From d09f8311d2c6c938579d3299d499fbd9b9583549 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Wed, 18 Jun 2025 21:08:07 -0700
Subject: [PATCH 1/9] Update
[ghstack-poisoned]
---
examples/models/llama/config/llm_config.py | 4 +-
examples/models/llama/export_llama_lib.py | 8 +-
extension/llm/export/README.md | 145 +++++++++++++++++++
extension/llm/export/export_llm.py | 55 ++++++-
extension/llm/export/test/test_export_llm.py | 112 ++++++++++++++
extension/llm/install_requirements.sh | 9 ++
6 files changed, 327 insertions(+), 6 deletions(-)
create mode 100644 extension/llm/export/README.md
create mode 100644 extension/llm/export/test/test_export_llm.py
create mode 100755 extension/llm/install_requirements.sh
diff --git a/examples/models/llama/config/llm_config.py b/examples/models/llama/config/llm_config.py
index 034d8af7562..201e3a5414a 100644
--- a/examples/models/llama/config/llm_config.py
+++ b/examples/models/llama/config/llm_config.py
@@ -65,7 +65,9 @@ class BaseConfig:
params: Model parameters, such as n_layers, hidden_size, etc.
If left empty will use defaults specified in model_args.py.
checkpoint: Path to the checkpoint file.
- If left empty, the model will be initialized with random weights.
+ If left empty, the model will either be initialized with random weights
+ if it is a Llama model or the weights will be downloaded from HuggingFace
+ if it is a non-Llama model.
checkpoint_dir: Path to directory containing sharded checkpoint files.
tokenizer_path: Path to the tokenizer file.
metadata: Json string containing metadata information.
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 1f055d65822..78c6244abee 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -53,6 +53,8 @@
)
from executorch.util.activation_memory_profiler import generate_memory_trace
+from omegaconf import DictConfig
+
from ..model_factory import EagerModelFactory
from .source_transformation.apply_spin_quant_r1_r2 import (
fuse_layer_norms,
@@ -571,12 +573,14 @@ def canonical_path(path: Union[str, Path], *, dir: bool = False) -> str:
def export_llama(
- export_options: Union[argparse.Namespace, LlmConfig],
+ export_options: Union[argparse.Namespace, LlmConfig, DictConfig],
) -> str:
if isinstance(export_options, argparse.Namespace):
# Legacy CLI.
llm_config = LlmConfig.from_args(export_options)
- elif isinstance(export_options, LlmConfig):
+ elif isinstance(export_options, LlmConfig) or isinstance(
+ export_options, DictConfig
+ ):
# Hydra CLI.
llm_config = export_options
else:
diff --git a/extension/llm/export/README.md b/extension/llm/export/README.md
new file mode 100644
index 00000000000..e15c7fd7f77
--- /dev/null
+++ b/extension/llm/export/README.md
@@ -0,0 +1,145 @@
+# LLM Export API
+
+This directory contains the unified API for exporting Large Language Models (LLMs) to ExecuTorch. The `export_llm` module provides a streamlined interface to convert various LLM architectures to optimized `.pte` files for on-device inference.
+
+## Overview
+
+The LLM export process transforms a model from its original format to an optimized representation suitable for mobile and edge devices. This involves several key steps:
+
+1. **Model Instantiation**: Load the model architecture and weights from sources like Hugging Face
+2. **Source Transformations**: Apply model-specific optimizations and quantization
+3. **IR Export**: Convert to intermediate representations (EXIR, Edge dialect)
+4. **Graph Transformations**: Apply backend-specific optimizations and PT2E quantization
+5. **Backend Delegation**: Partition operations to hardware-specific backends (XNNPACK, CoreML, QNN, etc.)
+6. **Serialization**: Export to final ExecuTorch `.pte` format
+
+## Supported Models
+
+- **Llama**: Llama 2, Llama 3, Llama 3.1, Llama 3.2 (1B, 3B, 8B variants)
+- **Qwen**: Qwen 2.5, Qwen 3 (0.6B, 1.7B, 4B variants)
+- **Phi**: Phi-3-Mini, Phi-4-Mini
+- **Stories**: Stories110M (educational model)
+- **SmolLM**: SmolLM2
+
+## Installation
+
+First, install the required dependencies:
+
+```bash
+./extension/llm/install_requirements.sh
+```
+
+## Usage
+
+The export API supports two configuration approaches:
+
+### Option 1: Hydra CLI Arguments
+
+Use structured configuration arguments directly on the command line:
+
+```bash
+python -m extension.llm.export.export_llm \
+ base.model_class=llama3 \
+ model.use_sdpa_with_kv_cache=True \
+ model.use_kv_cache=True \
+ export.max_seq_length=128 \
+ debug.verbose=True \
+ backend.xnnpack.enabled=True \
+ backend.xnnpack.extended_ops=True \
+ quantization.qmode=8da4w
+```
+
+### Option 2: Configuration File
+
+Create a YAML configuration file and reference it:
+
+```bash
+python -m extension.llm.export.export_llm --config my_config.yaml
+```
+
+Example `my_config.yaml`:
+```yaml
+base:
+ model_class: llama3
+ tokenizer_path: /path/to/tokenizer.json
+
+model:
+ use_kv_cache: true
+ use_sdpa_with_kv_cache: true
+ enable_dynamic_shape: true
+
+export:
+ max_seq_length: 512
+ output_dir: ./exported_models
+ output_name: llama3_optimized.pte
+
+quantization:
+ qmode: 8da4w
+ group_size: 32
+
+backend:
+ xnnpack:
+ enabled: true
+ extended_ops: true
+
+debug:
+ verbose: true
+```
+
+**Important**: You cannot mix both approaches. Use either CLI arguments OR a config file, not both.
+
+## Example Commands
+
+### Export Qwen3 0.6B with XNNPACK backend and quantization
+```bash
+python -m extension.llm.export.export_llm \
+ base.model_class=qwen3-0_6b \
+ base.params=examples/models/qwen3/0_6b_config.json \
+ base.metadata='{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
+ model.use_kv_cache=true \
+ model.use_sdpa_with_kv_cache=true \
+ model.dtype_override=FP32 \
+ export.max_seq_length=512 \
+ export.output_name=qwen3_0_6b.pte \
+ quantization.qmode=8da4w \
+ backend.xnnpack.enabled=true \
+ backend.xnnpack.extended_ops=true \
+ debug.verbose=true
+```
+
+### Export Phi-4-Mini with custom checkpoint
+```bash
+python -m extension.llm.export.export_llm \
+ base.model_class=phi_4_mini \
+ base.checkpoint=/path/to/phi4_checkpoint.pth \
+ base.params=examples/models/phi-4-mini/config.json \
+ base.metadata='{"get_bos_id":151643, "get_eos_ids":[151643]}' \
+ model.use_kv_cache=true \
+ model.use_sdpa_with_kv_cache=true \
+ export.max_seq_length=256 \
+ export.output_name=phi4_mini.pte \
+ backend.xnnpack.enabled=true \
+ debug.verbose=true
+```
+
+### Export with CoreML backend (iOS optimization)
+```bash
+python -m extension.llm.export.export_llm \
+ base.model_class=llama3 \
+ model.use_kv_cache=true \
+ export.max_seq_length=128 \
+ backend.coreml.enabled=true \
+ backend.coreml.compute_units=ALL \
+ quantization.pt2e_quantize=coreml_c4w \
+ debug.verbose=true
+```
+
+## Configuration Options
+
+For a complete reference of all available configuration options, see the [LlmConfig class definition](../../../examples/models/llama/config/llm_config.py) which documents all supported parameters for base, model, export, quantization, backend, and debug configurations.
+
+## Further Reading
+
+- [Llama Examples](../../../examples/models/llama/README.md) - Comprehensive Llama export guide
+- [LLM Runner](../runner/) - Running exported models
+- [ExecuTorch Documentation](https://pytorch.org/executorch/) - Framework overview
\ No newline at end of file
diff --git a/extension/llm/export/export_llm.py b/extension/llm/export/export_llm.py
index 09a15d6ab58..2af7439b805 100644
--- a/extension/llm/export/export_llm.py
+++ b/extension/llm/export/export_llm.py
@@ -23,23 +23,72 @@
backend.xnnpack.enabled=True \
backend.xnnpack.extended_ops=True \
quantization.qmode="8da4w"
+
+Example usage using config file:
+python -m extension.llm.export.export_llm \
+ --config example_llm_config.yaml
"""
+import argparse
+import sys
+from typing import Any, List, Tuple
+
import hydra
+import yaml
from executorch.examples.models.llama.config.llm_config import LlmConfig
from executorch.examples.models.llama.export_llama_lib import export_llama
from hydra.core.config_store import ConfigStore
-from omegaconf import OmegaConf
+from omegaconf import DictConfig, OmegaConf
cs = ConfigStore.instance()
cs.store(name="llm_config", node=LlmConfig)
-@hydra.main(version_base=None, config_path=None, config_name="llm_config")
-def main(llm_config: LlmConfig) -> None:
+def parse_config_arg() -> Tuple[str, List[Any]]:
+ """First parse out the arg for whether to use Hydra or the old CLI."""
+ parser = argparse.ArgumentParser(add_help=True)
+ parser.add_argument("--config", type=str, help="Path to the LlmConfig file")
+ args, remaining = parser.parse_known_args()
+ return args.config, remaining
+
+
+def pop_config_arg() -> str:
+ """
+ Removes '--config' and its value from sys.argv.
+ Assumes --config is specified and argparse has already validated the args.
+ """
+ idx = sys.argv.index("--config")
+ value = sys.argv[idx + 1]
+ del sys.argv[idx : idx + 2]
+ return value
+
+
+@hydra.main(version_base=None, config_name="llm_config")
+def hydra_main(llm_config: LlmConfig) -> None:
export_llama(OmegaConf.to_object(llm_config))
+def main() -> None:
+ config, remaining_args = parse_config_arg()
+ if config:
+ # Check if there are any remaining hydra CLI args when --config is specified
+ # This might change in the future to allow overriding config file values
+ if remaining_args:
+ raise ValueError(
+ "Cannot specify additional CLI arguments when using --config. "
+ f"Found: {remaining_args}. Use either --config file or hydra CLI args, not both."
+ )
+
+ config_file_path = pop_config_arg()
+ default_llm_config = LlmConfig()
+ llm_config_from_file = OmegaConf.load(config_file_path)
+ # Override defaults with values specified in the .yaml provided by --config.
+ merged_llm_config = OmegaConf.merge(default_llm_config, llm_config_from_file)
+ export_llama(merged_llm_config)
+ else:
+ hydra_main()
+
+
if __name__ == "__main__":
main()
diff --git a/extension/llm/export/test/test_export_llm.py b/extension/llm/export/test/test_export_llm.py
new file mode 100644
index 00000000000..970a32c9606
--- /dev/null
+++ b/extension/llm/export/test/test_export_llm.py
@@ -0,0 +1,112 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import sys
+import tempfile
+import unittest
+from unittest.mock import MagicMock, patch
+
+from executorch.examples.models.llama.config.llm_config import LlmConfig
+from executorch.extension.llm.export.export_llm import main, parse_config_arg, pop_config_arg
+
+
+class TestExportLlm(unittest.TestCase):
+ def test_parse_config_arg_with_config(self) -> None:
+ """Test parse_config_arg when --config is provided."""
+ # Mock sys.argv to include --config
+ test_argv = ["script.py", "--config", "test_config.yaml", "extra", "args"]
+ with patch.object(sys, "argv", test_argv):
+ config_path, remaining = parse_config_arg()
+ self.assertEqual(config_path, "test_config.yaml")
+ self.assertEqual(remaining, ["extra", "args"])
+
+ def test_parse_config_arg_without_config(self) -> None:
+ """Test parse_config_arg when --config is not provided."""
+ test_argv = ["script.py", "debug.verbose=True"]
+ with patch.object(sys, "argv", test_argv):
+ config_path, remaining = parse_config_arg()
+ self.assertIsNone(config_path)
+ self.assertEqual(remaining, ["debug.verbose=True"])
+
+ def test_pop_config_arg(self) -> None:
+ """Test pop_config_arg removes --config and its value from sys.argv."""
+ test_argv = ["script.py", "--config", "test_config.yaml", "other", "args"]
+ with patch.object(sys, "argv", test_argv):
+ config_path = pop_config_arg()
+ self.assertEqual(config_path, "test_config.yaml")
+ self.assertEqual(sys.argv, ["script.py", "other", "args"])
+
+ @patch("executorch.extension.llm.export.export_llm.export_llama")
+ def test_with_config(self, mock_export_llama: MagicMock) -> None:
+ """Test main function with --config file and no hydra args."""
+ # Create a temporary config file
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
+ f.write("""
+base:
+ tokenizer_path: /path/to/tokenizer.json
+export:
+ max_seq_length: 256
+""")
+ config_file = f.name
+
+ try:
+ test_argv = ["script.py", "--config", config_file]
+ with patch.object(sys, "argv", test_argv):
+ main()
+
+ # Verify export_llama was called with config
+ mock_export_llama.assert_called_once()
+ called_config = mock_export_llama.call_args[0][0]
+ self.assertEqual(called_config["base"]["tokenizer_path"], "/path/to/tokenizer.json")
+ self.assertEqual(called_config["export"]["max_seq_length"], 256)
+ finally:
+ os.unlink(config_file)
+
+ def test_with_cli_args(self) -> None:
+ """Test main function with only hydra CLI args."""
+ test_argv = ["script.py", "debug.verbose=True"]
+ with patch.object(sys, "argv", test_argv):
+ with patch("executorch.extension.llm.export.export_llm.hydra_main") as mock_hydra:
+ main()
+ mock_hydra.assert_called_once()
+
+ def test_config_with_cli_args_error(self) -> None:
+ """Test that --config rejects additional CLI arguments to prevent mixing approaches."""
+ # Create a temporary config file
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
+ f.write("base:\n checkpoint: /path/to/checkpoint.pth")
+ config_file = f.name
+
+ try:
+ test_argv = ["script.py", "--config", config_file, "debug.verbose=True"]
+ with patch.object(sys, "argv", test_argv):
+ with self.assertRaises(ValueError) as cm:
+ main()
+
+ error_msg = str(cm.exception)
+ self.assertIn("Cannot specify additional CLI arguments when using --config", error_msg)
+ finally:
+ os.unlink(config_file)
+
+ def test_config_rejects_multiple_cli_args(self) -> None:
+ """Test that --config rejects multiple CLI arguments (not just single ones)."""
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
+ f.write("export:\n max_seq_length: 128")
+ config_file = f.name
+
+ try:
+ test_argv = ["script.py", "--config", config_file, "debug.verbose=True", "export.output_dir=/tmp"]
+ with patch.object(sys, "argv", test_argv):
+ with self.assertRaises(ValueError):
+ main()
+ finally:
+ os.unlink(config_file)
+
+
+if __name__ == "__main__":
+ unittest.main()
+
diff --git a/extension/llm/install_requirements.sh b/extension/llm/install_requirements.sh
new file mode 100755
index 00000000000..8f322083c03
--- /dev/null
+++ b/extension/llm/install_requirements.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Install requirements for LLM extension
+pip install hydra-core>=1.3.0 omegaconf>=2.3.0
From 9b8ea72d164fee5817b91d5ebe63ad42c07fc796 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Wed, 18 Jun 2025 21:08:10 -0700
Subject: [PATCH 2/9] Update
[ghstack-poisoned]
---
examples/models/llama/config/llm_config.py | 169 +++++++++---------
.../models/llama/config/test_llm_config.py | 31 +++-
extension/llm/export/test/test_export_llm.py | 35 +++-
3 files changed, 149 insertions(+), 86 deletions(-)
diff --git a/examples/models/llama/config/llm_config.py b/examples/models/llama/config/llm_config.py
index 201e3a5414a..0504b386f45 100644
--- a/examples/models/llama/config/llm_config.py
+++ b/examples/models/llama/config/llm_config.py
@@ -16,7 +16,6 @@
import ast
import re
from dataclasses import dataclass, field
-from enum import Enum
from typing import ClassVar, List, Optional
@@ -25,32 +24,27 @@
################################################################################
-class ModelType(str, Enum):
- STORIES110M = "stories110m"
- LLAMA2 = "llama2"
- LLAMA3 = "llama3"
- LLAMA3_1 = "llama3_1"
- LLAMA3_2 = "llama3_2"
- LLAMA3_2_VISION = "llama3_2_vision"
- STATIC_LLAMA = "static_llama"
- QWEN2_5 = "qwen2_5"
- QWEN3_0_6B = "qwen3-0_6b"
- QWEN3_1_7B = "qwen3-1_7b"
- QWEN3_4B = "qwen3-4b"
- PHI_4_MINI = "phi_4_mini"
- SMOLLM2 = "smollm2"
+MODEL_TYPE_OPTIONS = [
+ "stories110m",
+ "llama2",
+ "llama3",
+ "llama3_1",
+ "llama3_2",
+ "llama3_2_vision",
+ "static_llama",
+ "qwen2_5",
+ "qwen3-0_6b",
+ "qwen3-1_7b",
+ "qwen3-4b",
+ "phi_4_mini",
+ "smollm2",
+]
-class PreqMode(str, Enum):
- """
- If you are dealing with pre-quantized checkpoints, this used to
- be the way to specify them. Now you don't need to specify these
- options if you use a TorchAo-prequantized checkpoint, but they
- are still around to preserve backward compatibility.
- """
-
- PREQ_8DA4W = "8da4w"
- PREQ_8DA4W_OUT_8DA8W = "8da4w_output_8da8w"
+PREQ_MODE_OPTIONS = [
+ "8da4w",
+ "8da4w_output_8da8w",
+]
@dataclass
@@ -82,7 +76,7 @@ class BaseConfig:
are loaded.
"""
- model_class: ModelType = ModelType.LLAMA3
+ model_class: str = "llama3"
params: Optional[str] = None
checkpoint: Optional[str] = None
checkpoint_dir: Optional[str] = None
@@ -90,26 +84,28 @@ class BaseConfig:
metadata: Optional[str] = None
use_lora: int = 0
fairseq2: bool = False
- preq_mode: Optional[PreqMode] = None
+ preq_mode: Optional[str] = None
preq_group_size: int = 32
preq_embedding_quantize: str = "8,0"
+ def __post_init__(self):
+ if self.model_class not in MODEL_TYPE_OPTIONS:
+ raise ValueError(f"model_class must be one of {MODEL_TYPE_OPTIONS}, got '{self.model_class}'")
+
+ if self.preq_mode is not None and self.preq_mode not in PREQ_MODE_OPTIONS:
+ raise ValueError(f"preq_mode must be one of {PREQ_MODE_OPTIONS}, got '{self.preq_mode}'")
+
################################################################################
################################# ModelConfig ##################################
################################################################################
-class DtypeOverride(str, Enum):
- """
- DType of the model. Highly recommended to use "fp32", unless you want to
- export without a backend, in which case you can also use "bf16". "fp16"
- is not recommended.
- """
-
- FP32 = "fp32"
- FP16 = "fp16"
- BF16 = "bf16"
+DTYPE_OVERRIDE_OPTIONS = [
+ "fp32",
+ "fp16",
+ "bf16",
+]
@dataclass
@@ -147,7 +143,7 @@ class ModelConfig:
[16] pattern specifies all layers have a sliding window of 16.
"""
- dtype_override: DtypeOverride = DtypeOverride.FP32
+ dtype_override: str = "fp32"
enable_dynamic_shape: bool = True
use_shared_embedding: bool = False
use_sdpa_with_kv_cache: bool = False
@@ -160,6 +156,9 @@ class ModelConfig:
local_global_attention: Optional[List[int]] = None
def __post_init__(self):
+ if self.dtype_override not in DTYPE_OVERRIDE_OPTIONS:
+ raise ValueError(f"dtype_override must be one of {DTYPE_OVERRIDE_OPTIONS}, got '{self.dtype_override}'")
+
self._validate_attention_sink()
self._validate_local_global_attention()
@@ -261,31 +260,25 @@ class DebugConfig:
################################################################################
-class Pt2eQuantize(str, Enum):
- """
- Type of backend-specific Pt2e quantization strategy to use.
-
- Pt2e uses a different quantization library that is graph-based
- compared to `qmode`, which is also specified in the QuantizationConfig
- and is source transform-based.
- """
+PT2E_QUANTIZE_OPTIONS = [
+ "xnnpack_dynamic",
+ "xnnpack_dynamic_qc4",
+ "qnn_8a8w",
+ "qnn_16a16w",
+ "qnn_16a4w",
+ "coreml_c4w",
+ "coreml_8a_c8w",
+ "coreml_8a_c4w",
+ "coreml_baseline_8a_c8w",
+ "coreml_baseline_8a_c4w",
+ "vulkan_8w",
+]
- XNNPACK_DYNAMIC = "xnnpack_dynamic"
- XNNPACK_DYNAMIC_QC4 = "xnnpack_dynamic_qc4"
- QNN_8A8W = "qnn_8a8w"
- QNN_16A16W = "qnn_16a16w"
- QNN_16A4W = "qnn_16a4w"
- COREML_C4W = "coreml_c4w"
- COREML_8A_C8W = "coreml_8a_c8w"
- COREML_8A_C4W = "coreml_8a_c4w"
- COREML_BASELINE_8A_C8W = "coreml_baseline_8a_c8w"
- COREML_BASELINE_8A_C4W = "coreml_baseline_8a_c4w"
- VULKAN_8W = "vulkan_8w"
-
-class SpinQuant(str, Enum):
- CUDA = "cuda"
- NATIVE = "native"
+SPIN_QUANT_OPTIONS = [
+ "cuda",
+ "native",
+]
@dataclass
@@ -320,9 +313,9 @@ class QuantizationConfig:
qmode: Optional[str] = None
embedding_quantize: Optional[str] = None
- pt2e_quantize: Optional[Pt2eQuantize] = None
+ pt2e_quantize: Optional[str] = None
group_size: Optional[int] = None
- use_spin_quant: Optional[SpinQuant] = None
+ use_spin_quant: Optional[str] = None
use_qat: bool = False
calibration_tasks: Optional[List[str]] = None
calibration_limit: Optional[int] = None
@@ -330,6 +323,12 @@ class QuantizationConfig:
calibration_data: str = "Once upon a time"
def __post_init__(self):
+ if self.pt2e_quantize is not None and self.pt2e_quantize not in PT2E_QUANTIZE_OPTIONS:
+ raise ValueError(f"pt2e_quantize must be one of {PT2E_QUANTIZE_OPTIONS}, got '{self.pt2e_quantize}'")
+
+ if self.use_spin_quant is not None and self.use_spin_quant not in SPIN_QUANT_OPTIONS:
+ raise ValueError(f"use_spin_quant must be one of {SPIN_QUANT_OPTIONS}, got '{self.use_spin_quant}'")
+
if self.qmode:
self._validate_qmode()
@@ -377,16 +376,18 @@ class XNNPackConfig:
extended_ops: bool = False
-class CoreMLQuantize(str, Enum):
- B4W = "b4w"
- C4W = "c4w"
+COREML_QUANTIZE_OPTIONS = [
+ "b4w",
+ "c4w",
+]
-class CoreMLComputeUnit(str, Enum):
- CPU_ONLY = "cpu_only"
- CPU_AND_GPU = "cpu_and_gpu"
- CPU_AND_NE = "cpu_and_ne"
- ALL = "all"
+COREML_COMPUTE_UNIT_OPTIONS = [
+ "cpu_only",
+ "cpu_and_gpu",
+ "cpu_and_ne",
+ "all",
+]
@dataclass
@@ -398,11 +399,17 @@ class CoreMLConfig:
enabled: bool = False
enable_state: bool = False
preserve_sdpa: bool = False
- quantize: Optional[CoreMLQuantize] = None
+ quantize: Optional[str] = None
ios: int = 15
- compute_units: CoreMLComputeUnit = CoreMLComputeUnit.CPU_ONLY
+ compute_units: str = "cpu_only"
def __post_init__(self):
+ if self.quantize is not None and self.quantize not in COREML_QUANTIZE_OPTIONS:
+ raise ValueError(f"quantize must be one of {COREML_QUANTIZE_OPTIONS}, got '{self.quantize}'")
+
+ if self.compute_units not in COREML_COMPUTE_UNIT_OPTIONS:
+ raise ValueError(f"compute_units must be one of {COREML_COMPUTE_UNIT_OPTIONS}, got '{self.compute_units}'")
+
if self.ios not in (15, 16, 17, 18):
raise ValueError(f"Invalid coreml ios version: {self.ios}")
@@ -481,7 +488,7 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig": # noqa: C901
# BaseConfig
if hasattr(args, "model"):
- llm_config.base.model_class = ModelType(args.model)
+ llm_config.base.model_class = args.model
if hasattr(args, "params"):
llm_config.base.params = args.params
if hasattr(args, "checkpoint"):
@@ -499,7 +506,7 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig": # noqa: C901
# PreqMode settings
if hasattr(args, "preq_mode") and args.preq_mode:
- llm_config.base.preq_mode = PreqMode(args.preq_mode)
+ llm_config.base.preq_mode = args.preq_mode
if hasattr(args, "preq_group_size"):
llm_config.base.preq_group_size = args.preq_group_size
if hasattr(args, "preq_embedding_quantize"):
@@ -507,7 +514,7 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig": # noqa: C901
# ModelConfig
if hasattr(args, "dtype_override"):
- llm_config.model.dtype_override = DtypeOverride(args.dtype_override)
+ llm_config.model.dtype_override = args.dtype_override
if hasattr(args, "enable_dynamic_shape"):
llm_config.model.enable_dynamic_shape = args.enable_dynamic_shape
if hasattr(args, "use_shared_embedding"):
@@ -549,11 +556,11 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig": # noqa: C901
if hasattr(args, "embedding_quantize"):
llm_config.quantization.embedding_quantize = args.embedding_quantize
if hasattr(args, "pt2e_quantize") and args.pt2e_quantize:
- llm_config.quantization.pt2e_quantize = Pt2eQuantize(args.pt2e_quantize)
+ llm_config.quantization.pt2e_quantize = args.pt2e_quantize
if hasattr(args, "group_size"):
llm_config.quantization.group_size = args.group_size
if hasattr(args, "use_spin_quant") and args.use_spin_quant:
- llm_config.quantization.use_spin_quant = SpinQuant(args.use_spin_quant)
+ llm_config.quantization.use_spin_quant = args.use_spin_quant
if hasattr(args, "use_qat"):
llm_config.quantization.use_qat = args.use_qat
if hasattr(args, "calibration_tasks"):
@@ -581,13 +588,11 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig": # noqa: C901
args, "coreml_preserve_sdpa", False
)
if hasattr(args, "coreml_quantize") and args.coreml_quantize:
- llm_config.backend.coreml.quantize = CoreMLQuantize(args.coreml_quantize)
+ llm_config.backend.coreml.quantize = args.coreml_quantize
if hasattr(args, "coreml_ios"):
llm_config.backend.coreml.ios = args.coreml_ios
if hasattr(args, "coreml_compute_units"):
- llm_config.backend.coreml.compute_units = CoreMLComputeUnit(
- args.coreml_compute_units
- )
+ llm_config.backend.coreml.compute_units = args.coreml_compute_units
# Vulkan
if hasattr(args, "vulkan"):
diff --git a/examples/models/llama/config/test_llm_config.py b/examples/models/llama/config/test_llm_config.py
index 0853e9dbbd8..15513bcd6f2 100644
--- a/examples/models/llama/config/test_llm_config.py
+++ b/examples/models/llama/config/test_llm_config.py
@@ -11,7 +11,6 @@
from executorch.examples.models.llama.config.llm_config import (
BackendConfig,
BaseConfig,
- CoreMLComputeUnit,
CoreMLConfig,
DebugConfig,
ExportConfig,
@@ -66,6 +65,34 @@ def test_shared_embedding_without_lowbit(self):
with self.assertRaises(ValueError):
LlmConfig(model=model_cfg, quantization=qcfg)
+ def test_invalid_model_type(self):
+ with self.assertRaises(ValueError):
+ BaseConfig(model_class="invalid_model")
+
+ def test_invalid_dtype_override(self):
+ with self.assertRaises(ValueError):
+ ModelConfig(dtype_override="invalid_dtype")
+
+ def test_invalid_preq_mode(self):
+ with self.assertRaises(ValueError):
+ BaseConfig(preq_mode="invalid_preq")
+
+ def test_invalid_pt2e_quantize(self):
+ with self.assertRaises(ValueError):
+ QuantizationConfig(pt2e_quantize="invalid_pt2e")
+
+ def test_invalid_spin_quant(self):
+ with self.assertRaises(ValueError):
+ QuantizationConfig(use_spin_quant="invalid_spin")
+
+ def test_invalid_coreml_quantize(self):
+ with self.assertRaises(ValueError):
+ CoreMLConfig(quantize="invalid_quantize")
+
+ def test_invalid_coreml_compute_units(self):
+ with self.assertRaises(ValueError):
+ CoreMLConfig(compute_units="invalid_compute_units")
+
class TestValidConstruction(unittest.TestCase):
@@ -94,7 +121,7 @@ def test_valid_llm_config(self):
backend=BackendConfig(
xnnpack=XNNPackConfig(enabled=False),
coreml=CoreMLConfig(
- enabled=True, ios=17, compute_units=CoreMLComputeUnit.ALL
+ enabled=True, ios=17, compute_units="all"
),
),
)
diff --git a/extension/llm/export/test/test_export_llm.py b/extension/llm/export/test/test_export_llm.py
index 970a32c9606..258a867dc6b 100644
--- a/extension/llm/export/test/test_export_llm.py
+++ b/extension/llm/export/test/test_export_llm.py
@@ -47,9 +47,20 @@ def test_with_config(self, mock_export_llama: MagicMock) -> None:
with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
f.write("""
base:
+ model_class: llama3
tokenizer_path: /path/to/tokenizer.json
+ preq_mode: 8da4w
+model:
+ dtype_override: fp32
export:
max_seq_length: 256
+quantization:
+ pt2e_quantize: xnnpack_dynamic
+ use_spin_quant: cuda
+backend:
+ coreml:
+ quantize: c4w
+ compute_units: cpu_and_gpu
""")
config_file = f.name
@@ -61,8 +72,15 @@ def test_with_config(self, mock_export_llama: MagicMock) -> None:
# Verify export_llama was called with config
mock_export_llama.assert_called_once()
called_config = mock_export_llama.call_args[0][0]
+ self.assertEqual(called_config["base"]["model_class"], "llama3")
self.assertEqual(called_config["base"]["tokenizer_path"], "/path/to/tokenizer.json")
+ self.assertEqual(called_config["base"]["preq_mode"], "8da4w")
+ self.assertEqual(called_config["model"]["dtype_override"], "fp32")
self.assertEqual(called_config["export"]["max_seq_length"], 256)
+ self.assertEqual(called_config["quantization"]["pt2e_quantize"], "xnnpack_dynamic")
+ self.assertEqual(called_config["quantization"]["use_spin_quant"], "cuda")
+ self.assertEqual(called_config["backend"]["coreml"]["quantize"], "c4w")
+ self.assertEqual(called_config["backend"]["coreml"]["compute_units"], "cpu_and_gpu")
finally:
os.unlink(config_file)
@@ -78,7 +96,13 @@ def test_config_with_cli_args_error(self) -> None:
"""Test that --config rejects additional CLI arguments to prevent mixing approaches."""
# Create a temporary config file
with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
- f.write("base:\n checkpoint: /path/to/checkpoint.pth")
+ f.write("""
+base:
+ model_class: llama2
+ checkpoint: /path/to/checkpoint.pth
+model:
+ dtype_override: bf16
+""")
config_file = f.name
try:
@@ -95,7 +119,14 @@ def test_config_with_cli_args_error(self) -> None:
def test_config_rejects_multiple_cli_args(self) -> None:
"""Test that --config rejects multiple CLI arguments (not just single ones)."""
with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
- f.write("export:\n max_seq_length: 128")
+ f.write("""
+base:
+ model_class: qwen2_5
+export:
+ max_seq_length: 128
+quantization:
+ pt2e_quantize: qnn_8a8w
+""")
config_file = f.name
try:
From f31059be1fbe183a307ad21cb14fbbdb89ca4e02 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Wed, 18 Jun 2025 21:08:14 -0700
Subject: [PATCH 3/9] Update
[ghstack-poisoned]
---
.../demo-apps/android/LlamaDemo/README.md | 2 +-
.../docs/delegates/qualcomm_README.md | 18 +--
.../docs/delegates/xnnpack_README.md | 10 +-
.../LLaMA/docs/delegates/mps_README.md | 2 +-
.../LLaMA/docs/delegates/xnnpack_README.md | 8 +-
.../deepseek-r1-distill-llama-8B/README.md | 24 +--
examples/models/llama/README.md | 140 +++++++++---------
examples/models/llama/UTILS.md | 12 +-
examples/models/llama2/README.md | 2 +-
examples/models/phi_4_mini/README.md | 28 ++--
examples/models/qwen2_5/README.md | 28 ++--
examples/models/qwen3/README.md | 76 +++++-----
12 files changed, 175 insertions(+), 175 deletions(-)
diff --git a/examples/demo-apps/android/LlamaDemo/README.md b/examples/demo-apps/android/LlamaDemo/README.md
index 4b8cafd2d4e..8fed04d7ff5 100644
--- a/examples/demo-apps/android/LlamaDemo/README.md
+++ b/examples/demo-apps/android/LlamaDemo/README.md
@@ -154,7 +154,7 @@ curl -C - -Ls "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokeni
# Create params.json file
touch params.json
echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json
-python -m examples.models.llama.export_llama -c stories110M.pt -p params.json -d fp16 -n stories110m_h.pte -kv
+python -m extension.llm.export.export_llm base.checkpoint=stories110M.pt base.params=params.json model.dtype_override="fp16" export.output_name=stories110m_h.pte model.use_kv_cache=True
python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
```
### Push model
diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md
index fb9df3c3375..969b6cacab9 100644
--- a/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md
+++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md
@@ -97,7 +97,7 @@ cmake --build cmake-out/examples/models/llama -j16 --config Release
## Export Llama Model
QNN backend currently supports exporting to these data types: fp32, int4/ int8 with PTQ, int4 with SpinQuant (Llama 3 only).
-We also support export for different Qualcomm SoC. We have verified SM8650(V75) and SM8550(V73). To export for different SoC, add “--soc_model SM8550” in your export command. Without setting this flag, the export will default to SM8650.
+We also support export for different Qualcomm SoC. We have verified SM8650(V75) and SM8550(V73). To export for different SoC, add "--soc_model SM8550" in your export command. Without setting this flag, the export will default to SM8650.
### Export with PTQ
We support PTQ by default. The entire export may take ~20 minutes (Llama 3.1 8B). However, there is accuracy regression and we are working on improving it.
@@ -106,12 +106,12 @@ We support PTQ by default. The entire export may take ~20 minutes (Llama 3.1 8B)
Examples:
```
# 4 bits weight only quantize
-python -m examples.models.llama.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" -p "${MODEL_DIR}/params.json" -kv --disable_dynamic_shape --qnn --pt2e_quantize qnn_16a4w -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="test.pte”
+python -m extension.llm.export.export_llm base.checkpoint="${MODEL_DIR}/consolidated.00.pth" base.params="${MODEL_DIR}/params.json" model.use_kv_cache=True model.enable_dynamic_shape=False backend.qnn.enabled=True backend.qnn.quantization="qnn_16a4w" model.dtype_override="fp32" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' export.output_name="test.pte"
```
If the model is really big, it may require model sharding because the Qualcomm DSP is a 32bit system and has a 4GB size limit . For example for Llama 3 8B models, we need to shard the model into 4, but ExecuTorch still packages it into one PTE file. Here is an example:
```
# 8 bits quantization with 4 shards
-python -m examples.models.llama.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" -p "${MODEL_DIR}/params.json" -kv --disable_dynamic_shape --qnn --pt2e_quantize qnn_8a8w -d fp32 --num_sharding 4 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="test.pte”
+python -m extension.llm.export.export_llm base.checkpoint="${MODEL_DIR}/consolidated.00.pth" base.params="${MODEL_DIR}/params.json" model.use_kv_cache=True model.enable_dynamic_shape=False backend.qnn.enabled=True backend.qnn.quantization="qnn_8a8w" model.dtype_override="fp32" backend.qnn.num_sharding=4 base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' export.output_name="test.pte"
```
Note: if you encountered issues below
```
@@ -163,7 +163,7 @@ To export Llama 3 8B instruct with the Qualcomm AI Engine Direct Backend, ensure
* 8B models might need 16GB RAM on the device to run.
```
# Please note that calibration_data must include the prompt template for special tokens.
-python -m examples.models.llama.export_llama -t -p -c --use_kv_cache --qnn --pt2e_quantize qnn_16a4w --disable_dynamic_shape --num_sharding 8 --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --optimized_rotation_path --calibration_data "<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+python -m extension.llm.export.export_llm base.tokenizer= base.params= base.checkpoint= model.use_kv_cache=True backend.qnn.enabled=True backend.qnn.quantization="qnn_16a4w" model.enable_dynamic_shape=False backend.qnn.num_sharding=8 backend.qnn.calibration_tasks="wikitext" backend.qnn.calibration_limit=1 backend.qnn.calibration_seq_length=128 backend.qnn.optimized_rotation_path= backend.qnn.calibration_data="<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
```
## Pushing Model and Tokenizer
@@ -210,17 +210,17 @@ Alternative you can also just run the shell script directly as in the root direc
sh examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh
```
This is running the shell script which configures the required core ExecuTorch, Llama2/3, and Android libraries, builds them into AAR, and copies it to the app.
-Note: If you are building the Android app mentioned in the next section on a separate machine (i.e. MacOS but building and exporting for QNN backend on Linux), make sure you copy the aar file generated from setup-with-qnn script to “examples/demo-apps/android/LlamaDemo/app/libs” before building the Android app.
+Note: If you are building the Android app mentioned in the next section on a separate machine (i.e. MacOS but building and exporting for QNN backend on Linux), make sure you copy the aar file generated from setup-with-qnn script to "examples/demo-apps/android/LlamaDemo/app/libs" before building the Android app.
## Run the Android Demo App
-First, make sure your Android phone’s chipset version is compatible with this demo (SM8650, SM8550). You can find the Qualcomm chipset version here in the [mapping](https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html).
+First, make sure your Android phone's chipset version is compatible with this demo (SM8650, SM8550). You can find the Qualcomm chipset version here in the [mapping](https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html).
-If you build and run the setup-with-qnn script on a separate machine rather than where you are building the Android app, make sure you copy the aar file it generated into “examples/demo-apps/android/LlamaDemo/app/libs”
+If you build and run the setup-with-qnn script on a separate machine rather than where you are building the Android app, make sure you copy the aar file it generated into "examples/demo-apps/android/LlamaDemo/app/libs"
### Alternative 1: Android Studio (Recommended)
-Open Android Studio and select “Open an existing Android Studio project” to open examples/demo-apps/android/LlamaDemo.
+Open Android Studio and select "Open an existing Android Studio project" to open examples/demo-apps/android/LlamaDemo.
Run the app (^R). This builds and launches the app on the phone.
### Alternative 2: Command line
@@ -238,4 +238,4 @@ If the app successfully run on your device, you should see something like below:
## Reporting Issues
-If you encountered any bugs or issues following this tutorial please file a bug/issue here on Github.
+If you encountered any bugs or issues following this tutorial please file a bug/issue here on Github.
\ No newline at end of file
diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md
index de99387f82d..c60bd537e6b 100644
--- a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md
+++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md
@@ -55,7 +55,7 @@ In this demo app, we support text-only inference with up-to-date Llama models an
Meta has released prequantized INT4 SpinQuant Llama 3.2 models that ExecuTorch supports on the XNNPACK backend.
* Export Llama model and generate .pte file as below:
```
-python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --max_context_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte"
+python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="fp32" backend.xnnpack.extended_ops=True base.preq_mode="8da4w_output_8da8w" base.preq_group_size=32 export.max_seq_length=2048 export.max_context_length=2048 base.preq_embedding_quantize="8,0" quantization.use_spin_quant="native" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' export.output_name="llama3_2_spinquant.pte"
```
For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_SpinQuant_INT4_EO8.ipynb).
@@ -63,7 +63,7 @@ For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co
Meta has released prequantized INT4 QAT+LoRA Llama 3.2 models that ExecuTorch supports on the XNNPACK backend.
* Export Llama model and generate .pte file as below:
```
-python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint --params -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --max_context_length 2048--preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte"
+python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= quantization.use_qat=True base.use_lora=16 model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="fp32" backend.xnnpack.extended_ops=True base.preq_mode="8da4w_output_8da8w" base.preq_group_size=32 export.max_seq_length=2048 export.max_context_length=2048 base.preq_embedding_quantize="8,0" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' export.output_name="llama3_2_qat_lora.pte"
```
For convenience, an [exported ExecuTorch QAT+LoRA model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-QLORA_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_QLORA_INT4_EO8.ipynb).
@@ -74,7 +74,7 @@ We have supported BF16 as a data type on the XNNPACK backend for Llama 3.2 1B/3B
* Export Llama model and generate .pte file as below:
```
-python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2_bf16.pte"
+python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="bf16" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' export.output_name="llama3_2_bf16.pte"
```
For convenience, an [exported ExecuTorch bf16 model](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/llama3_2-1B.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/ExportRecipe_1B.ipynb).
@@ -90,7 +90,7 @@ To safeguard your application, you can use our Llama Guard models for prompt cla
* We prepared this model using the following command
```
-python -m examples.models.llama.export_llama --checkpoint --params -d fp32 -kv --use_sdpa_with_kv_cache --quantization_mode 8da4w --group_size 256 --xnnpack --max_seq_length 8193 --max_context_length 8193 --embedding-quantize 4,32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_prune_map --output_name="llama_guard_3_1b_pruned_xnnpack.pte"
+python -m extension.llm.export.export_llm base.checkpoint= base.params= model.dtype_override="fp32" model.use_kv_cache=True model.use_sdpa_with_kv_cache=True quantization.qmode="8da4w" quantization.group_size=256 backend.xnnpack.enabled=True export.max_seq_length=8193 export.max_context_length=8193 quantization.embedding_quantize="4,32" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' base.output_prune_map= export.output_name="llama_guard_3_1b_pruned_xnnpack.pte"
```
@@ -100,7 +100,7 @@ python -m examples.models.llama.export_llama --checkpoint --params -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama.pte"
+python -m extension.llm.export.export_llm base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True quantization.qmode="8da4w" quantization.group_size=128 model.dtype_override="fp32" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' export.output_name="llama.pte"
```
You may wonder what the ‘--metadata’ flag is doing. This flag helps export the model with proper special tokens added that the runner can detect EOS tokens easily.
diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md
index 47352607bca..d6bccc0ef47 100644
--- a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md
+++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md
@@ -49,7 +49,7 @@ Install the required packages to export the model
Export the model
```
-python -m examples.models.llama.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" --params "${MODEL_DIR}/params.json" -kv --use_sdpa_with_kv_cache --mps -d fp32 --disable_dynamic_shape -qmode 8da4w -G 32
+python -m extension.llm.export.export_llm base.checkpoint="${MODEL_DIR}/consolidated.00.pth" base.params="${MODEL_DIR}/params.json" model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.mps.enabled=True model.dtype_override="fp32" model.enable_dynamic_shape=False quantization.qmode="8da4w" quantization.group_size=32
```
## Pushing Model and Tokenizer
diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
index bb33b50f8b7..d64a119e35f 100644
--- a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
+++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
@@ -51,7 +51,7 @@ In this demo app, we support text-only inference with up-to-date Llama models an
Meta has released prequantized INT4 SpinQuant Llama 3.2 models that ExecuTorch supports on the XNNPACK backend.
* Export Llama model and generate .pte file as below:
```
-python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --max_context_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte"
+python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="fp32" backend.xnnpack.extended_ops=True base.preq_mode="8da4w_output_8da8w" base.preq_group_size=32 export.max_seq_length=2048 export.max_context_length=2048 base.preq_embedding_quantize="8,0" quantization.use_spin_quant="native" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' export.output_name="llama3_2_spinquant.pte"
```
For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_SpinQuant_INT4_EO8.ipynb).
@@ -59,7 +59,7 @@ For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co
Meta has released prequantized INT4 QAT+LoRA Llama 3.2 models that ExecuTorch supports on the XNNPACK backend.
* Export Llama model and generate .pte file as below:
```
-python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint --params -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --max_context_length 2048 --preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte"
+python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= quantization.use_qat=True base.use_lora=16 model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="fp32" backend.xnnpack.extended_ops=True base.preq_mode="8da4w_output_8da8w" base.preq_group_size=32 export.max_seq_length=2048 export.max_context_length=2048 base.preq_embedding_quantize="8,0" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' export.output_name="llama3_2_qat_lora.pte"
```
For convenience, an [exported ExecuTorch QAT+LoRA model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-QLORA_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_QLORA_INT4_EO8.ipynb).
@@ -69,7 +69,7 @@ We have supported BF16 as a data type on the XNNPACK backend for Llama 3.2 1B/3B
* Export Llama model and generate .pte file as below:
```
-python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2_bf16.pte"
+python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="bf16" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' export.output_name="llama3_2_bf16.pte"
```
For convenience, an [exported ExecuTorch bf16 model](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/llama3_2-1B.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/ExportRecipe_1B.ipynb).
@@ -79,7 +79,7 @@ For more detail using Llama 3.2 lightweight models including prompt template, pl
Export the model
```
-python -m examples.models.llama.export_llama --checkpoint -p -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --embedding-quantize 4,32 --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte"
+python -m extension.llm.export.export_llm base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True quantization.qmode="8da4w" quantization.group_size=128 model.dtype_override="fp32" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' quantization.embedding_quantize="4,32" export.output_name="llama3_kv_sdpa_xnn_qe_4_32.pte"
```
### For LLaVA model
diff --git a/examples/models/deepseek-r1-distill-llama-8B/README.md b/examples/models/deepseek-r1-distill-llama-8B/README.md
index 5fd47ad61ec..7695c678337 100644
--- a/examples/models/deepseek-r1-distill-llama-8B/README.md
+++ b/examples/models/deepseek-r1-distill-llama-8B/README.md
@@ -52,18 +52,18 @@ torch.save(sd, "/tmp/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/checkpoint.pth")
5. Generate a PTE file for use with the Llama runner.
```
-python -m examples.models.llama.export_llama \
- --checkpoint /tmp/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/checkpoint.pth \
- -p params.json \
- -kv \
- --use_sdpa_with_kv_cache \
- -X \
- -qmode 8da4w \
- --group_size 128 \
- -d fp16 \
- --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
- --embedding-quantize 4,32 \
- --output_name="DeepSeek-R1-Distill-Llama-8B.pte"
+python -m extension.llm.export.export_llm \
+ base.checkpoint=/tmp/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/checkpoint.pth \
+ base.params=params.json \
+ model.use_kv_cache=True \
+ model.use_sdpa_with_kv_cache=True \
+ backend.xnnpack.enabled=True \
+ quantization.qmode="8da4w" \
+ quantization.group_size=128 \
+ model.dtype_override="fp16" \
+ base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
+ quantization.embedding_quantize="4,32" \
+ export.output_name="DeepSeek-R1-Distill-Llama-8B.pte"
```
6. Run the model on your desktop for validation or integrate with iOS/Android apps. Instructions for these are available in the Llama [README](../llama/README.md) starting at Step 3.
diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md
index c6f0350fff7..23a377a6611 100644
--- a/examples/models/llama/README.md
+++ b/examples/models/llama/README.md
@@ -167,15 +167,15 @@ Llama 3 8B performance was measured on the Samsung Galaxy S22, S24, and OnePlus
LLAMA_CHECKPOINT=path/to/consolidated.00.pth
LLAMA_PARAMS=path/to/params.json
-python -m examples.models.llama.export_llama \
- --model "llama3_2" \
- --checkpoint "${LLAMA_CHECKPOINT:?}" \
- --params "${LLAMA_PARAMS:?}" \
- -kv \
- --use_sdpa_with_kv_cache \
- -d bf16 \
- --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
- --output_name="llama3_2.pte"
+python -m extension.llm.export.export_llm \
+ base.model_class="llama3_2" \
+ base.checkpoint="${LLAMA_CHECKPOINT:?}" \
+ base.params="${LLAMA_PARAMS:?}" \
+ model.use_kv_cache=True \
+ model.use_sdpa_with_kv_cache=True \
+ model.dtype_override="bf16" \
+ base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
+ export.output_name="llama3_2.pte"
```
For convenience, an [exported ExecuTorch bf16 model](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/llama3_2-1B.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/ExportRecipe_1B.ipynb).
@@ -189,23 +189,23 @@ For convenience, an [exported ExecuTorch bf16 model](https://huggingface.co/exec
LLAMA_QUANTIZED_CHECKPOINT=path/to/spinquant/consolidated.00.pth.pth
LLAMA_PARAMS=path/to/spinquant/params.json
-python -m examples.models.llama.export_llama \
- --model "llama3_2" \
- --checkpoint "${LLAMA_QUANTIZED_CHECKPOINT:?}" \
- --params "${LLAMA_PARAMS:?}" \
- --use_sdpa_with_kv_cache \
- -X \
- --xnnpack-extended-ops \
- --preq_mode 8da4w_output_8da8w \
- --preq_group_size 32 \
- --max_seq_length 2048 \
- --max_context_length 2048 \
- --output_name "llama3_2.pte" \
- -kv \
- -d fp32 \
- --preq_embedding_quantize 8,0 \
- --use_spin_quant native \
- --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+python -m extension.llm.export.export_llm \
+ base.model_class="llama3_2" \
+ base.checkpoint="${LLAMA_QUANTIZED_CHECKPOINT:?}" \
+ base.params="${LLAMA_PARAMS:?}" \
+ model.use_sdpa_with_kv_cache=True \
+ backend.xnnpack.enabled=True \
+ backend.xnnpack.extended_ops=True \
+ base.preq_mode="8da4w_output_8da8w" \
+ base.preq_group_size=32 \
+ export.max_seq_length=2048 \
+ export.max_context_length=2048 \
+ export.output_name="llama3_2.pte" \
+ model.use_kv_cache=True \
+ model.dtype_override="fp32" \
+ base.preq_embedding_quantize="8,0" \
+ quantization.use_spin_quant="native" \
+ base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
```
For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_SpinQuant_INT4_EO8.ipynb).
@@ -218,24 +218,24 @@ For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co
LLAMA_QUANTIZED_CHECKPOINT=path/to/qlora/consolidated.00.pth.pth
LLAMA_PARAMS=path/to/qlora/params.json
-python -m examples.models.llama.export_llama \
- --model "llama3_2" \
- --checkpoint "${LLAMA_QUANTIZED_CHECKPOINT:?}" \
- --params "${LLAMA_PARAMS:?}" \
- -qat \
- -lora 16 \
- --preq_mode 8da4w_output_8da8w \
- --preq_group_size 32 \
- --preq_embedding_quantize 8,0 \
- --use_sdpa_with_kv_cache \
- -kv \
- -X \
- --xnnpack-extended-ops \
- -d fp32 \
- --max_seq_length 2048 \
- --max_context_length 2048 \
- --output_name "llama3_2.pte" \
- --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+python -m extension.llm.export.export_llm \
+ base.model_class="llama3_2" \
+ base.checkpoint="${LLAMA_QUANTIZED_CHECKPOINT:?}" \
+ base.params="${LLAMA_PARAMS:?}" \
+ quantization.use_qat=True \
+ base.use_lora=16 \
+ base.preq_mode="8da4w_output_8da8w" \
+ base.preq_group_size=32 \
+ base.preq_embedding_quantize="8,0" \
+ model.use_sdpa_with_kv_cache=True \
+ model.use_kv_cache=True \
+ backend.xnnpack.enabled=True \
+ backend.xnnpack.extended_ops=True \
+ model.dtype_override="fp32" \
+ export.max_seq_length=2048 \
+ export.max_context_length=2048 \
+ export.output_name="llama3_2.pte" \
+ base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
```
For convenience, an [exported ExecuTorch QAT+LoRA model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-QLORA_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_QLORA_INT4_EO8.ipynb).
@@ -247,20 +247,20 @@ You can export and run the original Llama 3 8B instruct model.
2. Export model and generate `.pte` file
```
- python -m examples.models.llama.export_llama \
- --checkpoint \
- -p \
- -kv \
- --use_sdpa_with_kv_cache \
- -X \
- -qmode 8da4w \
- --group_size 128 \
- -d fp32 \
- --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
- --embedding-quantize 4,32 \
- --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte"
+ python -m extension.llm.export.export_llm \
+ base.checkpoint= \
+ base.params= \
+ model.use_kv_cache=True \
+ model.use_sdpa_with_kv_cache=True \
+ backend.xnnpack.enabled=True \
+ quantization.qmode="8da4w" \
+ quantization.group_size=128 \
+ model.dtype_override="fp32" \
+ base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
+ quantization.embedding_quantize="4,32" \
+ export.output_name="llama3_kv_sdpa_xnn_qe_4_32.pte"
```
- Due to the larger vocabulary size of Llama 3, we recommend quantizing the embeddings with `--embedding-quantize 4,32` as shown above to further reduce the model size.
+ Due to the larger vocabulary size of Llama 3, we recommend quantizing the embeddings with `quantization.embedding_quantize="4,32"` as shown above to further reduce the model size.
If you're interested in deploying on non-CPU backends, [please refer the non-cpu-backend section](non_cpu_backends.md)
@@ -389,22 +389,22 @@ QLINEAR_GROUP_SIZE=128 # Must be multiple of 16
QEMBEDDING_BITWIDTH=4 # Can be 1-8
QEMBEDDING_GROUP_SIZE=32 # Must be multiple of 16
-python -m examples.models.llama.export_llama \
- --model "llama3_2" \
- --checkpoint "${LLAMA_CHECKPOINT:?}" \
- --params "${LLAMA_PARAMS:?}" \
- -kv \
- --use_sdpa_with_kv_cache \
- --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
- --output_name="llama3_2.pte" \
- -qmode "torchao:8da${QLINEAR_BITWIDTH}w" \
- --group_size ${QLINEAR_GROUP_SIZE} \
- -E "torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}" \
- -d fp32
+python -m extension.llm.export.export_llm \
+ base.model_class="llama3_2" \
+ base.checkpoint="${LLAMA_CHECKPOINT:?}" \
+ base.params="${LLAMA_PARAMS:?}" \
+ model.use_kv_cache=True \
+ model.use_sdpa_with_kv_cache=True \
+ base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
+ export.output_name="llama3_2.pte" \
+ quantization.qmode="torchao:8da${QLINEAR_BITWIDTH}w" \
+ quantization.group_size=${QLINEAR_GROUP_SIZE} \
+ quantization.embedding_quantize="torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}" \
+ model.dtype_override="fp32"
```
A few notes:
-- If your model shares embedding/unembedding weights (like Llama1B and Llama3B do), you can add `--use_shared_embedding` to take advantage of this and reduce memory. When this option is enabled, you can specify whether embeddings are quantized asymmetrically or not by specifying a third argument. For example, `-E "torchao:4,32,true"` means that the embedding is quantized to 4-bits with group_size=32 and is asymmetric (this is the default behavior if you simply use `-E "torchao:4,32"`), whereas `-E "torchao:4,32,false"` means that the embedding is quantized to 4-bits with group_size=32 and is symmetric. If `--use_shared_embedding` is specified, the unembedding (i.e., the final linear layer) is quantized in the same way, but also uses 8-bit dynamically quantized activations.
+- If your model shares embedding/unembedding weights (like Llama1B and Llama3B do), you can add `model.use_shared_embedding=True` to take advantage of this and reduce memory. When this option is enabled, you can specify whether embeddings are quantized asymmetrically or not by specifying a third argument. For example, `quantization.embedding_quantize="torchao:4,32,true"` means that the embedding is quantized to 4-bits with group_size=32 and is asymmetric (this is the default behavior if you simply use `quantization.embedding_quantize="torchao:4,32"`), whereas `quantization.embedding_quantize="torchao:4,32,false"` means that the embedding is quantized to 4-bits with group_size=32 and is symmetric. If `model.use_shared_embedding=True` is specified, the unembedding (i.e., the final linear layer) is quantized in the same way, but also uses 8-bit dynamically quantized activations.
- To do channelwise quantization, specify group_size to 0. This works for both linear and embedding layers.
Once the model is exported, we need to build ExecuTorch and the runner with the low-bit kernels.
diff --git a/examples/models/llama/UTILS.md b/examples/models/llama/UTILS.md
index 5f760ad7670..25bd7f77080 100644
--- a/examples/models/llama/UTILS.md
+++ b/examples/models/llama/UTILS.md
@@ -19,7 +19,7 @@ From `executorch` root:
```
3. Export model and generate `.pte` file.
```
- python -m examples.models.llama.export_llama -c stories110M.pt -p params.json -X -kv
+ python -m extension.llm.export.export_llm base.checkpoint=stories110M.pt base.params=params.json backend.xnnpack.enabled=True model.use_kv_cache=True
```
## Smaller model delegated to other backends
@@ -27,15 +27,15 @@ From `executorch` root:
Currently we supported lowering the stories model to other backends, including, CoreML, MPS and QNN. Please refer to the instruction
for each backend ([CoreML](https://pytorch.org/executorch/main/backends-coreml), [MPS](https://pytorch.org/executorch/main/backends-mps), [QNN](https://pytorch.org/executorch/main/backends-qualcomm)) before trying to lower them. After the backend library is installed, the script to export a lowered model is
-- Lower to CoreML: `python -m examples.models.llama.export_llama -kv --disable_dynamic_shape --coreml -c stories110M.pt -p params.json `
-- MPS: `python -m examples.models.llama.export_llama -kv --disable_dynamic_shape --mps -c stories110M.pt -p params.json `
-- QNN: `python -m examples.models.llama.export_llama -kv --disable_dynamic_shape --qnn -c stories110M.pt -p params.json `
+- Lower to CoreML: `python -m extension.llm.export.export_llm model.use_kv_cache=True model.enable_dynamic_shape=False backend.coreml.enabled=True base.checkpoint=stories110M.pt base.params=params.json`
+- MPS: `python -m extension.llm.export.export_llm model.use_kv_cache=True model.enable_dynamic_shape=False backend.mps.enabled=True base.checkpoint=stories110M.pt base.params=params.json`
+- QNN: `python -m extension.llm.export.export_llm model.use_kv_cache=True model.enable_dynamic_shape=False backend.qnn.enabled=True base.checkpoint=stories110M.pt base.params=params.json`
The iOS LLAMA app supports the CoreML and MPS model and the Android LLAMA app supports the QNN model. On Android, it also allow to cross compiler the llama runner binary, push to the device and run.
For CoreML, there are 2 additional optional arguments:
-* `--coreml-ios`: Specify the minimum iOS version to deploy (and turn on available optimizations). E.g. `--coreml-ios 18` will turn on [in-place KV cache](https://developer.apple.com/documentation/coreml/mlstate?language=objc) and [fused scaled dot product attention kernel](https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS18.transformers.scaled_dot_product_attention) (the resulting model will then need at least iOS 18 to run, though)
-* `--coreml-quantize`: Use [quantization tailored for CoreML](https://apple.github.io/coremltools/docs-guides/source/opt-quantization-overview.html). E.g. `--coreml-quantize b4w` will perform per-block 4-bit weight-only quantization in a way tailored for CoreML
+* `backend.coreml.ios`: Specify the minimum iOS version to deploy (and turn on available optimizations). E.g. `backend.coreml.ios=18` will turn on [in-place KV cache](https://developer.apple.com/documentation/coreml/mlstate?language=objc) and [fused scaled dot product attention kernel](https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS18.transformers.scaled_dot_product_attention) (the resulting model will then need at least iOS 18 to run, though)
+* `backend.coreml.quantize`: Use [quantization tailored for CoreML](https://apple.github.io/coremltools/docs-guides/source/opt-quantization-overview.html). E.g. `backend.coreml.quantize="b4w"` will perform per-block 4-bit weight-only quantization in a way tailored for CoreML
To deploy the large 8B model on the above backends, [please visit this section](non_cpu_backends.md).
diff --git a/examples/models/llama2/README.md b/examples/models/llama2/README.md
index 615ad3948fc..21f761b7f71 100644
--- a/examples/models/llama2/README.md
+++ b/examples/models/llama2/README.md
@@ -37,7 +37,7 @@ You can export and run the original Llama 2 7B model.
3. Export model and generate `.pte` file:
```
- python -m examples.models.llama.export_llama --checkpoint --params -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32
+ python -m extension.llm.export.export_llm base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True quantization.qmode="8da4w" quantization.group_size=128 model.dtype_override="fp32"
```
4. Create tokenizer.bin.
```
diff --git a/examples/models/phi_4_mini/README.md b/examples/models/phi_4_mini/README.md
index a23e4f49638..c2b3d515ec0 100644
--- a/examples/models/phi_4_mini/README.md
+++ b/examples/models/phi_4_mini/README.md
@@ -7,9 +7,9 @@ Phi-4-mini uses the same example code as Llama, while the checkpoint, model para
All commands for exporting and running Llama on various backends should also be applicable to Phi-4-mini, by swapping the following args:
```
---model phi_4_mini
---params examples/models/phi-4-mini/config.json
---checkpoint
+base.model_class="phi_4_mini"
+base.params="examples/models/phi-4-mini/config.json"
+base.checkpoint=
```
### Generate the Checkpoint
@@ -32,17 +32,17 @@ Export to XNNPack, no quantization:
# Set these paths to point to the downloaded files
PHI_CHECKPOINT=path/to/checkpoint.pth
-python -m examples.models.llama.export_llama \
- --model phi_4_mini \
- --checkpoint "${PHI_CHECKPOINT=path/to/checkpoint.pth:?}" \
- --params examples/models/phi-4-mini/config.json \
- -kv \
- --use_sdpa_with_kv_cache \
- -d fp32 \
- -X \
- --metadata '{"get_bos_id":151643, "get_eos_ids":[151643]}' \
- --output_name="phi-4-mini.pte"
- --verbose
+python -m extension.llm.export.export_llm \
+ base.model_class="phi_4_mini" \
+ base.checkpoint="${PHI_CHECKPOINT=path/to/checkpoint.pth:?}" \
+ base.params="examples/models/phi-4-mini/config.json" \
+ model.use_kv_cache=True \
+ model.use_sdpa_with_kv_cache=True \
+ model.dtype_override="fp32" \
+ backend.xnnpack.enabled=True \
+ base.metadata='{"get_bos_id":151643, "get_eos_ids":[151643]}' \
+ export.output_name="phi-4-mini.pte" \
+ debug.verbose=True
```
Run using the executor runner:
diff --git a/examples/models/qwen2_5/README.md b/examples/models/qwen2_5/README.md
index 9bf791a35ed..b40daaca469 100644
--- a/examples/models/qwen2_5/README.md
+++ b/examples/models/qwen2_5/README.md
@@ -7,9 +7,9 @@ Qwen 2.5 uses the same example code as Llama, while the checkpoint, model params
All commands for exporting and running Llama on various backends should also be applicable to Qwen 2.5, by swapping the following args:
```
---model qwen2_5
---params examples/models/qwen2_5/1_5b_config.json
---checkpoint
+base.model_class="qwen2_5"
+base.params="examples/models/qwen2_5/1_5b_config.json"
+base.checkpoint=
```
### Generate the Checkpoint
@@ -32,17 +32,17 @@ Export to XNNPack, no quantization:
# Set these paths to point to the downloaded files
QWEN_CHECKPOINT=path/to/checkpoint.pth
-python -m examples.models.llama.export_llama \
- --model "qwen2_5" \
- --checkpoint "${QWEN_CHECKPOINT:?}" \
- --params examples/models/qwen2_5/1_5b_config.json \
- -kv \
- --use_sdpa_with_kv_cache \
- -d fp32 \
- -X \
- --metadata '{"get_bos_id":151643, "get_eos_ids":[151643]}' \
- --output_name="qwen2_5-1_5b.pte"
- --verbose
+python -m extension.llm.export.export_llm \
+ base.model_class="qwen2_5" \
+ base.checkpoint="${QWEN_CHECKPOINT:?}" \
+ base.params="examples/models/qwen2_5/1_5b_config.json" \
+ model.use_kv_cache=True \
+ model.use_sdpa_with_kv_cache=True \
+ model.dtype_override="fp32" \
+ backend.xnnpack.enabled=True \
+ base.metadata='{"get_bos_id":151643, "get_eos_ids":[151643]}' \
+ export.output_name="qwen2_5-1_5b.pte" \
+ debug.verbose=True
```
Run using the executor runner:
diff --git a/examples/models/qwen3/README.md b/examples/models/qwen3/README.md
index a589d27c19d..acdd4497503 100644
--- a/examples/models/qwen3/README.md
+++ b/examples/models/qwen3/README.md
@@ -7,8 +7,8 @@ Qwen 3 uses the same example code as our optimized Llama model, while the checkp
All commands for exporting and running Llama on various backends should also be applicable to Qwen 3, by swapping the following args:
```
---model [qwen3-0.6b,qwen3-1_7b,qwen3-4b]
---params [examples/models/qwen3/0_6b_config.json,examples/models/qwen3/1_7b_config.json,examples/models/qwen3/4b_config.json]
+base.model_class=[qwen3-0_6b,qwen3-1_7b,qwen3-4b]
+base.params=[examples/models/qwen3/0_6b_config.json,examples/models/qwen3/1_7b_config.json,examples/models/qwen3/4b_config.json]
```
### Example export
@@ -16,50 +16,50 @@ Here is a basic example for exporting Qwen 3, although please refer to the Llama
Export 0.6b to XNNPack, quantized with 8da4w:
```
-python -m examples.models.llama.export_llama \
- --model qwen3-0_6b \
- --params examples/models/qwen3/0_6b_config.json \
- -kv \
- --use_sdpa_with_kv_cache \
- -d fp32 \
- -X \
- --xnnpack-extended-ops \
- -qmode 8da4w \
- --metadata '{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
- --output_name="qwen3-0_6b.pte" \
- --verbose
+python -m extension.llm.export.export_llm \
+ base.model_class="qwen3-0_6b" \
+ base.params="examples/models/qwen3/0_6b_config.json" \
+ model.use_kv_cache=True \
+ model.use_sdpa_with_kv_cache=True \
+ model.dtype_override="fp32" \
+ backend.xnnpack.enabled=True \
+ backend.xnnpack.extended_ops=True \
+ quantization.qmode="8da4w" \
+ base.metadata='{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
+ export.output_name="qwen3-0_6b.pte" \
+ debug.verbose=True
```
Export 1.7b to XNNPack, quantized with 8da4w:
```
-python -m examples.models.llama.export_llama \
- --model qwen3-1_7b \
- --params examples/models/qwen3/1_7b_config.json \
- -kv \
- --use_sdpa_with_kv_cache \
- -d fp32 \
- -X \
- --xnnpack-extended-ops \
- -qmode 8da4w \
- --metadata '{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
- --output_name="qwen3-1_7b.pte" \
- --verbose
+python -m extension.llm.export.export_llm \
+ base.model_class="qwen3-1_7b" \
+ base.params="examples/models/qwen3/1_7b_config.json" \
+ model.use_kv_cache=True \
+ model.use_sdpa_with_kv_cache=True \
+ model.dtype_override="fp32" \
+ backend.xnnpack.enabled=True \
+ backend.xnnpack.extended_ops=True \
+ quantization.qmode="8da4w" \
+ base.metadata='{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
+ export.output_name="qwen3-1_7b.pte" \
+ debug.verbose=True
```
Export 4b to XNNPack, quantized with 8da4w:
```
-python -m examples.models.llama.export_llama \
- --model qwen3-4b \
- --params examples/models/qwen3/4b_config.json \
- -kv \
- --use_sdpa_with_kv_cache \
- -d fp32 \
- -X \
- --xnnpack-extended-ops \
- -qmode 8da4w \
- --metadata '{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
- --output_name="qwen3-4b.pte" \
- --verbose
+python -m extension.llm.export.export_llm \
+ base.model_class="qwen3-4b" \
+ base.params="examples/models/qwen3/4b_config.json" \
+ model.use_kv_cache=True \
+ model.use_sdpa_with_kv_cache=True \
+ model.dtype_override="fp32" \
+ backend.xnnpack.enabled=True \
+ backend.xnnpack.extended_ops=True \
+ quantization.qmode="8da4w" \
+ base.metadata='{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
+ export.output_name="qwen3-4b.pte" \
+ debug.verbose=True
```
### Example run
From 49d56c40df5380fd6ad4b921535eebcad5883e96 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Fri, 20 Jun 2025 14:18:45 -0700
Subject: [PATCH 4/9] Update
[ghstack-poisoned]
---
examples/models/llama/config/llm_config.py | 80 +++++++--------
examples/models/llama/export_llama_lib.py | 30 +++---
examples/models/llama/model.py | 14 +--
extension/llm/export/test/test_export_llm.py | 101 ++++++++++++++++++-
4 files changed, 162 insertions(+), 63 deletions(-)
diff --git a/examples/models/llama/config/llm_config.py b/examples/models/llama/config/llm_config.py
index 201e3a5414a..9acd633fb21 100644
--- a/examples/models/llama/config/llm_config.py
+++ b/examples/models/llama/config/llm_config.py
@@ -26,19 +26,19 @@
class ModelType(str, Enum):
- STORIES110M = "stories110m"
- LLAMA2 = "llama2"
- LLAMA3 = "llama3"
- LLAMA3_1 = "llama3_1"
- LLAMA3_2 = "llama3_2"
- LLAMA3_2_VISION = "llama3_2_vision"
- STATIC_LLAMA = "static_llama"
- QWEN2_5 = "qwen2_5"
- QWEN3_0_6B = "qwen3-0_6b"
- QWEN3_1_7B = "qwen3-1_7b"
- QWEN3_4B = "qwen3-4b"
- PHI_4_MINI = "phi_4_mini"
- SMOLLM2 = "smollm2"
+ stories110m = "stories110m"
+ llama2 = "llama2"
+ llama3 = "llama3"
+ llama3_1 = "llama3_1"
+ llama3_2 = "llama3_2"
+ llama3_2_vision = "llama3_2_vision"
+ static_llama = "static_llama"
+ qwen2_5 = "qwen2_5"
+ qwen3_0_6b = "qwen3-0_6b"
+ qwen3_1_7b = "qwen3-1_7b"
+ qwen3_4b = "qwen3-4b"
+ phi_4_mini = "phi_4_mini"
+ smollm2 = "smollm2"
class PreqMode(str, Enum):
@@ -49,8 +49,8 @@ class PreqMode(str, Enum):
are still around to preserve backward compatibility.
"""
- PREQ_8DA4W = "8da4w"
- PREQ_8DA4W_OUT_8DA8W = "8da4w_output_8da8w"
+ preq_8da4w = "8da4w"
+ preq_8da4w_out_8da8w = "8da4w_output_8da8w"
@dataclass
@@ -82,7 +82,7 @@ class BaseConfig:
are loaded.
"""
- model_class: ModelType = ModelType.LLAMA3
+ model_class: ModelType = ModelType.llama3
params: Optional[str] = None
checkpoint: Optional[str] = None
checkpoint_dir: Optional[str] = None
@@ -107,9 +107,9 @@ class DtypeOverride(str, Enum):
is not recommended.
"""
- FP32 = "fp32"
- FP16 = "fp16"
- BF16 = "bf16"
+ fp32 = "fp32"
+ fp16 = "fp16"
+ bf16 = "bf16"
@dataclass
@@ -147,7 +147,7 @@ class ModelConfig:
[16] pattern specifies all layers have a sliding window of 16.
"""
- dtype_override: DtypeOverride = DtypeOverride.FP32
+ dtype_override: DtypeOverride = DtypeOverride.fp32
enable_dynamic_shape: bool = True
use_shared_embedding: bool = False
use_sdpa_with_kv_cache: bool = False
@@ -270,22 +270,22 @@ class Pt2eQuantize(str, Enum):
and is source transform-based.
"""
- XNNPACK_DYNAMIC = "xnnpack_dynamic"
- XNNPACK_DYNAMIC_QC4 = "xnnpack_dynamic_qc4"
- QNN_8A8W = "qnn_8a8w"
- QNN_16A16W = "qnn_16a16w"
- QNN_16A4W = "qnn_16a4w"
- COREML_C4W = "coreml_c4w"
- COREML_8A_C8W = "coreml_8a_c8w"
- COREML_8A_C4W = "coreml_8a_c4w"
- COREML_BASELINE_8A_C8W = "coreml_baseline_8a_c8w"
- COREML_BASELINE_8A_C4W = "coreml_baseline_8a_c4w"
- VULKAN_8W = "vulkan_8w"
+ xnnpack_dynamic = "xnnpack_dynamic"
+ xnnpack_dynamic_qc4 = "xnnpack_dynamic_qc4"
+ qnn_8a8w = "qnn_8a8w"
+ qnn_16a16w = "qnn_16a16w"
+ qnn_16a4w = "qnn_16a4w"
+ coreml_c4w = "coreml_c4w"
+ coreml_8a_c8w = "coreml_8a_c8w"
+ coreml_8a_c4w = "coreml_8a_c4w"
+ coreml_baseline_8a_c8w = "coreml_baseline_8a_c8w"
+ coreml_baseline_8a_c4w = "coreml_baseline_8a_c4w"
+ vulkan_8w = "vulkan_8w"
class SpinQuant(str, Enum):
- CUDA = "cuda"
- NATIVE = "native"
+ cuda = "cuda"
+ native = "native"
@dataclass
@@ -378,15 +378,15 @@ class XNNPackConfig:
class CoreMLQuantize(str, Enum):
- B4W = "b4w"
- C4W = "c4w"
+ b4w = "b4w"
+ c4w = "c4w"
class CoreMLComputeUnit(str, Enum):
- CPU_ONLY = "cpu_only"
- CPU_AND_GPU = "cpu_and_gpu"
- CPU_AND_NE = "cpu_and_ne"
- ALL = "all"
+ cpu_only = "cpu_only"
+ cpu_and_gpu = "cpu_and_gpu"
+ cpu_and_ne = "cpu_and_ne"
+ all = "all"
@dataclass
@@ -400,7 +400,7 @@ class CoreMLConfig:
preserve_sdpa: bool = False
quantize: Optional[CoreMLQuantize] = None
ios: int = 15
- compute_units: CoreMLComputeUnit = CoreMLComputeUnit.CPU_ONLY
+ compute_units: CoreMLComputeUnit = CoreMLComputeUnit.cpu_only
def __post_init__(self):
if self.ios not in (15, 16, 17, 18):
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 78c6244abee..6a706e0fa05 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -590,7 +590,7 @@ def export_llama(
# If a checkpoint isn't provided for an HF OSS model, download and convert the
# weights first.
- model_name = llm_config.base.model_class
+ model_name = llm_config.base.model_class.value
if not llm_config.base.checkpoint and model_name in HUGGING_FACE_REPO_IDS:
repo_id = HUGGING_FACE_REPO_IDS[model_name]
if model_name == "qwen2_5":
@@ -668,7 +668,7 @@ def _prepare_for_llama_export(llm_config: LlmConfig) -> LLMEdgeManager:
llm_config.export.output_dir = output_dir_path
# Convert dtype override string to actual type.
- dtype_override = DType[llm_config.model.dtype_override]
+ dtype_override = DType[llm_config.model.dtype_override.value]
edge_manager = _load_llama_model(llm_config)
@@ -702,7 +702,7 @@ def _prepare_for_llama_export(llm_config: LlmConfig) -> LLMEdgeManager:
checkpoint=llm_config.base.checkpoint,
checkpoint_dtype=DType.from_torch_dtype(checkpoint_dtype), # type: ignore
tokenizer_path=llm_config.base.tokenizer_path,
- use_spin_quant=llm_config.quantization.use_spin_quant,
+ use_spin_quant=llm_config.quantization.use_spin_quant.value if llm_config.quantization.use_spin_quant else None,
embedding_quantize=llm_config.quantization.embedding_quantize,
use_shared_embedding=llm_config.model.use_shared_embedding,
quantization_mode=llm_config.quantization.qmode,
@@ -726,7 +726,7 @@ def _prepare_for_llama_export(llm_config: LlmConfig) -> LLMEdgeManager:
vulkan=llm_config.backend.vulkan.enabled,
use_qat=llm_config.quantization.use_qat,
use_lora=llm_config.base.use_lora,
- preq_mode=llm_config.base.preq_mode,
+ preq_mode=llm_config.base.preq_mode.value if llm_config.base.preq_mode else None,
preq_group_size=llm_config.base.preq_group_size,
preq_embedding_quantize=llm_config.base.preq_embedding_quantize,
local_global_attention=llm_config.model.local_global_attention,
@@ -738,25 +738,25 @@ def _prepare_for_llama_export(llm_config: LlmConfig) -> LLMEdgeManager:
def get_quantizer_and_quant_params(llm_config):
pt2e_quant_params = get_pt2e_quantization_params(
- llm_config.quantization.pt2e_quantize, llm_config.quantization.qmode
+ llm_config.quantization.pt2e_quantize.value if llm_config.quantization.pt2e_quantize else None, llm_config.quantization.qmode
)
quantizers = get_pt2e_quantizers(pt2e_quant_params, llm_config.export.so_library)
quant_dtype = None
if llm_config.backend.qnn.enabled and llm_config.quantization.pt2e_quantize:
assert len(quantizers) == 0, "Should not enable both xnnpack and qnn"
qnn_quantizer, quant_dtype = get_qnn_quantizer(
- llm_config.quantization.pt2e_quantize, llm_config.quantization.qmode
+ llm_config.quantization.pt2e_quantize.value, llm_config.quantization.qmode
)
quantizers.append(qnn_quantizer)
if llm_config.backend.coreml.enabled and llm_config.quantization.pt2e_quantize:
assert len(quantizers) == 0, "Should not enable both xnnpack / qnn and coreml"
- coreml_quantizer = get_coreml_quantizer(llm_config.quantization.pt2e_quantize)
+ coreml_quantizer = get_coreml_quantizer(llm_config.quantization.pt2e_quantize.value)
quantizers.append(coreml_quantizer)
if llm_config.backend.vulkan.enabled and llm_config.quantization.pt2e_quantize:
assert (
len(quantizers) == 0
), "Should not enable both vulkan and other quantizers"
- vulkan_quantizer = get_vulkan_quantizer(llm_config.quantization.pt2e_quantize)
+ vulkan_quantizer = get_vulkan_quantizer(llm_config.quantization.pt2e_quantize.value)
quantizers.append(vulkan_quantizer)
logging.info(f"Applying quantizers: {quantizers}")
return pt2e_quant_params, quantizers, quant_dtype
@@ -1033,7 +1033,7 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager: # noqa: C901
)
additional_passes = []
- if llm_config.base.model_class in TORCHTUNE_DEFINED_MODELS:
+ if llm_config.base.model_class.value in TORCHTUNE_DEFINED_MODELS:
additional_passes = [InitializedMutableBufferPass(["kv_cache_pos"])]
# export_to_edge
@@ -1072,14 +1072,14 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager: # noqa: C901
mps=llm_config.backend.mps.enabled,
coreml=llm_config.backend.coreml.enabled,
qnn=llm_config.backend.qnn.enabled,
- dtype_override=llm_config.model.dtype_override,
+ dtype_override=llm_config.model.dtype_override.value,
enable_dynamic_shape=llm_config.model.enable_dynamic_shape,
use_kv_cache=llm_config.model.use_kv_cache,
embedding_quantize=llm_config.quantization.embedding_quantize,
- pt2e_quantize=llm_config.quantization.pt2e_quantize,
+ pt2e_quantize=llm_config.quantization.pt2e_quantize.value if llm_config.quantization.pt2e_quantize else None,
coreml_ios=llm_config.backend.coreml.ios,
- coreml_quantize=llm_config.backend.coreml.quantize,
- coreml_compute_units=llm_config.backend.coreml.compute_units,
+ coreml_quantize=llm_config.backend.coreml.quantize.value if llm_config.backend.coreml.quantize else None,
+ coreml_compute_units=llm_config.backend.coreml.compute_units.value,
use_qnn_sha=llm_config.backend.qnn.use_sha,
num_sharding=llm_config.backend.qnn.num_sharding,
soc_model=llm_config.backend.qnn.soc_model,
@@ -1152,7 +1152,7 @@ def _load_llama_model(llm_config: LlmConfig) -> "LLMEdgeManager":
An instance of LLMEdgeManager which contains the eager mode model.
"""
- modelname = llm_config.base.model_class
+ modelname = llm_config.base.model_class.value
if modelname in EXECUTORCH_DEFINED_MODELS:
module_name = "llama"
model_class_name = "Llama2Model" # TODO: Change to "LlamaModel" in examples/models/llama/model.py.
@@ -1173,7 +1173,7 @@ def _load_llama_model(llm_config: LlmConfig) -> "LLMEdgeManager":
)
)
# Convert dtype override string to actual type.
- dtype_override = DType[llm_config.model.dtype_override]
+ dtype_override = DType[llm_config.model.dtype_override.value]
return LLMEdgeManager(
model=model,
diff --git a/examples/models/llama/model.py b/examples/models/llama/model.py
index ec9646be6f4..efea80dde2f 100644
--- a/examples/models/llama/model.py
+++ b/examples/models/llama/model.py
@@ -157,7 +157,7 @@ def __init__(self, llm_config: Optional[LlmConfig] = None):
if model_args.use_scaled_rope:
# Older models don't have use_scaled_rope configuration
- model_name = str(self.llm_config.base.model_class)
+ model_name = self.llm_config.base.model_class.value
assert model_name not in ["llama2", "stories110m"]
# Llama3_2 and newer models in ExecuTorch repo should set larger scale factor
@@ -328,10 +328,10 @@ def get_example_inputs_kvcache_sdpa(self):
def _transform_for_pre_quantization(self, checkpoint, model_args):
assert self.llm_config.base.preq_mode, "preq_mode must be specified"
- assert self.llm_config.base.preq_mode in [
+ assert self.llm_config.base.preq_mode.value in [
"8da4w",
"8da4w_output_8da8w",
- ], f"Quantization mode {self.llm_config.base.preq_mode} is not compatible with SpinQuant."
+ ], f"Quantization mode {self.llm_config.base.preq_mode.value} is not compatible with SpinQuant."
assert self.llm_config.base.preq_group_size, "preq_group_size must be specified"
assert self.llm_config.model.dtype_override, "dtype_override must be specified"
@@ -351,7 +351,7 @@ def _transform_for_pre_quantization(self, checkpoint, model_args):
}
# Transform the output layer first if needed.
- if self.llm_config.base.preq_mode == "8da4w_output_8da8w":
+ if self.llm_config.base.preq_mode.value == "8da4w_output_8da8w":
from .source_transformation.pre_quantization import (
transform_output_linear_for_pre_quantization,
)
@@ -359,14 +359,14 @@ def _transform_for_pre_quantization(self, checkpoint, model_args):
self.model_ = transform_output_linear_for_pre_quantization(
module=self.model_,
checkpoint=checkpoint,
- dtype=mapping[self.llm_config.model.dtype_override],
+ dtype=mapping[self.llm_config.model.dtype_override.value],
)
self.model_ = transform_linear_for_pre_quantization(
self.model_,
checkpoint,
self.llm_config.base.preq_group_size,
- mapping[self.llm_config.model.dtype_override],
+ mapping[self.llm_config.model.dtype_override.value],
)
embedding_bit_width, embedding_group_size = None, None
@@ -390,7 +390,7 @@ def _transform_for_pre_quantization(self, checkpoint, model_args):
self.model_ = transform_embedding_for_pre_quantization(
self.model_,
checkpoint,
- mapping[self.llm_config.model.dtype_override],
+ mapping[self.llm_config.model.dtype_override.value],
int(embedding_bit_width),
embedding_group_size,
)
diff --git a/extension/llm/export/test/test_export_llm.py b/extension/llm/export/test/test_export_llm.py
index 970a32c9606..d36baa6c62c 100644
--- a/extension/llm/export/test/test_export_llm.py
+++ b/extension/llm/export/test/test_export_llm.py
@@ -10,7 +10,16 @@
import unittest
from unittest.mock import MagicMock, patch
-from executorch.examples.models.llama.config.llm_config import LlmConfig
+from executorch.examples.models.llama.config.llm_config import (
+ LlmConfig,
+ ModelType,
+ PreqMode,
+ DtypeOverride,
+ Pt2eQuantize,
+ SpinQuant,
+ CoreMLQuantize,
+ CoreMLComputeUnit
+)
from executorch.extension.llm.export.export_llm import main, parse_config_arg, pop_config_arg
@@ -106,6 +115,96 @@ def test_config_rejects_multiple_cli_args(self) -> None:
finally:
os.unlink(config_file)
+ def test_enum_fields(self) -> None:
+ """Test that all enum fields work correctly with their lowercase keys."""
+ # Test ModelType enum
+ for enum_value in ModelType:
+ self.assertIsNotNone(enum_value.value)
+ self.assertTrue(isinstance(enum_value.value, str))
+
+ # Test specific enum values that were changed from uppercase to lowercase
+ self.assertEqual(ModelType.stories110m.value, "stories110m")
+ self.assertEqual(ModelType.llama2.value, "llama2")
+ self.assertEqual(ModelType.llama3.value, "llama3")
+ self.assertEqual(ModelType.llama3_1.value, "llama3_1")
+ self.assertEqual(ModelType.llama3_2.value, "llama3_2")
+ self.assertEqual(ModelType.llama3_2_vision.value, "llama3_2_vision")
+ self.assertEqual(ModelType.static_llama.value, "static_llama")
+ self.assertEqual(ModelType.qwen2_5.value, "qwen2_5")
+ self.assertEqual(ModelType.qwen3_0_6b.value, "qwen3-0_6b")
+ self.assertEqual(ModelType.qwen3_1_7b.value, "qwen3-1_7b")
+ self.assertEqual(ModelType.qwen3_4b.value, "qwen3-4b")
+ self.assertEqual(ModelType.phi_4_mini.value, "phi_4_mini")
+ self.assertEqual(ModelType.smollm2.value, "smollm2")
+
+ # Test PreqMode enum
+ self.assertEqual(PreqMode.preq_8da4w.value, "8da4w")
+ self.assertEqual(PreqMode.preq_8da4w_out_8da8w.value, "8da4w_output_8da8w")
+
+ # Test DtypeOverride enum
+ self.assertEqual(DtypeOverride.fp32.value, "fp32")
+ self.assertEqual(DtypeOverride.fp16.value, "fp16")
+ self.assertEqual(DtypeOverride.bf16.value, "bf16")
+
+ # Test Pt2eQuantize enum
+ self.assertEqual(Pt2eQuantize.xnnpack_dynamic.value, "xnnpack_dynamic")
+ self.assertEqual(Pt2eQuantize.xnnpack_dynamic_qc4.value, "xnnpack_dynamic_qc4")
+ self.assertEqual(Pt2eQuantize.qnn_8a8w.value, "qnn_8a8w")
+ self.assertEqual(Pt2eQuantize.qnn_16a16w.value, "qnn_16a16w")
+ self.assertEqual(Pt2eQuantize.qnn_16a4w.value, "qnn_16a4w")
+ self.assertEqual(Pt2eQuantize.coreml_c4w.value, "coreml_c4w")
+ self.assertEqual(Pt2eQuantize.coreml_8a_c8w.value, "coreml_8a_c8w")
+ self.assertEqual(Pt2eQuantize.coreml_8a_c4w.value, "coreml_8a_c4w")
+ self.assertEqual(Pt2eQuantize.coreml_baseline_8a_c8w.value, "coreml_baseline_8a_c8w")
+ self.assertEqual(Pt2eQuantize.coreml_baseline_8a_c4w.value, "coreml_baseline_8a_c4w")
+ self.assertEqual(Pt2eQuantize.vulkan_8w.value, "vulkan_8w")
+
+ # Test SpinQuant enum
+ self.assertEqual(SpinQuant.cuda.value, "cuda")
+ self.assertEqual(SpinQuant.native.value, "native")
+
+ # Test CoreMLQuantize enum
+ self.assertEqual(CoreMLQuantize.b4w.value, "b4w")
+ self.assertEqual(CoreMLQuantize.c4w.value, "c4w")
+
+ # Test CoreMLComputeUnit enum
+ self.assertEqual(CoreMLComputeUnit.cpu_only.value, "cpu_only")
+ self.assertEqual(CoreMLComputeUnit.cpu_and_gpu.value, "cpu_and_gpu")
+ self.assertEqual(CoreMLComputeUnit.cpu_and_ne.value, "cpu_and_ne")
+ self.assertEqual(CoreMLComputeUnit.all.value, "all")
+
+ def test_enum_configuration(self) -> None:
+ """Test that enum fields can be properly set in LlmConfig."""
+ config = LlmConfig()
+
+ # Test setting ModelType
+ config.base.model_class = ModelType.llama3
+ self.assertEqual(config.base.model_class.value, "llama3")
+
+ # Test setting DtypeOverride
+ config.model.dtype_override = DtypeOverride.fp16
+ self.assertEqual(config.model.dtype_override.value, "fp16")
+
+ # Test setting PreqMode
+ config.base.preq_mode = PreqMode.preq_8da4w
+ self.assertEqual(config.base.preq_mode.value, "8da4w")
+
+ # Test setting Pt2eQuantize
+ config.quantization.pt2e_quantize = Pt2eQuantize.xnnpack_dynamic
+ self.assertEqual(config.quantization.pt2e_quantize.value, "xnnpack_dynamic")
+
+ # Test setting SpinQuant
+ config.quantization.use_spin_quant = SpinQuant.cuda
+ self.assertEqual(config.quantization.use_spin_quant.value, "cuda")
+
+ # Test setting CoreMLQuantize
+ config.backend.coreml.quantize = CoreMLQuantize.c4w
+ self.assertEqual(config.backend.coreml.quantize.value, "c4w")
+
+ # Test setting CoreMLComputeUnit
+ config.backend.coreml.compute_units = CoreMLComputeUnit.cpu_and_gpu
+ self.assertEqual(config.backend.coreml.compute_units.value, "cpu_and_gpu")
+
if __name__ == "__main__":
unittest.main()
From eedd8333e8ea8bc86bb08c24586f2309f2af6252 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Fri, 20 Jun 2025 14:22:21 -0700
Subject: [PATCH 5/9] Update
[ghstack-poisoned]
---
extension/llm/export/README.md | 8 --------
extension/llm/install_requirements.sh | 9 ---------
requirements-dev.txt | 2 ++
3 files changed, 2 insertions(+), 17 deletions(-)
delete mode 100755 extension/llm/install_requirements.sh
diff --git a/extension/llm/export/README.md b/extension/llm/export/README.md
index e15c7fd7f77..1ac27306c86 100644
--- a/extension/llm/export/README.md
+++ b/extension/llm/export/README.md
@@ -21,14 +21,6 @@ The LLM export process transforms a model from its original format to an optimiz
- **Stories**: Stories110M (educational model)
- **SmolLM**: SmolLM2
-## Installation
-
-First, install the required dependencies:
-
-```bash
-./extension/llm/install_requirements.sh
-```
-
## Usage
The export API supports two configuration approaches:
diff --git a/extension/llm/install_requirements.sh b/extension/llm/install_requirements.sh
deleted file mode 100755
index 8f322083c03..00000000000
--- a/extension/llm/install_requirements.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# Install requirements for LLM extension
-pip install hydra-core>=1.3.0 omegaconf>=2.3.0
diff --git a/requirements-dev.txt b/requirements-dev.txt
index a4ed212fb65..07c63101eb8 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -9,3 +9,5 @@ wheel # For building the pip package archive.
zstd # Imported by resolve_buck.py.
lintrunner==0.12.7
lintrunner-adapters==0.12.4
+hydra-core>=1.3.0
+omegaconf>=2.3.0
From 0cffae8114eb35f298762c9308da6f82f95ce693 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Fri, 20 Jun 2025 14:27:38 -0700
Subject: [PATCH 6/9] Update
[ghstack-poisoned]
---
extension/llm/export/test/test_export_llm.py | 119 +++----------------
1 file changed, 19 insertions(+), 100 deletions(-)
diff --git a/extension/llm/export/test/test_export_llm.py b/extension/llm/export/test/test_export_llm.py
index d36baa6c62c..c4390050235 100644
--- a/extension/llm/export/test/test_export_llm.py
+++ b/extension/llm/export/test/test_export_llm.py
@@ -10,16 +10,7 @@
import unittest
from unittest.mock import MagicMock, patch
-from executorch.examples.models.llama.config.llm_config import (
- LlmConfig,
- ModelType,
- PreqMode,
- DtypeOverride,
- Pt2eQuantize,
- SpinQuant,
- CoreMLQuantize,
- CoreMLComputeUnit
-)
+from executorch.examples.models.llama.config.llm_config import LlmConfig
from executorch.extension.llm.export.export_llm import main, parse_config_arg, pop_config_arg
@@ -56,9 +47,20 @@ def test_with_config(self, mock_export_llama: MagicMock) -> None:
with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
f.write("""
base:
+ model_class: llama2
tokenizer_path: /path/to/tokenizer.json
+ preq_mode: preq_8da4w
+model:
+ dtype_override: fp16
export:
max_seq_length: 256
+quantization:
+ pt2e_quantize: xnnpack_dynamic
+ use_spin_quant: cuda
+backend:
+ coreml:
+ quantize: c4w
+ compute_units: cpu_and_gpu
""")
config_file = f.name
@@ -71,7 +73,14 @@ def test_with_config(self, mock_export_llama: MagicMock) -> None:
mock_export_llama.assert_called_once()
called_config = mock_export_llama.call_args[0][0]
self.assertEqual(called_config["base"]["tokenizer_path"], "/path/to/tokenizer.json")
+ self.assertEqual(called_config["base"]["model_class"], "llama2")
+ self.assertEqual(called_config["base"]["preq_mode"], "preq_8da4w")
+ self.assertEqual(called_config["model"]["dtype_override"], "fp16")
self.assertEqual(called_config["export"]["max_seq_length"], 256)
+ self.assertEqual(called_config["quantization"]["pt2e_quantize"], "xnnpack_dynamic")
+ self.assertEqual(called_config["quantization"]["use_spin_quant"], "cuda")
+ self.assertEqual(called_config["backend"]["coreml"]["quantize"], "c4w")
+ self.assertEqual(called_config["backend"]["coreml"]["compute_units"], "cpu_and_gpu")
finally:
os.unlink(config_file)
@@ -115,96 +124,6 @@ def test_config_rejects_multiple_cli_args(self) -> None:
finally:
os.unlink(config_file)
- def test_enum_fields(self) -> None:
- """Test that all enum fields work correctly with their lowercase keys."""
- # Test ModelType enum
- for enum_value in ModelType:
- self.assertIsNotNone(enum_value.value)
- self.assertTrue(isinstance(enum_value.value, str))
-
- # Test specific enum values that were changed from uppercase to lowercase
- self.assertEqual(ModelType.stories110m.value, "stories110m")
- self.assertEqual(ModelType.llama2.value, "llama2")
- self.assertEqual(ModelType.llama3.value, "llama3")
- self.assertEqual(ModelType.llama3_1.value, "llama3_1")
- self.assertEqual(ModelType.llama3_2.value, "llama3_2")
- self.assertEqual(ModelType.llama3_2_vision.value, "llama3_2_vision")
- self.assertEqual(ModelType.static_llama.value, "static_llama")
- self.assertEqual(ModelType.qwen2_5.value, "qwen2_5")
- self.assertEqual(ModelType.qwen3_0_6b.value, "qwen3-0_6b")
- self.assertEqual(ModelType.qwen3_1_7b.value, "qwen3-1_7b")
- self.assertEqual(ModelType.qwen3_4b.value, "qwen3-4b")
- self.assertEqual(ModelType.phi_4_mini.value, "phi_4_mini")
- self.assertEqual(ModelType.smollm2.value, "smollm2")
-
- # Test PreqMode enum
- self.assertEqual(PreqMode.preq_8da4w.value, "8da4w")
- self.assertEqual(PreqMode.preq_8da4w_out_8da8w.value, "8da4w_output_8da8w")
-
- # Test DtypeOverride enum
- self.assertEqual(DtypeOverride.fp32.value, "fp32")
- self.assertEqual(DtypeOverride.fp16.value, "fp16")
- self.assertEqual(DtypeOverride.bf16.value, "bf16")
-
- # Test Pt2eQuantize enum
- self.assertEqual(Pt2eQuantize.xnnpack_dynamic.value, "xnnpack_dynamic")
- self.assertEqual(Pt2eQuantize.xnnpack_dynamic_qc4.value, "xnnpack_dynamic_qc4")
- self.assertEqual(Pt2eQuantize.qnn_8a8w.value, "qnn_8a8w")
- self.assertEqual(Pt2eQuantize.qnn_16a16w.value, "qnn_16a16w")
- self.assertEqual(Pt2eQuantize.qnn_16a4w.value, "qnn_16a4w")
- self.assertEqual(Pt2eQuantize.coreml_c4w.value, "coreml_c4w")
- self.assertEqual(Pt2eQuantize.coreml_8a_c8w.value, "coreml_8a_c8w")
- self.assertEqual(Pt2eQuantize.coreml_8a_c4w.value, "coreml_8a_c4w")
- self.assertEqual(Pt2eQuantize.coreml_baseline_8a_c8w.value, "coreml_baseline_8a_c8w")
- self.assertEqual(Pt2eQuantize.coreml_baseline_8a_c4w.value, "coreml_baseline_8a_c4w")
- self.assertEqual(Pt2eQuantize.vulkan_8w.value, "vulkan_8w")
-
- # Test SpinQuant enum
- self.assertEqual(SpinQuant.cuda.value, "cuda")
- self.assertEqual(SpinQuant.native.value, "native")
-
- # Test CoreMLQuantize enum
- self.assertEqual(CoreMLQuantize.b4w.value, "b4w")
- self.assertEqual(CoreMLQuantize.c4w.value, "c4w")
-
- # Test CoreMLComputeUnit enum
- self.assertEqual(CoreMLComputeUnit.cpu_only.value, "cpu_only")
- self.assertEqual(CoreMLComputeUnit.cpu_and_gpu.value, "cpu_and_gpu")
- self.assertEqual(CoreMLComputeUnit.cpu_and_ne.value, "cpu_and_ne")
- self.assertEqual(CoreMLComputeUnit.all.value, "all")
-
- def test_enum_configuration(self) -> None:
- """Test that enum fields can be properly set in LlmConfig."""
- config = LlmConfig()
-
- # Test setting ModelType
- config.base.model_class = ModelType.llama3
- self.assertEqual(config.base.model_class.value, "llama3")
-
- # Test setting DtypeOverride
- config.model.dtype_override = DtypeOverride.fp16
- self.assertEqual(config.model.dtype_override.value, "fp16")
-
- # Test setting PreqMode
- config.base.preq_mode = PreqMode.preq_8da4w
- self.assertEqual(config.base.preq_mode.value, "8da4w")
-
- # Test setting Pt2eQuantize
- config.quantization.pt2e_quantize = Pt2eQuantize.xnnpack_dynamic
- self.assertEqual(config.quantization.pt2e_quantize.value, "xnnpack_dynamic")
-
- # Test setting SpinQuant
- config.quantization.use_spin_quant = SpinQuant.cuda
- self.assertEqual(config.quantization.use_spin_quant.value, "cuda")
-
- # Test setting CoreMLQuantize
- config.backend.coreml.quantize = CoreMLQuantize.c4w
- self.assertEqual(config.backend.coreml.quantize.value, "c4w")
-
- # Test setting CoreMLComputeUnit
- config.backend.coreml.compute_units = CoreMLComputeUnit.cpu_and_gpu
- self.assertEqual(config.backend.coreml.compute_units.value, "cpu_and_gpu")
-
if __name__ == "__main__":
unittest.main()
From 1af1b27274ddc736608dd7566f26d98c2596dbe6 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Fri, 20 Jun 2025 18:44:29 -0700
Subject: [PATCH 7/9] Update
[ghstack-poisoned]
---
extension/llm/export/test/test_export_llm.py | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/extension/llm/export/test/test_export_llm.py b/extension/llm/export/test/test_export_llm.py
index c4390050235..0932d3b1bd6 100644
--- a/extension/llm/export/test/test_export_llm.py
+++ b/extension/llm/export/test/test_export_llm.py
@@ -74,13 +74,13 @@ def test_with_config(self, mock_export_llama: MagicMock) -> None:
called_config = mock_export_llama.call_args[0][0]
self.assertEqual(called_config["base"]["tokenizer_path"], "/path/to/tokenizer.json")
self.assertEqual(called_config["base"]["model_class"], "llama2")
- self.assertEqual(called_config["base"]["preq_mode"], "preq_8da4w")
- self.assertEqual(called_config["model"]["dtype_override"], "fp16")
+ self.assertEqual(called_config["base"]["preq_mode"].value, "8da4w")
+ self.assertEqual(called_config["model"]["dtype_override"].value, "fp16")
self.assertEqual(called_config["export"]["max_seq_length"], 256)
- self.assertEqual(called_config["quantization"]["pt2e_quantize"], "xnnpack_dynamic")
- self.assertEqual(called_config["quantization"]["use_spin_quant"], "cuda")
- self.assertEqual(called_config["backend"]["coreml"]["quantize"], "c4w")
- self.assertEqual(called_config["backend"]["coreml"]["compute_units"], "cpu_and_gpu")
+ self.assertEqual(called_config["quantization"]["pt2e_quantize"].value, "xnnpack_dynamic")
+ self.assertEqual(called_config["quantization"]["use_spin_quant"].value, "cuda")
+ self.assertEqual(called_config["backend"]["coreml"]["quantize"].value, "c4w")
+ self.assertEqual(called_config["backend"]["coreml"]["compute_units"].value, "cpu_and_gpu")
finally:
os.unlink(config_file)
From acd2079be7d10dfd7afc60b2fbf1a3821dfce6dd Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Fri, 20 Jun 2025 20:17:46 -0700
Subject: [PATCH 8/9] Update
[ghstack-poisoned]
---
extension/llm/export/export_llm.py | 5 ++-
extension/llm/export/test/test_export_llm.py | 37 ++++++++++++++------
2 files changed, 29 insertions(+), 13 deletions(-)
diff --git a/extension/llm/export/export_llm.py b/extension/llm/export/export_llm.py
index 2af7439b805..e995b329f30 100644
--- a/extension/llm/export/export_llm.py
+++ b/extension/llm/export/export_llm.py
@@ -34,12 +34,11 @@
from typing import Any, List, Tuple
import hydra
-import yaml
from executorch.examples.models.llama.config.llm_config import LlmConfig
from executorch.examples.models.llama.export_llama_lib import export_llama
from hydra.core.config_store import ConfigStore
-from omegaconf import DictConfig, OmegaConf
+from omegaconf import OmegaConf
cs = ConfigStore.instance()
cs.store(name="llm_config", node=LlmConfig)
@@ -79,7 +78,7 @@ def main() -> None:
"Cannot specify additional CLI arguments when using --config. "
f"Found: {remaining_args}. Use either --config file or hydra CLI args, not both."
)
-
+
config_file_path = pop_config_arg()
default_llm_config = LlmConfig()
llm_config_from_file = OmegaConf.load(config_file_path)
diff --git a/extension/llm/export/test/test_export_llm.py b/extension/llm/export/test/test_export_llm.py
index 970a32c9606..1f230233867 100644
--- a/extension/llm/export/test/test_export_llm.py
+++ b/extension/llm/export/test/test_export_llm.py
@@ -10,8 +10,11 @@
import unittest
from unittest.mock import MagicMock, patch
-from executorch.examples.models.llama.config.llm_config import LlmConfig
-from executorch.extension.llm.export.export_llm import main, parse_config_arg, pop_config_arg
+from executorch.extension.llm.export.export_llm import (
+ main,
+ parse_config_arg,
+ pop_config_arg,
+)
class TestExportLlm(unittest.TestCase):
@@ -45,12 +48,14 @@ def test_with_config(self, mock_export_llama: MagicMock) -> None:
"""Test main function with --config file and no hydra args."""
# Create a temporary config file
with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
- f.write("""
+ f.write(
+ """
base:
tokenizer_path: /path/to/tokenizer.json
export:
max_seq_length: 256
-""")
+"""
+ )
config_file = f.name
try:
@@ -61,7 +66,9 @@ def test_with_config(self, mock_export_llama: MagicMock) -> None:
# Verify export_llama was called with config
mock_export_llama.assert_called_once()
called_config = mock_export_llama.call_args[0][0]
- self.assertEqual(called_config["base"]["tokenizer_path"], "/path/to/tokenizer.json")
+ self.assertEqual(
+ called_config["base"]["tokenizer_path"], "/path/to/tokenizer.json"
+ )
self.assertEqual(called_config["export"]["max_seq_length"], 256)
finally:
os.unlink(config_file)
@@ -70,7 +77,9 @@ def test_with_cli_args(self) -> None:
"""Test main function with only hydra CLI args."""
test_argv = ["script.py", "debug.verbose=True"]
with patch.object(sys, "argv", test_argv):
- with patch("executorch.extension.llm.export.export_llm.hydra_main") as mock_hydra:
+ with patch(
+ "executorch.extension.llm.export.export_llm.hydra_main"
+ ) as mock_hydra:
main()
mock_hydra.assert_called_once()
@@ -86,9 +95,12 @@ def test_config_with_cli_args_error(self) -> None:
with patch.object(sys, "argv", test_argv):
with self.assertRaises(ValueError) as cm:
main()
-
+
error_msg = str(cm.exception)
- self.assertIn("Cannot specify additional CLI arguments when using --config", error_msg)
+ self.assertIn(
+ "Cannot specify additional CLI arguments when using --config",
+ error_msg,
+ )
finally:
os.unlink(config_file)
@@ -99,7 +111,13 @@ def test_config_rejects_multiple_cli_args(self) -> None:
config_file = f.name
try:
- test_argv = ["script.py", "--config", config_file, "debug.verbose=True", "export.output_dir=/tmp"]
+ test_argv = [
+ "script.py",
+ "--config",
+ config_file,
+ "debug.verbose=True",
+ "export.output_dir=/tmp",
+ ]
with patch.object(sys, "argv", test_argv):
with self.assertRaises(ValueError):
main()
@@ -109,4 +127,3 @@ def test_config_rejects_multiple_cli_args(self) -> None:
if __name__ == "__main__":
unittest.main()
-
From 0118873d79a6997fae75252a880804ed73c2c6cc Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Mon, 23 Jun 2025 11:57:35 -0700
Subject: [PATCH 9/9] Update
[ghstack-poisoned]
---
.../docs/delegates/qualcomm_README.md | 4 ++--
.../docs/delegates/xnnpack_README.md | 10 +++++-----
.../LLaMA/docs/delegates/xnnpack_README.md | 8 ++++----
.../deepseek-r1-distill-llama-8B/README.md | 4 ++--
examples/models/llama/README.md | 20 +++++++++----------
examples/models/phi_4_mini/README.md | 2 +-
examples/models/qwen2_5/README.md | 2 +-
examples/models/qwen3/README.md | 6 +++---
8 files changed, 28 insertions(+), 28 deletions(-)
diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md
index 969b6cacab9..360e92a5f30 100644
--- a/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md
+++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md
@@ -106,12 +106,12 @@ We support PTQ by default. The entire export may take ~20 minutes (Llama 3.1 8B)
Examples:
```
# 4 bits weight only quantize
-python -m extension.llm.export.export_llm base.checkpoint="${MODEL_DIR}/consolidated.00.pth" base.params="${MODEL_DIR}/params.json" model.use_kv_cache=True model.enable_dynamic_shape=False backend.qnn.enabled=True backend.qnn.quantization="qnn_16a4w" model.dtype_override="fp32" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' export.output_name="test.pte"
+python -m extension.llm.export.export_llm base.checkpoint="${MODEL_DIR}/consolidated.00.pth" base.params="${MODEL_DIR}/params.json" model.use_kv_cache=True model.enable_dynamic_shape=False backend.qnn.enabled=True backend.qnn.quantization="qnn_16a4w" model.dtype_override="fp32" base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="test.pte"
```
If the model is really big, it may require model sharding because the Qualcomm DSP is a 32bit system and has a 4GB size limit . For example for Llama 3 8B models, we need to shard the model into 4, but ExecuTorch still packages it into one PTE file. Here is an example:
```
# 8 bits quantization with 4 shards
-python -m extension.llm.export.export_llm base.checkpoint="${MODEL_DIR}/consolidated.00.pth" base.params="${MODEL_DIR}/params.json" model.use_kv_cache=True model.enable_dynamic_shape=False backend.qnn.enabled=True backend.qnn.quantization="qnn_8a8w" model.dtype_override="fp32" backend.qnn.num_sharding=4 base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' export.output_name="test.pte"
+python -m extension.llm.export.export_llm base.checkpoint="${MODEL_DIR}/consolidated.00.pth" base.params="${MODEL_DIR}/params.json" model.use_kv_cache=True model.enable_dynamic_shape=False backend.qnn.enabled=True backend.qnn.quantization="qnn_8a8w" model.dtype_override="fp32" backend.qnn.num_sharding=4 base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="test.pte"
```
Note: if you encountered issues below
```
diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md
index c60bd537e6b..baf8ffb7071 100644
--- a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md
+++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md
@@ -55,7 +55,7 @@ In this demo app, we support text-only inference with up-to-date Llama models an
Meta has released prequantized INT4 SpinQuant Llama 3.2 models that ExecuTorch supports on the XNNPACK backend.
* Export Llama model and generate .pte file as below:
```
-python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="fp32" backend.xnnpack.extended_ops=True base.preq_mode="8da4w_output_8da8w" base.preq_group_size=32 export.max_seq_length=2048 export.max_context_length=2048 base.preq_embedding_quantize="8,0" quantization.use_spin_quant="native" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' export.output_name="llama3_2_spinquant.pte"
+python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="fp32" backend.xnnpack.extended_ops=True base.preq_mode="8da4w_output_8da8w" base.preq_group_size=32 export.max_seq_length=2048 export.max_context_length=2048 base.preq_embedding_quantize=\'8,0\' quantization.use_spin_quant="native" base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="llama3_2_spinquant.pte"
```
For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_SpinQuant_INT4_EO8.ipynb).
@@ -63,7 +63,7 @@ For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co
Meta has released prequantized INT4 QAT+LoRA Llama 3.2 models that ExecuTorch supports on the XNNPACK backend.
* Export Llama model and generate .pte file as below:
```
-python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= quantization.use_qat=True base.use_lora=16 model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="fp32" backend.xnnpack.extended_ops=True base.preq_mode="8da4w_output_8da8w" base.preq_group_size=32 export.max_seq_length=2048 export.max_context_length=2048 base.preq_embedding_quantize="8,0" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' export.output_name="llama3_2_qat_lora.pte"
+python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= quantization.use_qat=True base.use_lora=16 model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="fp32" backend.xnnpack.extended_ops=True base.preq_mode="8da4w_output_8da8w" base.preq_group_size=32 export.max_seq_length=2048 export.max_context_length=2048 base.preq_embedding_quantize=\'8,0\' base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="llama3_2_qat_lora.pte"
```
For convenience, an [exported ExecuTorch QAT+LoRA model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-QLORA_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_QLORA_INT4_EO8.ipynb).
@@ -74,7 +74,7 @@ We have supported BF16 as a data type on the XNNPACK backend for Llama 3.2 1B/3B
* Export Llama model and generate .pte file as below:
```
-python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="bf16" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' export.output_name="llama3_2_bf16.pte"
+python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="bf16" base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="llama3_2_bf16.pte"
```
For convenience, an [exported ExecuTorch bf16 model](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/llama3_2-1B.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/ExportRecipe_1B.ipynb).
@@ -90,7 +90,7 @@ To safeguard your application, you can use our Llama Guard models for prompt cla
* We prepared this model using the following command
```
-python -m extension.llm.export.export_llm base.checkpoint= base.params= model.dtype_override="fp32" model.use_kv_cache=True model.use_sdpa_with_kv_cache=True quantization.qmode="8da4w" quantization.group_size=256 backend.xnnpack.enabled=True export.max_seq_length=8193 export.max_context_length=8193 quantization.embedding_quantize="4,32" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' base.output_prune_map= export.output_name="llama_guard_3_1b_pruned_xnnpack.pte"
+python -m extension.llm.export.export_llm base.checkpoint= base.params= model.dtype_override="fp32" model.use_kv_cache=True model.use_sdpa_with_kv_cache=True quantization.qmode="8da4w" quantization.group_size=256 backend.xnnpack.enabled=True export.max_seq_length=8193 export.max_context_length=8193 quantization.embedding_quantize=\'4,32\' base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' base.output_prune_map= export.output_name="llama_guard_3_1b_pruned_xnnpack.pte"
```
@@ -100,7 +100,7 @@ python -m extension.llm.export.export_llm base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True quantization.qmode="8da4w" quantization.group_size=128 model.dtype_override="fp32" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' export.output_name="llama.pte"
+python -m extension.llm.export.export_llm base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True quantization.qmode="8da4w" quantization.group_size=128 model.dtype_override="fp32" base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="llama.pte"
```
You may wonder what the ‘--metadata’ flag is doing. This flag helps export the model with proper special tokens added that the runner can detect EOS tokens easily.
diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
index d64a119e35f..6cca65339da 100644
--- a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
+++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
@@ -51,7 +51,7 @@ In this demo app, we support text-only inference with up-to-date Llama models an
Meta has released prequantized INT4 SpinQuant Llama 3.2 models that ExecuTorch supports on the XNNPACK backend.
* Export Llama model and generate .pte file as below:
```
-python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="fp32" backend.xnnpack.extended_ops=True base.preq_mode="8da4w_output_8da8w" base.preq_group_size=32 export.max_seq_length=2048 export.max_context_length=2048 base.preq_embedding_quantize="8,0" quantization.use_spin_quant="native" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' export.output_name="llama3_2_spinquant.pte"
+python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="fp32" backend.xnnpack.extended_ops=True base.preq_mode="8da4w_output_8da8w" base.preq_group_size=32 export.max_seq_length=2048 export.max_context_length=2048 base.preq_embedding_quantize=\'8,0\' quantization.use_spin_quant="native" base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="llama3_2_spinquant.pte"
```
For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_SpinQuant_INT4_EO8.ipynb).
@@ -59,7 +59,7 @@ For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co
Meta has released prequantized INT4 QAT+LoRA Llama 3.2 models that ExecuTorch supports on the XNNPACK backend.
* Export Llama model and generate .pte file as below:
```
-python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= quantization.use_qat=True base.use_lora=16 model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="fp32" backend.xnnpack.extended_ops=True base.preq_mode="8da4w_output_8da8w" base.preq_group_size=32 export.max_seq_length=2048 export.max_context_length=2048 base.preq_embedding_quantize="8,0" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' export.output_name="llama3_2_qat_lora.pte"
+python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= quantization.use_qat=True base.use_lora=16 model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="fp32" backend.xnnpack.extended_ops=True base.preq_mode="8da4w_output_8da8w" base.preq_group_size=32 export.max_seq_length=2048 export.max_context_length=2048 base.preq_embedding_quantize=\'8,0\' base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="llama3_2_qat_lora.pte"
```
For convenience, an [exported ExecuTorch QAT+LoRA model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-QLORA_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_QLORA_INT4_EO8.ipynb).
@@ -69,7 +69,7 @@ We have supported BF16 as a data type on the XNNPACK backend for Llama 3.2 1B/3B
* Export Llama model and generate .pte file as below:
```
-python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="bf16" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' export.output_name="llama3_2_bf16.pte"
+python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="bf16" base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="llama3_2_bf16.pte"
```
For convenience, an [exported ExecuTorch bf16 model](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/llama3_2-1B.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/ExportRecipe_1B.ipynb).
@@ -79,7 +79,7 @@ For more detail using Llama 3.2 lightweight models including prompt template, pl
Export the model
```
-python -m extension.llm.export.export_llm base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True quantization.qmode="8da4w" quantization.group_size=128 model.dtype_override="fp32" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' quantization.embedding_quantize="4,32" export.output_name="llama3_kv_sdpa_xnn_qe_4_32.pte"
+python -m extension.llm.export.export_llm base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True quantization.qmode="8da4w" quantization.group_size=128 model.dtype_override="fp32" base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' quantization.embedding_quantize=\'4,32\' export.output_name="llama3_kv_sdpa_xnn_qe_4_32.pte"
```
### For LLaVA model
diff --git a/examples/models/deepseek-r1-distill-llama-8B/README.md b/examples/models/deepseek-r1-distill-llama-8B/README.md
index 7695c678337..f05dd9990a2 100644
--- a/examples/models/deepseek-r1-distill-llama-8B/README.md
+++ b/examples/models/deepseek-r1-distill-llama-8B/README.md
@@ -61,8 +61,8 @@ python -m extension.llm.export.export_llm \
quantization.qmode="8da4w" \
quantization.group_size=128 \
model.dtype_override="fp16" \
- base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
- quantization.embedding_quantize="4,32" \
+ base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' \
+ quantization.embedding_quantize=\'4,32\' \
export.output_name="DeepSeek-R1-Distill-Llama-8B.pte"
```
diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md
index 23a377a6611..e555043c44d 100644
--- a/examples/models/llama/README.md
+++ b/examples/models/llama/README.md
@@ -174,7 +174,7 @@ python -m extension.llm.export.export_llm \
model.use_kv_cache=True \
model.use_sdpa_with_kv_cache=True \
model.dtype_override="bf16" \
- base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
+ base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' \
export.output_name="llama3_2.pte"
```
For convenience, an [exported ExecuTorch bf16 model](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/llama3_2-1B.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/ExportRecipe_1B.ipynb).
@@ -203,9 +203,9 @@ python -m extension.llm.export.export_llm \
export.output_name="llama3_2.pte" \
model.use_kv_cache=True \
model.dtype_override="fp32" \
- base.preq_embedding_quantize="8,0" \
+ base.preq_embedding_quantize=\'8,0\' \
quantization.use_spin_quant="native" \
- base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+ base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"'
```
For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_SpinQuant_INT4_EO8.ipynb).
@@ -226,7 +226,7 @@ python -m extension.llm.export.export_llm \
base.use_lora=16 \
base.preq_mode="8da4w_output_8da8w" \
base.preq_group_size=32 \
- base.preq_embedding_quantize="8,0" \
+ base.preq_embedding_quantize=\'8,0\' \
model.use_sdpa_with_kv_cache=True \
model.use_kv_cache=True \
backend.xnnpack.enabled=True \
@@ -235,7 +235,7 @@ python -m extension.llm.export.export_llm \
export.max_seq_length=2048 \
export.max_context_length=2048 \
export.output_name="llama3_2.pte" \
- base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+ base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"'
```
For convenience, an [exported ExecuTorch QAT+LoRA model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-QLORA_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_QLORA_INT4_EO8.ipynb).
@@ -256,11 +256,11 @@ You can export and run the original Llama 3 8B instruct model.
quantization.qmode="8da4w" \
quantization.group_size=128 \
model.dtype_override="fp32" \
- base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
- quantization.embedding_quantize="4,32" \
+ base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' \
+ quantization.embedding_quantize=\'4,32\' \
export.output_name="llama3_kv_sdpa_xnn_qe_4_32.pte"
```
- Due to the larger vocabulary size of Llama 3, we recommend quantizing the embeddings with `quantization.embedding_quantize="4,32"` as shown above to further reduce the model size.
+ Due to the larger vocabulary size of Llama 3, we recommend quantizing the embeddings with `quantization.embedding_quantize=\'4,32\'` as shown above to further reduce the model size.
If you're interested in deploying on non-CPU backends, [please refer the non-cpu-backend section](non_cpu_backends.md)
@@ -395,11 +395,11 @@ python -m extension.llm.export.export_llm \
base.params="${LLAMA_PARAMS:?}" \
model.use_kv_cache=True \
model.use_sdpa_with_kv_cache=True \
- base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
+ base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' \
export.output_name="llama3_2.pte" \
quantization.qmode="torchao:8da${QLINEAR_BITWIDTH}w" \
quantization.group_size=${QLINEAR_GROUP_SIZE} \
- quantization.embedding_quantize="torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}" \
+ quantization.embedding_quantize=\'torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}\' \
model.dtype_override="fp32"
```
diff --git a/examples/models/phi_4_mini/README.md b/examples/models/phi_4_mini/README.md
index c2b3d515ec0..d168d54226e 100644
--- a/examples/models/phi_4_mini/README.md
+++ b/examples/models/phi_4_mini/README.md
@@ -40,7 +40,7 @@ python -m extension.llm.export.export_llm \
model.use_sdpa_with_kv_cache=True \
model.dtype_override="fp32" \
backend.xnnpack.enabled=True \
- base.metadata='{"get_bos_id":151643, "get_eos_ids":[151643]}' \
+ base.metadata='"{\"get_bos_id\":151643, \"get_eos_ids\":[151643]}"' \
export.output_name="phi-4-mini.pte" \
debug.verbose=True
```
diff --git a/examples/models/qwen2_5/README.md b/examples/models/qwen2_5/README.md
index b40daaca469..57784169ece 100644
--- a/examples/models/qwen2_5/README.md
+++ b/examples/models/qwen2_5/README.md
@@ -40,7 +40,7 @@ python -m extension.llm.export.export_llm \
model.use_sdpa_with_kv_cache=True \
model.dtype_override="fp32" \
backend.xnnpack.enabled=True \
- base.metadata='{"get_bos_id":151643, "get_eos_ids":[151643]}' \
+ base.metadata='"{\"get_bos_id\":151643, \"get_eos_ids\":[151643]}"' \
export.output_name="qwen2_5-1_5b.pte" \
debug.verbose=True
```
diff --git a/examples/models/qwen3/README.md b/examples/models/qwen3/README.md
index acdd4497503..d31d491adf2 100644
--- a/examples/models/qwen3/README.md
+++ b/examples/models/qwen3/README.md
@@ -25,7 +25,7 @@ python -m extension.llm.export.export_llm \
backend.xnnpack.enabled=True \
backend.xnnpack.extended_ops=True \
quantization.qmode="8da4w" \
- base.metadata='{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
+ base.metadata='"{\"get_bos_id\": 151644, \"get_eos_ids\":[151645]}"' \
export.output_name="qwen3-0_6b.pte" \
debug.verbose=True
```
@@ -41,7 +41,7 @@ python -m extension.llm.export.export_llm \
backend.xnnpack.enabled=True \
backend.xnnpack.extended_ops=True \
quantization.qmode="8da4w" \
- base.metadata='{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
+ base.metadata='"{\"get_bos_id\": 151644, \"get_eos_ids\":[151645]}"' \
export.output_name="qwen3-1_7b.pte" \
debug.verbose=True
```
@@ -57,7 +57,7 @@ python -m extension.llm.export.export_llm \
backend.xnnpack.enabled=True \
backend.xnnpack.extended_ops=True \
quantization.qmode="8da4w" \
- base.metadata='{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
+ base.metadata='"{\"get_bos_id\": 151644, \"get_eos_ids\":[151645]}"' \
export.output_name="qwen3-4b.pte" \
debug.verbose=True
```