diff --git a/examples/demo-apps/android/LlamaDemo/README.md b/examples/demo-apps/android/LlamaDemo/README.md index 4b8cafd2d4e..8fed04d7ff5 100644 --- a/examples/demo-apps/android/LlamaDemo/README.md +++ b/examples/demo-apps/android/LlamaDemo/README.md @@ -154,7 +154,7 @@ curl -C - -Ls "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokeni # Create params.json file touch params.json echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json -python -m examples.models.llama.export_llama -c stories110M.pt -p params.json -d fp16 -n stories110m_h.pte -kv +python -m extension.llm.export.export_llm base.checkpoint=stories110M.pt base.params=params.json model.dtype_override="fp16" export.output_name=stories110m_h.pte model.use_kv_cache=True python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin ``` ### Push model diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md index fb9df3c3375..360e92a5f30 100644 --- a/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md +++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md @@ -97,7 +97,7 @@ cmake --build cmake-out/examples/models/llama -j16 --config Release ## Export Llama Model QNN backend currently supports exporting to these data types: fp32, int4/ int8 with PTQ, int4 with SpinQuant (Llama 3 only). -We also support export for different Qualcomm SoC. We have verified SM8650(V75) and SM8550(V73). To export for different SoC, add “--soc_model SM8550” in your export command. Without setting this flag, the export will default to SM8650. +We also support export for different Qualcomm SoC. We have verified SM8650(V75) and SM8550(V73). To export for different SoC, add "--soc_model SM8550" in your export command. Without setting this flag, the export will default to SM8650. ### Export with PTQ We support PTQ by default. The entire export may take ~20 minutes (Llama 3.1 8B). However, there is accuracy regression and we are working on improving it. @@ -106,12 +106,12 @@ We support PTQ by default. The entire export may take ~20 minutes (Llama 3.1 8B) Examples: ``` # 4 bits weight only quantize -python -m examples.models.llama.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" -p "${MODEL_DIR}/params.json" -kv --disable_dynamic_shape --qnn --pt2e_quantize qnn_16a4w -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="test.pte” +python -m extension.llm.export.export_llm base.checkpoint="${MODEL_DIR}/consolidated.00.pth" base.params="${MODEL_DIR}/params.json" model.use_kv_cache=True model.enable_dynamic_shape=False backend.qnn.enabled=True backend.qnn.quantization="qnn_16a4w" model.dtype_override="fp32" base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="test.pte" ``` If the model is really big, it may require model sharding because the Qualcomm DSP is a 32bit system and has a 4GB size limit . For example for Llama 3 8B models, we need to shard the model into 4, but ExecuTorch still packages it into one PTE file. Here is an example: ``` # 8 bits quantization with 4 shards -python -m examples.models.llama.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" -p "${MODEL_DIR}/params.json" -kv --disable_dynamic_shape --qnn --pt2e_quantize qnn_8a8w -d fp32 --num_sharding 4 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="test.pte” +python -m extension.llm.export.export_llm base.checkpoint="${MODEL_DIR}/consolidated.00.pth" base.params="${MODEL_DIR}/params.json" model.use_kv_cache=True model.enable_dynamic_shape=False backend.qnn.enabled=True backend.qnn.quantization="qnn_8a8w" model.dtype_override="fp32" backend.qnn.num_sharding=4 base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="test.pte" ``` Note: if you encountered issues below ``` @@ -163,7 +163,7 @@ To export Llama 3 8B instruct with the Qualcomm AI Engine Direct Backend, ensure * 8B models might need 16GB RAM on the device to run. ``` # Please note that calibration_data must include the prompt template for special tokens. -python -m examples.models.llama.export_llama -t -p -c --use_kv_cache --qnn --pt2e_quantize qnn_16a4w --disable_dynamic_shape --num_sharding 8 --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --optimized_rotation_path --calibration_data "<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" +python -m extension.llm.export.export_llm base.tokenizer= base.params= base.checkpoint= model.use_kv_cache=True backend.qnn.enabled=True backend.qnn.quantization="qnn_16a4w" model.enable_dynamic_shape=False backend.qnn.num_sharding=8 backend.qnn.calibration_tasks="wikitext" backend.qnn.calibration_limit=1 backend.qnn.calibration_seq_length=128 backend.qnn.optimized_rotation_path= backend.qnn.calibration_data="<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" ``` ## Pushing Model and Tokenizer @@ -210,17 +210,17 @@ Alternative you can also just run the shell script directly as in the root direc sh examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh ``` This is running the shell script which configures the required core ExecuTorch, Llama2/3, and Android libraries, builds them into AAR, and copies it to the app. -Note: If you are building the Android app mentioned in the next section on a separate machine (i.e. MacOS but building and exporting for QNN backend on Linux), make sure you copy the aar file generated from setup-with-qnn script to “examples/demo-apps/android/LlamaDemo/app/libs” before building the Android app. +Note: If you are building the Android app mentioned in the next section on a separate machine (i.e. MacOS but building and exporting for QNN backend on Linux), make sure you copy the aar file generated from setup-with-qnn script to "examples/demo-apps/android/LlamaDemo/app/libs" before building the Android app. ## Run the Android Demo App -First, make sure your Android phone’s chipset version is compatible with this demo (SM8650, SM8550). You can find the Qualcomm chipset version here in the [mapping](https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html). +First, make sure your Android phone's chipset version is compatible with this demo (SM8650, SM8550). You can find the Qualcomm chipset version here in the [mapping](https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html). -If you build and run the setup-with-qnn script on a separate machine rather than where you are building the Android app, make sure you copy the aar file it generated into “examples/demo-apps/android/LlamaDemo/app/libs” +If you build and run the setup-with-qnn script on a separate machine rather than where you are building the Android app, make sure you copy the aar file it generated into "examples/demo-apps/android/LlamaDemo/app/libs" ### Alternative 1: Android Studio (Recommended) -Open Android Studio and select “Open an existing Android Studio project” to open examples/demo-apps/android/LlamaDemo. +Open Android Studio and select "Open an existing Android Studio project" to open examples/demo-apps/android/LlamaDemo. Run the app (^R). This builds and launches the app on the phone. ### Alternative 2: Command line @@ -238,4 +238,4 @@ If the app successfully run on your device, you should see something like below:

## Reporting Issues -If you encountered any bugs or issues following this tutorial please file a bug/issue here on Github. +If you encountered any bugs or issues following this tutorial please file a bug/issue here on Github. \ No newline at end of file diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md index de99387f82d..baf8ffb7071 100644 --- a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md +++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md @@ -55,7 +55,7 @@ In this demo app, we support text-only inference with up-to-date Llama models an Meta has released prequantized INT4 SpinQuant Llama 3.2 models that ExecuTorch supports on the XNNPACK backend. * Export Llama model and generate .pte file as below: ``` -python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --max_context_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte" +python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="fp32" backend.xnnpack.extended_ops=True base.preq_mode="8da4w_output_8da8w" base.preq_group_size=32 export.max_seq_length=2048 export.max_context_length=2048 base.preq_embedding_quantize=\'8,0\' quantization.use_spin_quant="native" base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="llama3_2_spinquant.pte" ``` For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_SpinQuant_INT4_EO8.ipynb). @@ -63,7 +63,7 @@ For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co Meta has released prequantized INT4 QAT+LoRA Llama 3.2 models that ExecuTorch supports on the XNNPACK backend. * Export Llama model and generate .pte file as below: ``` -python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint --params -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --max_context_length 2048--preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte" +python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= quantization.use_qat=True base.use_lora=16 model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="fp32" backend.xnnpack.extended_ops=True base.preq_mode="8da4w_output_8da8w" base.preq_group_size=32 export.max_seq_length=2048 export.max_context_length=2048 base.preq_embedding_quantize=\'8,0\' base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="llama3_2_qat_lora.pte" ``` For convenience, an [exported ExecuTorch QAT+LoRA model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-QLORA_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_QLORA_INT4_EO8.ipynb). @@ -74,7 +74,7 @@ We have supported BF16 as a data type on the XNNPACK backend for Llama 3.2 1B/3B * Export Llama model and generate .pte file as below: ``` -python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2_bf16.pte" +python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="bf16" base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="llama3_2_bf16.pte" ``` For convenience, an [exported ExecuTorch bf16 model](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/llama3_2-1B.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/ExportRecipe_1B.ipynb). @@ -90,7 +90,7 @@ To safeguard your application, you can use our Llama Guard models for prompt cla * We prepared this model using the following command ``` -python -m examples.models.llama.export_llama --checkpoint --params -d fp32 -kv --use_sdpa_with_kv_cache --quantization_mode 8da4w --group_size 256 --xnnpack --max_seq_length 8193 --max_context_length 8193 --embedding-quantize 4,32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_prune_map --output_name="llama_guard_3_1b_pruned_xnnpack.pte" +python -m extension.llm.export.export_llm base.checkpoint= base.params= model.dtype_override="fp32" model.use_kv_cache=True model.use_sdpa_with_kv_cache=True quantization.qmode="8da4w" quantization.group_size=256 backend.xnnpack.enabled=True export.max_seq_length=8193 export.max_context_length=8193 quantization.embedding_quantize=\'4,32\' base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' base.output_prune_map= export.output_name="llama_guard_3_1b_pruned_xnnpack.pte" ``` @@ -100,7 +100,7 @@ python -m examples.models.llama.export_llama --checkpoint --params -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama.pte" +python -m extension.llm.export.export_llm base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True quantization.qmode="8da4w" quantization.group_size=128 model.dtype_override="fp32" base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="llama.pte" ``` You may wonder what the ‘--metadata’ flag is doing. This flag helps export the model with proper special tokens added that the runner can detect EOS tokens easily. diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md index 47352607bca..d6bccc0ef47 100644 --- a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md +++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md @@ -49,7 +49,7 @@ Install the required packages to export the model Export the model ``` -python -m examples.models.llama.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" --params "${MODEL_DIR}/params.json" -kv --use_sdpa_with_kv_cache --mps -d fp32 --disable_dynamic_shape -qmode 8da4w -G 32 +python -m extension.llm.export.export_llm base.checkpoint="${MODEL_DIR}/consolidated.00.pth" base.params="${MODEL_DIR}/params.json" model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.mps.enabled=True model.dtype_override="fp32" model.enable_dynamic_shape=False quantization.qmode="8da4w" quantization.group_size=32 ``` ## Pushing Model and Tokenizer diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md index bb33b50f8b7..6cca65339da 100644 --- a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md +++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md @@ -51,7 +51,7 @@ In this demo app, we support text-only inference with up-to-date Llama models an Meta has released prequantized INT4 SpinQuant Llama 3.2 models that ExecuTorch supports on the XNNPACK backend. * Export Llama model and generate .pte file as below: ``` -python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --max_context_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte" +python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="fp32" backend.xnnpack.extended_ops=True base.preq_mode="8da4w_output_8da8w" base.preq_group_size=32 export.max_seq_length=2048 export.max_context_length=2048 base.preq_embedding_quantize=\'8,0\' quantization.use_spin_quant="native" base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="llama3_2_spinquant.pte" ``` For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_SpinQuant_INT4_EO8.ipynb). @@ -59,7 +59,7 @@ For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co Meta has released prequantized INT4 QAT+LoRA Llama 3.2 models that ExecuTorch supports on the XNNPACK backend. * Export Llama model and generate .pte file as below: ``` -python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint --params -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --max_context_length 2048 --preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte" +python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= quantization.use_qat=True base.use_lora=16 model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="fp32" backend.xnnpack.extended_ops=True base.preq_mode="8da4w_output_8da8w" base.preq_group_size=32 export.max_seq_length=2048 export.max_context_length=2048 base.preq_embedding_quantize=\'8,0\' base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="llama3_2_qat_lora.pte" ``` For convenience, an [exported ExecuTorch QAT+LoRA model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-QLORA_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_QLORA_INT4_EO8.ipynb). @@ -69,7 +69,7 @@ We have supported BF16 as a data type on the XNNPACK backend for Llama 3.2 1B/3B * Export Llama model and generate .pte file as below: ``` -python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2_bf16.pte" +python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="bf16" base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="llama3_2_bf16.pte" ``` For convenience, an [exported ExecuTorch bf16 model](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/llama3_2-1B.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/ExportRecipe_1B.ipynb). @@ -79,7 +79,7 @@ For more detail using Llama 3.2 lightweight models including prompt template, pl Export the model ``` -python -m examples.models.llama.export_llama --checkpoint -p -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --embedding-quantize 4,32 --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte" +python -m extension.llm.export.export_llm base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True quantization.qmode="8da4w" quantization.group_size=128 model.dtype_override="fp32" base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' quantization.embedding_quantize=\'4,32\' export.output_name="llama3_kv_sdpa_xnn_qe_4_32.pte" ``` ### For LLaVA model diff --git a/examples/models/deepseek-r1-distill-llama-8B/README.md b/examples/models/deepseek-r1-distill-llama-8B/README.md index 5fd47ad61ec..f05dd9990a2 100644 --- a/examples/models/deepseek-r1-distill-llama-8B/README.md +++ b/examples/models/deepseek-r1-distill-llama-8B/README.md @@ -52,18 +52,18 @@ torch.save(sd, "/tmp/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/checkpoint.pth") 5. Generate a PTE file for use with the Llama runner. ``` -python -m examples.models.llama.export_llama \ - --checkpoint /tmp/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/checkpoint.pth \ - -p params.json \ - -kv \ - --use_sdpa_with_kv_cache \ - -X \ - -qmode 8da4w \ - --group_size 128 \ - -d fp16 \ - --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ - --embedding-quantize 4,32 \ - --output_name="DeepSeek-R1-Distill-Llama-8B.pte" +python -m extension.llm.export.export_llm \ + base.checkpoint=/tmp/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/checkpoint.pth \ + base.params=params.json \ + model.use_kv_cache=True \ + model.use_sdpa_with_kv_cache=True \ + backend.xnnpack.enabled=True \ + quantization.qmode="8da4w" \ + quantization.group_size=128 \ + model.dtype_override="fp16" \ + base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' \ + quantization.embedding_quantize=\'4,32\' \ + export.output_name="DeepSeek-R1-Distill-Llama-8B.pte" ``` 6. Run the model on your desktop for validation or integrate with iOS/Android apps. Instructions for these are available in the Llama [README](../llama/README.md) starting at Step 3. diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md index c6f0350fff7..e555043c44d 100644 --- a/examples/models/llama/README.md +++ b/examples/models/llama/README.md @@ -167,15 +167,15 @@ Llama 3 8B performance was measured on the Samsung Galaxy S22, S24, and OnePlus LLAMA_CHECKPOINT=path/to/consolidated.00.pth LLAMA_PARAMS=path/to/params.json -python -m examples.models.llama.export_llama \ - --model "llama3_2" \ - --checkpoint "${LLAMA_CHECKPOINT:?}" \ - --params "${LLAMA_PARAMS:?}" \ - -kv \ - --use_sdpa_with_kv_cache \ - -d bf16 \ - --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ - --output_name="llama3_2.pte" +python -m extension.llm.export.export_llm \ + base.model_class="llama3_2" \ + base.checkpoint="${LLAMA_CHECKPOINT:?}" \ + base.params="${LLAMA_PARAMS:?}" \ + model.use_kv_cache=True \ + model.use_sdpa_with_kv_cache=True \ + model.dtype_override="bf16" \ + base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' \ + export.output_name="llama3_2.pte" ``` For convenience, an [exported ExecuTorch bf16 model](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/llama3_2-1B.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/ExportRecipe_1B.ipynb). @@ -189,23 +189,23 @@ For convenience, an [exported ExecuTorch bf16 model](https://huggingface.co/exec LLAMA_QUANTIZED_CHECKPOINT=path/to/spinquant/consolidated.00.pth.pth LLAMA_PARAMS=path/to/spinquant/params.json -python -m examples.models.llama.export_llama \ - --model "llama3_2" \ - --checkpoint "${LLAMA_QUANTIZED_CHECKPOINT:?}" \ - --params "${LLAMA_PARAMS:?}" \ - --use_sdpa_with_kv_cache \ - -X \ - --xnnpack-extended-ops \ - --preq_mode 8da4w_output_8da8w \ - --preq_group_size 32 \ - --max_seq_length 2048 \ - --max_context_length 2048 \ - --output_name "llama3_2.pte" \ - -kv \ - -d fp32 \ - --preq_embedding_quantize 8,0 \ - --use_spin_quant native \ - --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' +python -m extension.llm.export.export_llm \ + base.model_class="llama3_2" \ + base.checkpoint="${LLAMA_QUANTIZED_CHECKPOINT:?}" \ + base.params="${LLAMA_PARAMS:?}" \ + model.use_sdpa_with_kv_cache=True \ + backend.xnnpack.enabled=True \ + backend.xnnpack.extended_ops=True \ + base.preq_mode="8da4w_output_8da8w" \ + base.preq_group_size=32 \ + export.max_seq_length=2048 \ + export.max_context_length=2048 \ + export.output_name="llama3_2.pte" \ + model.use_kv_cache=True \ + model.dtype_override="fp32" \ + base.preq_embedding_quantize=\'8,0\' \ + quantization.use_spin_quant="native" \ + base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' ``` For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_SpinQuant_INT4_EO8.ipynb). @@ -218,24 +218,24 @@ For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co LLAMA_QUANTIZED_CHECKPOINT=path/to/qlora/consolidated.00.pth.pth LLAMA_PARAMS=path/to/qlora/params.json -python -m examples.models.llama.export_llama \ - --model "llama3_2" \ - --checkpoint "${LLAMA_QUANTIZED_CHECKPOINT:?}" \ - --params "${LLAMA_PARAMS:?}" \ - -qat \ - -lora 16 \ - --preq_mode 8da4w_output_8da8w \ - --preq_group_size 32 \ - --preq_embedding_quantize 8,0 \ - --use_sdpa_with_kv_cache \ - -kv \ - -X \ - --xnnpack-extended-ops \ - -d fp32 \ - --max_seq_length 2048 \ - --max_context_length 2048 \ - --output_name "llama3_2.pte" \ - --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' +python -m extension.llm.export.export_llm \ + base.model_class="llama3_2" \ + base.checkpoint="${LLAMA_QUANTIZED_CHECKPOINT:?}" \ + base.params="${LLAMA_PARAMS:?}" \ + quantization.use_qat=True \ + base.use_lora=16 \ + base.preq_mode="8da4w_output_8da8w" \ + base.preq_group_size=32 \ + base.preq_embedding_quantize=\'8,0\' \ + model.use_sdpa_with_kv_cache=True \ + model.use_kv_cache=True \ + backend.xnnpack.enabled=True \ + backend.xnnpack.extended_ops=True \ + model.dtype_override="fp32" \ + export.max_seq_length=2048 \ + export.max_context_length=2048 \ + export.output_name="llama3_2.pte" \ + base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' ``` For convenience, an [exported ExecuTorch QAT+LoRA model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-QLORA_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_QLORA_INT4_EO8.ipynb). @@ -247,20 +247,20 @@ You can export and run the original Llama 3 8B instruct model. 2. Export model and generate `.pte` file ``` - python -m examples.models.llama.export_llama \ - --checkpoint \ - -p \ - -kv \ - --use_sdpa_with_kv_cache \ - -X \ - -qmode 8da4w \ - --group_size 128 \ - -d fp32 \ - --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ - --embedding-quantize 4,32 \ - --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte" + python -m extension.llm.export.export_llm \ + base.checkpoint= \ + base.params= \ + model.use_kv_cache=True \ + model.use_sdpa_with_kv_cache=True \ + backend.xnnpack.enabled=True \ + quantization.qmode="8da4w" \ + quantization.group_size=128 \ + model.dtype_override="fp32" \ + base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' \ + quantization.embedding_quantize=\'4,32\' \ + export.output_name="llama3_kv_sdpa_xnn_qe_4_32.pte" ``` - Due to the larger vocabulary size of Llama 3, we recommend quantizing the embeddings with `--embedding-quantize 4,32` as shown above to further reduce the model size. + Due to the larger vocabulary size of Llama 3, we recommend quantizing the embeddings with `quantization.embedding_quantize=\'4,32\'` as shown above to further reduce the model size. If you're interested in deploying on non-CPU backends, [please refer the non-cpu-backend section](non_cpu_backends.md) @@ -389,22 +389,22 @@ QLINEAR_GROUP_SIZE=128 # Must be multiple of 16 QEMBEDDING_BITWIDTH=4 # Can be 1-8 QEMBEDDING_GROUP_SIZE=32 # Must be multiple of 16 -python -m examples.models.llama.export_llama \ - --model "llama3_2" \ - --checkpoint "${LLAMA_CHECKPOINT:?}" \ - --params "${LLAMA_PARAMS:?}" \ - -kv \ - --use_sdpa_with_kv_cache \ - --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ - --output_name="llama3_2.pte" \ - -qmode "torchao:8da${QLINEAR_BITWIDTH}w" \ - --group_size ${QLINEAR_GROUP_SIZE} \ - -E "torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}" \ - -d fp32 +python -m extension.llm.export.export_llm \ + base.model_class="llama3_2" \ + base.checkpoint="${LLAMA_CHECKPOINT:?}" \ + base.params="${LLAMA_PARAMS:?}" \ + model.use_kv_cache=True \ + model.use_sdpa_with_kv_cache=True \ + base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' \ + export.output_name="llama3_2.pte" \ + quantization.qmode="torchao:8da${QLINEAR_BITWIDTH}w" \ + quantization.group_size=${QLINEAR_GROUP_SIZE} \ + quantization.embedding_quantize=\'torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}\' \ + model.dtype_override="fp32" ``` A few notes: -- If your model shares embedding/unembedding weights (like Llama1B and Llama3B do), you can add `--use_shared_embedding` to take advantage of this and reduce memory. When this option is enabled, you can specify whether embeddings are quantized asymmetrically or not by specifying a third argument. For example, `-E "torchao:4,32,true"` means that the embedding is quantized to 4-bits with group_size=32 and is asymmetric (this is the default behavior if you simply use `-E "torchao:4,32"`), whereas `-E "torchao:4,32,false"` means that the embedding is quantized to 4-bits with group_size=32 and is symmetric. If `--use_shared_embedding` is specified, the unembedding (i.e., the final linear layer) is quantized in the same way, but also uses 8-bit dynamically quantized activations. +- If your model shares embedding/unembedding weights (like Llama1B and Llama3B do), you can add `model.use_shared_embedding=True` to take advantage of this and reduce memory. When this option is enabled, you can specify whether embeddings are quantized asymmetrically or not by specifying a third argument. For example, `quantization.embedding_quantize="torchao:4,32,true"` means that the embedding is quantized to 4-bits with group_size=32 and is asymmetric (this is the default behavior if you simply use `quantization.embedding_quantize="torchao:4,32"`), whereas `quantization.embedding_quantize="torchao:4,32,false"` means that the embedding is quantized to 4-bits with group_size=32 and is symmetric. If `model.use_shared_embedding=True` is specified, the unembedding (i.e., the final linear layer) is quantized in the same way, but also uses 8-bit dynamically quantized activations. - To do channelwise quantization, specify group_size to 0. This works for both linear and embedding layers. Once the model is exported, we need to build ExecuTorch and the runner with the low-bit kernels. diff --git a/examples/models/llama/UTILS.md b/examples/models/llama/UTILS.md index 5f760ad7670..25bd7f77080 100644 --- a/examples/models/llama/UTILS.md +++ b/examples/models/llama/UTILS.md @@ -19,7 +19,7 @@ From `executorch` root: ``` 3. Export model and generate `.pte` file. ``` - python -m examples.models.llama.export_llama -c stories110M.pt -p params.json -X -kv + python -m extension.llm.export.export_llm base.checkpoint=stories110M.pt base.params=params.json backend.xnnpack.enabled=True model.use_kv_cache=True ``` ## Smaller model delegated to other backends @@ -27,15 +27,15 @@ From `executorch` root: Currently we supported lowering the stories model to other backends, including, CoreML, MPS and QNN. Please refer to the instruction for each backend ([CoreML](https://pytorch.org/executorch/main/backends-coreml), [MPS](https://pytorch.org/executorch/main/backends-mps), [QNN](https://pytorch.org/executorch/main/backends-qualcomm)) before trying to lower them. After the backend library is installed, the script to export a lowered model is -- Lower to CoreML: `python -m examples.models.llama.export_llama -kv --disable_dynamic_shape --coreml -c stories110M.pt -p params.json ` -- MPS: `python -m examples.models.llama.export_llama -kv --disable_dynamic_shape --mps -c stories110M.pt -p params.json ` -- QNN: `python -m examples.models.llama.export_llama -kv --disable_dynamic_shape --qnn -c stories110M.pt -p params.json ` +- Lower to CoreML: `python -m extension.llm.export.export_llm model.use_kv_cache=True model.enable_dynamic_shape=False backend.coreml.enabled=True base.checkpoint=stories110M.pt base.params=params.json` +- MPS: `python -m extension.llm.export.export_llm model.use_kv_cache=True model.enable_dynamic_shape=False backend.mps.enabled=True base.checkpoint=stories110M.pt base.params=params.json` +- QNN: `python -m extension.llm.export.export_llm model.use_kv_cache=True model.enable_dynamic_shape=False backend.qnn.enabled=True base.checkpoint=stories110M.pt base.params=params.json` The iOS LLAMA app supports the CoreML and MPS model and the Android LLAMA app supports the QNN model. On Android, it also allow to cross compiler the llama runner binary, push to the device and run. For CoreML, there are 2 additional optional arguments: -* `--coreml-ios`: Specify the minimum iOS version to deploy (and turn on available optimizations). E.g. `--coreml-ios 18` will turn on [in-place KV cache](https://developer.apple.com/documentation/coreml/mlstate?language=objc) and [fused scaled dot product attention kernel](https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS18.transformers.scaled_dot_product_attention) (the resulting model will then need at least iOS 18 to run, though) -* `--coreml-quantize`: Use [quantization tailored for CoreML](https://apple.github.io/coremltools/docs-guides/source/opt-quantization-overview.html). E.g. `--coreml-quantize b4w` will perform per-block 4-bit weight-only quantization in a way tailored for CoreML +* `backend.coreml.ios`: Specify the minimum iOS version to deploy (and turn on available optimizations). E.g. `backend.coreml.ios=18` will turn on [in-place KV cache](https://developer.apple.com/documentation/coreml/mlstate?language=objc) and [fused scaled dot product attention kernel](https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS18.transformers.scaled_dot_product_attention) (the resulting model will then need at least iOS 18 to run, though) +* `backend.coreml.quantize`: Use [quantization tailored for CoreML](https://apple.github.io/coremltools/docs-guides/source/opt-quantization-overview.html). E.g. `backend.coreml.quantize="b4w"` will perform per-block 4-bit weight-only quantization in a way tailored for CoreML To deploy the large 8B model on the above backends, [please visit this section](non_cpu_backends.md). diff --git a/examples/models/llama/config/llm_config.py b/examples/models/llama/config/llm_config.py index 201e3a5414a..9acd633fb21 100644 --- a/examples/models/llama/config/llm_config.py +++ b/examples/models/llama/config/llm_config.py @@ -26,19 +26,19 @@ class ModelType(str, Enum): - STORIES110M = "stories110m" - LLAMA2 = "llama2" - LLAMA3 = "llama3" - LLAMA3_1 = "llama3_1" - LLAMA3_2 = "llama3_2" - LLAMA3_2_VISION = "llama3_2_vision" - STATIC_LLAMA = "static_llama" - QWEN2_5 = "qwen2_5" - QWEN3_0_6B = "qwen3-0_6b" - QWEN3_1_7B = "qwen3-1_7b" - QWEN3_4B = "qwen3-4b" - PHI_4_MINI = "phi_4_mini" - SMOLLM2 = "smollm2" + stories110m = "stories110m" + llama2 = "llama2" + llama3 = "llama3" + llama3_1 = "llama3_1" + llama3_2 = "llama3_2" + llama3_2_vision = "llama3_2_vision" + static_llama = "static_llama" + qwen2_5 = "qwen2_5" + qwen3_0_6b = "qwen3-0_6b" + qwen3_1_7b = "qwen3-1_7b" + qwen3_4b = "qwen3-4b" + phi_4_mini = "phi_4_mini" + smollm2 = "smollm2" class PreqMode(str, Enum): @@ -49,8 +49,8 @@ class PreqMode(str, Enum): are still around to preserve backward compatibility. """ - PREQ_8DA4W = "8da4w" - PREQ_8DA4W_OUT_8DA8W = "8da4w_output_8da8w" + preq_8da4w = "8da4w" + preq_8da4w_out_8da8w = "8da4w_output_8da8w" @dataclass @@ -82,7 +82,7 @@ class BaseConfig: are loaded. """ - model_class: ModelType = ModelType.LLAMA3 + model_class: ModelType = ModelType.llama3 params: Optional[str] = None checkpoint: Optional[str] = None checkpoint_dir: Optional[str] = None @@ -107,9 +107,9 @@ class DtypeOverride(str, Enum): is not recommended. """ - FP32 = "fp32" - FP16 = "fp16" - BF16 = "bf16" + fp32 = "fp32" + fp16 = "fp16" + bf16 = "bf16" @dataclass @@ -147,7 +147,7 @@ class ModelConfig: [16] pattern specifies all layers have a sliding window of 16. """ - dtype_override: DtypeOverride = DtypeOverride.FP32 + dtype_override: DtypeOverride = DtypeOverride.fp32 enable_dynamic_shape: bool = True use_shared_embedding: bool = False use_sdpa_with_kv_cache: bool = False @@ -270,22 +270,22 @@ class Pt2eQuantize(str, Enum): and is source transform-based. """ - XNNPACK_DYNAMIC = "xnnpack_dynamic" - XNNPACK_DYNAMIC_QC4 = "xnnpack_dynamic_qc4" - QNN_8A8W = "qnn_8a8w" - QNN_16A16W = "qnn_16a16w" - QNN_16A4W = "qnn_16a4w" - COREML_C4W = "coreml_c4w" - COREML_8A_C8W = "coreml_8a_c8w" - COREML_8A_C4W = "coreml_8a_c4w" - COREML_BASELINE_8A_C8W = "coreml_baseline_8a_c8w" - COREML_BASELINE_8A_C4W = "coreml_baseline_8a_c4w" - VULKAN_8W = "vulkan_8w" + xnnpack_dynamic = "xnnpack_dynamic" + xnnpack_dynamic_qc4 = "xnnpack_dynamic_qc4" + qnn_8a8w = "qnn_8a8w" + qnn_16a16w = "qnn_16a16w" + qnn_16a4w = "qnn_16a4w" + coreml_c4w = "coreml_c4w" + coreml_8a_c8w = "coreml_8a_c8w" + coreml_8a_c4w = "coreml_8a_c4w" + coreml_baseline_8a_c8w = "coreml_baseline_8a_c8w" + coreml_baseline_8a_c4w = "coreml_baseline_8a_c4w" + vulkan_8w = "vulkan_8w" class SpinQuant(str, Enum): - CUDA = "cuda" - NATIVE = "native" + cuda = "cuda" + native = "native" @dataclass @@ -378,15 +378,15 @@ class XNNPackConfig: class CoreMLQuantize(str, Enum): - B4W = "b4w" - C4W = "c4w" + b4w = "b4w" + c4w = "c4w" class CoreMLComputeUnit(str, Enum): - CPU_ONLY = "cpu_only" - CPU_AND_GPU = "cpu_and_gpu" - CPU_AND_NE = "cpu_and_ne" - ALL = "all" + cpu_only = "cpu_only" + cpu_and_gpu = "cpu_and_gpu" + cpu_and_ne = "cpu_and_ne" + all = "all" @dataclass @@ -400,7 +400,7 @@ class CoreMLConfig: preserve_sdpa: bool = False quantize: Optional[CoreMLQuantize] = None ios: int = 15 - compute_units: CoreMLComputeUnit = CoreMLComputeUnit.CPU_ONLY + compute_units: CoreMLComputeUnit = CoreMLComputeUnit.cpu_only def __post_init__(self): if self.ios not in (15, 16, 17, 18): diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index 88b79d30eb2..334f3ace712 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -590,7 +590,7 @@ def export_llama( # If a checkpoint isn't provided for an HF OSS model, download and convert the # weights first. - model_name = llm_config.base.model_class + model_name = llm_config.base.model_class.value if not llm_config.base.checkpoint and model_name in HUGGING_FACE_REPO_IDS: repo_id = HUGGING_FACE_REPO_IDS[model_name] if model_name == "qwen2_5": @@ -668,7 +668,7 @@ def _prepare_for_llama_export(llm_config: LlmConfig) -> LLMEdgeManager: llm_config.export.output_dir = output_dir_path # Convert dtype override string to actual type. - dtype_override = DType[llm_config.model.dtype_override] + dtype_override = DType[llm_config.model.dtype_override.value] edge_manager = _load_llama_model(llm_config) @@ -702,7 +702,11 @@ def _prepare_for_llama_export(llm_config: LlmConfig) -> LLMEdgeManager: checkpoint=llm_config.base.checkpoint, checkpoint_dtype=DType.from_torch_dtype(checkpoint_dtype), # type: ignore tokenizer_path=llm_config.base.tokenizer_path, - use_spin_quant=llm_config.quantization.use_spin_quant, + use_spin_quant=( + llm_config.quantization.use_spin_quant.value + if llm_config.quantization.use_spin_quant + else None + ), embedding_quantize=llm_config.quantization.embedding_quantize, use_shared_embedding=llm_config.model.use_shared_embedding, quantization_mode=llm_config.quantization.qmode, @@ -726,7 +730,9 @@ def _prepare_for_llama_export(llm_config: LlmConfig) -> LLMEdgeManager: vulkan=llm_config.backend.vulkan.enabled, use_qat=llm_config.quantization.use_qat, use_lora=llm_config.base.use_lora, - preq_mode=llm_config.base.preq_mode, + preq_mode=( + llm_config.base.preq_mode.value if llm_config.base.preq_mode else None + ), preq_group_size=llm_config.base.preq_group_size, preq_embedding_quantize=llm_config.base.preq_embedding_quantize, local_global_attention=llm_config.model.local_global_attention, @@ -738,25 +744,34 @@ def _prepare_for_llama_export(llm_config: LlmConfig) -> LLMEdgeManager: def get_quantizer_and_quant_params(llm_config): pt2e_quant_params = get_pt2e_quantization_params( - llm_config.quantization.pt2e_quantize, llm_config.quantization.qmode + ( + llm_config.quantization.pt2e_quantize.value + if llm_config.quantization.pt2e_quantize + else None + ), + llm_config.quantization.qmode, ) quantizers = get_pt2e_quantizers(pt2e_quant_params, llm_config.export.so_library) quant_dtype = None if llm_config.backend.qnn.enabled and llm_config.quantization.pt2e_quantize: assert len(quantizers) == 0, "Should not enable both xnnpack and qnn" qnn_quantizer, quant_dtype = get_qnn_quantizer( - llm_config.quantization.pt2e_quantize, llm_config.quantization.qmode + llm_config.quantization.pt2e_quantize.value, llm_config.quantization.qmode ) quantizers.append(qnn_quantizer) if llm_config.backend.coreml.enabled and llm_config.quantization.pt2e_quantize: assert len(quantizers) == 0, "Should not enable both xnnpack / qnn and coreml" - coreml_quantizer = get_coreml_quantizer(llm_config.quantization.pt2e_quantize) + coreml_quantizer = get_coreml_quantizer( + llm_config.quantization.pt2e_quantize.value + ) quantizers.append(coreml_quantizer) if llm_config.backend.vulkan.enabled and llm_config.quantization.pt2e_quantize: assert ( len(quantizers) == 0 ), "Should not enable both vulkan and other quantizers" - vulkan_quantizer = get_vulkan_quantizer(llm_config.quantization.pt2e_quantize) + vulkan_quantizer = get_vulkan_quantizer( + llm_config.quantization.pt2e_quantize.value + ) quantizers.append(vulkan_quantizer) logging.info(f"Applying quantizers: {quantizers}") return pt2e_quant_params, quantizers, quant_dtype @@ -1035,7 +1050,7 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager: # noqa: C901 ) additional_passes = [] - if llm_config.base.model_class in TORCHTUNE_DEFINED_MODELS: + if llm_config.base.model_class.value in TORCHTUNE_DEFINED_MODELS: additional_passes = [InitializedMutableBufferPass(["kv_cache_pos"])] # export_to_edge @@ -1074,14 +1089,22 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager: # noqa: C901 mps=llm_config.backend.mps.enabled, coreml=llm_config.backend.coreml.enabled, qnn=llm_config.backend.qnn.enabled, - dtype_override=llm_config.model.dtype_override, + dtype_override=llm_config.model.dtype_override.value, enable_dynamic_shape=llm_config.model.enable_dynamic_shape, use_kv_cache=llm_config.model.use_kv_cache, embedding_quantize=llm_config.quantization.embedding_quantize, - pt2e_quantize=llm_config.quantization.pt2e_quantize, + pt2e_quantize=( + llm_config.quantization.pt2e_quantize.value + if llm_config.quantization.pt2e_quantize + else None + ), coreml_ios=llm_config.backend.coreml.ios, - coreml_quantize=llm_config.backend.coreml.quantize, - coreml_compute_units=llm_config.backend.coreml.compute_units, + coreml_quantize=( + llm_config.backend.coreml.quantize.value + if llm_config.backend.coreml.quantize + else None + ), + coreml_compute_units=llm_config.backend.coreml.compute_units.value, use_qnn_sha=llm_config.backend.qnn.use_sha, num_sharding=llm_config.backend.qnn.num_sharding, soc_model=llm_config.backend.qnn.soc_model, @@ -1154,7 +1177,7 @@ def _load_llama_model(llm_config: LlmConfig) -> "LLMEdgeManager": An instance of LLMEdgeManager which contains the eager mode model. """ - modelname = llm_config.base.model_class + modelname = llm_config.base.model_class.value if modelname in EXECUTORCH_DEFINED_MODELS: module_name = "llama" model_class_name = "Llama2Model" # TODO: Change to "LlamaModel" in examples/models/llama/model.py. @@ -1175,7 +1198,7 @@ def _load_llama_model(llm_config: LlmConfig) -> "LLMEdgeManager": ) ) # Convert dtype override string to actual type. - dtype_override = DType[llm_config.model.dtype_override] + dtype_override = DType[llm_config.model.dtype_override.value] return LLMEdgeManager( model=model, diff --git a/examples/models/llama/model.py b/examples/models/llama/model.py index ec9646be6f4..efea80dde2f 100644 --- a/examples/models/llama/model.py +++ b/examples/models/llama/model.py @@ -157,7 +157,7 @@ def __init__(self, llm_config: Optional[LlmConfig] = None): if model_args.use_scaled_rope: # Older models don't have use_scaled_rope configuration - model_name = str(self.llm_config.base.model_class) + model_name = self.llm_config.base.model_class.value assert model_name not in ["llama2", "stories110m"] # Llama3_2 and newer models in ExecuTorch repo should set larger scale factor @@ -328,10 +328,10 @@ def get_example_inputs_kvcache_sdpa(self): def _transform_for_pre_quantization(self, checkpoint, model_args): assert self.llm_config.base.preq_mode, "preq_mode must be specified" - assert self.llm_config.base.preq_mode in [ + assert self.llm_config.base.preq_mode.value in [ "8da4w", "8da4w_output_8da8w", - ], f"Quantization mode {self.llm_config.base.preq_mode} is not compatible with SpinQuant." + ], f"Quantization mode {self.llm_config.base.preq_mode.value} is not compatible with SpinQuant." assert self.llm_config.base.preq_group_size, "preq_group_size must be specified" assert self.llm_config.model.dtype_override, "dtype_override must be specified" @@ -351,7 +351,7 @@ def _transform_for_pre_quantization(self, checkpoint, model_args): } # Transform the output layer first if needed. - if self.llm_config.base.preq_mode == "8da4w_output_8da8w": + if self.llm_config.base.preq_mode.value == "8da4w_output_8da8w": from .source_transformation.pre_quantization import ( transform_output_linear_for_pre_quantization, ) @@ -359,14 +359,14 @@ def _transform_for_pre_quantization(self, checkpoint, model_args): self.model_ = transform_output_linear_for_pre_quantization( module=self.model_, checkpoint=checkpoint, - dtype=mapping[self.llm_config.model.dtype_override], + dtype=mapping[self.llm_config.model.dtype_override.value], ) self.model_ = transform_linear_for_pre_quantization( self.model_, checkpoint, self.llm_config.base.preq_group_size, - mapping[self.llm_config.model.dtype_override], + mapping[self.llm_config.model.dtype_override.value], ) embedding_bit_width, embedding_group_size = None, None @@ -390,7 +390,7 @@ def _transform_for_pre_quantization(self, checkpoint, model_args): self.model_ = transform_embedding_for_pre_quantization( self.model_, checkpoint, - mapping[self.llm_config.model.dtype_override], + mapping[self.llm_config.model.dtype_override.value], int(embedding_bit_width), embedding_group_size, ) diff --git a/examples/models/llama2/README.md b/examples/models/llama2/README.md index 615ad3948fc..21f761b7f71 100644 --- a/examples/models/llama2/README.md +++ b/examples/models/llama2/README.md @@ -37,7 +37,7 @@ You can export and run the original Llama 2 7B model. 3. Export model and generate `.pte` file: ``` - python -m examples.models.llama.export_llama --checkpoint --params -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 + python -m extension.llm.export.export_llm base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True quantization.qmode="8da4w" quantization.group_size=128 model.dtype_override="fp32" ``` 4. Create tokenizer.bin. ``` diff --git a/examples/models/phi_4_mini/README.md b/examples/models/phi_4_mini/README.md index a23e4f49638..d168d54226e 100644 --- a/examples/models/phi_4_mini/README.md +++ b/examples/models/phi_4_mini/README.md @@ -7,9 +7,9 @@ Phi-4-mini uses the same example code as Llama, while the checkpoint, model para All commands for exporting and running Llama on various backends should also be applicable to Phi-4-mini, by swapping the following args: ``` ---model phi_4_mini ---params examples/models/phi-4-mini/config.json ---checkpoint +base.model_class="phi_4_mini" +base.params="examples/models/phi-4-mini/config.json" +base.checkpoint= ``` ### Generate the Checkpoint @@ -32,17 +32,17 @@ Export to XNNPack, no quantization: # Set these paths to point to the downloaded files PHI_CHECKPOINT=path/to/checkpoint.pth -python -m examples.models.llama.export_llama \ - --model phi_4_mini \ - --checkpoint "${PHI_CHECKPOINT=path/to/checkpoint.pth:?}" \ - --params examples/models/phi-4-mini/config.json \ - -kv \ - --use_sdpa_with_kv_cache \ - -d fp32 \ - -X \ - --metadata '{"get_bos_id":151643, "get_eos_ids":[151643]}' \ - --output_name="phi-4-mini.pte" - --verbose +python -m extension.llm.export.export_llm \ + base.model_class="phi_4_mini" \ + base.checkpoint="${PHI_CHECKPOINT=path/to/checkpoint.pth:?}" \ + base.params="examples/models/phi-4-mini/config.json" \ + model.use_kv_cache=True \ + model.use_sdpa_with_kv_cache=True \ + model.dtype_override="fp32" \ + backend.xnnpack.enabled=True \ + base.metadata='"{\"get_bos_id\":151643, \"get_eos_ids\":[151643]}"' \ + export.output_name="phi-4-mini.pte" \ + debug.verbose=True ``` Run using the executor runner: diff --git a/examples/models/qwen2_5/README.md b/examples/models/qwen2_5/README.md index 9bf791a35ed..57784169ece 100644 --- a/examples/models/qwen2_5/README.md +++ b/examples/models/qwen2_5/README.md @@ -7,9 +7,9 @@ Qwen 2.5 uses the same example code as Llama, while the checkpoint, model params All commands for exporting and running Llama on various backends should also be applicable to Qwen 2.5, by swapping the following args: ``` ---model qwen2_5 ---params examples/models/qwen2_5/1_5b_config.json ---checkpoint +base.model_class="qwen2_5" +base.params="examples/models/qwen2_5/1_5b_config.json" +base.checkpoint= ``` ### Generate the Checkpoint @@ -32,17 +32,17 @@ Export to XNNPack, no quantization: # Set these paths to point to the downloaded files QWEN_CHECKPOINT=path/to/checkpoint.pth -python -m examples.models.llama.export_llama \ - --model "qwen2_5" \ - --checkpoint "${QWEN_CHECKPOINT:?}" \ - --params examples/models/qwen2_5/1_5b_config.json \ - -kv \ - --use_sdpa_with_kv_cache \ - -d fp32 \ - -X \ - --metadata '{"get_bos_id":151643, "get_eos_ids":[151643]}' \ - --output_name="qwen2_5-1_5b.pte" - --verbose +python -m extension.llm.export.export_llm \ + base.model_class="qwen2_5" \ + base.checkpoint="${QWEN_CHECKPOINT:?}" \ + base.params="examples/models/qwen2_5/1_5b_config.json" \ + model.use_kv_cache=True \ + model.use_sdpa_with_kv_cache=True \ + model.dtype_override="fp32" \ + backend.xnnpack.enabled=True \ + base.metadata='"{\"get_bos_id\":151643, \"get_eos_ids\":[151643]}"' \ + export.output_name="qwen2_5-1_5b.pte" \ + debug.verbose=True ``` Run using the executor runner: diff --git a/examples/models/qwen3/README.md b/examples/models/qwen3/README.md index a589d27c19d..d31d491adf2 100644 --- a/examples/models/qwen3/README.md +++ b/examples/models/qwen3/README.md @@ -7,8 +7,8 @@ Qwen 3 uses the same example code as our optimized Llama model, while the checkp All commands for exporting and running Llama on various backends should also be applicable to Qwen 3, by swapping the following args: ``` ---model [qwen3-0.6b,qwen3-1_7b,qwen3-4b] ---params [examples/models/qwen3/0_6b_config.json,examples/models/qwen3/1_7b_config.json,examples/models/qwen3/4b_config.json] +base.model_class=[qwen3-0_6b,qwen3-1_7b,qwen3-4b] +base.params=[examples/models/qwen3/0_6b_config.json,examples/models/qwen3/1_7b_config.json,examples/models/qwen3/4b_config.json] ``` ### Example export @@ -16,50 +16,50 @@ Here is a basic example for exporting Qwen 3, although please refer to the Llama Export 0.6b to XNNPack, quantized with 8da4w: ``` -python -m examples.models.llama.export_llama \ - --model qwen3-0_6b \ - --params examples/models/qwen3/0_6b_config.json \ - -kv \ - --use_sdpa_with_kv_cache \ - -d fp32 \ - -X \ - --xnnpack-extended-ops \ - -qmode 8da4w \ - --metadata '{"get_bos_id": 151644, "get_eos_ids":[151645]}' \ - --output_name="qwen3-0_6b.pte" \ - --verbose +python -m extension.llm.export.export_llm \ + base.model_class="qwen3-0_6b" \ + base.params="examples/models/qwen3/0_6b_config.json" \ + model.use_kv_cache=True \ + model.use_sdpa_with_kv_cache=True \ + model.dtype_override="fp32" \ + backend.xnnpack.enabled=True \ + backend.xnnpack.extended_ops=True \ + quantization.qmode="8da4w" \ + base.metadata='"{\"get_bos_id\": 151644, \"get_eos_ids\":[151645]}"' \ + export.output_name="qwen3-0_6b.pte" \ + debug.verbose=True ``` Export 1.7b to XNNPack, quantized with 8da4w: ``` -python -m examples.models.llama.export_llama \ - --model qwen3-1_7b \ - --params examples/models/qwen3/1_7b_config.json \ - -kv \ - --use_sdpa_with_kv_cache \ - -d fp32 \ - -X \ - --xnnpack-extended-ops \ - -qmode 8da4w \ - --metadata '{"get_bos_id": 151644, "get_eos_ids":[151645]}' \ - --output_name="qwen3-1_7b.pte" \ - --verbose +python -m extension.llm.export.export_llm \ + base.model_class="qwen3-1_7b" \ + base.params="examples/models/qwen3/1_7b_config.json" \ + model.use_kv_cache=True \ + model.use_sdpa_with_kv_cache=True \ + model.dtype_override="fp32" \ + backend.xnnpack.enabled=True \ + backend.xnnpack.extended_ops=True \ + quantization.qmode="8da4w" \ + base.metadata='"{\"get_bos_id\": 151644, \"get_eos_ids\":[151645]}"' \ + export.output_name="qwen3-1_7b.pte" \ + debug.verbose=True ``` Export 4b to XNNPack, quantized with 8da4w: ``` -python -m examples.models.llama.export_llama \ - --model qwen3-4b \ - --params examples/models/qwen3/4b_config.json \ - -kv \ - --use_sdpa_with_kv_cache \ - -d fp32 \ - -X \ - --xnnpack-extended-ops \ - -qmode 8da4w \ - --metadata '{"get_bos_id": 151644, "get_eos_ids":[151645]}' \ - --output_name="qwen3-4b.pte" \ - --verbose +python -m extension.llm.export.export_llm \ + base.model_class="qwen3-4b" \ + base.params="examples/models/qwen3/4b_config.json" \ + model.use_kv_cache=True \ + model.use_sdpa_with_kv_cache=True \ + model.dtype_override="fp32" \ + backend.xnnpack.enabled=True \ + backend.xnnpack.extended_ops=True \ + quantization.qmode="8da4w" \ + base.metadata='"{\"get_bos_id\": 151644, \"get_eos_ids\":[151645]}"' \ + export.output_name="qwen3-4b.pte" \ + debug.verbose=True ``` ### Example run diff --git a/extension/llm/export/test/test_export_llm.py b/extension/llm/export/test/test_export_llm.py index 1f230233867..7d17b7819d3 100644 --- a/extension/llm/export/test/test_export_llm.py +++ b/extension/llm/export/test/test_export_llm.py @@ -51,9 +51,20 @@ def test_with_config(self, mock_export_llama: MagicMock) -> None: f.write( """ base: + model_class: llama2 tokenizer_path: /path/to/tokenizer.json + preq_mode: preq_8da4w +model: + dtype_override: fp16 export: max_seq_length: 256 +quantization: + pt2e_quantize: xnnpack_dynamic + use_spin_quant: cuda +backend: + coreml: + quantize: c4w + compute_units: cpu_and_gpu """ ) config_file = f.name @@ -69,7 +80,22 @@ def test_with_config(self, mock_export_llama: MagicMock) -> None: self.assertEqual( called_config["base"]["tokenizer_path"], "/path/to/tokenizer.json" ) + self.assertEqual(called_config["base"]["model_class"], "llama2") + self.assertEqual(called_config["base"]["preq_mode"].value, "8da4w") + self.assertEqual(called_config["model"]["dtype_override"].value, "fp16") self.assertEqual(called_config["export"]["max_seq_length"], 256) + self.assertEqual( + called_config["quantization"]["pt2e_quantize"].value, "xnnpack_dynamic" + ) + self.assertEqual( + called_config["quantization"]["use_spin_quant"].value, "cuda" + ) + self.assertEqual( + called_config["backend"]["coreml"]["quantize"].value, "c4w" + ) + self.assertEqual( + called_config["backend"]["coreml"]["compute_units"].value, "cpu_and_gpu" + ) finally: os.unlink(config_file)