diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh index 9f183528719..f92a983a340 100644 --- a/.ci/scripts/test_llama.sh +++ b/.ci/scripts/test_llama.sh @@ -54,10 +54,7 @@ PT2E_QUANTIZE="${PT2E_QUANTIZE:-}" # Default CMake Build Type to release mode CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} -if [[ $# -lt 4 ]]; then # Assuming 4 mandatory args - echo "Expecting atleast 4 positional arguments" - echo "Usage: [...]" -fi +# Argument validation is done individually below for each required parameter if [[ -z "${MODEL_NAME:-}" ]]; then echo "Missing model name, exiting..." exit 1 @@ -224,34 +221,34 @@ fi # Export model. EXPORTED_MODEL_NAME="${EXPORTED_MODEL_NAME}.pte" echo "Exporting ${EXPORTED_MODEL_NAME}" -EXPORT_ARGS="-c ${CHECKPOINT_FILE_NAME} -p ${PARAMS} -d ${DTYPE} -n ${EXPORTED_MODEL_NAME} -kv" +EXPORT_ARGS="base.checkpoint=${CHECKPOINT_FILE_NAME} base.params=${PARAMS} model.dtype_override=${DTYPE} export.output_name=${EXPORTED_MODEL_NAME} model.use_kv_cache=true" if [[ "${XNNPACK}" == "ON" ]]; then - EXPORT_ARGS="${EXPORT_ARGS} -X --xnnpack-extended-ops -qmode 8da4w -G 128" + EXPORT_ARGS="${EXPORT_ARGS} backend.xnnpack.enabled=true backend.xnnpack.extended_ops=true quantization.qmode=8da4w quantization.group_size=128" fi if [[ "${CUSTOM}" == "ON" ]]; then - EXPORT_ARGS="${EXPORT_ARGS} --use_sdpa_with_kv_cache" + EXPORT_ARGS="${EXPORT_ARGS} model.use_sdpa_with_kv_cache=true" fi if [[ "${QE}" == "ON" ]]; then - EXPORT_ARGS="${EXPORT_ARGS} --embedding-quantize 8,1024" + EXPORT_ARGS="${EXPORT_ARGS} quantization.embedding_quantize=\"8,1024\"" fi if [[ "${MPS}" == "ON" ]]; then - EXPORT_ARGS="${EXPORT_ARGS} -kv -v --mps --disable_dynamic_shape" + EXPORT_ARGS="${EXPORT_ARGS} backend.mps.enabled=true model.enable_dynamic_shape=false debug.verbose=true" fi if [[ "${COREML}" == "ON" ]]; then - EXPORT_ARGS="${EXPORT_ARGS} -kv -v --coreml --disable_dynamic_shape" + EXPORT_ARGS="${EXPORT_ARGS} backend.coreml.enabled=true model.enable_dynamic_shape=false debug.verbose=true" fi if [[ "${QNN}" == "ON" ]]; then - EXPORT_ARGS="${EXPORT_ARGS} -kv -v --qnn --disable_dynamic_shape" + EXPORT_ARGS="${EXPORT_ARGS} backend.qnn.enabled=true model.enable_dynamic_shape=false debug.verbose=true" echo "PT2E_QUANTIZE is ${PT2E_QUANTIZE}" if [[ "${PT2E_QUANTIZE}" == "qnn_16a16w" ]]; then - EXPORT_ARGS+=" --tokenizer_path tokenizer.model --pt2e_quantize qnn_16a16w --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --calibration_data Once " + EXPORT_ARGS+=" base.tokenizer_path=tokenizer.model quantization.pt2e_quantize=qnn_16a16w quantization.calibration_tasks=[\"wikitext\"] quantization.calibration_limit=1 quantization.calibration_seq_length=128 quantization.calibration_data=\"Once\"" fi fi if [[ "${QUANTIZE_KV_CACHE}" == "ON" ]]; then - EXPORT_ARGS="${EXPORT_ARGS} --quantize_kv_cache" + EXPORT_ARGS="${EXPORT_ARGS} model.quantize_kv_cache=true" fi # Add dynamically linked library location -$PYTHON_EXECUTABLE -m examples.models.llama.export_llama ${EXPORT_ARGS} +$PYTHON_EXECUTABLE -m extension.llm.export.export_llm ${EXPORT_ARGS} # Create tokenizer.bin. echo "Creating tokenizer.bin" diff --git a/.ci/scripts/test_llama_torchao_lowbit.sh b/.ci/scripts/test_llama_torchao_lowbit.sh index ac603cc5e83..21989d26770 100644 --- a/.ci/scripts/test_llama_torchao_lowbit.sh +++ b/.ci/scripts/test_llama_torchao_lowbit.sh @@ -70,16 +70,16 @@ QLINEAR_GROUP_SIZE=128 # Must be multiple of 16 QEMBEDDING_BITWIDTH=4 # Can be 1-8 QEMBEDDING_GROUP_SIZE=32 # Must be multiple of 16 -${PYTHON_EXECUTABLE} -m examples.models.llama.export_llama \ - --checkpoint "${LLAMA_CHECKPOINT:?}" \ - --params "${LLAMA_PARAMS:?}" \ - -kv \ - --use_sdpa_with_kv_cache \ - --output_name=${MODEL_OUT} \ - -qmode "torchao:8da${QLINEAR_BITWIDTH}w" \ - --group_size ${QLINEAR_GROUP_SIZE} \ - -E "torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}" \ - -d fp32 +${PYTHON_EXECUTABLE} -m extension.llm.export.export_llm \ + base.checkpoint="${LLAMA_CHECKPOINT:?}" \ + base.params="${LLAMA_PARAMS:?}" \ + model.use_kv_cache=true \ + model.use_sdpa_with_kv_cache=true \ + export.output_name="${MODEL_OUT}" \ + quantization.qmode="torchao:8da${QLINEAR_BITWIDTH}w" \ + quantization.group_size=${QLINEAR_GROUP_SIZE} \ + quantization.embedding_quantize=\"torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}\" \ + model.dtype_override=fp32 # Test run ./cmake-out/examples/models/llama/llama_main --model_path=$MODEL_OUT --tokenizer_path=$TOKENIZER --prompt="Once upon a time," diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh index 4f8dc7a30e5..bbf879295ae 100755 --- a/.ci/scripts/test_model.sh +++ b/.ci/scripts/test_model.sh @@ -86,8 +86,8 @@ test_model() { if [[ "${MODEL_NAME}" == "llama2" ]]; then # Install requirements for export_llama bash examples/models/llama/install_requirements.sh - # Test export_llama script: python3 -m examples.models.llama.export_llama - "${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama --model "${MODEL_NAME}" -c examples/models/llama/params/demo_rand_params.pth -p examples/models/llama/params/demo_config.json + # Test export_llm script: python3 -m extension.llm.export.export_llm + "${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.checkpoint=examples/models/llama/params/demo_rand_params.pth base.params=examples/models/llama/params/demo_config.json run_portable_executor_runner rm "./${MODEL_NAME}.pte" fi @@ -100,17 +100,17 @@ test_model() { if [[ "${MODEL_NAME}" == "qwen2_5" ]]; then # Install requirements for export_llama bash examples/models/llama/install_requirements.sh - # Test export_llama script: python3 -m examples.models.llama.export_llama. + # Test export_llm script: python3 -m extension.llm.export.export_llm. # Use Llama random checkpoint with Qwen 2.5 1.5b model configuration. - "${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama --model "${MODEL_NAME}" -p examples/models/qwen2_5/1_5b_config.json + "${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/qwen2_5/1_5b_config.json rm "./${MODEL_NAME}.pte" return # Skip running with portable executor runnner since portable doesn't support Qwen's biased linears. fi if [[ "${MODEL_NAME}" == "phi_4_mini" ]]; then # Install requirements for export_llama bash examples/models/llama/install_requirements.sh - # Test export_llama script: python3 -m examples.models.llama.export_llama. - "${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama --model "${MODEL_NAME}" -p examples/models/phi_4_mini/config.json + # Test export_llm script: python3 -m extension.llm.export.export_llm. + "${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/phi_4_mini/config.json run_portable_executor_runner rm "./${MODEL_NAME}.pte" return diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml index 1a6d63f1bd1..1bede7c3f27 100644 --- a/.github/workflows/android-perf.yml +++ b/.github/workflows/android-perf.yml @@ -214,23 +214,23 @@ jobs: --files "tokenizer.model" "params.json" "consolidated.00.pth" ) # Export using ExecuTorch's model definition - python -m examples.models.llama.export_llama \ - --model "llama3_2" \ - --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ - --params "${DOWNLOADED_PATH}/params.json" \ - --use_sdpa_with_kv_cache \ - -X \ - --xnnpack-extended-ops \ - --preq_mode 8da4w_output_8da8w \ - --preq_group_size 32 \ - --max_seq_length 2048 \ - --max_context_length 2048 \ - --output_name "${OUT_ET_MODEL_NAME}.pte" \ - -kv \ - -d fp32 \ - --preq_embedding_quantize 8,0 \ - --use_spin_quant native \ - --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' + python -m extension.llm.export.export_llm \ + base.model_class="llama3_2" \ + base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \ + base.params="${DOWNLOADED_PATH}/params.json" \ + model.use_sdpa_with_kv_cache=true \ + backend.xnnpack.enabled=true \ + backend.xnnpack.extended_ops=true \ + base.preq_mode="8da4w_output_8da8w" \ + base.preq_group_size=32 \ + export.max_seq_length=2048 \ + export.max_context_length=2048 \ + export.output_name="${OUT_ET_MODEL_NAME}.pte" \ + model.use_kv_cache=true \ + model.dtype_override=fp32 \ + base.preq_embedding_quantize=\'8,0\' \ + quantization.use_spin_quant=native \ + base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}' ls -lh "${OUT_ET_MODEL_NAME}.pte" elif [[ ${{ matrix.config }} == "llama3_qlora" ]]; then # QAT + LoRA @@ -241,53 +241,55 @@ jobs: --files "tokenizer.model" "params.json" "consolidated.00.pth" ) # Export using ExecuTorch's model definition - python -m examples.models.llama.export_llama \ - --model "llama3_2" \ - --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ - --params "${DOWNLOADED_PATH}/params.json" \ - -qat \ - -lora 16 \ - --preq_mode 8da4w_output_8da8w \ - --preq_group_size 32 \ - --preq_embedding_quantize 8,0 \ - --use_sdpa_with_kv_cache \ - -kv \ - -X \ - --xnnpack-extended-ops \ - -d fp32 \ - --max_seq_length 2048 \ - --max_context_length 2048 \ - --output_name "${OUT_ET_MODEL_NAME}.pte" \ - --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' + python -m extension.llm.export.export_llm \ + base.model_class="llama3_2" \ + base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \ + base.params="${DOWNLOADED_PATH}/params.json" \ + quantization.use_qat=true \ + base.use_lora=16 \ + base.preq_mode="8da4w_output_8da8w" \ + base.preq_group_size=32 \ + base.preq_embedding_quantize=\'8,0\' \ + model.use_sdpa_with_kv_cache=true \ + model.use_kv_cache=true \ + backend.xnnpack.enabled=true \ + backend.xnnpack.extended_ops=true \ + model.dtype_override=fp32 \ + export.max_seq_length=2048 \ + export.max_context_length=2048 \ + export.output_name="${OUT_ET_MODEL_NAME}.pte" \ + base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}' ls -lh "${OUT_ET_MODEL_NAME}.pte" elif [[ ${{ matrix.config }} == "llama3_fb16" ]]; then # Original BF16 version, without any quantization DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth") - python -m examples.models.llama.export_llama \ - --model "llama3_2" \ - --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ - --params "${DOWNLOADED_PATH}/params.json" \ - -kv \ - --use_sdpa_with_kv_cache \ - -X \ - -d bf16 \ - --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ - --output_name="${OUT_ET_MODEL_NAME}.pte" + python -m extension.llm.export.export_llm \ + base.model_class="llama3_2" \ + base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \ + base.params="${DOWNLOADED_PATH}/params.json" \ + model.use_kv_cache=true \ + model.use_sdpa_with_kv_cache=true \ + backend.xnnpack.enabled=true \ + model.dtype_override=bf16 \ + base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}' \ + export.output_name="${OUT_ET_MODEL_NAME}.pte" ls -lh "${OUT_ET_MODEL_NAME}.pte" elif [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth") - python -m examples.models.llama.export_llama \ - --model llama3_2 \ - --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ - --params "${DOWNLOADED_PATH}/params.json" \ - -kv \ - --use_sdpa_with_kv_cache \ - -d fp32 \ - -X \ - --xnnpack-extended-ops \ - -qmode 8da4w -G 32 -E 8,0 \ - --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ - --output_name="${OUT_ET_MODEL_NAME}.pte" + python -m extension.llm.export.export_llm \ + base.model_class=llama3_2 \ + base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \ + base.params="${DOWNLOADED_PATH}/params.json" \ + model.use_kv_cache=true \ + model.use_sdpa_with_kv_cache=true \ + model.dtype_override=fp32 \ + backend.xnnpack.enabled=true \ + backend.xnnpack.extended_ops=true \ + quantization.qmode=8da4w \ + quantization.group_size=32 \ + quantization.embedding_quantize=\'8,0\' \ + base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}' \ + export.output_name="${OUT_ET_MODEL_NAME}.pte" ls -lh "${OUT_ET_MODEL_NAME}.pte" elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029 @@ -313,19 +315,19 @@ jobs: elif [[ "$HF_MODEL_REPO" == "Qwen/Qwen3-0.6B" ]]; then if [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json") - python -m examples.models.llama.export_llama \ - --model qwen3-0_6b \ - --params examples/models/qwen3/0_6b_config.json \ - -kv \ - --use_sdpa_with_kv_cache \ - -d fp32 \ - -X \ - --xnnpack-extended-ops \ - -qmode 8da4w \ - -G 32 \ - -E 8,0 \ - --metadata '{"get_bos_id": 151644, "get_eos_ids":[151645]}' \ - --output_name="${OUT_ET_MODEL_NAME}.pte" + python -m extension.llm.export.export_llm \ + base.model_class=qwen3-0_6b \ + base.params=examples/models/qwen3/0_6b_config.json \ + model.use_kv_cache=true \ + model.use_sdpa_with_kv_cache=true \ + model.dtype_override=fp32 \ + backend.xnnpack.enabled=true \ + backend.xnnpack.extended_ops=true \ + quantization.qmode=8da4w \ + quantization.group_size=32 \ + quantization.embedding_quantize=\'8,0\' \ + base.metadata='{"get_bos_id":151644,"get_eos_ids":[151645]}' \ + export.output_name="${OUT_ET_MODEL_NAME}.pte" ls -lh "${OUT_ET_MODEL_NAME}.pte" fi fi diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml index 0c03f55f82e..1155ef2a7b2 100644 --- a/.github/workflows/apple-perf.yml +++ b/.github/workflows/apple-perf.yml @@ -223,23 +223,23 @@ jobs: --files "tokenizer.model" "params.json" "consolidated.00.pth" ) # Export using ExecuTorch's model definition - ${CONDA_RUN} python -m examples.models.llama.export_llama \ - --model "llama3_2" \ - --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ - --params "${DOWNLOADED_PATH}/params.json" \ - --use_sdpa_with_kv_cache \ - -X \ - --xnnpack-extended-ops \ - --preq_mode 8da4w_output_8da8w \ - --preq_group_size 32 \ - --max_seq_length 2048 \ - --max_context_length 2048 \ - --output_name "${OUT_ET_MODEL_NAME}.pte" \ - -kv \ - -d fp32 \ - --preq_embedding_quantize 8,0 \ - --use_spin_quant native \ - --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' + ${CONDA_RUN} python -m extension.llm.export.export_llm \ + base.model_class="llama3_2" \ + base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \ + base.params="${DOWNLOADED_PATH}/params.json" \ + model.use_sdpa_with_kv_cache=true \ + backend.xnnpack.enabled=true \ + backend.xnnpack.extended_ops=true \ + base.preq_mode="8da4w_output_8da8w" \ + base.preq_group_size=32 \ + export.max_seq_length=2048 \ + export.max_context_length=2048 \ + export.output_name="${OUT_ET_MODEL_NAME}.pte" \ + model.use_kv_cache=true \ + model.dtype_override=fp32 \ + base.preq_embedding_quantize=\'8,0\' \ + quantization.use_spin_quant=native \ + base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}' ls -lh "${OUT_ET_MODEL_NAME}.pte" elif [[ ${{ matrix.config }} == "llama3_qlora" ]]; then # QAT + LoRA @@ -250,87 +250,89 @@ jobs: --files "tokenizer.model" "params.json" "consolidated.00.pth" ) # Export using ExecuTorch's model definition - ${CONDA_RUN} python -m examples.models.llama.export_llama \ - --model "llama3_2" \ - --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ - --params "${DOWNLOADED_PATH}/params.json" \ - -qat \ - -lora 16 \ - --preq_mode 8da4w_output_8da8w \ - --preq_group_size 32 \ - --preq_embedding_quantize 8,0 \ - --use_sdpa_with_kv_cache \ - -kv \ - -X \ - --xnnpack-extended-ops \ - -d fp32 \ - --max_seq_length 2048 \ - --max_context_length 2048 \ - --output_name "${OUT_ET_MODEL_NAME}.pte" \ - --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' + ${CONDA_RUN} python -m extension.llm.export.export_llm \ + base.model_class="llama3_2" \ + base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \ + base.params="${DOWNLOADED_PATH}/params.json" \ + quantization.use_qat=true \ + base.use_lora=16 \ + base.preq_mode="8da4w_output_8da8w" \ + base.preq_group_size=32 \ + base.preq_embedding_quantize=\'8,0\' \ + model.use_sdpa_with_kv_cache=true \ + model.use_kv_cache=true \ + backend.xnnpack.enabled=true \ + backend.xnnpack.extended_ops=true \ + model.dtype_override=fp32 \ + export.max_seq_length=2048 \ + export.max_context_length=2048 \ + export.output_name="${OUT_ET_MODEL_NAME}.pte" \ + base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}' ls -lh "${OUT_ET_MODEL_NAME}.pte" elif [[ ${{ matrix.config }} == "llama3_fb16" ]]; then # Original BF16 version, without any quantization DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth") - ${CONDA_RUN} python -m examples.models.llama.export_llama \ - --model "llama3_2" \ - --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ - --params "${DOWNLOADED_PATH}/params.json" \ - -kv \ - --use_sdpa_with_kv_cache \ - -X \ - -d bf16 \ - --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ - --output_name="${OUT_ET_MODEL_NAME}.pte" + ${CONDA_RUN} python -m extension.llm.export.export_llm \ + base.model_class="llama3_2" \ + base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \ + base.params="${DOWNLOADED_PATH}/params.json" \ + model.use_kv_cache=true \ + model.use_sdpa_with_kv_cache=true \ + backend.xnnpack.enabled=true \ + model.dtype_override=bf16 \ + base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}' \ + export.output_name="${OUT_ET_MODEL_NAME}.pte" ls -lh "${OUT_ET_MODEL_NAME}.pte" elif [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth") - ${CONDA_RUN} python -m examples.models.llama.export_llama \ - --model llama3_2 \ - --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ - --params "${DOWNLOADED_PATH}/params.json" \ - -kv \ - --use_sdpa_with_kv_cache \ - -d fp32 \ - -X \ - --xnnpack-extended-ops \ - -qmode 8da4w -G 32 -E 8,0 \ - --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ - --output_name="${OUT_ET_MODEL_NAME}.pte" + ${CONDA_RUN} python -m extension.llm.export.export_llm \ + base.model_class=llama3_2 \ + base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \ + base.params="${DOWNLOADED_PATH}/params.json" \ + model.use_kv_cache=true \ + model.use_sdpa_with_kv_cache=true \ + model.dtype_override=fp32 \ + backend.xnnpack.enabled=true \ + backend.xnnpack.extended_ops=true \ + quantization.qmode=8da4w \ + quantization.group_size=32 \ + quantization.embedding_quantize=\'8,0\' \ + base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}' \ + export.output_name="${OUT_ET_MODEL_NAME}.pte" ls -lh "${OUT_ET_MODEL_NAME}.pte" elif [[ ${{ matrix.config }} == "llama3_coreml_ane" ]]; then # ANE DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth") - ${CONDA_RUN} python -m examples.models.llama.export_llama \ - --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ - --params "${DOWNLOADED_PATH}/params.json" \ - -E "4,32" \ - -kv \ - --disable_dynamic_shape \ - --coreml \ - --coreml-ios 18 \ - --coreml-quantize c4w \ - --coreml-compute-units cpu_and_ne \ - --output_name="${OUT_ET_MODEL_NAME}.pte" + ${CONDA_RUN} python -m extension.llm.export.export_llm \ + base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \ + base.params="${DOWNLOADED_PATH}/params.json" \ + quantization.embedding_quantize="4,32" \ + model.use_kv_cache=true \ + model.enable_dynamic_shape=false \ + backend.coreml.enabled=true \ + backend.coreml.ios=18 \ + backend.coreml.quantize=c4w \ + backend.coreml.compute_units=cpu_and_ne \ + export.output_name="${OUT_ET_MODEL_NAME}.pte" ls -lh "${OUT_ET_MODEL_NAME}.pte" fi elif [[ "$HF_MODEL_REPO" == "Qwen/Qwen3-0.6B" ]]; then OUT_ET_MODEL_NAME=${OUT_ET_MODEL_NAME}_llm if [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json") - ${CONDA_RUN} python -m examples.models.llama.export_llama \ - --model qwen3-0_6b \ - --params examples/models/qwen3/0_6b_config.json \ - -kv \ - --use_sdpa_with_kv_cache \ - -d fp32 \ - -X \ - --xnnpack-extended-ops \ - -qmode 8da4w \ - -G 32 \ - -E 8,0 \ - --metadata '{"get_bos_id": 151644, "get_eos_ids":[151645]}' \ - --output_name="${OUT_ET_MODEL_NAME}.pte" + ${CONDA_RUN} python -m extension.llm.export.export_llm \ + base.model_class=qwen3-0_6b \ + base.params=examples/models/qwen3/0_6b_config.json \ + model.use_kv_cache=true \ + model.use_sdpa_with_kv_cache=true \ + model.dtype_override=fp32 \ + backend.xnnpack.enabled=true \ + backend.xnnpack.extended_ops=true \ + quantization.qmode=8da4w \ + quantization.group_size=32 \ + quantization.embedding_quantize=\'8,0\' \ + base.metadata='{"get_bos_id":151644,"get_eos_ids":[151645]}' \ + export.output_name="${OUT_ET_MODEL_NAME}.pte" ls -lh "${OUT_ET_MODEL_NAME}.pte" fi fi diff --git a/backends/arm/test/models/test_llama.py b/backends/arm/test/models/test_llama.py index 2b524fe0cc9..7658ad40fe6 100644 --- a/backends/arm/test/models/test_llama.py +++ b/backends/arm/test/models/test_llama.py @@ -23,7 +23,7 @@ TosaPipelineMI, ) -from executorch.examples.models.llama.config.llm_config import LlmConfig +from executorch.extension.llm.export.config.llm_config import LlmConfig from executorch.examples.models.llama.export_llama_lib import ( build_args_parser, get_llama_model, diff --git a/examples/apple/mps/scripts/mps_example.py b/examples/apple/mps/scripts/mps_example.py index 4aa34fce9f8..5c69b2cd115 100644 --- a/examples/apple/mps/scripts/mps_example.py +++ b/examples/apple/mps/scripts/mps_example.py @@ -20,7 +20,7 @@ serialize_from_bundled_program_to_flatbuffer, ) -from executorch.examples.models.llama.config.llm_config import LlmConfig +from executorch.extension.llm.export.config.llm_config import LlmConfig from executorch.exir import ( EdgeCompileConfig, EdgeProgramManager, diff --git a/examples/demo-apps/android/LlamaDemo/README.md b/examples/demo-apps/android/LlamaDemo/README.md index 4b8cafd2d4e..8fed04d7ff5 100644 --- a/examples/demo-apps/android/LlamaDemo/README.md +++ b/examples/demo-apps/android/LlamaDemo/README.md @@ -154,7 +154,7 @@ curl -C - -Ls "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokeni # Create params.json file touch params.json echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json -python -m examples.models.llama.export_llama -c stories110M.pt -p params.json -d fp16 -n stories110m_h.pte -kv +python -m extension.llm.export.export_llm base.checkpoint=stories110M.pt base.params=params.json model.dtype_override="fp16" export.output_name=stories110m_h.pte model.use_kv_cache=True python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin ``` ### Push model diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md index fb9df3c3375..360e92a5f30 100644 --- a/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md +++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md @@ -97,7 +97,7 @@ cmake --build cmake-out/examples/models/llama -j16 --config Release ## Export Llama Model QNN backend currently supports exporting to these data types: fp32, int4/ int8 with PTQ, int4 with SpinQuant (Llama 3 only). -We also support export for different Qualcomm SoC. We have verified SM8650(V75) and SM8550(V73). To export for different SoC, add “--soc_model SM8550” in your export command. Without setting this flag, the export will default to SM8650. +We also support export for different Qualcomm SoC. We have verified SM8650(V75) and SM8550(V73). To export for different SoC, add "--soc_model SM8550" in your export command. Without setting this flag, the export will default to SM8650. ### Export with PTQ We support PTQ by default. The entire export may take ~20 minutes (Llama 3.1 8B). However, there is accuracy regression and we are working on improving it. @@ -106,12 +106,12 @@ We support PTQ by default. The entire export may take ~20 minutes (Llama 3.1 8B) Examples: ``` # 4 bits weight only quantize -python -m examples.models.llama.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" -p "${MODEL_DIR}/params.json" -kv --disable_dynamic_shape --qnn --pt2e_quantize qnn_16a4w -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="test.pte” +python -m extension.llm.export.export_llm base.checkpoint="${MODEL_DIR}/consolidated.00.pth" base.params="${MODEL_DIR}/params.json" model.use_kv_cache=True model.enable_dynamic_shape=False backend.qnn.enabled=True backend.qnn.quantization="qnn_16a4w" model.dtype_override="fp32" base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="test.pte" ``` If the model is really big, it may require model sharding because the Qualcomm DSP is a 32bit system and has a 4GB size limit . For example for Llama 3 8B models, we need to shard the model into 4, but ExecuTorch still packages it into one PTE file. Here is an example: ``` # 8 bits quantization with 4 shards -python -m examples.models.llama.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" -p "${MODEL_DIR}/params.json" -kv --disable_dynamic_shape --qnn --pt2e_quantize qnn_8a8w -d fp32 --num_sharding 4 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="test.pte” +python -m extension.llm.export.export_llm base.checkpoint="${MODEL_DIR}/consolidated.00.pth" base.params="${MODEL_DIR}/params.json" model.use_kv_cache=True model.enable_dynamic_shape=False backend.qnn.enabled=True backend.qnn.quantization="qnn_8a8w" model.dtype_override="fp32" backend.qnn.num_sharding=4 base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="test.pte" ``` Note: if you encountered issues below ``` @@ -163,7 +163,7 @@ To export Llama 3 8B instruct with the Qualcomm AI Engine Direct Backend, ensure * 8B models might need 16GB RAM on the device to run. ``` # Please note that calibration_data must include the prompt template for special tokens. -python -m examples.models.llama.export_llama -t -p -c --use_kv_cache --qnn --pt2e_quantize qnn_16a4w --disable_dynamic_shape --num_sharding 8 --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --optimized_rotation_path --calibration_data "<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" +python -m extension.llm.export.export_llm base.tokenizer= base.params= base.checkpoint= model.use_kv_cache=True backend.qnn.enabled=True backend.qnn.quantization="qnn_16a4w" model.enable_dynamic_shape=False backend.qnn.num_sharding=8 backend.qnn.calibration_tasks="wikitext" backend.qnn.calibration_limit=1 backend.qnn.calibration_seq_length=128 backend.qnn.optimized_rotation_path= backend.qnn.calibration_data="<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" ``` ## Pushing Model and Tokenizer @@ -210,17 +210,17 @@ Alternative you can also just run the shell script directly as in the root direc sh examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh ``` This is running the shell script which configures the required core ExecuTorch, Llama2/3, and Android libraries, builds them into AAR, and copies it to the app. -Note: If you are building the Android app mentioned in the next section on a separate machine (i.e. MacOS but building and exporting for QNN backend on Linux), make sure you copy the aar file generated from setup-with-qnn script to “examples/demo-apps/android/LlamaDemo/app/libs” before building the Android app. +Note: If you are building the Android app mentioned in the next section on a separate machine (i.e. MacOS but building and exporting for QNN backend on Linux), make sure you copy the aar file generated from setup-with-qnn script to "examples/demo-apps/android/LlamaDemo/app/libs" before building the Android app. ## Run the Android Demo App -First, make sure your Android phone’s chipset version is compatible with this demo (SM8650, SM8550). You can find the Qualcomm chipset version here in the [mapping](https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html). +First, make sure your Android phone's chipset version is compatible with this demo (SM8650, SM8550). You can find the Qualcomm chipset version here in the [mapping](https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html). -If you build and run the setup-with-qnn script on a separate machine rather than where you are building the Android app, make sure you copy the aar file it generated into “examples/demo-apps/android/LlamaDemo/app/libs” +If you build and run the setup-with-qnn script on a separate machine rather than where you are building the Android app, make sure you copy the aar file it generated into "examples/demo-apps/android/LlamaDemo/app/libs" ### Alternative 1: Android Studio (Recommended) -Open Android Studio and select “Open an existing Android Studio project” to open examples/demo-apps/android/LlamaDemo. +Open Android Studio and select "Open an existing Android Studio project" to open examples/demo-apps/android/LlamaDemo. Run the app (^R). This builds and launches the app on the phone. ### Alternative 2: Command line @@ -238,4 +238,4 @@ If the app successfully run on your device, you should see something like below:

## Reporting Issues -If you encountered any bugs or issues following this tutorial please file a bug/issue here on Github. +If you encountered any bugs or issues following this tutorial please file a bug/issue here on Github. \ No newline at end of file diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md index de99387f82d..baf8ffb7071 100644 --- a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md +++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md @@ -55,7 +55,7 @@ In this demo app, we support text-only inference with up-to-date Llama models an Meta has released prequantized INT4 SpinQuant Llama 3.2 models that ExecuTorch supports on the XNNPACK backend. * Export Llama model and generate .pte file as below: ``` -python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --max_context_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte" +python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="fp32" backend.xnnpack.extended_ops=True base.preq_mode="8da4w_output_8da8w" base.preq_group_size=32 export.max_seq_length=2048 export.max_context_length=2048 base.preq_embedding_quantize=\'8,0\' quantization.use_spin_quant="native" base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="llama3_2_spinquant.pte" ``` For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_SpinQuant_INT4_EO8.ipynb). @@ -63,7 +63,7 @@ For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co Meta has released prequantized INT4 QAT+LoRA Llama 3.2 models that ExecuTorch supports on the XNNPACK backend. * Export Llama model and generate .pte file as below: ``` -python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint --params -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --max_context_length 2048--preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte" +python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= quantization.use_qat=True base.use_lora=16 model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="fp32" backend.xnnpack.extended_ops=True base.preq_mode="8da4w_output_8da8w" base.preq_group_size=32 export.max_seq_length=2048 export.max_context_length=2048 base.preq_embedding_quantize=\'8,0\' base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="llama3_2_qat_lora.pte" ``` For convenience, an [exported ExecuTorch QAT+LoRA model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-QLORA_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_QLORA_INT4_EO8.ipynb). @@ -74,7 +74,7 @@ We have supported BF16 as a data type on the XNNPACK backend for Llama 3.2 1B/3B * Export Llama model and generate .pte file as below: ``` -python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2_bf16.pte" +python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="bf16" base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="llama3_2_bf16.pte" ``` For convenience, an [exported ExecuTorch bf16 model](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/llama3_2-1B.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/ExportRecipe_1B.ipynb). @@ -90,7 +90,7 @@ To safeguard your application, you can use our Llama Guard models for prompt cla * We prepared this model using the following command ``` -python -m examples.models.llama.export_llama --checkpoint --params -d fp32 -kv --use_sdpa_with_kv_cache --quantization_mode 8da4w --group_size 256 --xnnpack --max_seq_length 8193 --max_context_length 8193 --embedding-quantize 4,32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_prune_map --output_name="llama_guard_3_1b_pruned_xnnpack.pte" +python -m extension.llm.export.export_llm base.checkpoint= base.params= model.dtype_override="fp32" model.use_kv_cache=True model.use_sdpa_with_kv_cache=True quantization.qmode="8da4w" quantization.group_size=256 backend.xnnpack.enabled=True export.max_seq_length=8193 export.max_context_length=8193 quantization.embedding_quantize=\'4,32\' base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' base.output_prune_map= export.output_name="llama_guard_3_1b_pruned_xnnpack.pte" ``` @@ -100,7 +100,7 @@ python -m examples.models.llama.export_llama --checkpoint --params -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama.pte" +python -m extension.llm.export.export_llm base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True quantization.qmode="8da4w" quantization.group_size=128 model.dtype_override="fp32" base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="llama.pte" ``` You may wonder what the ‘--metadata’ flag is doing. This flag helps export the model with proper special tokens added that the runner can detect EOS tokens easily. diff --git a/examples/demo-apps/android/LlamaDemo/run_instrumentation_test.sh b/examples/demo-apps/android/LlamaDemo/run_instrumentation_test.sh index ff59fc56b2c..8c1ad52ef8b 100644 --- a/examples/demo-apps/android/LlamaDemo/run_instrumentation_test.sh +++ b/examples/demo-apps/android/LlamaDemo/run_instrumentation_test.sh @@ -14,7 +14,7 @@ curl -C - -Ls "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokeni # Create params.json file touch params.json echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json -python -m examples.models.llama.export_llama -c stories110M.pt -p params.json -d fp16 -n stories110m_h.pte -kv +python -m extension.llm.export.export_llm base.checkpoint=stories110M.pt base.params=params.json model.dtype_override=fp16 export.output_name=stories110m_h.pte model.use_kv_cache=true python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin adb mkdir -p /data/local/tmp/llama diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md index 47352607bca..d6bccc0ef47 100644 --- a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md +++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md @@ -49,7 +49,7 @@ Install the required packages to export the model Export the model ``` -python -m examples.models.llama.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" --params "${MODEL_DIR}/params.json" -kv --use_sdpa_with_kv_cache --mps -d fp32 --disable_dynamic_shape -qmode 8da4w -G 32 +python -m extension.llm.export.export_llm base.checkpoint="${MODEL_DIR}/consolidated.00.pth" base.params="${MODEL_DIR}/params.json" model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.mps.enabled=True model.dtype_override="fp32" model.enable_dynamic_shape=False quantization.qmode="8da4w" quantization.group_size=32 ``` ## Pushing Model and Tokenizer diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md index bb33b50f8b7..6cca65339da 100644 --- a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md +++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md @@ -51,7 +51,7 @@ In this demo app, we support text-only inference with up-to-date Llama models an Meta has released prequantized INT4 SpinQuant Llama 3.2 models that ExecuTorch supports on the XNNPACK backend. * Export Llama model and generate .pte file as below: ``` -python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --max_context_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte" +python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="fp32" backend.xnnpack.extended_ops=True base.preq_mode="8da4w_output_8da8w" base.preq_group_size=32 export.max_seq_length=2048 export.max_context_length=2048 base.preq_embedding_quantize=\'8,0\' quantization.use_spin_quant="native" base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="llama3_2_spinquant.pte" ``` For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_SpinQuant_INT4_EO8.ipynb). @@ -59,7 +59,7 @@ For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co Meta has released prequantized INT4 QAT+LoRA Llama 3.2 models that ExecuTorch supports on the XNNPACK backend. * Export Llama model and generate .pte file as below: ``` -python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint --params -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --max_context_length 2048 --preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte" +python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= quantization.use_qat=True base.use_lora=16 model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="fp32" backend.xnnpack.extended_ops=True base.preq_mode="8da4w_output_8da8w" base.preq_group_size=32 export.max_seq_length=2048 export.max_context_length=2048 base.preq_embedding_quantize=\'8,0\' base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="llama3_2_qat_lora.pte" ``` For convenience, an [exported ExecuTorch QAT+LoRA model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-QLORA_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_QLORA_INT4_EO8.ipynb). @@ -69,7 +69,7 @@ We have supported BF16 as a data type on the XNNPACK backend for Llama 3.2 1B/3B * Export Llama model and generate .pte file as below: ``` -python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2_bf16.pte" +python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="bf16" base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="llama3_2_bf16.pte" ``` For convenience, an [exported ExecuTorch bf16 model](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/llama3_2-1B.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/ExportRecipe_1B.ipynb). @@ -79,7 +79,7 @@ For more detail using Llama 3.2 lightweight models including prompt template, pl Export the model ``` -python -m examples.models.llama.export_llama --checkpoint -p -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --embedding-quantize 4,32 --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte" +python -m extension.llm.export.export_llm base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True quantization.qmode="8da4w" quantization.group_size=128 model.dtype_override="fp32" base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' quantization.embedding_quantize=\'4,32\' export.output_name="llama3_kv_sdpa_xnn_qe_4_32.pte" ``` ### For LLaVA model diff --git a/examples/models/deepseek-r1-distill-llama-8B/README.md b/examples/models/deepseek-r1-distill-llama-8B/README.md index 5fd47ad61ec..f05dd9990a2 100644 --- a/examples/models/deepseek-r1-distill-llama-8B/README.md +++ b/examples/models/deepseek-r1-distill-llama-8B/README.md @@ -52,18 +52,18 @@ torch.save(sd, "/tmp/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/checkpoint.pth") 5. Generate a PTE file for use with the Llama runner. ``` -python -m examples.models.llama.export_llama \ - --checkpoint /tmp/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/checkpoint.pth \ - -p params.json \ - -kv \ - --use_sdpa_with_kv_cache \ - -X \ - -qmode 8da4w \ - --group_size 128 \ - -d fp16 \ - --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ - --embedding-quantize 4,32 \ - --output_name="DeepSeek-R1-Distill-Llama-8B.pte" +python -m extension.llm.export.export_llm \ + base.checkpoint=/tmp/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/checkpoint.pth \ + base.params=params.json \ + model.use_kv_cache=True \ + model.use_sdpa_with_kv_cache=True \ + backend.xnnpack.enabled=True \ + quantization.qmode="8da4w" \ + quantization.group_size=128 \ + model.dtype_override="fp16" \ + base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' \ + quantization.embedding_quantize=\'4,32\' \ + export.output_name="DeepSeek-R1-Distill-Llama-8B.pte" ``` 6. Run the model on your desktop for validation or integrate with iOS/Android apps. Instructions for these are available in the Llama [README](../llama/README.md) starting at Step 3. diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md index c6f0350fff7..e555043c44d 100644 --- a/examples/models/llama/README.md +++ b/examples/models/llama/README.md @@ -167,15 +167,15 @@ Llama 3 8B performance was measured on the Samsung Galaxy S22, S24, and OnePlus LLAMA_CHECKPOINT=path/to/consolidated.00.pth LLAMA_PARAMS=path/to/params.json -python -m examples.models.llama.export_llama \ - --model "llama3_2" \ - --checkpoint "${LLAMA_CHECKPOINT:?}" \ - --params "${LLAMA_PARAMS:?}" \ - -kv \ - --use_sdpa_with_kv_cache \ - -d bf16 \ - --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ - --output_name="llama3_2.pte" +python -m extension.llm.export.export_llm \ + base.model_class="llama3_2" \ + base.checkpoint="${LLAMA_CHECKPOINT:?}" \ + base.params="${LLAMA_PARAMS:?}" \ + model.use_kv_cache=True \ + model.use_sdpa_with_kv_cache=True \ + model.dtype_override="bf16" \ + base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' \ + export.output_name="llama3_2.pte" ``` For convenience, an [exported ExecuTorch bf16 model](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/llama3_2-1B.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/ExportRecipe_1B.ipynb). @@ -189,23 +189,23 @@ For convenience, an [exported ExecuTorch bf16 model](https://huggingface.co/exec LLAMA_QUANTIZED_CHECKPOINT=path/to/spinquant/consolidated.00.pth.pth LLAMA_PARAMS=path/to/spinquant/params.json -python -m examples.models.llama.export_llama \ - --model "llama3_2" \ - --checkpoint "${LLAMA_QUANTIZED_CHECKPOINT:?}" \ - --params "${LLAMA_PARAMS:?}" \ - --use_sdpa_with_kv_cache \ - -X \ - --xnnpack-extended-ops \ - --preq_mode 8da4w_output_8da8w \ - --preq_group_size 32 \ - --max_seq_length 2048 \ - --max_context_length 2048 \ - --output_name "llama3_2.pte" \ - -kv \ - -d fp32 \ - --preq_embedding_quantize 8,0 \ - --use_spin_quant native \ - --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' +python -m extension.llm.export.export_llm \ + base.model_class="llama3_2" \ + base.checkpoint="${LLAMA_QUANTIZED_CHECKPOINT:?}" \ + base.params="${LLAMA_PARAMS:?}" \ + model.use_sdpa_with_kv_cache=True \ + backend.xnnpack.enabled=True \ + backend.xnnpack.extended_ops=True \ + base.preq_mode="8da4w_output_8da8w" \ + base.preq_group_size=32 \ + export.max_seq_length=2048 \ + export.max_context_length=2048 \ + export.output_name="llama3_2.pte" \ + model.use_kv_cache=True \ + model.dtype_override="fp32" \ + base.preq_embedding_quantize=\'8,0\' \ + quantization.use_spin_quant="native" \ + base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' ``` For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_SpinQuant_INT4_EO8.ipynb). @@ -218,24 +218,24 @@ For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co LLAMA_QUANTIZED_CHECKPOINT=path/to/qlora/consolidated.00.pth.pth LLAMA_PARAMS=path/to/qlora/params.json -python -m examples.models.llama.export_llama \ - --model "llama3_2" \ - --checkpoint "${LLAMA_QUANTIZED_CHECKPOINT:?}" \ - --params "${LLAMA_PARAMS:?}" \ - -qat \ - -lora 16 \ - --preq_mode 8da4w_output_8da8w \ - --preq_group_size 32 \ - --preq_embedding_quantize 8,0 \ - --use_sdpa_with_kv_cache \ - -kv \ - -X \ - --xnnpack-extended-ops \ - -d fp32 \ - --max_seq_length 2048 \ - --max_context_length 2048 \ - --output_name "llama3_2.pte" \ - --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' +python -m extension.llm.export.export_llm \ + base.model_class="llama3_2" \ + base.checkpoint="${LLAMA_QUANTIZED_CHECKPOINT:?}" \ + base.params="${LLAMA_PARAMS:?}" \ + quantization.use_qat=True \ + base.use_lora=16 \ + base.preq_mode="8da4w_output_8da8w" \ + base.preq_group_size=32 \ + base.preq_embedding_quantize=\'8,0\' \ + model.use_sdpa_with_kv_cache=True \ + model.use_kv_cache=True \ + backend.xnnpack.enabled=True \ + backend.xnnpack.extended_ops=True \ + model.dtype_override="fp32" \ + export.max_seq_length=2048 \ + export.max_context_length=2048 \ + export.output_name="llama3_2.pte" \ + base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' ``` For convenience, an [exported ExecuTorch QAT+LoRA model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-QLORA_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_QLORA_INT4_EO8.ipynb). @@ -247,20 +247,20 @@ You can export and run the original Llama 3 8B instruct model. 2. Export model and generate `.pte` file ``` - python -m examples.models.llama.export_llama \ - --checkpoint \ - -p \ - -kv \ - --use_sdpa_with_kv_cache \ - -X \ - -qmode 8da4w \ - --group_size 128 \ - -d fp32 \ - --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ - --embedding-quantize 4,32 \ - --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte" + python -m extension.llm.export.export_llm \ + base.checkpoint= \ + base.params= \ + model.use_kv_cache=True \ + model.use_sdpa_with_kv_cache=True \ + backend.xnnpack.enabled=True \ + quantization.qmode="8da4w" \ + quantization.group_size=128 \ + model.dtype_override="fp32" \ + base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' \ + quantization.embedding_quantize=\'4,32\' \ + export.output_name="llama3_kv_sdpa_xnn_qe_4_32.pte" ``` - Due to the larger vocabulary size of Llama 3, we recommend quantizing the embeddings with `--embedding-quantize 4,32` as shown above to further reduce the model size. + Due to the larger vocabulary size of Llama 3, we recommend quantizing the embeddings with `quantization.embedding_quantize=\'4,32\'` as shown above to further reduce the model size. If you're interested in deploying on non-CPU backends, [please refer the non-cpu-backend section](non_cpu_backends.md) @@ -389,22 +389,22 @@ QLINEAR_GROUP_SIZE=128 # Must be multiple of 16 QEMBEDDING_BITWIDTH=4 # Can be 1-8 QEMBEDDING_GROUP_SIZE=32 # Must be multiple of 16 -python -m examples.models.llama.export_llama \ - --model "llama3_2" \ - --checkpoint "${LLAMA_CHECKPOINT:?}" \ - --params "${LLAMA_PARAMS:?}" \ - -kv \ - --use_sdpa_with_kv_cache \ - --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ - --output_name="llama3_2.pte" \ - -qmode "torchao:8da${QLINEAR_BITWIDTH}w" \ - --group_size ${QLINEAR_GROUP_SIZE} \ - -E "torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}" \ - -d fp32 +python -m extension.llm.export.export_llm \ + base.model_class="llama3_2" \ + base.checkpoint="${LLAMA_CHECKPOINT:?}" \ + base.params="${LLAMA_PARAMS:?}" \ + model.use_kv_cache=True \ + model.use_sdpa_with_kv_cache=True \ + base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' \ + export.output_name="llama3_2.pte" \ + quantization.qmode="torchao:8da${QLINEAR_BITWIDTH}w" \ + quantization.group_size=${QLINEAR_GROUP_SIZE} \ + quantization.embedding_quantize=\'torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}\' \ + model.dtype_override="fp32" ``` A few notes: -- If your model shares embedding/unembedding weights (like Llama1B and Llama3B do), you can add `--use_shared_embedding` to take advantage of this and reduce memory. When this option is enabled, you can specify whether embeddings are quantized asymmetrically or not by specifying a third argument. For example, `-E "torchao:4,32,true"` means that the embedding is quantized to 4-bits with group_size=32 and is asymmetric (this is the default behavior if you simply use `-E "torchao:4,32"`), whereas `-E "torchao:4,32,false"` means that the embedding is quantized to 4-bits with group_size=32 and is symmetric. If `--use_shared_embedding` is specified, the unembedding (i.e., the final linear layer) is quantized in the same way, but also uses 8-bit dynamically quantized activations. +- If your model shares embedding/unembedding weights (like Llama1B and Llama3B do), you can add `model.use_shared_embedding=True` to take advantage of this and reduce memory. When this option is enabled, you can specify whether embeddings are quantized asymmetrically or not by specifying a third argument. For example, `quantization.embedding_quantize="torchao:4,32,true"` means that the embedding is quantized to 4-bits with group_size=32 and is asymmetric (this is the default behavior if you simply use `quantization.embedding_quantize="torchao:4,32"`), whereas `quantization.embedding_quantize="torchao:4,32,false"` means that the embedding is quantized to 4-bits with group_size=32 and is symmetric. If `model.use_shared_embedding=True` is specified, the unembedding (i.e., the final linear layer) is quantized in the same way, but also uses 8-bit dynamically quantized activations. - To do channelwise quantization, specify group_size to 0. This works for both linear and embedding layers. Once the model is exported, we need to build ExecuTorch and the runner with the low-bit kernels. diff --git a/examples/models/llama/TARGETS b/examples/models/llama/TARGETS index d2caccd5897..95d57e12f5a 100644 --- a/examples/models/llama/TARGETS +++ b/examples/models/llama/TARGETS @@ -67,7 +67,7 @@ runtime.python_library( "//caffe2:torch", "//executorch/examples/models:model_base", "//executorch/examples/models/llama:llama_transformer", - "//executorch/examples/models/llama/config:llm_config", + "//executorch/extension/llm/export/config:llm_config", "//executorch/examples/models:checkpoint", ], ) @@ -150,7 +150,7 @@ runtime.python_library( ":source_transformation", "//ai_codesign/gen_ai/fast_hadamard_transform:fast_hadamard_transform", "//caffe2:torch", - "//executorch/examples/models/llama/config:llm_config", + "//executorch/extension/llm/export/config:llm_config", "//executorch/backends/vulkan/_passes:vulkan_passes", "//executorch/exir/passes:init_mutable_pass", "//executorch/examples/models:model_base", diff --git a/examples/models/llama/UTILS.md b/examples/models/llama/UTILS.md index 5f760ad7670..25bd7f77080 100644 --- a/examples/models/llama/UTILS.md +++ b/examples/models/llama/UTILS.md @@ -19,7 +19,7 @@ From `executorch` root: ``` 3. Export model and generate `.pte` file. ``` - python -m examples.models.llama.export_llama -c stories110M.pt -p params.json -X -kv + python -m extension.llm.export.export_llm base.checkpoint=stories110M.pt base.params=params.json backend.xnnpack.enabled=True model.use_kv_cache=True ``` ## Smaller model delegated to other backends @@ -27,15 +27,15 @@ From `executorch` root: Currently we supported lowering the stories model to other backends, including, CoreML, MPS and QNN. Please refer to the instruction for each backend ([CoreML](https://pytorch.org/executorch/main/backends-coreml), [MPS](https://pytorch.org/executorch/main/backends-mps), [QNN](https://pytorch.org/executorch/main/backends-qualcomm)) before trying to lower them. After the backend library is installed, the script to export a lowered model is -- Lower to CoreML: `python -m examples.models.llama.export_llama -kv --disable_dynamic_shape --coreml -c stories110M.pt -p params.json ` -- MPS: `python -m examples.models.llama.export_llama -kv --disable_dynamic_shape --mps -c stories110M.pt -p params.json ` -- QNN: `python -m examples.models.llama.export_llama -kv --disable_dynamic_shape --qnn -c stories110M.pt -p params.json ` +- Lower to CoreML: `python -m extension.llm.export.export_llm model.use_kv_cache=True model.enable_dynamic_shape=False backend.coreml.enabled=True base.checkpoint=stories110M.pt base.params=params.json` +- MPS: `python -m extension.llm.export.export_llm model.use_kv_cache=True model.enable_dynamic_shape=False backend.mps.enabled=True base.checkpoint=stories110M.pt base.params=params.json` +- QNN: `python -m extension.llm.export.export_llm model.use_kv_cache=True model.enable_dynamic_shape=False backend.qnn.enabled=True base.checkpoint=stories110M.pt base.params=params.json` The iOS LLAMA app supports the CoreML and MPS model and the Android LLAMA app supports the QNN model. On Android, it also allow to cross compiler the llama runner binary, push to the device and run. For CoreML, there are 2 additional optional arguments: -* `--coreml-ios`: Specify the minimum iOS version to deploy (and turn on available optimizations). E.g. `--coreml-ios 18` will turn on [in-place KV cache](https://developer.apple.com/documentation/coreml/mlstate?language=objc) and [fused scaled dot product attention kernel](https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS18.transformers.scaled_dot_product_attention) (the resulting model will then need at least iOS 18 to run, though) -* `--coreml-quantize`: Use [quantization tailored for CoreML](https://apple.github.io/coremltools/docs-guides/source/opt-quantization-overview.html). E.g. `--coreml-quantize b4w` will perform per-block 4-bit weight-only quantization in a way tailored for CoreML +* `backend.coreml.ios`: Specify the minimum iOS version to deploy (and turn on available optimizations). E.g. `backend.coreml.ios=18` will turn on [in-place KV cache](https://developer.apple.com/documentation/coreml/mlstate?language=objc) and [fused scaled dot product attention kernel](https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS18.transformers.scaled_dot_product_attention) (the resulting model will then need at least iOS 18 to run, though) +* `backend.coreml.quantize`: Use [quantization tailored for CoreML](https://apple.github.io/coremltools/docs-guides/source/opt-quantization-overview.html). E.g. `backend.coreml.quantize="b4w"` will perform per-block 4-bit weight-only quantization in a way tailored for CoreML To deploy the large 8B model on the above backends, [please visit this section](non_cpu_backends.md). diff --git a/examples/models/llama/config/targets.bzl b/examples/models/llama/config/targets.bzl index 8b85ce6d107..a72ffa6c8e0 100644 --- a/examples/models/llama/config/targets.bzl +++ b/examples/models/llama/config/targets.bzl @@ -2,25 +2,12 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest") def define_common_targets(): - runtime.python_library( - name = "llm_config", - srcs = [ - "llm_config.py", - ], - _is_external_target = True, - base_module = "executorch.examples.models.llama.config", - visibility = [ - "//executorch/...", - "@EXECUTORCH_CLIENTS", - ], - ) - python_unittest( name = "test_llm_config", srcs = [ "test_llm_config.py", ], deps = [ - ":llm_config", + "//executorch/extension/llm/export/config:llm_config", ], ) diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index 1f055d65822..a82703043ae 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -28,7 +28,7 @@ from executorch.devtools.etrecord import generate_etrecord as generate_etrecord_func -from executorch.examples.models.llama.config.llm_config import LlmConfig +from executorch.extension.llm.export.config.llm_config import LlmConfig from executorch.examples.models.llama.hf_download import ( download_and_convert_hf_checkpoint, ) @@ -53,6 +53,8 @@ ) from executorch.util.activation_memory_profiler import generate_memory_trace +from omegaconf import DictConfig + from ..model_factory import EagerModelFactory from .source_transformation.apply_spin_quant_r1_r2 import ( fuse_layer_norms, @@ -102,9 +104,9 @@ "llama3_2", "static_llama", "qwen2_5", - "qwen3-0_6b", - "qwen3-1_7b", - "qwen3-4b", + "qwen3_0_6b", + "qwen3_1_7b", + "qwen3_4b", "phi_4_mini", "smollm2", ] @@ -113,9 +115,9 @@ "qwen2_5": "Qwen/Qwen2.5-1.5B", "phi_4_mini": "microsoft/Phi-4-mini-instruct", "smollm2": "HuggingFaceTB/SmolLM-135M", - "qwen3-0_6b": "Qwen/Qwen3-0.6B", - "qwen3-1_7b": "Qwen/Qwen3-1.7B", - "qwen3-4b": "Qwen/Qwen3-4B", + "qwen3_0_6b": "Qwen/Qwen3-0.6B", + "qwen3_1_7b": "Qwen/Qwen3-1.7B", + "qwen3_4b": "Qwen/Qwen3-4B", } @@ -571,12 +573,14 @@ def canonical_path(path: Union[str, Path], *, dir: bool = False) -> str: def export_llama( - export_options: Union[argparse.Namespace, LlmConfig], + export_options: Union[argparse.Namespace, LlmConfig, DictConfig], ) -> str: if isinstance(export_options, argparse.Namespace): # Legacy CLI. llm_config = LlmConfig.from_args(export_options) - elif isinstance(export_options, LlmConfig): + elif isinstance(export_options, LlmConfig) or isinstance( + export_options, DictConfig + ): # Hydra CLI. llm_config = export_options else: @@ -586,7 +590,7 @@ def export_llama( # If a checkpoint isn't provided for an HF OSS model, download and convert the # weights first. - model_name = llm_config.base.model_class + model_name = llm_config.base.model_class.value if not llm_config.base.checkpoint and model_name in HUGGING_FACE_REPO_IDS: repo_id = HUGGING_FACE_REPO_IDS[model_name] if model_name == "qwen2_5": @@ -664,7 +668,7 @@ def _prepare_for_llama_export(llm_config: LlmConfig) -> LLMEdgeManager: llm_config.export.output_dir = output_dir_path # Convert dtype override string to actual type. - dtype_override = DType[llm_config.model.dtype_override] + dtype_override = DType[llm_config.model.dtype_override.value] edge_manager = _load_llama_model(llm_config) @@ -698,7 +702,11 @@ def _prepare_for_llama_export(llm_config: LlmConfig) -> LLMEdgeManager: checkpoint=llm_config.base.checkpoint, checkpoint_dtype=DType.from_torch_dtype(checkpoint_dtype), # type: ignore tokenizer_path=llm_config.base.tokenizer_path, - use_spin_quant=llm_config.quantization.use_spin_quant, + use_spin_quant=( + llm_config.quantization.use_spin_quant.value + if llm_config.quantization.use_spin_quant + else None + ), embedding_quantize=llm_config.quantization.embedding_quantize, use_shared_embedding=llm_config.model.use_shared_embedding, quantization_mode=llm_config.quantization.qmode, @@ -722,7 +730,9 @@ def _prepare_for_llama_export(llm_config: LlmConfig) -> LLMEdgeManager: vulkan=llm_config.backend.vulkan.enabled, use_qat=llm_config.quantization.use_qat, use_lora=llm_config.base.use_lora, - preq_mode=llm_config.base.preq_mode, + preq_mode=( + llm_config.base.preq_mode.value if llm_config.base.preq_mode else None + ), preq_group_size=llm_config.base.preq_group_size, preq_embedding_quantize=llm_config.base.preq_embedding_quantize, local_global_attention=llm_config.model.local_global_attention, @@ -734,25 +744,34 @@ def _prepare_for_llama_export(llm_config: LlmConfig) -> LLMEdgeManager: def get_quantizer_and_quant_params(llm_config): pt2e_quant_params = get_pt2e_quantization_params( - llm_config.quantization.pt2e_quantize, llm_config.quantization.qmode + ( + llm_config.quantization.pt2e_quantize.value + if llm_config.quantization.pt2e_quantize + else None + ), + llm_config.quantization.qmode, ) quantizers = get_pt2e_quantizers(pt2e_quant_params, llm_config.export.so_library) quant_dtype = None if llm_config.backend.qnn.enabled and llm_config.quantization.pt2e_quantize: assert len(quantizers) == 0, "Should not enable both xnnpack and qnn" qnn_quantizer, quant_dtype = get_qnn_quantizer( - llm_config.quantization.pt2e_quantize, llm_config.quantization.qmode + llm_config.quantization.pt2e_quantize.value, llm_config.quantization.qmode ) quantizers.append(qnn_quantizer) if llm_config.backend.coreml.enabled and llm_config.quantization.pt2e_quantize: assert len(quantizers) == 0, "Should not enable both xnnpack / qnn and coreml" - coreml_quantizer = get_coreml_quantizer(llm_config.quantization.pt2e_quantize) + coreml_quantizer = get_coreml_quantizer( + llm_config.quantization.pt2e_quantize.value + ) quantizers.append(coreml_quantizer) if llm_config.backend.vulkan.enabled and llm_config.quantization.pt2e_quantize: assert ( len(quantizers) == 0 ), "Should not enable both vulkan and other quantizers" - vulkan_quantizer = get_vulkan_quantizer(llm_config.quantization.pt2e_quantize) + vulkan_quantizer = get_vulkan_quantizer( + llm_config.quantization.pt2e_quantize.value + ) quantizers.append(vulkan_quantizer) logging.info(f"Applying quantizers: {quantizers}") return pt2e_quant_params, quantizers, quant_dtype @@ -1029,7 +1048,7 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager: # noqa: C901 ) additional_passes = [] - if llm_config.base.model_class in TORCHTUNE_DEFINED_MODELS: + if llm_config.base.model_class.value in TORCHTUNE_DEFINED_MODELS: additional_passes = [InitializedMutableBufferPass(["kv_cache_pos"])] # export_to_edge @@ -1068,14 +1087,22 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager: # noqa: C901 mps=llm_config.backend.mps.enabled, coreml=llm_config.backend.coreml.enabled, qnn=llm_config.backend.qnn.enabled, - dtype_override=llm_config.model.dtype_override, + dtype_override=llm_config.model.dtype_override.value, enable_dynamic_shape=llm_config.model.enable_dynamic_shape, use_kv_cache=llm_config.model.use_kv_cache, embedding_quantize=llm_config.quantization.embedding_quantize, - pt2e_quantize=llm_config.quantization.pt2e_quantize, + pt2e_quantize=( + llm_config.quantization.pt2e_quantize.value + if llm_config.quantization.pt2e_quantize + else None + ), coreml_ios=llm_config.backend.coreml.ios, - coreml_quantize=llm_config.backend.coreml.quantize, - coreml_compute_units=llm_config.backend.coreml.compute_units, + coreml_quantize=( + llm_config.backend.coreml.quantize.value + if llm_config.backend.coreml.quantize + else None + ), + coreml_compute_units=llm_config.backend.coreml.compute_units.value, use_qnn_sha=llm_config.backend.qnn.use_sha, num_sharding=llm_config.backend.qnn.num_sharding, soc_model=llm_config.backend.qnn.soc_model, @@ -1148,7 +1175,7 @@ def _load_llama_model(llm_config: LlmConfig) -> "LLMEdgeManager": An instance of LLMEdgeManager which contains the eager mode model. """ - modelname = llm_config.base.model_class + modelname = llm_config.base.model_class.value if modelname in EXECUTORCH_DEFINED_MODELS: module_name = "llama" model_class_name = "Llama2Model" # TODO: Change to "LlamaModel" in examples/models/llama/model.py. @@ -1169,7 +1196,7 @@ def _load_llama_model(llm_config: LlmConfig) -> "LLMEdgeManager": ) ) # Convert dtype override string to actual type. - dtype_override = DType[llm_config.model.dtype_override] + dtype_override = DType[llm_config.model.dtype_override.value] return LLMEdgeManager( model=model, diff --git a/examples/models/llama/model.py b/examples/models/llama/model.py index ec9646be6f4..f3b59fbcb4f 100644 --- a/examples/models/llama/model.py +++ b/examples/models/llama/model.py @@ -16,7 +16,7 @@ get_default_model_resource_dir, ) -from executorch.examples.models.llama.config.llm_config import LlmConfig +from executorch.extension.llm.export.config.llm_config import LlmConfig from executorch.examples.models.llama.llama_transformer import construct_transformer from executorch.examples.models.llama.model_args import ModelArgs from executorch.examples.models.llama.rope import Rope @@ -157,7 +157,7 @@ def __init__(self, llm_config: Optional[LlmConfig] = None): if model_args.use_scaled_rope: # Older models don't have use_scaled_rope configuration - model_name = str(self.llm_config.base.model_class) + model_name = self.llm_config.base.model_class.value assert model_name not in ["llama2", "stories110m"] # Llama3_2 and newer models in ExecuTorch repo should set larger scale factor @@ -328,10 +328,10 @@ def get_example_inputs_kvcache_sdpa(self): def _transform_for_pre_quantization(self, checkpoint, model_args): assert self.llm_config.base.preq_mode, "preq_mode must be specified" - assert self.llm_config.base.preq_mode in [ + assert self.llm_config.base.preq_mode.value in [ "8da4w", "8da4w_output_8da8w", - ], f"Quantization mode {self.llm_config.base.preq_mode} is not compatible with SpinQuant." + ], f"Quantization mode {self.llm_config.base.preq_mode.value} is not compatible with SpinQuant." assert self.llm_config.base.preq_group_size, "preq_group_size must be specified" assert self.llm_config.model.dtype_override, "dtype_override must be specified" @@ -351,7 +351,7 @@ def _transform_for_pre_quantization(self, checkpoint, model_args): } # Transform the output layer first if needed. - if self.llm_config.base.preq_mode == "8da4w_output_8da8w": + if self.llm_config.base.preq_mode.value == "8da4w_output_8da8w": from .source_transformation.pre_quantization import ( transform_output_linear_for_pre_quantization, ) @@ -359,14 +359,14 @@ def _transform_for_pre_quantization(self, checkpoint, model_args): self.model_ = transform_output_linear_for_pre_quantization( module=self.model_, checkpoint=checkpoint, - dtype=mapping[self.llm_config.model.dtype_override], + dtype=mapping[self.llm_config.model.dtype_override.value], ) self.model_ = transform_linear_for_pre_quantization( self.model_, checkpoint, self.llm_config.base.preq_group_size, - mapping[self.llm_config.model.dtype_override], + mapping[self.llm_config.model.dtype_override.value], ) embedding_bit_width, embedding_group_size = None, None @@ -390,7 +390,7 @@ def _transform_for_pre_quantization(self, checkpoint, model_args): self.model_ = transform_embedding_for_pre_quantization( self.model_, checkpoint, - mapping[self.llm_config.model.dtype_override], + mapping[self.llm_config.model.dtype_override.value], int(embedding_bit_width), embedding_group_size, ) diff --git a/examples/models/llama/runner/eager.py b/examples/models/llama/runner/eager.py index c55ad0eea28..5521b2edc94 100644 --- a/examples/models/llama/runner/eager.py +++ b/examples/models/llama/runner/eager.py @@ -10,7 +10,7 @@ import torch -from executorch.examples.models.llama.config.llm_config import LlmConfig +from executorch.extension.llm.export.config.llm_config import LlmConfig from executorch.examples.models.llama.export_llama_lib import ( _prepare_for_llama_export, build_args_parser as _build_args_parser, diff --git a/examples/models/llama/tests/test_export_llama_lib.py b/examples/models/llama/tests/test_export_llama_lib.py index f2ac9497604..74dea3eb536 100644 --- a/examples/models/llama/tests/test_export_llama_lib.py +++ b/examples/models/llama/tests/test_export_llama_lib.py @@ -7,7 +7,7 @@ import unittest from executorch.devtools.backend_debug import get_delegation_info -from executorch.examples.models.llama.config.llm_config import LlmConfig +from executorch.extension.llm.export.config.llm_config import LlmConfig from executorch.examples.models.llama.export_llama_lib import ( _export_llama, build_args_parser, diff --git a/examples/models/llama2/README.md b/examples/models/llama2/README.md index 615ad3948fc..21f761b7f71 100644 --- a/examples/models/llama2/README.md +++ b/examples/models/llama2/README.md @@ -37,7 +37,7 @@ You can export and run the original Llama 2 7B model. 3. Export model and generate `.pte` file: ``` - python -m examples.models.llama.export_llama --checkpoint --params -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 + python -m extension.llm.export.export_llm base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True quantization.qmode="8da4w" quantization.group_size=128 model.dtype_override="fp32" ``` 4. Create tokenizer.bin. ``` diff --git a/examples/models/llama3_2_vision/runner/eager.py b/examples/models/llama3_2_vision/runner/eager.py index 5e68a43bf8e..66591c3dcdd 100644 --- a/examples/models/llama3_2_vision/runner/eager.py +++ b/examples/models/llama3_2_vision/runner/eager.py @@ -8,7 +8,7 @@ from typing import Optional import torch -from executorch.examples.models.llama.config.llm_config import LlmConfig +from executorch.extension.llm.export.config.llm_config import LlmConfig from executorch.examples.models.llama.export_llama_lib import _prepare_for_llama_export from executorch.examples.models.llama.runner.eager import execute_runner diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py index 32b3ff448ac..84361d6f64b 100644 --- a/examples/models/llava/export_llava.py +++ b/examples/models/llava/export_llava.py @@ -16,7 +16,7 @@ get_symmetric_quantization_config, XNNPACKQuantizer, ) -from executorch.examples.models.llama.config.llm_config import LlmConfig +from executorch.extension.llm.export.config.llm_config import LlmConfig from executorch.examples.models.llama.export_llama_lib import ( get_quantizer_and_quant_params, ) diff --git a/examples/models/phi_4_mini/README.md b/examples/models/phi_4_mini/README.md index a23e4f49638..d168d54226e 100644 --- a/examples/models/phi_4_mini/README.md +++ b/examples/models/phi_4_mini/README.md @@ -7,9 +7,9 @@ Phi-4-mini uses the same example code as Llama, while the checkpoint, model para All commands for exporting and running Llama on various backends should also be applicable to Phi-4-mini, by swapping the following args: ``` ---model phi_4_mini ---params examples/models/phi-4-mini/config.json ---checkpoint +base.model_class="phi_4_mini" +base.params="examples/models/phi-4-mini/config.json" +base.checkpoint= ``` ### Generate the Checkpoint @@ -32,17 +32,17 @@ Export to XNNPack, no quantization: # Set these paths to point to the downloaded files PHI_CHECKPOINT=path/to/checkpoint.pth -python -m examples.models.llama.export_llama \ - --model phi_4_mini \ - --checkpoint "${PHI_CHECKPOINT=path/to/checkpoint.pth:?}" \ - --params examples/models/phi-4-mini/config.json \ - -kv \ - --use_sdpa_with_kv_cache \ - -d fp32 \ - -X \ - --metadata '{"get_bos_id":151643, "get_eos_ids":[151643]}' \ - --output_name="phi-4-mini.pte" - --verbose +python -m extension.llm.export.export_llm \ + base.model_class="phi_4_mini" \ + base.checkpoint="${PHI_CHECKPOINT=path/to/checkpoint.pth:?}" \ + base.params="examples/models/phi-4-mini/config.json" \ + model.use_kv_cache=True \ + model.use_sdpa_with_kv_cache=True \ + model.dtype_override="fp32" \ + backend.xnnpack.enabled=True \ + base.metadata='"{\"get_bos_id\":151643, \"get_eos_ids\":[151643]}"' \ + export.output_name="phi-4-mini.pte" \ + debug.verbose=True ``` Run using the executor runner: diff --git a/examples/models/qwen2_5/README.md b/examples/models/qwen2_5/README.md index 9bf791a35ed..57784169ece 100644 --- a/examples/models/qwen2_5/README.md +++ b/examples/models/qwen2_5/README.md @@ -7,9 +7,9 @@ Qwen 2.5 uses the same example code as Llama, while the checkpoint, model params All commands for exporting and running Llama on various backends should also be applicable to Qwen 2.5, by swapping the following args: ``` ---model qwen2_5 ---params examples/models/qwen2_5/1_5b_config.json ---checkpoint +base.model_class="qwen2_5" +base.params="examples/models/qwen2_5/1_5b_config.json" +base.checkpoint= ``` ### Generate the Checkpoint @@ -32,17 +32,17 @@ Export to XNNPack, no quantization: # Set these paths to point to the downloaded files QWEN_CHECKPOINT=path/to/checkpoint.pth -python -m examples.models.llama.export_llama \ - --model "qwen2_5" \ - --checkpoint "${QWEN_CHECKPOINT:?}" \ - --params examples/models/qwen2_5/1_5b_config.json \ - -kv \ - --use_sdpa_with_kv_cache \ - -d fp32 \ - -X \ - --metadata '{"get_bos_id":151643, "get_eos_ids":[151643]}' \ - --output_name="qwen2_5-1_5b.pte" - --verbose +python -m extension.llm.export.export_llm \ + base.model_class="qwen2_5" \ + base.checkpoint="${QWEN_CHECKPOINT:?}" \ + base.params="examples/models/qwen2_5/1_5b_config.json" \ + model.use_kv_cache=True \ + model.use_sdpa_with_kv_cache=True \ + model.dtype_override="fp32" \ + backend.xnnpack.enabled=True \ + base.metadata='"{\"get_bos_id\":151643, \"get_eos_ids\":[151643]}"' \ + export.output_name="qwen2_5-1_5b.pte" \ + debug.verbose=True ``` Run using the executor runner: diff --git a/examples/models/qwen3/README.md b/examples/models/qwen3/README.md index a589d27c19d..e24d8da2637 100644 --- a/examples/models/qwen3/README.md +++ b/examples/models/qwen3/README.md @@ -7,8 +7,8 @@ Qwen 3 uses the same example code as our optimized Llama model, while the checkp All commands for exporting and running Llama on various backends should also be applicable to Qwen 3, by swapping the following args: ``` ---model [qwen3-0.6b,qwen3-1_7b,qwen3-4b] ---params [examples/models/qwen3/0_6b_config.json,examples/models/qwen3/1_7b_config.json,examples/models/qwen3/4b_config.json] +base.model_class=[qwen3_0_6b,qwen3_1_7b,qwen3_4b] +base.params=[examples/models/qwen3/0_6b_config.json,examples/models/qwen3/1_7b_config.json,examples/models/qwen3/4b_config.json] ``` ### Example export @@ -16,58 +16,58 @@ Here is a basic example for exporting Qwen 3, although please refer to the Llama Export 0.6b to XNNPack, quantized with 8da4w: ``` -python -m examples.models.llama.export_llama \ - --model qwen3-0_6b \ - --params examples/models/qwen3/0_6b_config.json \ - -kv \ - --use_sdpa_with_kv_cache \ - -d fp32 \ - -X \ - --xnnpack-extended-ops \ - -qmode 8da4w \ - --metadata '{"get_bos_id": 151644, "get_eos_ids":[151645]}' \ - --output_name="qwen3-0_6b.pte" \ - --verbose +python -m extension.llm.export.export_llm \ + base.model_class="qwen3_0_6b" \ + base.params="examples/models/qwen3/0_6b_config.json" \ + model.use_kv_cache=True \ + model.use_sdpa_with_kv_cache=True \ + model.dtype_override="fp32" \ + backend.xnnpack.enabled=True \ + backend.xnnpack.extended_ops=True \ + quantization.qmode="8da4w" \ + base.metadata='"{\"get_bos_id\": 151644, \"get_eos_ids\":[151645]}"' \ + export.output_name="qwen3_0_6b.pte" \ + debug.verbose=True ``` Export 1.7b to XNNPack, quantized with 8da4w: ``` -python -m examples.models.llama.export_llama \ - --model qwen3-1_7b \ - --params examples/models/qwen3/1_7b_config.json \ - -kv \ - --use_sdpa_with_kv_cache \ - -d fp32 \ - -X \ - --xnnpack-extended-ops \ - -qmode 8da4w \ - --metadata '{"get_bos_id": 151644, "get_eos_ids":[151645]}' \ - --output_name="qwen3-1_7b.pte" \ - --verbose +python -m extension.llm.export.export_llm \ + base.model_class="qwen3_1_7b" \ + base.params="examples/models/qwen3/1_7b_config.json" \ + model.use_kv_cache=True \ + model.use_sdpa_with_kv_cache=True \ + model.dtype_override="fp32" \ + backend.xnnpack.enabled=True \ + backend.xnnpack.extended_ops=True \ + quantization.qmode="8da4w" \ + base.metadata='"{\"get_bos_id\": 151644, \"get_eos_ids\":[151645]}"' \ + export.output_name="qwen3_1_7b.pte" \ + debug.verbose=True ``` Export 4b to XNNPack, quantized with 8da4w: ``` -python -m examples.models.llama.export_llama \ - --model qwen3-4b \ - --params examples/models/qwen3/4b_config.json \ - -kv \ - --use_sdpa_with_kv_cache \ - -d fp32 \ - -X \ - --xnnpack-extended-ops \ - -qmode 8da4w \ - --metadata '{"get_bos_id": 151644, "get_eos_ids":[151645]}' \ - --output_name="qwen3-4b.pte" \ - --verbose +python -m extension.llm.export.export_llm \ + base.model_class="qwen3_4b" \ + base.params="examples/models/qwen3/4b_config.json" \ + model.use_kv_cache=True \ + model.use_sdpa_with_kv_cache=True \ + model.dtype_override="fp32" \ + backend.xnnpack.enabled=True \ + backend.xnnpack.extended_ops=True \ + quantization.qmode="8da4w" \ + base.metadata='"{\"get_bos_id\": 151644, \"get_eos_ids\":[151645]}"' \ + export.output_name="qwen3_4b.pte" \ + debug.verbose=True ``` ### Example run With ExecuTorch pybindings: ``` python -m examples.models.llama.runner.native - --model qwen3-0_6b \ - --pte qwen3-0_6b.pte \ + --model qwen3_0_6b \ + --pte qwen3_0_6b.pte \ --tokenizer ~/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/a9c98e602b9d36d2a2f7ba1eb0f5f31e4e8e5143/tokenizer.json \ --tokenizer_config ~/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/a9c98e602b9d36d2a2f7ba1eb0f5f31e4e8e5143/tokenizer_config.json \ --prompt "Who is the president of the US?" \ @@ -80,7 +80,7 @@ python -m examples.models.llama.runner.native With ExecuTorch's sample c++ runner (see the Llama README's [Step 3: Run on your computer to validate](../llama/README.md#step-3-run-on-your-computer-to-validate) to build the runner): ``` cmake-out/examples/models/llama/llama_main - --model_path qwen3-0_6b.pte + --model_path qwen3_0_6b.pte --tokenizer_path ~/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/a9c98e602b9d36d2a2f7ba1eb0f5f31e4e8e5143/tokenizer.json --prompt="Who is the president of the US?" ``` diff --git a/extension/android/executorch_android/android_test_setup.sh b/extension/android/executorch_android/android_test_setup.sh index 682a1d16787..f521dac30c5 100644 --- a/extension/android/executorch_android/android_test_setup.sh +++ b/extension/android/executorch_android/android_test_setup.sh @@ -25,7 +25,7 @@ prepare_tinyllama() { # Create params.json file touch params.json echo '{"dim": 288, "multiple_of": 32, "n_heads": 6, "n_layers": 6, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json - python -m examples.models.llama.export_llama -c stories15M.pt -p params.json -d fp16 -n stories15m_h.pte -kv + python -m extension.llm.export.export_llm base.checkpoint=stories15M.pt base.params=params.json model.dtype_override=fp16 export.output_name=stories15m_h.pte model.use_kv_cache=true python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin cp stories15m_h.pte "${BASEDIR}/src/androidTest/resources/stories.pte" diff --git a/extension/llm/export/README.md b/extension/llm/export/README.md new file mode 100644 index 00000000000..6bf73dc7990 --- /dev/null +++ b/extension/llm/export/README.md @@ -0,0 +1,137 @@ +# LLM Export API + +This directory contains the unified API for exporting Large Language Models (LLMs) to ExecuTorch. The `export_llm` module provides a streamlined interface to convert various LLM architectures to optimized `.pte` files for on-device inference. + +## Overview + +The LLM export process transforms a model from its original format to an optimized representation suitable for mobile and edge devices. This involves several key steps: + +1. **Model Instantiation**: Load the model architecture and weights from sources like Hugging Face +2. **Source Transformations**: Apply model-specific optimizations and quantization +3. **IR Export**: Convert to intermediate representations (EXIR, Edge dialect) +4. **Graph Transformations**: Apply backend-specific optimizations and PT2E quantization +5. **Backend Delegation**: Partition operations to hardware-specific backends (XNNPACK, CoreML, QNN, etc.) +6. **Serialization**: Export to final ExecuTorch `.pte` format + +## Supported Models + +- **Llama**: Llama 2, Llama 3, Llama 3.1, Llama 3.2 (1B, 3B, 8B variants) +- **Qwen**: Qwen 2.5, Qwen 3 (0.6B, 1.7B, 4B variants) +- **Phi**: Phi-3-Mini, Phi-4-Mini +- **Stories**: Stories110M (educational model) +- **SmolLM**: SmolLM2 + +## Usage + +The export API supports two configuration approaches: + +### Option 1: Hydra CLI Arguments + +Use structured configuration arguments directly on the command line: + +```bash +python -m extension.llm.export.export_llm \ + base.model_class=llama3 \ + model.use_sdpa_with_kv_cache=True \ + model.use_kv_cache=True \ + export.max_seq_length=128 \ + debug.verbose=True \ + backend.xnnpack.enabled=True \ + backend.xnnpack.extended_ops=True \ + quantization.qmode=8da4w +``` + +### Option 2: Configuration File + +Create a YAML configuration file and reference it: + +```bash +python -m extension.llm.export.export_llm --config my_config.yaml +``` + +Example `my_config.yaml`: +```yaml +base: + model_class: llama3 + tokenizer_path: /path/to/tokenizer.json + +model: + use_kv_cache: true + use_sdpa_with_kv_cache: true + enable_dynamic_shape: true + +export: + max_seq_length: 512 + output_dir: ./exported_models + output_name: llama3_optimized.pte + +quantization: + qmode: 8da4w + group_size: 32 + +backend: + xnnpack: + enabled: true + extended_ops: true + +debug: + verbose: true +``` + +**Important**: You cannot mix both approaches. Use either CLI arguments OR a config file, not both. + +## Example Commands + +### Export Qwen3 0.6B with XNNPACK backend and quantization +```bash +python -m extension.llm.export.export_llm \ + base.model_class=qwen3_0_6b \ + base.params=examples/models/qwen3/0_6b_config.json \ + base.metadata='{"get_bos_id": 151644, "get_eos_ids":[151645]}' \ + model.use_kv_cache=true \ + model.use_sdpa_with_kv_cache=true \ + model.dtype_override=FP32 \ + export.max_seq_length=512 \ + export.output_name=qwen3_0_6b.pte \ + quantization.qmode=8da4w \ + backend.xnnpack.enabled=true \ + backend.xnnpack.extended_ops=true \ + debug.verbose=true +``` + +### Export Phi-4-Mini with custom checkpoint +```bash +python -m extension.llm.export.export_llm \ + base.model_class=phi_4_mini \ + base.checkpoint=/path/to/phi4_checkpoint.pth \ + base.params=examples/models/phi-4-mini/config.json \ + base.metadata='{"get_bos_id":151643, "get_eos_ids":[151643]}' \ + model.use_kv_cache=true \ + model.use_sdpa_with_kv_cache=true \ + export.max_seq_length=256 \ + export.output_name=phi4_mini.pte \ + backend.xnnpack.enabled=true \ + debug.verbose=true +``` + +### Export with CoreML backend (iOS optimization) +```bash +python -m extension.llm.export.export_llm \ + base.model_class=llama3 \ + model.use_kv_cache=true \ + export.max_seq_length=128 \ + backend.coreml.enabled=true \ + backend.coreml.compute_units=ALL \ + quantization.pt2e_quantize=coreml_c4w \ + debug.verbose=true +``` + +## Configuration Options + +For a complete reference of all available configuration options, see the [LlmConfig class definition](config/llm_config.py) which documents all supported parameters for base, model, export, quantization, backend, and debug configurations. + +## Further Reading + +- [Llama Examples](../../../examples/models/llama/README.md) - Comprehensive Llama export guide +- [LLM Runner](../runner/) - Running exported models +- [ExecuTorch Documentation](https://pytorch.org/executorch/) - Framework overview \ No newline at end of file diff --git a/extension/llm/export/config/TARGETS b/extension/llm/export/config/TARGETS new file mode 100644 index 00000000000..bf8d13dcf37 --- /dev/null +++ b/extension/llm/export/config/TARGETS @@ -0,0 +1,5 @@ +load(":targets.bzl", "define_common_targets") + +oncall("executorch") + +define_common_targets() \ No newline at end of file diff --git a/examples/models/llama/config/llm_config.py b/extension/llm/export/config/llm_config.py similarity index 93% rename from examples/models/llama/config/llm_config.py rename to extension/llm/export/config/llm_config.py index 034d8af7562..a4d1dcc2840 100644 --- a/examples/models/llama/config/llm_config.py +++ b/extension/llm/export/config/llm_config.py @@ -10,13 +10,17 @@ Configurations for exporting Llama. Uses dataclasses, which integrate with OmegaConf and Hydra. + +Note: +- Hydra is a bit finnicky with string values that include quotations, please +refer to https://hydra.cc/docs/1.2/advanced/override_grammar/basic/#quoted-values +for more information. """ import argparse import ast import re from dataclasses import dataclass, field -from enum import Enum from typing import ClassVar, List, Optional @@ -26,19 +30,19 @@ class ModelType(str, Enum): - STORIES110M = "stories110m" - LLAMA2 = "llama2" - LLAMA3 = "llama3" - LLAMA3_1 = "llama3_1" - LLAMA3_2 = "llama3_2" - LLAMA3_2_VISION = "llama3_2_vision" - STATIC_LLAMA = "static_llama" - QWEN2_5 = "qwen2_5" - QWEN3_0_6B = "qwen3-0_6b" - QWEN3_1_7B = "qwen3-1_7b" - QWEN3_4B = "qwen3-4b" - PHI_4_MINI = "phi_4_mini" - SMOLLM2 = "smollm2" + stories110m = "stories110m" + llama2 = "llama2" + llama3 = "llama3" + llama3_1 = "llama3_1" + llama3_2 = "llama3_2" + llama3_2_vision = "llama3_2_vision" + static_llama = "static_llama" + qwen2_5 = "qwen2_5" + qwen3_0_6b = "qwen3_0_6b" + qwen3_1_7b = "qwen3_1_7b" + qwen3_4b = "qwen3_4b" + phi_4_mini = "phi_4_mini" + smollm2 = "smollm2" class PreqMode(str, Enum): @@ -49,8 +53,8 @@ class PreqMode(str, Enum): are still around to preserve backward compatibility. """ - PREQ_8DA4W = "8da4w" - PREQ_8DA4W_OUT_8DA8W = "8da4w_output_8da8w" + preq_8da4w = "8da4w" + preq_8da4w_out_8da8w = "8da4w_output_8da8w" @dataclass @@ -65,11 +69,13 @@ class BaseConfig: params: Model parameters, such as n_layers, hidden_size, etc. If left empty will use defaults specified in model_args.py. checkpoint: Path to the checkpoint file. - If left empty, the model will be initialized with random weights. + If left empty, the model will either be initialized with random weights + if it is a Llama model or the weights will be downloaded from HuggingFace + if it is a non-Llama model. checkpoint_dir: Path to directory containing sharded checkpoint files. tokenizer_path: Path to the tokenizer file. metadata: Json string containing metadata information. - e.g. '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' + e.g. '"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' use_lora: Rank of the LoRA, if set to 0 then this means no LoRA. For use with QAT. fairseq2: For legacy internal use cases, this is safe to ignore. preq_mode: Legacy option to specify how prequantized weights are loaded. @@ -80,7 +86,7 @@ class BaseConfig: are loaded. """ - model_class: ModelType = ModelType.LLAMA3 + model_class: ModelType = ModelType.llama3 params: Optional[str] = None checkpoint: Optional[str] = None checkpoint_dir: Optional[str] = None @@ -105,9 +111,9 @@ class DtypeOverride(str, Enum): is not recommended. """ - FP32 = "fp32" - FP16 = "fp16" - BF16 = "bf16" + fp32 = "fp32" + fp16 = "fp16" + bf16 = "bf16" @dataclass @@ -145,7 +151,7 @@ class ModelConfig: [16] pattern specifies all layers have a sliding window of 16. """ - dtype_override: DtypeOverride = DtypeOverride.FP32 + dtype_override: DtypeOverride = DtypeOverride.fp32 enable_dynamic_shape: bool = True use_shared_embedding: bool = False use_sdpa_with_kv_cache: bool = False @@ -268,22 +274,22 @@ class Pt2eQuantize(str, Enum): and is source transform-based. """ - XNNPACK_DYNAMIC = "xnnpack_dynamic" - XNNPACK_DYNAMIC_QC4 = "xnnpack_dynamic_qc4" - QNN_8A8W = "qnn_8a8w" - QNN_16A16W = "qnn_16a16w" - QNN_16A4W = "qnn_16a4w" - COREML_C4W = "coreml_c4w" - COREML_8A_C8W = "coreml_8a_c8w" - COREML_8A_C4W = "coreml_8a_c4w" - COREML_BASELINE_8A_C8W = "coreml_baseline_8a_c8w" - COREML_BASELINE_8A_C4W = "coreml_baseline_8a_c4w" - VULKAN_8W = "vulkan_8w" + xnnpack_dynamic = "xnnpack_dynamic" + xnnpack_dynamic_qc4 = "xnnpack_dynamic_qc4" + qnn_8a8w = "qnn_8a8w" + qnn_16a16w = "qnn_16a16w" + qnn_16a4w = "qnn_16a4w" + coreml_c4w = "coreml_c4w" + coreml_8a_c8w = "coreml_8a_c8w" + coreml_8a_c4w = "coreml_8a_c4w" + coreml_baseline_8a_c8w = "coreml_baseline_8a_c8w" + coreml_baseline_8a_c4w = "coreml_baseline_8a_c4w" + vulkan_8w = "vulkan_8w" class SpinQuant(str, Enum): - CUDA = "cuda" - NATIVE = "native" + cuda = "cuda" + native = "native" @dataclass @@ -376,15 +382,15 @@ class XNNPackConfig: class CoreMLQuantize(str, Enum): - B4W = "b4w" - C4W = "c4w" + b4w = "b4w" + c4w = "c4w" class CoreMLComputeUnit(str, Enum): - CPU_ONLY = "cpu_only" - CPU_AND_GPU = "cpu_and_gpu" - CPU_AND_NE = "cpu_and_ne" - ALL = "all" + cpu_only = "cpu_only" + cpu_and_gpu = "cpu_and_gpu" + cpu_and_ne = "cpu_and_ne" + all = "all" @dataclass @@ -398,7 +404,7 @@ class CoreMLConfig: preserve_sdpa: bool = False quantize: Optional[CoreMLQuantize] = None ios: int = 15 - compute_units: CoreMLComputeUnit = CoreMLComputeUnit.CPU_ONLY + compute_units: CoreMLComputeUnit = CoreMLComputeUnit.cpu_only def __post_init__(self): if self.ios not in (15, 16, 17, 18): diff --git a/extension/llm/export/config/targets.bzl b/extension/llm/export/config/targets.bzl new file mode 100644 index 00000000000..4135b336fbd --- /dev/null +++ b/extension/llm/export/config/targets.bzl @@ -0,0 +1,15 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +def define_common_targets(): + runtime.python_library( + name = "llm_config", + srcs = [ + "llm_config.py", + ], + _is_external_target = True, + base_module = "executorch.extension.llm.export.config", + visibility = [ + "//executorch/...", + "@EXECUTORCH_CLIENTS", + ], + ) \ No newline at end of file diff --git a/extension/llm/export/export_llm.py b/extension/llm/export/export_llm.py index 09a15d6ab58..9caaf7b6ad0 100644 --- a/extension/llm/export/export_llm.py +++ b/extension/llm/export/export_llm.py @@ -23,11 +23,19 @@ backend.xnnpack.enabled=True \ backend.xnnpack.extended_ops=True \ quantization.qmode="8da4w" + +Example usage using config file: +python -m extension.llm.export.export_llm \ + --config example_llm_config.yaml """ +import argparse +import sys +from typing import Any, List, Tuple + import hydra -from executorch.examples.models.llama.config.llm_config import LlmConfig +from executorch.extension.llm.export.config.llm_config import LlmConfig from executorch.examples.models.llama.export_llama_lib import export_llama from hydra.core.config_store import ConfigStore from omegaconf import OmegaConf @@ -36,10 +44,50 @@ cs.store(name="llm_config", node=LlmConfig) -@hydra.main(version_base=None, config_path=None, config_name="llm_config") -def main(llm_config: LlmConfig) -> None: +def parse_config_arg() -> Tuple[str, List[Any]]: + """First parse out the arg for whether to use Hydra or the old CLI.""" + parser = argparse.ArgumentParser(add_help=True) + parser.add_argument("--config", type=str, help="Path to the LlmConfig file") + args, remaining = parser.parse_known_args() + return args.config, remaining + + +def pop_config_arg() -> str: + """ + Removes '--config' and its value from sys.argv. + Assumes --config is specified and argparse has already validated the args. + """ + idx = sys.argv.index("--config") + value = sys.argv[idx + 1] + del sys.argv[idx : idx + 2] + return value + + +@hydra.main(version_base=None, config_name="llm_config") +def hydra_main(llm_config: LlmConfig) -> None: export_llama(OmegaConf.to_object(llm_config)) +def main() -> None: + config, remaining_args = parse_config_arg() + if config: + # Check if there are any remaining hydra CLI args when --config is specified + # This might change in the future to allow overriding config file values + if remaining_args: + raise ValueError( + "Cannot specify additional CLI arguments when using --config. " + f"Found: {remaining_args}. Use either --config file or hydra CLI args, not both." + ) + + config_file_path = pop_config_arg() + default_llm_config = LlmConfig() + llm_config_from_file = OmegaConf.load(config_file_path) + # Override defaults with values specified in the .yaml provided by --config. + merged_llm_config = OmegaConf.merge(default_llm_config, llm_config_from_file) + export_llama(merged_llm_config) + else: + hydra_main() + + if __name__ == "__main__": main() diff --git a/extension/llm/export/test/test_export_llm.py b/extension/llm/export/test/test_export_llm.py new file mode 100644 index 00000000000..7d17b7819d3 --- /dev/null +++ b/extension/llm/export/test/test_export_llm.py @@ -0,0 +1,155 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import os +import sys +import tempfile +import unittest +from unittest.mock import MagicMock, patch + +from executorch.extension.llm.export.export_llm import ( + main, + parse_config_arg, + pop_config_arg, +) + + +class TestExportLlm(unittest.TestCase): + def test_parse_config_arg_with_config(self) -> None: + """Test parse_config_arg when --config is provided.""" + # Mock sys.argv to include --config + test_argv = ["script.py", "--config", "test_config.yaml", "extra", "args"] + with patch.object(sys, "argv", test_argv): + config_path, remaining = parse_config_arg() + self.assertEqual(config_path, "test_config.yaml") + self.assertEqual(remaining, ["extra", "args"]) + + def test_parse_config_arg_without_config(self) -> None: + """Test parse_config_arg when --config is not provided.""" + test_argv = ["script.py", "debug.verbose=True"] + with patch.object(sys, "argv", test_argv): + config_path, remaining = parse_config_arg() + self.assertIsNone(config_path) + self.assertEqual(remaining, ["debug.verbose=True"]) + + def test_pop_config_arg(self) -> None: + """Test pop_config_arg removes --config and its value from sys.argv.""" + test_argv = ["script.py", "--config", "test_config.yaml", "other", "args"] + with patch.object(sys, "argv", test_argv): + config_path = pop_config_arg() + self.assertEqual(config_path, "test_config.yaml") + self.assertEqual(sys.argv, ["script.py", "other", "args"]) + + @patch("executorch.extension.llm.export.export_llm.export_llama") + def test_with_config(self, mock_export_llama: MagicMock) -> None: + """Test main function with --config file and no hydra args.""" + # Create a temporary config file + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + f.write( + """ +base: + model_class: llama2 + tokenizer_path: /path/to/tokenizer.json + preq_mode: preq_8da4w +model: + dtype_override: fp16 +export: + max_seq_length: 256 +quantization: + pt2e_quantize: xnnpack_dynamic + use_spin_quant: cuda +backend: + coreml: + quantize: c4w + compute_units: cpu_and_gpu +""" + ) + config_file = f.name + + try: + test_argv = ["script.py", "--config", config_file] + with patch.object(sys, "argv", test_argv): + main() + + # Verify export_llama was called with config + mock_export_llama.assert_called_once() + called_config = mock_export_llama.call_args[0][0] + self.assertEqual( + called_config["base"]["tokenizer_path"], "/path/to/tokenizer.json" + ) + self.assertEqual(called_config["base"]["model_class"], "llama2") + self.assertEqual(called_config["base"]["preq_mode"].value, "8da4w") + self.assertEqual(called_config["model"]["dtype_override"].value, "fp16") + self.assertEqual(called_config["export"]["max_seq_length"], 256) + self.assertEqual( + called_config["quantization"]["pt2e_quantize"].value, "xnnpack_dynamic" + ) + self.assertEqual( + called_config["quantization"]["use_spin_quant"].value, "cuda" + ) + self.assertEqual( + called_config["backend"]["coreml"]["quantize"].value, "c4w" + ) + self.assertEqual( + called_config["backend"]["coreml"]["compute_units"].value, "cpu_and_gpu" + ) + finally: + os.unlink(config_file) + + def test_with_cli_args(self) -> None: + """Test main function with only hydra CLI args.""" + test_argv = ["script.py", "debug.verbose=True"] + with patch.object(sys, "argv", test_argv): + with patch( + "executorch.extension.llm.export.export_llm.hydra_main" + ) as mock_hydra: + main() + mock_hydra.assert_called_once() + + def test_config_with_cli_args_error(self) -> None: + """Test that --config rejects additional CLI arguments to prevent mixing approaches.""" + # Create a temporary config file + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + f.write("base:\n checkpoint: /path/to/checkpoint.pth") + config_file = f.name + + try: + test_argv = ["script.py", "--config", config_file, "debug.verbose=True"] + with patch.object(sys, "argv", test_argv): + with self.assertRaises(ValueError) as cm: + main() + + error_msg = str(cm.exception) + self.assertIn( + "Cannot specify additional CLI arguments when using --config", + error_msg, + ) + finally: + os.unlink(config_file) + + def test_config_rejects_multiple_cli_args(self) -> None: + """Test that --config rejects multiple CLI arguments (not just single ones).""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + f.write("export:\n max_seq_length: 128") + config_file = f.name + + try: + test_argv = [ + "script.py", + "--config", + config_file, + "debug.verbose=True", + "export.output_dir=/tmp", + ] + with patch.object(sys, "argv", test_argv): + with self.assertRaises(ValueError): + main() + finally: + os.unlink(config_file) + + +if __name__ == "__main__": + unittest.main() diff --git a/requirements-dev.txt b/requirements-dev.txt index a4ed212fb65..07c63101eb8 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -9,3 +9,5 @@ wheel # For building the pip package archive. zstd # Imported by resolve_buck.py. lintrunner==0.12.7 lintrunner-adapters==0.12.4 +hydra-core>=1.3.0 +omegaconf>=2.3.0