diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh index 9f183528719..f92a983a340 100644 --- a/.ci/scripts/test_llama.sh +++ b/.ci/scripts/test_llama.sh @@ -54,10 +54,7 @@ PT2E_QUANTIZE="${PT2E_QUANTIZE:-}" # Default CMake Build Type to release mode CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} -if [[ $# -lt 4 ]]; then # Assuming 4 mandatory args - echo "Expecting atleast 4 positional arguments" - echo "Usage: [...]" -fi +# Argument validation is done individually below for each required parameter if [[ -z "${MODEL_NAME:-}" ]]; then echo "Missing model name, exiting..." exit 1 @@ -224,34 +221,34 @@ fi # Export model. EXPORTED_MODEL_NAME="${EXPORTED_MODEL_NAME}.pte" echo "Exporting ${EXPORTED_MODEL_NAME}" -EXPORT_ARGS="-c ${CHECKPOINT_FILE_NAME} -p ${PARAMS} -d ${DTYPE} -n ${EXPORTED_MODEL_NAME} -kv" +EXPORT_ARGS="base.checkpoint=${CHECKPOINT_FILE_NAME} base.params=${PARAMS} model.dtype_override=${DTYPE} export.output_name=${EXPORTED_MODEL_NAME} model.use_kv_cache=true" if [[ "${XNNPACK}" == "ON" ]]; then - EXPORT_ARGS="${EXPORT_ARGS} -X --xnnpack-extended-ops -qmode 8da4w -G 128" + EXPORT_ARGS="${EXPORT_ARGS} backend.xnnpack.enabled=true backend.xnnpack.extended_ops=true quantization.qmode=8da4w quantization.group_size=128" fi if [[ "${CUSTOM}" == "ON" ]]; then - EXPORT_ARGS="${EXPORT_ARGS} --use_sdpa_with_kv_cache" + EXPORT_ARGS="${EXPORT_ARGS} model.use_sdpa_with_kv_cache=true" fi if [[ "${QE}" == "ON" ]]; then - EXPORT_ARGS="${EXPORT_ARGS} --embedding-quantize 8,1024" + EXPORT_ARGS="${EXPORT_ARGS} quantization.embedding_quantize=\"8,1024\"" fi if [[ "${MPS}" == "ON" ]]; then - EXPORT_ARGS="${EXPORT_ARGS} -kv -v --mps --disable_dynamic_shape" + EXPORT_ARGS="${EXPORT_ARGS} backend.mps.enabled=true model.enable_dynamic_shape=false debug.verbose=true" fi if [[ "${COREML}" == "ON" ]]; then - EXPORT_ARGS="${EXPORT_ARGS} -kv -v --coreml --disable_dynamic_shape" + EXPORT_ARGS="${EXPORT_ARGS} backend.coreml.enabled=true model.enable_dynamic_shape=false debug.verbose=true" fi if [[ "${QNN}" == "ON" ]]; then - EXPORT_ARGS="${EXPORT_ARGS} -kv -v --qnn --disable_dynamic_shape" + EXPORT_ARGS="${EXPORT_ARGS} backend.qnn.enabled=true model.enable_dynamic_shape=false debug.verbose=true" echo "PT2E_QUANTIZE is ${PT2E_QUANTIZE}" if [[ "${PT2E_QUANTIZE}" == "qnn_16a16w" ]]; then - EXPORT_ARGS+=" --tokenizer_path tokenizer.model --pt2e_quantize qnn_16a16w --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --calibration_data Once " + EXPORT_ARGS+=" base.tokenizer_path=tokenizer.model quantization.pt2e_quantize=qnn_16a16w quantization.calibration_tasks=[\"wikitext\"] quantization.calibration_limit=1 quantization.calibration_seq_length=128 quantization.calibration_data=\"Once\"" fi fi if [[ "${QUANTIZE_KV_CACHE}" == "ON" ]]; then - EXPORT_ARGS="${EXPORT_ARGS} --quantize_kv_cache" + EXPORT_ARGS="${EXPORT_ARGS} model.quantize_kv_cache=true" fi # Add dynamically linked library location -$PYTHON_EXECUTABLE -m examples.models.llama.export_llama ${EXPORT_ARGS} +$PYTHON_EXECUTABLE -m extension.llm.export.export_llm ${EXPORT_ARGS} # Create tokenizer.bin. echo "Creating tokenizer.bin" diff --git a/.ci/scripts/test_llama_torchao_lowbit.sh b/.ci/scripts/test_llama_torchao_lowbit.sh index ac603cc5e83..21989d26770 100644 --- a/.ci/scripts/test_llama_torchao_lowbit.sh +++ b/.ci/scripts/test_llama_torchao_lowbit.sh @@ -70,16 +70,16 @@ QLINEAR_GROUP_SIZE=128 # Must be multiple of 16 QEMBEDDING_BITWIDTH=4 # Can be 1-8 QEMBEDDING_GROUP_SIZE=32 # Must be multiple of 16 -${PYTHON_EXECUTABLE} -m examples.models.llama.export_llama \ - --checkpoint "${LLAMA_CHECKPOINT:?}" \ - --params "${LLAMA_PARAMS:?}" \ - -kv \ - --use_sdpa_with_kv_cache \ - --output_name=${MODEL_OUT} \ - -qmode "torchao:8da${QLINEAR_BITWIDTH}w" \ - --group_size ${QLINEAR_GROUP_SIZE} \ - -E "torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}" \ - -d fp32 +${PYTHON_EXECUTABLE} -m extension.llm.export.export_llm \ + base.checkpoint="${LLAMA_CHECKPOINT:?}" \ + base.params="${LLAMA_PARAMS:?}" \ + model.use_kv_cache=true \ + model.use_sdpa_with_kv_cache=true \ + export.output_name="${MODEL_OUT}" \ + quantization.qmode="torchao:8da${QLINEAR_BITWIDTH}w" \ + quantization.group_size=${QLINEAR_GROUP_SIZE} \ + quantization.embedding_quantize=\"torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}\" \ + model.dtype_override=fp32 # Test run ./cmake-out/examples/models/llama/llama_main --model_path=$MODEL_OUT --tokenizer_path=$TOKENIZER --prompt="Once upon a time," diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh index 4f8dc7a30e5..bbf879295ae 100755 --- a/.ci/scripts/test_model.sh +++ b/.ci/scripts/test_model.sh @@ -86,8 +86,8 @@ test_model() { if [[ "${MODEL_NAME}" == "llama2" ]]; then # Install requirements for export_llama bash examples/models/llama/install_requirements.sh - # Test export_llama script: python3 -m examples.models.llama.export_llama - "${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama --model "${MODEL_NAME}" -c examples/models/llama/params/demo_rand_params.pth -p examples/models/llama/params/demo_config.json + # Test export_llm script: python3 -m extension.llm.export.export_llm + "${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.checkpoint=examples/models/llama/params/demo_rand_params.pth base.params=examples/models/llama/params/demo_config.json run_portable_executor_runner rm "./${MODEL_NAME}.pte" fi @@ -100,17 +100,17 @@ test_model() { if [[ "${MODEL_NAME}" == "qwen2_5" ]]; then # Install requirements for export_llama bash examples/models/llama/install_requirements.sh - # Test export_llama script: python3 -m examples.models.llama.export_llama. + # Test export_llm script: python3 -m extension.llm.export.export_llm. # Use Llama random checkpoint with Qwen 2.5 1.5b model configuration. - "${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama --model "${MODEL_NAME}" -p examples/models/qwen2_5/1_5b_config.json + "${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/qwen2_5/1_5b_config.json rm "./${MODEL_NAME}.pte" return # Skip running with portable executor runnner since portable doesn't support Qwen's biased linears. fi if [[ "${MODEL_NAME}" == "phi_4_mini" ]]; then # Install requirements for export_llama bash examples/models/llama/install_requirements.sh - # Test export_llama script: python3 -m examples.models.llama.export_llama. - "${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama --model "${MODEL_NAME}" -p examples/models/phi_4_mini/config.json + # Test export_llm script: python3 -m extension.llm.export.export_llm. + "${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/phi_4_mini/config.json run_portable_executor_runner rm "./${MODEL_NAME}.pte" return diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml index 1a6d63f1bd1..a7c2b9ca14c 100644 --- a/.github/workflows/android-perf.yml +++ b/.github/workflows/android-perf.yml @@ -214,23 +214,23 @@ jobs: --files "tokenizer.model" "params.json" "consolidated.00.pth" ) # Export using ExecuTorch's model definition - python -m examples.models.llama.export_llama \ - --model "llama3_2" \ - --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ - --params "${DOWNLOADED_PATH}/params.json" \ - --use_sdpa_with_kv_cache \ - -X \ - --xnnpack-extended-ops \ - --preq_mode 8da4w_output_8da8w \ - --preq_group_size 32 \ - --max_seq_length 2048 \ - --max_context_length 2048 \ - --output_name "${OUT_ET_MODEL_NAME}.pte" \ - -kv \ - -d fp32 \ - --preq_embedding_quantize 8,0 \ - --use_spin_quant native \ - --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' + python -m extension.llm.export.export_llm \ + base.model_class="llama3_2" \ + base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \ + base.params="${DOWNLOADED_PATH}/params.json" \ + model.use_sdpa_with_kv_cache=true \ + backend.xnnpack.enabled=true \ + backend.xnnpack.extended_ops=true \ + base.preq_mode="8da4w_output_8da8w" \ + base.preq_group_size=32 \ + export.max_seq_length=2048 \ + export.max_context_length=2048 \ + export.output_name="${OUT_ET_MODEL_NAME}.pte" \ + model.use_kv_cache=true \ + model.dtype_override=fp32 \ + base.preq_embedding_quantize=\'8,0\' \ + quantization.use_spin_quant=native \ + base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"' ls -lh "${OUT_ET_MODEL_NAME}.pte" elif [[ ${{ matrix.config }} == "llama3_qlora" ]]; then # QAT + LoRA @@ -241,53 +241,55 @@ jobs: --files "tokenizer.model" "params.json" "consolidated.00.pth" ) # Export using ExecuTorch's model definition - python -m examples.models.llama.export_llama \ - --model "llama3_2" \ - --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ - --params "${DOWNLOADED_PATH}/params.json" \ - -qat \ - -lora 16 \ - --preq_mode 8da4w_output_8da8w \ - --preq_group_size 32 \ - --preq_embedding_quantize 8,0 \ - --use_sdpa_with_kv_cache \ - -kv \ - -X \ - --xnnpack-extended-ops \ - -d fp32 \ - --max_seq_length 2048 \ - --max_context_length 2048 \ - --output_name "${OUT_ET_MODEL_NAME}.pte" \ - --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' + python -m extension.llm.export.export_llm \ + base.model_class="llama3_2" \ + base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \ + base.params="${DOWNLOADED_PATH}/params.json" \ + quantization.use_qat=true \ + base.use_lora=16 \ + base.preq_mode="8da4w_output_8da8w" \ + base.preq_group_size=32 \ + base.preq_embedding_quantize=\'8,0\' \ + model.use_sdpa_with_kv_cache=true \ + model.use_kv_cache=true \ + backend.xnnpack.enabled=true \ + backend.xnnpack.extended_ops=true \ + model.dtype_override=fp32 \ + export.max_seq_length=2048 \ + export.max_context_length=2048 \ + export.output_name="${OUT_ET_MODEL_NAME}.pte" \ + base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"' ls -lh "${OUT_ET_MODEL_NAME}.pte" elif [[ ${{ matrix.config }} == "llama3_fb16" ]]; then # Original BF16 version, without any quantization DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth") - python -m examples.models.llama.export_llama \ - --model "llama3_2" \ - --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ - --params "${DOWNLOADED_PATH}/params.json" \ - -kv \ - --use_sdpa_with_kv_cache \ - -X \ - -d bf16 \ - --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ - --output_name="${OUT_ET_MODEL_NAME}.pte" + python -m extension.llm.export.export_llm \ + base.model_class="llama3_2" \ + base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \ + base.params="${DOWNLOADED_PATH}/params.json" \ + model.use_kv_cache=true \ + model.use_sdpa_with_kv_cache=true \ + backend.xnnpack.enabled=true \ + model.dtype_override=bf16 \ + base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"' \ + export.output_name="${OUT_ET_MODEL_NAME}.pte" ls -lh "${OUT_ET_MODEL_NAME}.pte" elif [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth") - python -m examples.models.llama.export_llama \ - --model llama3_2 \ - --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ - --params "${DOWNLOADED_PATH}/params.json" \ - -kv \ - --use_sdpa_with_kv_cache \ - -d fp32 \ - -X \ - --xnnpack-extended-ops \ - -qmode 8da4w -G 32 -E 8,0 \ - --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ - --output_name="${OUT_ET_MODEL_NAME}.pte" + python -m extension.llm.export.export_llm \ + base.model_class=llama3_2 \ + base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \ + base.params="${DOWNLOADED_PATH}/params.json" \ + model.use_kv_cache=true \ + model.use_sdpa_with_kv_cache=true \ + model.dtype_override=fp32 \ + backend.xnnpack.enabled=true \ + backend.xnnpack.extended_ops=true \ + quantization.qmode=8da4w \ + quantization.group_size=32 \ + quantization.embedding_quantize=\'8,0\' \ + base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"' \ + export.output_name="${OUT_ET_MODEL_NAME}.pte" ls -lh "${OUT_ET_MODEL_NAME}.pte" elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029 @@ -313,19 +315,19 @@ jobs: elif [[ "$HF_MODEL_REPO" == "Qwen/Qwen3-0.6B" ]]; then if [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json") - python -m examples.models.llama.export_llama \ - --model qwen3-0_6b \ - --params examples/models/qwen3/0_6b_config.json \ - -kv \ - --use_sdpa_with_kv_cache \ - -d fp32 \ - -X \ - --xnnpack-extended-ops \ - -qmode 8da4w \ - -G 32 \ - -E 8,0 \ - --metadata '{"get_bos_id": 151644, "get_eos_ids":[151645]}' \ - --output_name="${OUT_ET_MODEL_NAME}.pte" + python -m extension.llm.export.export_llm \ + base.model_class=qwen3_0_6b \ + base.params=examples/models/qwen3/0_6b_config.json \ + model.use_kv_cache=true \ + model.use_sdpa_with_kv_cache=true \ + model.dtype_override=fp32 \ + backend.xnnpack.enabled=true \ + backend.xnnpack.extended_ops=true \ + quantization.qmode=8da4w \ + quantization.group_size=32 \ + quantization.embedding_quantize=\'8,0\' \ + base.metadata='"{\"get_bos_id\":151644,\"get_eos_ids\":[151645]}"' \ + export.output_name="${OUT_ET_MODEL_NAME}.pte" ls -lh "${OUT_ET_MODEL_NAME}.pte" fi fi diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml index 0c03f55f82e..6b1666da642 100644 --- a/.github/workflows/apple-perf.yml +++ b/.github/workflows/apple-perf.yml @@ -223,23 +223,23 @@ jobs: --files "tokenizer.model" "params.json" "consolidated.00.pth" ) # Export using ExecuTorch's model definition - ${CONDA_RUN} python -m examples.models.llama.export_llama \ - --model "llama3_2" \ - --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ - --params "${DOWNLOADED_PATH}/params.json" \ - --use_sdpa_with_kv_cache \ - -X \ - --xnnpack-extended-ops \ - --preq_mode 8da4w_output_8da8w \ - --preq_group_size 32 \ - --max_seq_length 2048 \ - --max_context_length 2048 \ - --output_name "${OUT_ET_MODEL_NAME}.pte" \ - -kv \ - -d fp32 \ - --preq_embedding_quantize 8,0 \ - --use_spin_quant native \ - --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' + ${CONDA_RUN} python -m extension.llm.export.export_llm \ + base.model_class="llama3_2" \ + base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \ + base.params="${DOWNLOADED_PATH}/params.json" \ + model.use_sdpa_with_kv_cache=true \ + backend.xnnpack.enabled=true \ + backend.xnnpack.extended_ops=true \ + base.preq_mode="8da4w_output_8da8w" \ + base.preq_group_size=32 \ + export.max_seq_length=2048 \ + export.max_context_length=2048 \ + export.output_name="${OUT_ET_MODEL_NAME}.pte" \ + model.use_kv_cache=true \ + model.dtype_override=fp32 \ + base.preq_embedding_quantize=\'8,0\' \ + quantization.use_spin_quant=native \ + base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"' ls -lh "${OUT_ET_MODEL_NAME}.pte" elif [[ ${{ matrix.config }} == "llama3_qlora" ]]; then # QAT + LoRA @@ -250,87 +250,89 @@ jobs: --files "tokenizer.model" "params.json" "consolidated.00.pth" ) # Export using ExecuTorch's model definition - ${CONDA_RUN} python -m examples.models.llama.export_llama \ - --model "llama3_2" \ - --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ - --params "${DOWNLOADED_PATH}/params.json" \ - -qat \ - -lora 16 \ - --preq_mode 8da4w_output_8da8w \ - --preq_group_size 32 \ - --preq_embedding_quantize 8,0 \ - --use_sdpa_with_kv_cache \ - -kv \ - -X \ - --xnnpack-extended-ops \ - -d fp32 \ - --max_seq_length 2048 \ - --max_context_length 2048 \ - --output_name "${OUT_ET_MODEL_NAME}.pte" \ - --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' + ${CONDA_RUN} python -m extension.llm.export.export_llm \ + base.model_class="llama3_2" \ + base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \ + base.params="${DOWNLOADED_PATH}/params.json" \ + quantization.use_qat=true \ + base.use_lora=16 \ + base.preq_mode="8da4w_output_8da8w" \ + base.preq_group_size=32 \ + base.preq_embedding_quantize=\'8,0\' \ + model.use_sdpa_with_kv_cache=true \ + model.use_kv_cache=true \ + backend.xnnpack.enabled=true \ + backend.xnnpack.extended_ops=true \ + model.dtype_override=fp32 \ + export.max_seq_length=2048 \ + export.max_context_length=2048 \ + export.output_name="${OUT_ET_MODEL_NAME}.pte" \ + base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"' ls -lh "${OUT_ET_MODEL_NAME}.pte" elif [[ ${{ matrix.config }} == "llama3_fb16" ]]; then # Original BF16 version, without any quantization DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth") - ${CONDA_RUN} python -m examples.models.llama.export_llama \ - --model "llama3_2" \ - --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ - --params "${DOWNLOADED_PATH}/params.json" \ - -kv \ - --use_sdpa_with_kv_cache \ - -X \ - -d bf16 \ - --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ - --output_name="${OUT_ET_MODEL_NAME}.pte" + ${CONDA_RUN} python -m extension.llm.export.export_llm \ + base.model_class="llama3_2" \ + base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \ + base.params="${DOWNLOADED_PATH}/params.json" \ + model.use_kv_cache=true \ + model.use_sdpa_with_kv_cache=true \ + backend.xnnpack.enabled=true \ + model.dtype_override=bf16 \ + base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"' \ + export.output_name="${OUT_ET_MODEL_NAME}.pte" ls -lh "${OUT_ET_MODEL_NAME}.pte" elif [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth") - ${CONDA_RUN} python -m examples.models.llama.export_llama \ - --model llama3_2 \ - --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ - --params "${DOWNLOADED_PATH}/params.json" \ - -kv \ - --use_sdpa_with_kv_cache \ - -d fp32 \ - -X \ - --xnnpack-extended-ops \ - -qmode 8da4w -G 32 -E 8,0 \ - --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ - --output_name="${OUT_ET_MODEL_NAME}.pte" + ${CONDA_RUN} python -m extension.llm.export.export_llm \ + base.model_class=llama3_2 \ + base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \ + base.params="${DOWNLOADED_PATH}/params.json" \ + model.use_kv_cache=true \ + model.use_sdpa_with_kv_cache=true \ + model.dtype_override=fp32 \ + backend.xnnpack.enabled=true \ + backend.xnnpack.extended_ops=true \ + quantization.qmode=8da4w \ + quantization.group_size=32 \ + quantization.embedding_quantize=\'8,0\' \ + base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"' \ + export.output_name="${OUT_ET_MODEL_NAME}.pte" ls -lh "${OUT_ET_MODEL_NAME}.pte" elif [[ ${{ matrix.config }} == "llama3_coreml_ane" ]]; then # ANE DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth") - ${CONDA_RUN} python -m examples.models.llama.export_llama \ - --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ - --params "${DOWNLOADED_PATH}/params.json" \ - -E "4,32" \ - -kv \ - --disable_dynamic_shape \ - --coreml \ - --coreml-ios 18 \ - --coreml-quantize c4w \ - --coreml-compute-units cpu_and_ne \ - --output_name="${OUT_ET_MODEL_NAME}.pte" + ${CONDA_RUN} python -m extension.llm.export.export_llm \ + base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \ + base.params="${DOWNLOADED_PATH}/params.json" \ + quantization.embedding_quantize=\'4,32\' \ + model.use_kv_cache=true \ + model.enable_dynamic_shape=false \ + backend.coreml.enabled=true \ + backend.coreml.ios=18 \ + backend.coreml.quantize=c4w \ + backend.coreml.compute_units=cpu_and_ne \ + export.output_name="${OUT_ET_MODEL_NAME}.pte" ls -lh "${OUT_ET_MODEL_NAME}.pte" fi elif [[ "$HF_MODEL_REPO" == "Qwen/Qwen3-0.6B" ]]; then OUT_ET_MODEL_NAME=${OUT_ET_MODEL_NAME}_llm if [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json") - ${CONDA_RUN} python -m examples.models.llama.export_llama \ - --model qwen3-0_6b \ - --params examples/models/qwen3/0_6b_config.json \ - -kv \ - --use_sdpa_with_kv_cache \ - -d fp32 \ - -X \ - --xnnpack-extended-ops \ - -qmode 8da4w \ - -G 32 \ - -E 8,0 \ - --metadata '{"get_bos_id": 151644, "get_eos_ids":[151645]}' \ - --output_name="${OUT_ET_MODEL_NAME}.pte" + ${CONDA_RUN} python -m extension.llm.export.export_llm \ + base.model_class=qwen3_0_6b \ + base.params=examples/models/qwen3/0_6b_config.json \ + model.use_kv_cache=true \ + model.use_sdpa_with_kv_cache=true \ + model.dtype_override=fp32 \ + backend.xnnpack.enabled=true \ + backend.xnnpack.extended_ops=true \ + quantization.qmode=8da4w \ + quantization.group_size=32 \ + quantization.embedding_quantize=\'8,0\' \ + base.metadata='"{\"get_bos_id\":151644,\"get_eos_ids\":[151645]}"' \ + export.output_name="${OUT_ET_MODEL_NAME}.pte" ls -lh "${OUT_ET_MODEL_NAME}.pte" fi fi diff --git a/examples/demo-apps/android/LlamaDemo/run_instrumentation_test.sh b/examples/demo-apps/android/LlamaDemo/run_instrumentation_test.sh index ff59fc56b2c..8c1ad52ef8b 100644 --- a/examples/demo-apps/android/LlamaDemo/run_instrumentation_test.sh +++ b/examples/demo-apps/android/LlamaDemo/run_instrumentation_test.sh @@ -14,7 +14,7 @@ curl -C - -Ls "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokeni # Create params.json file touch params.json echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json -python -m examples.models.llama.export_llama -c stories110M.pt -p params.json -d fp16 -n stories110m_h.pte -kv +python -m extension.llm.export.export_llm base.checkpoint=stories110M.pt base.params=params.json model.dtype_override=fp16 export.output_name=stories110m_h.pte model.use_kv_cache=true python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin adb mkdir -p /data/local/tmp/llama diff --git a/examples/models/llama/config/llm_config.py b/examples/models/llama/config/llm_config.py index 9acd633fb21..72342199dfd 100644 --- a/examples/models/llama/config/llm_config.py +++ b/examples/models/llama/config/llm_config.py @@ -10,6 +10,11 @@ Configurations for exporting Llama. Uses dataclasses, which integrate with OmegaConf and Hydra. + +Note: +- Hydra is a bit finnicky with string values that include quotations, please +refer to https://hydra.cc/docs/1.2/advanced/override_grammar/basic/#quoted-values +for more information. """ import argparse @@ -34,9 +39,9 @@ class ModelType(str, Enum): llama3_2_vision = "llama3_2_vision" static_llama = "static_llama" qwen2_5 = "qwen2_5" - qwen3_0_6b = "qwen3-0_6b" - qwen3_1_7b = "qwen3-1_7b" - qwen3_4b = "qwen3-4b" + qwen3_0_6b = "qwen3_0_6b" + qwen3_1_7b = "qwen3_1_7b" + qwen3_4b = "qwen3_4b" phi_4_mini = "phi_4_mini" smollm2 = "smollm2" @@ -71,7 +76,7 @@ class BaseConfig: checkpoint_dir: Path to directory containing sharded checkpoint files. tokenizer_path: Path to the tokenizer file. metadata: Json string containing metadata information. - e.g. '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' + e.g. '"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' use_lora: Rank of the LoRA, if set to 0 then this means no LoRA. For use with QAT. fairseq2: For legacy internal use cases, this is safe to ignore. preq_mode: Legacy option to specify how prequantized weights are loaded. diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index 334f3ace712..685e9de9a2e 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -104,9 +104,9 @@ "llama3_2", "static_llama", "qwen2_5", - "qwen3-0_6b", - "qwen3-1_7b", - "qwen3-4b", + "qwen3_0_6b", + "qwen3_1_7b", + "qwen3_4b", "phi_4_mini", "smollm2", ] @@ -115,9 +115,9 @@ "qwen2_5": "Qwen/Qwen2.5-1.5B", "phi_4_mini": "microsoft/Phi-4-mini-instruct", "smollm2": "HuggingFaceTB/SmolLM-135M", - "qwen3-0_6b": "Qwen/Qwen3-0.6B", - "qwen3-1_7b": "Qwen/Qwen3-1.7B", - "qwen3-4b": "Qwen/Qwen3-4B", + "qwen3_0_6b": "Qwen/Qwen3-0.6B", + "qwen3_1_7b": "Qwen/Qwen3-1.7B", + "qwen3_4b": "Qwen/Qwen3-4B", } diff --git a/examples/models/qwen3/README.md b/examples/models/qwen3/README.md index d31d491adf2..e24d8da2637 100644 --- a/examples/models/qwen3/README.md +++ b/examples/models/qwen3/README.md @@ -7,7 +7,7 @@ Qwen 3 uses the same example code as our optimized Llama model, while the checkp All commands for exporting and running Llama on various backends should also be applicable to Qwen 3, by swapping the following args: ``` -base.model_class=[qwen3-0_6b,qwen3-1_7b,qwen3-4b] +base.model_class=[qwen3_0_6b,qwen3_1_7b,qwen3_4b] base.params=[examples/models/qwen3/0_6b_config.json,examples/models/qwen3/1_7b_config.json,examples/models/qwen3/4b_config.json] ``` @@ -17,7 +17,7 @@ Here is a basic example for exporting Qwen 3, although please refer to the Llama Export 0.6b to XNNPack, quantized with 8da4w: ``` python -m extension.llm.export.export_llm \ - base.model_class="qwen3-0_6b" \ + base.model_class="qwen3_0_6b" \ base.params="examples/models/qwen3/0_6b_config.json" \ model.use_kv_cache=True \ model.use_sdpa_with_kv_cache=True \ @@ -26,14 +26,14 @@ python -m extension.llm.export.export_llm \ backend.xnnpack.extended_ops=True \ quantization.qmode="8da4w" \ base.metadata='"{\"get_bos_id\": 151644, \"get_eos_ids\":[151645]}"' \ - export.output_name="qwen3-0_6b.pte" \ + export.output_name="qwen3_0_6b.pte" \ debug.verbose=True ``` Export 1.7b to XNNPack, quantized with 8da4w: ``` python -m extension.llm.export.export_llm \ - base.model_class="qwen3-1_7b" \ + base.model_class="qwen3_1_7b" \ base.params="examples/models/qwen3/1_7b_config.json" \ model.use_kv_cache=True \ model.use_sdpa_with_kv_cache=True \ @@ -42,14 +42,14 @@ python -m extension.llm.export.export_llm \ backend.xnnpack.extended_ops=True \ quantization.qmode="8da4w" \ base.metadata='"{\"get_bos_id\": 151644, \"get_eos_ids\":[151645]}"' \ - export.output_name="qwen3-1_7b.pte" \ + export.output_name="qwen3_1_7b.pte" \ debug.verbose=True ``` Export 4b to XNNPack, quantized with 8da4w: ``` python -m extension.llm.export.export_llm \ - base.model_class="qwen3-4b" \ + base.model_class="qwen3_4b" \ base.params="examples/models/qwen3/4b_config.json" \ model.use_kv_cache=True \ model.use_sdpa_with_kv_cache=True \ @@ -58,7 +58,7 @@ python -m extension.llm.export.export_llm \ backend.xnnpack.extended_ops=True \ quantization.qmode="8da4w" \ base.metadata='"{\"get_bos_id\": 151644, \"get_eos_ids\":[151645]}"' \ - export.output_name="qwen3-4b.pte" \ + export.output_name="qwen3_4b.pte" \ debug.verbose=True ``` @@ -66,8 +66,8 @@ python -m extension.llm.export.export_llm \ With ExecuTorch pybindings: ``` python -m examples.models.llama.runner.native - --model qwen3-0_6b \ - --pte qwen3-0_6b.pte \ + --model qwen3_0_6b \ + --pte qwen3_0_6b.pte \ --tokenizer ~/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/a9c98e602b9d36d2a2f7ba1eb0f5f31e4e8e5143/tokenizer.json \ --tokenizer_config ~/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/a9c98e602b9d36d2a2f7ba1eb0f5f31e4e8e5143/tokenizer_config.json \ --prompt "Who is the president of the US?" \ @@ -80,7 +80,7 @@ python -m examples.models.llama.runner.native With ExecuTorch's sample c++ runner (see the Llama README's [Step 3: Run on your computer to validate](../llama/README.md#step-3-run-on-your-computer-to-validate) to build the runner): ``` cmake-out/examples/models/llama/llama_main - --model_path qwen3-0_6b.pte + --model_path qwen3_0_6b.pte --tokenizer_path ~/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/a9c98e602b9d36d2a2f7ba1eb0f5f31e4e8e5143/tokenizer.json --prompt="Who is the president of the US?" ``` diff --git a/extension/android/executorch_android/android_test_setup.sh b/extension/android/executorch_android/android_test_setup.sh index 682a1d16787..f521dac30c5 100644 --- a/extension/android/executorch_android/android_test_setup.sh +++ b/extension/android/executorch_android/android_test_setup.sh @@ -25,7 +25,7 @@ prepare_tinyllama() { # Create params.json file touch params.json echo '{"dim": 288, "multiple_of": 32, "n_heads": 6, "n_layers": 6, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json - python -m examples.models.llama.export_llama -c stories15M.pt -p params.json -d fp16 -n stories15m_h.pte -kv + python -m extension.llm.export.export_llm base.checkpoint=stories15M.pt base.params=params.json model.dtype_override=fp16 export.output_name=stories15m_h.pte model.use_kv_cache=true python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin cp stories15m_h.pte "${BASEDIR}/src/androidTest/resources/stories.pte" diff --git a/extension/llm/export/README.md b/extension/llm/export/README.md index 1ac27306c86..96f36acc1b4 100644 --- a/extension/llm/export/README.md +++ b/extension/llm/export/README.md @@ -85,7 +85,7 @@ debug: ### Export Qwen3 0.6B with XNNPACK backend and quantization ```bash python -m extension.llm.export.export_llm \ - base.model_class=qwen3-0_6b \ + base.model_class=qwen3_0_6b \ base.params=examples/models/qwen3/0_6b_config.json \ base.metadata='{"get_bos_id": 151644, "get_eos_ids":[151645]}' \ model.use_kv_cache=true \