Skip to content

Use export_llm in CI #11836

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 35 commits into from
Jun 24, 2025
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 11 additions & 14 deletions .ci/scripts/test_llama.sh
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,7 @@ PT2E_QUANTIZE="${PT2E_QUANTIZE:-}"
# Default CMake Build Type to release mode
CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}

if [[ $# -lt 4 ]]; then # Assuming 4 mandatory args
echo "Expecting atleast 4 positional arguments"
echo "Usage: [...]"
fi
# Argument validation is done individually below for each required parameter
if [[ -z "${MODEL_NAME:-}" ]]; then
echo "Missing model name, exiting..."
exit 1
Expand Down Expand Up @@ -224,34 +221,34 @@ fi
# Export model.
EXPORTED_MODEL_NAME="${EXPORTED_MODEL_NAME}.pte"
echo "Exporting ${EXPORTED_MODEL_NAME}"
EXPORT_ARGS="-c ${CHECKPOINT_FILE_NAME} -p ${PARAMS} -d ${DTYPE} -n ${EXPORTED_MODEL_NAME} -kv"
EXPORT_ARGS="base.checkpoint=${CHECKPOINT_FILE_NAME} base.params=${PARAMS} model.dtype_override=${DTYPE} export.output_name=${EXPORTED_MODEL_NAME} model.use_kv_cache=true"
if [[ "${XNNPACK}" == "ON" ]]; then
EXPORT_ARGS="${EXPORT_ARGS} -X --xnnpack-extended-ops -qmode 8da4w -G 128"
EXPORT_ARGS="${EXPORT_ARGS} backend.xnnpack.enabled=true backend.xnnpack.extended_ops=true quantization.qmode=8da4w quantization.group_size=128"
fi
if [[ "${CUSTOM}" == "ON" ]]; then
EXPORT_ARGS="${EXPORT_ARGS} --use_sdpa_with_kv_cache"
EXPORT_ARGS="${EXPORT_ARGS} model.use_sdpa_with_kv_cache=true"
fi
if [[ "${QE}" == "ON" ]]; then
EXPORT_ARGS="${EXPORT_ARGS} --embedding-quantize 8,1024"
EXPORT_ARGS="${EXPORT_ARGS} quantization.embedding_quantize=\"8,1024\""
fi
if [[ "${MPS}" == "ON" ]]; then
EXPORT_ARGS="${EXPORT_ARGS} -kv -v --mps --disable_dynamic_shape"
EXPORT_ARGS="${EXPORT_ARGS} backend.mps.enabled=true model.enable_dynamic_shape=false debug.verbose=true"
fi
if [[ "${COREML}" == "ON" ]]; then
EXPORT_ARGS="${EXPORT_ARGS} -kv -v --coreml --disable_dynamic_shape"
EXPORT_ARGS="${EXPORT_ARGS} backend.coreml.enabled=true model.enable_dynamic_shape=false debug.verbose=true"
fi
if [[ "${QNN}" == "ON" ]]; then
EXPORT_ARGS="${EXPORT_ARGS} -kv -v --qnn --disable_dynamic_shape"
EXPORT_ARGS="${EXPORT_ARGS} backend.qnn.enabled=true model.enable_dynamic_shape=false debug.verbose=true"
echo "PT2E_QUANTIZE is ${PT2E_QUANTIZE}"
if [[ "${PT2E_QUANTIZE}" == "qnn_16a16w" ]]; then
EXPORT_ARGS+=" --tokenizer_path tokenizer.model --pt2e_quantize qnn_16a16w --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --calibration_data Once "
EXPORT_ARGS+=" base.tokenizer_path=tokenizer.model quantization.pt2e_quantize=qnn_16a16w quantization.calibration_tasks=[\"wikitext\"] quantization.calibration_limit=1 quantization.calibration_seq_length=128 quantization.calibration_data=\"Once\""
fi
fi
if [[ "${QUANTIZE_KV_CACHE}" == "ON" ]]; then
EXPORT_ARGS="${EXPORT_ARGS} --quantize_kv_cache"
EXPORT_ARGS="${EXPORT_ARGS} model.quantize_kv_cache=true"
fi
# Add dynamically linked library location
$PYTHON_EXECUTABLE -m examples.models.llama.export_llama ${EXPORT_ARGS}
$PYTHON_EXECUTABLE -m extension.llm.export.export_llm ${EXPORT_ARGS}

# Create tokenizer.bin.
echo "Creating tokenizer.bin"
Expand Down
20 changes: 10 additions & 10 deletions .ci/scripts/test_llama_torchao_lowbit.sh
Original file line number Diff line number Diff line change
Expand Up @@ -70,16 +70,16 @@ QLINEAR_GROUP_SIZE=128 # Must be multiple of 16
QEMBEDDING_BITWIDTH=4 # Can be 1-8
QEMBEDDING_GROUP_SIZE=32 # Must be multiple of 16

${PYTHON_EXECUTABLE} -m examples.models.llama.export_llama \
--checkpoint "${LLAMA_CHECKPOINT:?}" \
--params "${LLAMA_PARAMS:?}" \
-kv \
--use_sdpa_with_kv_cache \
--output_name=${MODEL_OUT} \
-qmode "torchao:8da${QLINEAR_BITWIDTH}w" \
--group_size ${QLINEAR_GROUP_SIZE} \
-E "torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}" \
-d fp32
${PYTHON_EXECUTABLE} -m extension.llm.export.export_llm \
base.checkpoint="${LLAMA_CHECKPOINT:?}" \
base.params="${LLAMA_PARAMS:?}" \
model.use_kv_cache=true \
model.use_sdpa_with_kv_cache=true \
export.output_name="${MODEL_OUT}" \
quantization.qmode="torchao:8da${QLINEAR_BITWIDTH}w" \
quantization.group_size=${QLINEAR_GROUP_SIZE} \
quantization.embedding_quantize=\"torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}\" \
model.dtype_override=fp32

# Test run
./cmake-out/examples/models/llama/llama_main --model_path=$MODEL_OUT --tokenizer_path=$TOKENIZER --prompt="Once upon a time,"
12 changes: 6 additions & 6 deletions .ci/scripts/test_model.sh
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,8 @@ test_model() {
if [[ "${MODEL_NAME}" == "llama2" ]]; then
# Install requirements for export_llama
bash examples/models/llama/install_requirements.sh
# Test export_llama script: python3 -m examples.models.llama.export_llama
"${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama --model "${MODEL_NAME}" -c examples/models/llama/params/demo_rand_params.pth -p examples/models/llama/params/demo_config.json
# Test export_llm script: python3 -m extension.llm.export.export_llm
"${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.checkpoint=examples/models/llama/params/demo_rand_params.pth base.params=examples/models/llama/params/demo_config.json
run_portable_executor_runner
rm "./${MODEL_NAME}.pte"
fi
Expand All @@ -100,17 +100,17 @@ test_model() {
if [[ "${MODEL_NAME}" == "qwen2_5" ]]; then
# Install requirements for export_llama
bash examples/models/llama/install_requirements.sh
# Test export_llama script: python3 -m examples.models.llama.export_llama.
# Test export_llm script: python3 -m extension.llm.export.export_llm.
# Use Llama random checkpoint with Qwen 2.5 1.5b model configuration.
"${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama --model "${MODEL_NAME}" -p examples/models/qwen2_5/1_5b_config.json
"${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/qwen2_5/1_5b_config.json
rm "./${MODEL_NAME}.pte"
return # Skip running with portable executor runnner since portable doesn't support Qwen's biased linears.
fi
if [[ "${MODEL_NAME}" == "phi_4_mini" ]]; then
# Install requirements for export_llama
bash examples/models/llama/install_requirements.sh
# Test export_llama script: python3 -m examples.models.llama.export_llama.
"${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama --model "${MODEL_NAME}" -p examples/models/phi_4_mini/config.json
# Test export_llm script: python3 -m extension.llm.export.export_llm.
"${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/phi_4_mini/config.json
run_portable_executor_runner
rm "./${MODEL_NAME}.pte"
return
Expand Down
142 changes: 72 additions & 70 deletions .github/workflows/android-perf.yml
Original file line number Diff line number Diff line change
Expand Up @@ -214,23 +214,23 @@ jobs:
--files "tokenizer.model" "params.json" "consolidated.00.pth"
)
# Export using ExecuTorch's model definition
python -m examples.models.llama.export_llama \
--model "llama3_2" \
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
--params "${DOWNLOADED_PATH}/params.json" \
--use_sdpa_with_kv_cache \
-X \
--xnnpack-extended-ops \
--preq_mode 8da4w_output_8da8w \
--preq_group_size 32 \
--max_seq_length 2048 \
--max_context_length 2048 \
--output_name "${OUT_ET_MODEL_NAME}.pte" \
-kv \
-d fp32 \
--preq_embedding_quantize 8,0 \
--use_spin_quant native \
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
python -m extension.llm.export.export_llm \
base.model_class="llama3_2" \
base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
base.params="${DOWNLOADED_PATH}/params.json" \
model.use_sdpa_with_kv_cache=true \
backend.xnnpack.enabled=true \
backend.xnnpack.extended_ops=true \
base.preq_mode="8da4w_output_8da8w" \
base.preq_group_size=32 \
export.max_seq_length=2048 \
export.max_context_length=2048 \
export.output_name="${OUT_ET_MODEL_NAME}.pte" \
model.use_kv_cache=true \
model.dtype_override=fp32 \
base.preq_embedding_quantize=\'8,0\' \
quantization.use_spin_quant=native \
base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}'
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "llama3_qlora" ]]; then
# QAT + LoRA
Expand All @@ -241,53 +241,55 @@ jobs:
--files "tokenizer.model" "params.json" "consolidated.00.pth"
)
# Export using ExecuTorch's model definition
python -m examples.models.llama.export_llama \
--model "llama3_2" \
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
--params "${DOWNLOADED_PATH}/params.json" \
-qat \
-lora 16 \
--preq_mode 8da4w_output_8da8w \
--preq_group_size 32 \
--preq_embedding_quantize 8,0 \
--use_sdpa_with_kv_cache \
-kv \
-X \
--xnnpack-extended-ops \
-d fp32 \
--max_seq_length 2048 \
--max_context_length 2048 \
--output_name "${OUT_ET_MODEL_NAME}.pte" \
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
python -m extension.llm.export.export_llm \
base.model_class="llama3_2" \
base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
base.params="${DOWNLOADED_PATH}/params.json" \
quantization.use_qat=true \
base.use_lora=16 \
base.preq_mode="8da4w_output_8da8w" \
base.preq_group_size=32 \
base.preq_embedding_quantize=\'8,0\' \
model.use_sdpa_with_kv_cache=true \
model.use_kv_cache=true \
backend.xnnpack.enabled=true \
backend.xnnpack.extended_ops=true \
model.dtype_override=fp32 \
export.max_seq_length=2048 \
export.max_context_length=2048 \
export.output_name="${OUT_ET_MODEL_NAME}.pte" \
base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}'
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "llama3_fb16" ]]; then
# Original BF16 version, without any quantization
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
python -m examples.models.llama.export_llama \
--model "llama3_2" \
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
--params "${DOWNLOADED_PATH}/params.json" \
-kv \
--use_sdpa_with_kv_cache \
-X \
-d bf16 \
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
--output_name="${OUT_ET_MODEL_NAME}.pte"
python -m extension.llm.export.export_llm \
base.model_class="llama3_2" \
base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
base.params="${DOWNLOADED_PATH}/params.json" \
model.use_kv_cache=true \
model.use_sdpa_with_kv_cache=true \
backend.xnnpack.enabled=true \
model.dtype_override=bf16 \
base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}' \
export.output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
python -m examples.models.llama.export_llama \
--model llama3_2 \
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
--params "${DOWNLOADED_PATH}/params.json" \
-kv \
--use_sdpa_with_kv_cache \
-d fp32 \
-X \
--xnnpack-extended-ops \
-qmode 8da4w -G 32 -E 8,0 \
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
--output_name="${OUT_ET_MODEL_NAME}.pte"
python -m extension.llm.export.export_llm \
base.model_class=llama3_2 \
base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
base.params="${DOWNLOADED_PATH}/params.json" \
model.use_kv_cache=true \
model.use_sdpa_with_kv_cache=true \
model.dtype_override=fp32 \
backend.xnnpack.enabled=true \
backend.xnnpack.extended_ops=true \
quantization.qmode=8da4w \
quantization.group_size=32 \
quantization.embedding_quantize=\'8,0\' \
base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}' \
export.output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then
export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
Expand All @@ -313,19 +315,19 @@ jobs:
elif [[ "$HF_MODEL_REPO" == "Qwen/Qwen3-0.6B" ]]; then
if [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json")
python -m examples.models.llama.export_llama \
--model qwen3-0_6b \
--params examples/models/qwen3/0_6b_config.json \
-kv \
--use_sdpa_with_kv_cache \
-d fp32 \
-X \
--xnnpack-extended-ops \
-qmode 8da4w \
-G 32 \
-E 8,0 \
--metadata '{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
--output_name="${OUT_ET_MODEL_NAME}.pte"
python -m extension.llm.export.export_llm \
base.model_class=qwen3-0_6b \
base.params=examples/models/qwen3/0_6b_config.json \
model.use_kv_cache=true \
model.use_sdpa_with_kv_cache=true \
model.dtype_override=fp32 \
backend.xnnpack.enabled=true \
backend.xnnpack.extended_ops=true \
quantization.qmode=8da4w \
quantization.group_size=32 \
quantization.embedding_quantize=\'8,0\' \
base.metadata='{"get_bos_id":151644,"get_eos_ids":[151645]}' \
export.output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
fi
fi
Expand Down
Loading
Loading