|
1 | 1 | #!/bin/bash
|
2 |
| -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. |
| 2 | +# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
3 | 3 | #
|
4 | 4 | # Redistribution and use in source and binary forms, with or without
|
5 | 5 | # modification, are permitted provided that the following conditions
|
@@ -53,22 +53,38 @@ function prepare_tensorrtllm() {
|
53 | 53 | # FIXME: Remove when testing TRT-LLM containers built from source
|
54 | 54 | pip install -r requirements.txt
|
55 | 55 |
|
56 |
| - MODEL="llama-3-8b-instruct" |
| 56 | + MODEL="meta-llama/Meta-Llama-3.1-8B-Instruct" |
57 | 57 | MODEL_REPO="tests/tensorrtllm_models"
|
58 |
| - rm -rf ${MODEL_REPO} |
59 |
| - |
60 |
| - # FIXME: This may require an upgrade each release to match the TRT-LLM version, |
61 |
| - # and would likely be easier to use trtllm-build directly for test purposes. |
62 |
| - # Use Triton CLI to prepare model repository for testing |
63 |
| - pip install git+https://github.com/triton-inference-server/triton_cli.git@0.1.1 |
64 |
| - # NOTE: Could use ENGINE_DEST_PATH set to NFS mount for pre-built engines in future |
65 |
| - triton import \ |
66 |
| - --model ${MODEL} \ |
67 |
| - --backend tensorrtllm \ |
68 |
| - --model-repository "${MODEL_REPO}" |
69 |
| - |
70 |
| - # WAR for tests expecting default name of "tensorrt_llm_bls" |
71 |
| - mv "${MODEL_REPO}/${MODEL}" "${MODEL_REPO}/tensorrt_llm_bls" |
| 58 | + mkdir -p ${MODEL_REPO} |
| 59 | + cp /app/all_models/inflight_batcher_llm/* "${MODEL_REPO}" -r |
| 60 | + # Ensemble model is not needed for the test |
| 61 | + rm -rf ${MODEL_REPO}/ensemble |
| 62 | + |
| 63 | + # 1. Download model from HF |
| 64 | + huggingface-cli download ${MODEL} |
| 65 | + |
| 66 | + HF_LLAMA_MODEL=`python3 -c "from pathlib import Path; from huggingface_hub import hf_hub_download; print(Path(hf_hub_download('${MODEL}', filename='config.json')).parent)"` |
| 67 | + CKPT_PATH=/tmp/ckpt/llama/3.1-8b-instruct/ |
| 68 | + ENGINE_PATH=/tmp/engines/llama/3.1-8b-instruct/ |
| 69 | + |
| 70 | + # 2. Convert weights |
| 71 | + python3 /app/examples/llama/convert_checkpoint.py --model_dir ${HF_LLAMA_MODEL} \ |
| 72 | + --output_dir ${CKPT_PATH} \ |
| 73 | + --dtype float16 |
| 74 | + |
| 75 | + # 3. Build engine |
| 76 | + # max_batch_size set to 128 to avoid OOM errors |
| 77 | + trtllm-build --checkpoint_dir ${CKPT_PATH} \ |
| 78 | + --gemm_plugin auto \ |
| 79 | + --max_batch_size 128 \ |
| 80 | + --output_dir ${ENGINE_PATH} |
| 81 | + |
| 82 | + # 4. Prepare model repository |
| 83 | + FILL_TEMPLATE="/app/tools/fill_template.py" |
| 84 | + python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/preprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,preprocessing_instance_count:1,max_queue_size:0 |
| 85 | + python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/postprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1 |
| 86 | + python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:True,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32 |
| 87 | + python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:${ENGINE_PATH},batching_strategy:inflight_fused_batching,max_queue_size:0,max_queue_delay_microseconds:1000,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32,exclude_input_in_output:True |
72 | 88 | }
|
73 | 89 |
|
74 | 90 | function pre_test() {
|
|
0 commit comments