Skip to content

Commit 17314ee

Browse files
authored
[XPU] Update doc and add scripts for downloading dependencies (#2845)
* [XPU] update xvllm download * update supported models * fix xpu model runner in huge memory with small model * update doc
1 parent 101ad33 commit 17314ee

File tree

8 files changed

+289
-254
lines changed

8 files changed

+289
-254
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,3 +162,5 @@ custom_ops/tmp*
162162
build
163163

164164
.ccls-cache
165+
166+
third_party
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
#!/bin/bash
2+
3+
if [ $# -ne 1 ] || { [ "$1" != "stable" ] && [ "$1" != "develop" ]; }; then
4+
echo "Usage: $0 <stable|develop>"
5+
exit 1
6+
fi
7+
8+
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
9+
THIRDPARTY_DIR="$SCRIPT_DIR/third_party"
10+
11+
rm -rf "$THIRDPARTY_DIR"
12+
mkdir -p "$THIRDPARTY_DIR" || exit 1
13+
14+
if [ "$1" == "stable" ]; then
15+
version_xvllm="20250710"
16+
version_xtdk="3.2.40.1"
17+
else
18+
version_xvllm="latest"
19+
version_xtdk="latest"
20+
fi
21+
22+
(
23+
cd "$THIRDPARTY_DIR" || exit 1
24+
25+
# Clean previous installation
26+
rm -rf output* xvllm* xtdk-llvm* output.tar.gz xtdk-llvm*tar.gz
27+
28+
# Download and install xvllm
29+
if ! wget "https://klx-sdk-release-public.su.bcebos.com/xinfer/daily/eb/${version_xvllm}/output.tar.gz"; then
30+
echo "Error downloading xvllm"
31+
exit 2
32+
fi
33+
tar -zxf output.tar.gz && mv output xvllm && rm output.tar.gz
34+
35+
# Download and install xtdk
36+
if ! wget "https://klx-sdk-release-public.su.bcebos.com/xtdk_15fusion/dev/${version_xtdk}/xtdk-llvm15-ubuntu2004_x86_64.tar.gz"; then
37+
echo "Error downloading xtdk"
38+
exit 3
39+
fi
40+
tar -zxf xtdk-llvm15-ubuntu2004_x86_64.tar.gz && \
41+
mv xtdk-llvm15-ubuntu2004_x86_64 xtdk && \
42+
rm xtdk-llvm15-ubuntu2004_x86_64.tar.gz
43+
)
44+
45+
if [ $? -ne 0 ]; then
46+
echo "Installation failed"
47+
exit 4
48+
fi
49+
50+
echo "Installation completed in: $THIRDPARTY_DIR"
51+
echo "You can set environment variables as follows to use XVLLM and XTDK:"
52+
echo " export CLANG_PATH=$THIRDPARTY_DIR/xtdk"
53+
echo " export XVLLM_PATH=$THIRDPARTY_DIR/xvllm"
54+
echo ""

dockerfiles/Dockerfile.xpu

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -17,23 +17,18 @@ RUN python -m pip uninstall paddlepaddle-gpu paddlepaddle-xpu -y
1717
# install paddlepaddle
1818
RUN python -m pip install --no-cache-dir --progress-bar off paddlepaddle-xpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/
1919

20+
COPY . /workspace/FastDeploy
21+
2022
# get xtdk and xvllm and xre
21-
RUN mkdir -p /workspace/deps && cd /workspace/deps && wget https://klx-sdk-release-public.su.bcebos.com/xinfer/daily/eb/20250710/output.tar.gz && \
22-
tar -zxf output.tar.gz && mv output xvllm && \
23-
wget https://klx-sdk-release-public.su.bcebos.com/xtdk_15fusion/dev/3.2.40.1/xtdk-llvm15-ubuntu2004_x86_64.tar.gz && \
24-
tar -zxf xtdk-llvm15-ubuntu2004_x86_64.tar.gz && mv xtdk-llvm15-ubuntu2004_x86_64 xtdk && \
23+
RUN mkdir -p /workspace/deps && cd /workspace/deps && \
2524
wget https://klx-sdk-release-public.su.bcebos.com/xre/kl3-release/5.0.21.21/xre-Linux-x86_64-5.0.21.21.tar.gz && \
26-
tar -zxf xre-Linux-x86_64-5.0.21.21.tar.gz && mv xre-Linux-x86_64-5.0.21.21 xre
25+
tar -zxf xre-Linux-x86_64-5.0.21.21.tar.gz && mv xre-Linux-x86_64-5.0.21.21 xre && \
26+
cd /workspace/FastDeploy && bash custom_ops/xpu_ops/src/download_dependencies.sh stable
2727

2828
ENV PATH=/workspace/deps/xre/bin:$PATH
29-
ENV CLANG_PATH=/workspace/deps/xtdk
30-
ENV XVLLM_PATH=/workspace/deps/xvllm
29+
ENV CLANG_PATH=/workspace/FastDeploy/custom_ops/xpu_ops/src/third_party/xtdk
30+
ENV XVLLM_PATH=/workspace/FastDeploy/custom_ops/xpu_ops/src/third_party/xvllm
3131

32-
ENV OPENBLAS_NUM_THREADS=1
33-
ENV OMP_NUM_THREADS=1
34-
ENV MKL_NUM_THREADS=1
35-
USER root
36-
COPY . /workspace/FastDeploy
3732
# build and install FastDeploy
3833
RUN cd /workspace/FastDeploy && bash build.sh && python -m pip install --no-cache-dir dist/* && rm -rf /workspace/FastDeploy
3934

docs/get_started/installation/kunlunxin_xpu.md

Lines changed: 20 additions & 117 deletions
Original file line numberDiff line numberDiff line change
@@ -72,148 +72,51 @@ Alternatively, you can install the latest version of PaddlePaddle (Not recommend
7272
python -m pip install --pre paddlepaddle-xpu -i https://www.paddlepaddle.org.cn/packages/nightly/xpu-p800/
7373
```
7474

75-
### Download Kunlunxin Toolkit (XTDK) and XVLLM library, then set their paths.
76-
77-
```bash
78-
# XTDK
79-
wget https://klx-sdk-release-public.su.bcebos.com/xtdk_15fusion/dev/3.2.40.1/xtdk-llvm15-ubuntu2004_x86_64.tar.gz
80-
tar -xvf xtdk-llvm15-ubuntu2004_x86_64.tar.gz && mv xtdk-llvm15-ubuntu2004_x86_64 xtdk
81-
export CLANG_PATH=$(pwd)/xtdk
82-
83-
# XVLLM
84-
wget https://klx-sdk-release-public.su.bcebos.com/xinfer/daily/eb/20250624/output.tar.gz
85-
tar -xvf output.tar.gz && mv output xvllm
86-
export XVLLM_PATH=$(pwd)/xvllm
87-
```
88-
89-
Alternatively, you can download the latest versions of XTDK and XVLLM (Not recommended)
90-
91-
```bash
92-
XTDK: https://klx-sdk-release-public.su.bcebos.com/xtdk_15fusion/dev/latest/xtdk-llvm15-ubuntu2004_x86_64.tar.gz
93-
XVLLM: https://klx-sdk-release-public.su.bcebos.com/xinfer/daily/eb/latest/output.tar.gz
94-
```
95-
96-
### Download FastDeploy source code, checkout the stable branch/TAG, then compile and install.
75+
### Download FastDeploy source code, checkout the stable branch/TAG
9776

9877
```bash
9978
git clone https://github.com/PaddlePaddle/FastDeploy
79+
git checkout <tag or branch>
10080
cd FastDeploy
101-
bash build.sh
10281
```
10382

104-
The compiled outputs will be located in the ```FastDeploy/dist``` directory.
105-
106-
## Installation verification
83+
### Download Kunlunxin Compilation Dependency
10784

10885
```bash
109-
python -c "import paddle; paddle.version.show()"
110-
python -c "import paddle; paddle.utils.run_check()"
111-
python -c "from paddle.jit.marker import unified"
112-
python -c "from fastdeploy.model_executor.ops.xpu import block_attn"
86+
bash custom_ops/xpu_ops/src/download_dependencies.sh stable
11387
```
11488

115-
If all the above steps execute successfully, FastDeploy is installed correctly.
116-
117-
## Quick start
118-
119-
The P800 supports the deployment of the ```ERNIE-4.5-300B-A47B-Paddle``` model using the following configurations (Note: Different configurations may result in variations in performance).
120-
- 32K WINT4 with 8 XPUs (Recommended)
121-
- 128K WINT4 with 8 XPUs
122-
- 32K WINT4 with 4 XPUs
123-
124-
### Online serving (OpenAI API-Compatible server)
125-
126-
Deploy an OpenAI API-compatible server using FastDeploy with the following commands:
127-
128-
#### Start service
129-
130-
**Deploy the ERNIE-4.5-300B-A47B-Paddle model with WINT4 precision and 32K context length on 8 XPUs(Recommended)**
89+
Alternatively, you can download the latest versions of XTDK and XVLLM (Not recommended)
13190

13291
```bash
133-
python -m fastdeploy.entrypoints.openai.api_server \
134-
--model baidu/ERNIE-4.5-300B-A47B-Paddle \
135-
--port 8188 \
136-
--tensor-parallel-size 8 \
137-
--max-model-len 32768 \
138-
--max-num-seqs 64 \
139-
--quantization "wint4" \
140-
--gpu-memory-utilization 0.9
92+
bash custom_ops/xpu_ops/src/download_dependencies.sh develop
14193
```
14294

143-
**Deploy the ERNIE-4.5-300B-A47B-Paddle model with WINT4 precision and 128K context length on 8 XPUs**
95+
Set environment variables,
14496

14597
```bash
146-
python -m fastdeploy.entrypoints.openai.api_server \
147-
--model baidu/ERNIE-4.5-300B-A47B-Paddle \
148-
--port 8188 \
149-
--tensor-parallel-size 8 \
150-
--max-model-len 131072 \
151-
--max-num-seqs 64 \
152-
--quantization "wint4" \
153-
--gpu-memory-utilization 0.9
98+
export CLANG_PATH=$(pwd)/custom_ops/xpu_ops/src/third_party/xtdk
99+
export XVLLM_PATH=$(pwd)/custom_ops/xpu_ops/src/third_party/xvllm
154100
```
155101

156-
**Deploy the ERNIE-4.5-300B-A47B-Paddle model with WINT4 precision and 32K context length on 4 XPUs**
102+
### Compile and Install.
157103

158104
```bash
159-
export XPU_VISIBLE_DEVICES="0,1,2,3" # Specify which cards to be used
160-
python -m fastdeploy.entrypoints.openai.api_server \
161-
--model baidu/ERNIE-4.5-300B-A47B-Paddle \
162-
--port 8188 \
163-
--tensor-parallel-size 4 \
164-
--max-model-len 32768 \
165-
--max-num-seqs 64 \
166-
--quantization "wint4" \
167-
--gpu-memory-utilization 0.9
105+
bash build.sh
168106
```
169107

170-
**Note:** When deploying on 4 XPUs, only two configurations are supported which constrained by hardware limitations such as interconnect capabilities.
171-
`export XPU_VISIBLE_DEVICES="0,1,2,3"`
172-
or
173-
`export XPU_VISIBLE_DEVICES="4,5,6,7"`
174-
175-
Refer to [Parameters](../../parameters.md) for more options.
176-
177-
#### Send requests
108+
The compiled outputs will be located in the ```FastDeploy/dist``` directory.
178109

179-
Send requests using either curl or Python
110+
## Installation verification
180111

181112
```bash
182-
curl -X POST "http://0.0.0.0:8188/v1/chat/completions" \
183-
-H "Content-Type: application/json" \
184-
-d '{
185-
"messages": [
186-
{"role": "user", "content": "Where is the capital of China?"}
187-
]
188-
}'
113+
python -c "import paddle; paddle.version.show()"
114+
python -c "import paddle; paddle.utils.run_check()"
115+
python -c "from paddle.jit.marker import unified"
116+
python -c "from fastdeploy.model_executor.ops.xpu import block_attn"
189117
```
190118

191-
```python
192-
import openai
193-
host = "0.0.0.0"
194-
port = "8188"
195-
client = openai.Client(base_url=f"http://{host}:{port}/v1", api_key="null")
196-
197-
response = client.completions.create(
198-
model="null",
199-
prompt="Where is the capital of China?",
200-
stream=True,
201-
)
202-
for chunk in response:
203-
print(chunk.choices[0].text, end='')
204-
print('\n')
205-
206-
response = client.chat.completions.create(
207-
model="null",
208-
messages=[
209-
{"role": "user", "content": "Where is the capital of China?"},
210-
],
211-
stream=True,
212-
)
213-
for chunk in response:
214-
if chunk.choices[0].delta:
215-
print(chunk.choices[0].delta.content, end='')
216-
print('\n')
217-
```
119+
If all the above steps execute successfully, FastDeploy is installed correctly.
218120

219-
For detailed OpenAI protocol specifications, see [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create). Differences from the standard OpenAI protocol are documented in [OpenAI Protocol-Compatible API Server](../../online_serving/README.md).
121+
## How to deploy services on kunlunxin XPU
122+
Refer to [**Supported Models and Service Deployment**](../../usage/kunlunxin_xpu_deployment.md) for the details about the supported models and the way to deploy services on kunlunxin XPU.
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
## Supported Models
2+
|Model Name|Context Length|Quantization|XPUs Required|Deployment Commands|
3+
|-|-|-|-|-|
4+
|ERNIE-4.5-300B-A47B|32K|WINT8|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|
5+
|ERNIE-4.5-300B-A47B|32K|WINT4|4 (recommend)|export XPU_VISIBLE_DEVICES="0,1,2,3" or "4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 4 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|
6+
|ERNIE-4.5-300B-A47B|32K|WINT4|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|
7+
|ERNIE-4.5-300B-A47B|128K|WINT4|8 (recommend)|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|
8+
|ERNIE-4.5-21B-A3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9|
9+
|ERNIE-4.5-21B-A3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|
10+
|ERNIE-4.5-21B-A3B|32K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|
11+
|ERNIE-4.5-21B-A3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9|
12+
|ERNIE-4.5-21B-A3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|
13+
|ERNIE-4.5-21B-A3B|128K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|
14+
|ERNIE-4.5-0.3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9|
15+
|ERNIE-4.5-0.3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="x" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|
16+
|ERNIE-4.5-0.3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9|
17+
|ERNIE-4.5-0.3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|
18+
19+
## Quick start
20+
21+
### Online serving (OpenAI API-Compatible server)
22+
23+
Deploy an OpenAI API-compatible server using FastDeploy with the following commands:
24+
25+
#### Start service
26+
27+
**Deploy the ERNIE-4.5-300B-A47B-Paddle model with WINT4 precision and 32K context length on 4 XPUs**
28+
29+
```bash
30+
export XPU_VISIBLE_DEVICES="0,1,2,3" # Specify which cards to be used
31+
python -m fastdeploy.entrypoints.openai.api_server \
32+
--model baidu/ERNIE-4.5-300B-A47B-Paddle \
33+
--port 8188 \
34+
--tensor-parallel-size 4 \
35+
--max-model-len 32768 \
36+
--max-num-seqs 64 \
37+
--quantization "wint4" \
38+
--gpu-memory-utilization 0.9
39+
```
40+
41+
**Note:** When deploying on 4 XPUs, only two configurations are supported which constrained by hardware limitations such as interconnect capabilities.
42+
`export XPU_VISIBLE_DEVICES="0,1,2,3"`
43+
or
44+
`export XPU_VISIBLE_DEVICES="4,5,6,7"`
45+
46+
Refer to [Parameters](../../parameters.md) for more options.
47+
48+
All supported models can be found in the *Supported Models* section above.
49+
50+
#### Send requests
51+
52+
Send requests using either curl or Python
53+
54+
```bash
55+
curl -X POST "http://0.0.0.0:8188/v1/chat/completions" \
56+
-H "Content-Type: application/json" \
57+
-d '{
58+
"messages": [
59+
{"role": "user", "content": "Where is the capital of China?"}
60+
]
61+
}'
62+
```
63+
64+
```python
65+
import openai
66+
host = "0.0.0.0"
67+
port = "8188"
68+
client = openai.Client(base_url=f"http://{host}:{port}/v1", api_key="null")
69+
70+
response = client.completions.create(
71+
model="null",
72+
prompt="Where is the capital of China?",
73+
stream=True,
74+
)
75+
for chunk in response:
76+
print(chunk.choices[0].text, end='')
77+
print('\n')
78+
79+
response = client.chat.completions.create(
80+
model="null",
81+
messages=[
82+
{"role": "user", "content": "Where is the capital of China?"},
83+
],
84+
stream=True,
85+
)
86+
for chunk in response:
87+
if chunk.choices[0].delta:
88+
print(chunk.choices[0].delta.content, end='')
89+
print('\n')
90+
```
91+
92+
For detailed OpenAI protocol specifications, see [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create). Differences from the standard OpenAI protocol are documented in [OpenAI Protocol-Compatible API Server](../../online_serving/README.md).

0 commit comments

Comments
 (0)