PaddlePaddle
diff --git a/‎.gitignore
Lines changed: 2 additions & 0 deletions b/‎.gitignore
Lines changed: 2 additions & 0 deletions
diff --git a/‎custom_ops/xpu_ops/src/download_dependencies.sh
Lines changed: 54 additions & 0 deletions b/‎custom_ops/xpu_ops/src/download_dependencies.sh
Lines changed: 54 additions & 0 deletions
diff --git a/‎dockerfiles/Dockerfile.xpu
Lines changed: 7 additions & 12 deletions b/‎dockerfiles/Dockerfile.xpu
Lines changed: 7 additions & 12 deletions
diff --git a/‎docs/get_started/installation/kunlunxin_xpu.md
Lines changed: 20 additions & 117 deletions b/‎docs/get_started/installation/kunlunxin_xpu.md
Lines changed: 20 additions & 117 deletions
diff --git a/‎docs/usage/kunlunxin_xpu_deployment.md
Lines changed: 92 additions & 0 deletions b/‎docs/usage/kunlunxin_xpu_deployment.md
Lines changed: 92 additions & 0 deletions
@@ -162,3 +162,5 @@ custom_ops/tmp*
 build
 
 .ccls-cache
+
+third_party
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+if [ $# -ne 1 ] || { [ "$1" != "stable" ] && [ "$1" != "develop" ]; }; then
+    echo "Usage: $0 <stable|develop>"
+    exit 1
+fi
+
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+THIRDPARTY_DIR="$SCRIPT_DIR/third_party"
+
+rm -rf "$THIRDPARTY_DIR"
+mkdir -p "$THIRDPARTY_DIR" || exit 1
+
+if [ "$1" == "stable" ]; then
+    version_xvllm="20250710"
+    version_xtdk="3.2.40.1"
+else
+    version_xvllm="latest"
+    version_xtdk="latest"
+fi
+
+(
+    cd "$THIRDPARTY_DIR" || exit 1
+
+    # Clean previous installation
+    rm -rf output* xvllm* xtdk-llvm* output.tar.gz xtdk-llvm*tar.gz
+
+    # Download and install xvllm
+    if ! wget "https://klx-sdk-release-public.su.bcebos.com/xinfer/daily/eb/${version_xvllm}/output.tar.gz"; then
+        echo "Error downloading xvllm"
+        exit 2
+    fi
+    tar -zxf output.tar.gz && mv output xvllm && rm output.tar.gz
+
+    # Download and install xtdk
+    if ! wget "https://klx-sdk-release-public.su.bcebos.com/xtdk_15fusion/dev/${version_xtdk}/xtdk-llvm15-ubuntu2004_x86_64.tar.gz"; then
+        echo "Error downloading xtdk"
+        exit 3
+    fi
+    tar -zxf xtdk-llvm15-ubuntu2004_x86_64.tar.gz && \
+    mv xtdk-llvm15-ubuntu2004_x86_64 xtdk && \
+    rm xtdk-llvm15-ubuntu2004_x86_64.tar.gz
+)
+
+if [ $? -ne 0 ]; then
+    echo "Installation failed"
+    exit 4
+fi
+
+echo "Installation completed in: $THIRDPARTY_DIR"
+echo "You can set environment variables as follows to use XVLLM and XTDK:"
+echo " export CLANG_PATH=$THIRDPARTY_DIR/xtdk"
+echo " export XVLLM_PATH=$THIRDPARTY_DIR/xvllm"
+echo ""
@@ -17,23 +17,18 @@ RUN python -m pip uninstall paddlepaddle-gpu paddlepaddle-xpu -y
 # install paddlepaddle
 RUN python -m pip install --no-cache-dir --progress-bar off paddlepaddle-xpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/
 
+COPY . /workspace/FastDeploy
+
 # get xtdk and xvllm and xre
-RUN mkdir -p /workspace/deps && cd /workspace/deps && wget https://klx-sdk-release-public.su.bcebos.com/xinfer/daily/eb/20250710/output.tar.gz && \
-    tar -zxf output.tar.gz  && mv output xvllm && \
-    wget https://klx-sdk-release-public.su.bcebos.com/xtdk_15fusion/dev/3.2.40.1/xtdk-llvm15-ubuntu2004_x86_64.tar.gz && \
-    tar -zxf xtdk-llvm15-ubuntu2004_x86_64.tar.gz && mv xtdk-llvm15-ubuntu2004_x86_64 xtdk && \
+RUN mkdir -p /workspace/deps && cd /workspace/deps && \
     wget https://klx-sdk-release-public.su.bcebos.com/xre/kl3-release/5.0.21.21/xre-Linux-x86_64-5.0.21.21.tar.gz && \
-    tar -zxf xre-Linux-x86_64-5.0.21.21.tar.gz && mv xre-Linux-x86_64-5.0.21.21 xre
+    tar -zxf xre-Linux-x86_64-5.0.21.21.tar.gz && mv xre-Linux-x86_64-5.0.21.21 xre && \
+    cd /workspace/FastDeploy && bash custom_ops/xpu_ops/src/download_dependencies.sh stable
 
 ENV PATH=/workspace/deps/xre/bin:$PATH
-ENV CLANG_PATH=/workspace/deps/xtdk
-ENV XVLLM_PATH=/workspace/deps/xvllm
+ENV CLANG_PATH=/workspace/FastDeploy/custom_ops/xpu_ops/src/third_party/xtdk
+ENV XVLLM_PATH=/workspace/FastDeploy/custom_ops/xpu_ops/src/third_party/xvllm
 
-ENV OPENBLAS_NUM_THREADS=1
-ENV OMP_NUM_THREADS=1
-ENV MKL_NUM_THREADS=1
-USER root
-COPY . /workspace/FastDeploy
 # build and install FastDeploy
 RUN cd /workspace/FastDeploy && bash build.sh && python -m pip install --no-cache-dir dist/* && rm -rf /workspace/FastDeploy
 
 
@@ -72,148 +72,51 @@ Alternatively, you can install the latest version of PaddlePaddle (Not recommend
 python -m pip install --pre paddlepaddle-xpu -i https://www.paddlepaddle.org.cn/packages/nightly/xpu-p800/
 ```
 
-### Download Kunlunxin Toolkit (XTDK) and XVLLM library, then set their paths.
-
-```bash
-# XTDK
-wget https://klx-sdk-release-public.su.bcebos.com/xtdk_15fusion/dev/3.2.40.1/xtdk-llvm15-ubuntu2004_x86_64.tar.gz
-tar -xvf xtdk-llvm15-ubuntu2004_x86_64.tar.gz && mv xtdk-llvm15-ubuntu2004_x86_64 xtdk
-export CLANG_PATH=$(pwd)/xtdk
-
-# XVLLM
-wget https://klx-sdk-release-public.su.bcebos.com/xinfer/daily/eb/20250624/output.tar.gz
-tar -xvf output.tar.gz && mv output xvllm
-export XVLLM_PATH=$(pwd)/xvllm
-```
-
-Alternatively, you can download the latest versions of XTDK and XVLLM (Not recommended)
-
-```bash
-XTDK: https://klx-sdk-release-public.su.bcebos.com/xtdk_15fusion/dev/latest/xtdk-llvm15-ubuntu2004_x86_64.tar.gz
-XVLLM: https://klx-sdk-release-public.su.bcebos.com/xinfer/daily/eb/latest/output.tar.gz
-```
-
-### Download FastDeploy source code, checkout the stable branch/TAG, then compile and install.
+### Download FastDeploy source code, checkout the stable branch/TAG
 
 ```bash
 git clone https://github.com/PaddlePaddle/FastDeploy
+git checkout <tag or branch>
 cd FastDeploy
-bash build.sh
 ```
 
-The compiled outputs will be located in the ```FastDeploy/dist``` directory.
-
-## Installation verification
+### Download Kunlunxin Compilation Dependency
 
 ```bash
-python -c "import paddle; paddle.version.show()"
-python -c "import paddle; paddle.utils.run_check()"
-python -c "from paddle.jit.marker import unified"
-python -c "from fastdeploy.model_executor.ops.xpu import block_attn"
+bash custom_ops/xpu_ops/src/download_dependencies.sh stable
 ```
 
-If all the above steps execute successfully, FastDeploy is installed correctly.
-
-## Quick start
-
-The P800 supports the deployment of the ```ERNIE-4.5-300B-A47B-Paddle``` model using the following configurations (Note: Different configurations may result in variations in performance).
-- 32K WINT4 with 8 XPUs (Recommended)
-- 128K WINT4 with 8 XPUs
-- 32K WINT4 with 4 XPUs
-
-### Online serving (OpenAI API-Compatible server)
-
-Deploy an OpenAI API-compatible server using FastDeploy with the following commands:
-
-#### Start service
-
-**Deploy the ERNIE-4.5-300B-A47B-Paddle model with WINT4 precision and 32K context length on 8 XPUs(Recommended)**
+Alternatively, you can download the latest versions of XTDK and XVLLM (Not recommended)
 
 ```bash
-python -m fastdeploy.entrypoints.openai.api_server \
-    --model baidu/ERNIE-4.5-300B-A47B-Paddle \
-    --port 8188 \
-    --tensor-parallel-size 8 \
-    --max-model-len 32768 \
-    --max-num-seqs 64 \
-    --quantization "wint4" \
-    --gpu-memory-utilization 0.9
+bash custom_ops/xpu_ops/src/download_dependencies.sh develop
 ```
 
-**Deploy the ERNIE-4.5-300B-A47B-Paddle model with WINT4 precision and 128K context length on 8 XPUs**
+Set environment variables,
 
 ```bash
-python -m fastdeploy.entrypoints.openai.api_server \
-    --model baidu/ERNIE-4.5-300B-A47B-Paddle \
-    --port 8188 \
-    --tensor-parallel-size 8 \
-    --max-model-len 131072 \
-    --max-num-seqs 64 \
-    --quantization "wint4" \
-    --gpu-memory-utilization 0.9
+export CLANG_PATH=$(pwd)/custom_ops/xpu_ops/src/third_party/xtdk
+export XVLLM_PATH=$(pwd)/custom_ops/xpu_ops/src/third_party/xvllm
 ```
 
-**Deploy the ERNIE-4.5-300B-A47B-Paddle model with WINT4 precision and 32K context length on 4 XPUs**
+### Compile and Install.
 
 ```bash
-export XPU_VISIBLE_DEVICES="0,1,2,3" # Specify which cards to be used
-python -m fastdeploy.entrypoints.openai.api_server \
-    --model baidu/ERNIE-4.5-300B-A47B-Paddle \
-    --port 8188 \
-    --tensor-parallel-size 4 \
-    --max-model-len 32768 \
-    --max-num-seqs 64 \
-    --quantization "wint4" \
-    --gpu-memory-utilization 0.9
+bash build.sh
 ```
 
-**Note:** When deploying on 4 XPUs, only two configurations are supported which constrained by hardware limitations such as interconnect capabilities.
-`export XPU_VISIBLE_DEVICES="0,1,2,3"`
-or
-`export XPU_VISIBLE_DEVICES="4,5,6,7"`
-
-Refer to [Parameters](../../parameters.md) for more options.
-
-#### Send requests
+The compiled outputs will be located in the ```FastDeploy/dist``` directory.
 
-Send requests using either curl or Python
+## Installation verification
 
 ```bash
-curl -X POST "http://0.0.0.0:8188/v1/chat/completions" \
--H "Content-Type: application/json" \
--d '{
-  "messages": [
-    {"role": "user", "content": "Where is the capital of China?"}
-  ]
-}'
+python -c "import paddle; paddle.version.show()"
+python -c "import paddle; paddle.utils.run_check()"
+python -c "from paddle.jit.marker import unified"
+python -c "from fastdeploy.model_executor.ops.xpu import block_attn"
 ```
 
-```python
-import openai
-host = "0.0.0.0"
-port = "8188"
-client = openai.Client(base_url=f"http://{host}:{port}/v1", api_key="null")
-
-response = client.completions.create(
-    model="null",
-    prompt="Where is the capital of China?",
-    stream=True,
-)
-for chunk in response:
-    print(chunk.choices[0].text, end='')
-print('\n')
-
-response = client.chat.completions.create(
-    model="null",
-    messages=[
-        {"role": "user", "content": "Where is the capital of China?"},
-    ],
-    stream=True,
-)
-for chunk in response:
-    if chunk.choices[0].delta:
-        print(chunk.choices[0].delta.content, end='')
-print('\n')
-```
+If all the above steps execute successfully, FastDeploy is installed correctly.
 
-For detailed OpenAI protocol specifications, see [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create). Differences from the standard OpenAI protocol are documented in [OpenAI Protocol-Compatible API Server](../../online_serving/README.md).
+## How to deploy services on kunlunxin XPU
+Refer to [**Supported Models and Service Deployment**](../../usage/kunlunxin_xpu_deployment.md) for the details about the supported models and the way to deploy services on kunlunxin XPU.
@@ -0,0 +1,92 @@
+## Supported Models
+|Model Name|Context Length|Quantization|XPUs Required|Deployment Commands|
+|-|-|-|-|-|
+|ERNIE-4.5-300B-A47B|32K|WINT8|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 8 \ <br>    --max-model-len 32768 \ <br>    --max-num-seqs 64 \ <br>    --quantization "wint8" \ <br>    --gpu-memory-utilization 0.9|
+|ERNIE-4.5-300B-A47B|32K|WINT4|4 (recommend)|export XPU_VISIBLE_DEVICES="0,1,2,3" or "4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 4 \ <br>    --max-model-len 32768 \ <br>    --max-num-seqs 64 \ <br>    --quantization "wint4" \ <br>    --gpu-memory-utilization 0.9|
+|ERNIE-4.5-300B-A47B|32K|WINT4|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 8 \ <br>    --max-model-len 32768 \ <br>    --max-num-seqs 64 \ <br>    --quantization "wint4" \ <br>    --gpu-memory-utilization 0.9|
+|ERNIE-4.5-300B-A47B|128K|WINT4|8 (recommend)|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 8 \ <br>    --max-model-len 131072 \ <br>    --max-num-seqs 64 \ <br>    --quantization "wint4" \ <br>    --gpu-memory-utilization 0.9|
+|ERNIE-4.5-21B-A3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 32768 \ <br>    --max-num-seqs 128 \ <br>    --gpu-memory-utilization 0.9|
+|ERNIE-4.5-21B-A3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 32768 \ <br>    --max-num-seqs 128 \ <br>    --quantization "wint8" \ <br>    --gpu-memory-utilization 0.9|
+|ERNIE-4.5-21B-A3B|32K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 32768 \ <br>    --max-num-seqs 128 \ <br>    --quantization "wint4" \ <br>    --gpu-memory-utilization 0.9|
+|ERNIE-4.5-21B-A3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 131072 \ <br>    --max-num-seqs 128 \ <br>    --gpu-memory-utilization 0.9|
+|ERNIE-4.5-21B-A3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 131072 \ <br>    --max-num-seqs 128 \ <br>    --quantization "wint8" \ <br>    --gpu-memory-utilization 0.9|
+|ERNIE-4.5-21B-A3B|128K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 131072 \ <br>    --max-num-seqs 128 \ <br>    --quantization "wint4" \ <br>    --gpu-memory-utilization 0.9|
+|ERNIE-4.5-0.3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 32768 \ <br>    --max-num-seqs 128 \ <br>    --gpu-memory-utilization 0.9|
+|ERNIE-4.5-0.3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="x" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 32768 \ <br>    --max-num-seqs 128 \ <br>    --quantization "wint8" \ <br>    --gpu-memory-utilization 0.9|
+|ERNIE-4.5-0.3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 131072 \ <br>    --max-num-seqs 128 \ <br>    --gpu-memory-utilization 0.9|
+|ERNIE-4.5-0.3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 131072 \ <br>    --max-num-seqs 128 \ <br>    --quantization "wint8" \ <br>    --gpu-memory-utilization 0.9|
+
+## Quick start
+
+### Online serving (OpenAI API-Compatible server)
+
+Deploy an OpenAI API-compatible server using FastDeploy with the following commands:
+
+#### Start service
+
+**Deploy the ERNIE-4.5-300B-A47B-Paddle model with WINT4 precision and 32K context length on 4 XPUs**
+
+```bash
+export XPU_VISIBLE_DEVICES="0,1,2,3" # Specify which cards to be used
+python -m fastdeploy.entrypoints.openai.api_server \
+    --model baidu/ERNIE-4.5-300B-A47B-Paddle \
+    --port 8188 \
+    --tensor-parallel-size 4 \
+    --max-model-len 32768 \
+    --max-num-seqs 64 \
+    --quantization "wint4" \
+    --gpu-memory-utilization 0.9
+```
+
+**Note:** When deploying on 4 XPUs, only two configurations are supported which constrained by hardware limitations such as interconnect capabilities.
+`export XPU_VISIBLE_DEVICES="0,1,2,3"`
+or
+`export XPU_VISIBLE_DEVICES="4,5,6,7"`
+
+Refer to [Parameters](../../parameters.md) for more options.
+
+All supported models can be found in the *Supported Models* section above.
+
+#### Send requests
+
+Send requests using either curl or Python
+
+```bash
+curl -X POST "http://0.0.0.0:8188/v1/chat/completions" \
+-H "Content-Type: application/json" \
+-d '{
+  "messages": [
+    {"role": "user", "content": "Where is the capital of China?"}
+  ]
+}'
+```
+
+```python
+import openai
+host = "0.0.0.0"
+port = "8188"
+client = openai.Client(base_url=f"http://{host}:{port}/v1", api_key="null")
+
+response = client.completions.create(
+    model="null",
+    prompt="Where is the capital of China?",
+    stream=True,
+)
+for chunk in response:
+    print(chunk.choices[0].text, end='')
+print('\n')
+
+response = client.chat.completions.create(
+    model="null",
+    messages=[
+        {"role": "user", "content": "Where is the capital of China?"},
+    ],
+    stream=True,
+)
+for chunk in response:
+    if chunk.choices[0].delta:
+        print(chunk.choices[0].delta.content, end='')
+print('\n')
+```
+
+For detailed OpenAI protocol specifications, see [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create). Differences from the standard OpenAI protocol are documented in [OpenAI Protocol-Compatible API Server](../../online_serving/README.md).