From bb6912a817e6ff805f07b752f8696da1c2e5d14d Mon Sep 17 00:00:00 2001 From: mayongqiang Date: Wed, 2 Jul 2025 11:02:28 +0800 Subject: [PATCH 1/2] update iluvatar gpu fastdeploy whl --- docs/get_started/installation/iluvatar_gpu.md | 2 +- docs/zh/get_started/installation/iluvatar_gpu.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/get_started/installation/iluvatar_gpu.md b/docs/get_started/installation/iluvatar_gpu.md index 85c907e324..dd288d638e 100644 --- a/docs/get_started/installation/iluvatar_gpu.md +++ b/docs/get_started/installation/iluvatar_gpu.md @@ -30,7 +30,7 @@ docker exec -it paddle_infer bash ```bash pip3 install paddlepaddle==3.1.0a0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/ pip3 install paddle-iluvatar-gpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ -pip3 install fastdeploy -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simplels +pip3 install fastdeploy_iluvatar_gpu -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simplels pip3 install aistudio-sdk==0.2.6 ``` diff --git a/docs/zh/get_started/installation/iluvatar_gpu.md b/docs/zh/get_started/installation/iluvatar_gpu.md index baeeaa6c63..7f110ef02e 100644 --- a/docs/zh/get_started/installation/iluvatar_gpu.md +++ b/docs/zh/get_started/installation/iluvatar_gpu.md @@ -29,7 +29,7 @@ docker exec -it paddle_infer bash ```bash pip3 install paddlepaddle==3.1.0a0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/ pip3 install paddle-iluvatar-gpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ -pip3 install fastdeploy -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simplels +pip3 install fastdeploy_iluvatar_gpu -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simplels pip3 install aistudio-sdk==0.2.6 ``` From 8d4e1ccd370c0742978818187042431a233379a6 Mon Sep 17 00:00:00 2001 From: mayongqiang Date: Wed, 2 Jul 2025 11:07:11 +0800 Subject: [PATCH 2/2] update iluvatar gpu fastdeploy whl --- docs/get_started/installation/iluvatar_gpu.md | 4 ++-- docs/zh/get_started/installation/iluvatar_gpu.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/get_started/installation/iluvatar_gpu.md b/docs/get_started/installation/iluvatar_gpu.md index dd288d638e..6f9514b2dc 100644 --- a/docs/get_started/installation/iluvatar_gpu.md +++ b/docs/get_started/installation/iluvatar_gpu.md @@ -1,4 +1,4 @@ -# Run ERNIE-4.5-300B-A47B model on iluvatar machine +# Run ERNIE-4.5-300B-A47B & ERNIE-4.5-21B-A3B model on iluvatar machine The current version of the software merely serves as a demonstration demo for the Iluvatar CoreX combined with the Fastdeploy inference framework for large models. There may be issues when running the latest ERNIE4.5 model, and we will conduct repairs and performance optimization in the future. Subsequent versions will provide customers with a more stable version. ## Machine Preparation @@ -62,7 +62,7 @@ prompts = [ sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256) # load the model -llm = LLM(model="/home/paddle/ernie-4_5-300b-a47b-bf16-paddle", tensor_parallel_size=16, max_model_len=8192) +llm = LLM(model="/home/paddle/ernie-4_5-300b-a47b-bf16-paddle", tensor_parallel_size=16, max_model_len=8192, static_decode_blocks=0, quantization='wint8') # Perform batch inference outputs = llm.generate(prompts, sampling_params) diff --git a/docs/zh/get_started/installation/iluvatar_gpu.md b/docs/zh/get_started/installation/iluvatar_gpu.md index 7f110ef02e..2c5c8e9c45 100644 --- a/docs/zh/get_started/installation/iluvatar_gpu.md +++ b/docs/zh/get_started/installation/iluvatar_gpu.md @@ -1,4 +1,4 @@ -# 如何在天数机器上运行 ERNIE-4.5-300B-A47B-BF16 +# 如何在天数机器上运行 ERNIE-4.5-300B-A47B-BF16 & ERNIE-4.5-21B-A3B 当前版本软件只是作为天数芯片 + Fastdeploy 推理大模型的一个演示 demo,跑最新ERNIE4.5模型可能存在问题,后续进行修复和性能优化,给客户提供一个更稳定的版本。 ## 准备机器 @@ -62,7 +62,7 @@ prompts = [ sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256) # 加载模型 -llm = LLM(model="/home/paddle/ernie-4_5-300b-a47b-bf16-paddle", tensor_parallel_size=16, max_model_len=8192) +llm = LLM(model="/home/paddle/ernie-4_5-300b-a47b-bf16-paddle", tensor_parallel_size=16, max_model_len=8192, static_decode_blocks=0, quantization='wint8') # 批量进行推理(llm内部基于资源情况进行请求排队、动态插入处理) outputs = llm.generate(prompts, sampling_params)