PaddlePaddle
diff --git a/‎build.sh
Lines changed: 3 additions & 1 deletion b/‎build.sh
Lines changed: 3 additions & 1 deletion
diff --git a/‎custom_ops/gpu_ops/helper.h
Lines changed: 15 additions & 0 deletions b/‎custom_ops/gpu_ops/helper.h
Lines changed: 15 additions & 0 deletions
diff --git a/‎custom_ops/gpu_ops/set_data_ipc.cu
Lines changed: 5 additions & 0 deletions b/‎custom_ops/gpu_ops/set_data_ipc.cu
Lines changed: 5 additions & 0 deletions
diff --git a/‎custom_ops/gpu_ops/share_external_data.cu
Lines changed: 8 additions & 0 deletions b/‎custom_ops/gpu_ops/share_external_data.cu
Lines changed: 8 additions & 0 deletions
diff --git a/‎custom_ops/setup_ops.py
Lines changed: 39 additions & 30 deletions b/‎custom_ops/setup_ops.py
Lines changed: 39 additions & 30 deletions
diff --git a/‎docs/get_started/installation/README.md
Lines changed: 1 addition & 0 deletions b/‎docs/get_started/installation/README.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/get_started/installation/hygon_dcu.md
Lines changed: 81 additions & 0 deletions b/‎docs/get_started/installation/hygon_dcu.md
Lines changed: 81 additions & 0 deletions
diff --git a/‎docs/zh/get_started/installation/README.md
Lines changed: 1 addition & 0 deletions b/‎docs/zh/get_started/installation/README.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/zh/get_started/installation/hygon_dcu.md
Lines changed: 81 additions & 0 deletions b/‎docs/zh/get_started/installation/hygon_dcu.md
Lines changed: 81 additions & 0 deletions
diff --git a/‎fastdeploy/model_executor/layers/attention/__init__.py
Lines changed: 3 additions & 1 deletion b/‎fastdeploy/model_executor/layers/attention/__init__.py
Lines changed: 3 additions & 1 deletion
@@ -77,8 +77,10 @@ function copy_ops(){
     is_rocm=`$python -c "import paddle; print(paddle.is_compiled_with_rocm())"`
     if [ "$is_rocm" = "True" ]; then
       DEVICE_TYPE="rocm"
+      mkdir -p ../fastdeploy/model_executor/ops/base
+      cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
       cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/gpu
-      echo -e "ROCM ops have been copy to fastdeploy"
+      echo -e "BASE and ROCM ops have been copy to fastdeploy"
       return
     fi
     mkdir -p ../fastdeploy/model_executor/ops/base
 
@@ -214,11 +214,19 @@ HOSTDEVICE inline void Store(const AlignedVector<T, Size> &vec, T *addr) {
   *addr_vec = vec;
 }
 
+#ifdef PADDLE_WITH_HIP
+template <int Size>
+HOSTDEVICE inline void Store(const AlignedVector<hip_bfloat16, Size> &vec,
+                             int8_t *addr) {
+    printf("Error: Store hip_bfloat16 to int8_t is not supported!");
+}
+#else
 template <int Size>
 HOSTDEVICE inline void Store(const AlignedVector<__nv_bfloat16, Size> &vec,
                              int8_t *addr) {
   printf("Error: Store __nv_bfloat16 to int8_t is not supported!");
 }
+#endif
 
 template <int Size>
 HOSTDEVICE inline void Store(const AlignedVector<half, Size> &vec,
@@ -478,7 +486,12 @@ template <typename T>
 static void PrintMatrix3(const T *mat_d, int num, std::string name) {
 
   std::vector<T> tmp(num);
+#ifdef PADDLE_WITH_HIP
+  hipMemcpy(tmp.data(), mat_d, sizeof(T) * num, hipMemcpyDeviceToHost);
+#else
   cudaMemcpy(tmp.data(), mat_d, sizeof(T) * num, cudaMemcpyDeviceToHost);
+#endif
+
 
   std::ofstream outfile;
   outfile.open(name + ".txt", std::ios::out);
@@ -495,6 +508,7 @@ static void PrintMatrix3(const T *mat_d, int num, std::string name) {
   outfile.close();
 }
 
+#ifndef PADDLE_WITH_HIP
 __forceinline__ __device__ uint32_t ld_flag_acquire(uint32_t *flag_addr,
                                                     int mode = 0) {
   uint32_t flag;
@@ -534,6 +548,7 @@ inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
                          cudaDevAttrMaxSharedMemoryPerBlockOptin, device);
   return max_shared_mem_per_block_opt_in;
 }
+#endif
 
 inline int GetSMVersion() {
   static int sm_version = phi::backends::gpu::GetGPUComputeCapability(
 
@@ -91,7 +91,12 @@ void set_data_ipc(const paddle::Tensor& tmp_input,
   memset((void *)shm, 0, sizeof(*shm));
 
   void *data_ptr_now = reinterpret_cast<void*>(const_cast<data_t*>(tmp_input.data<data_t>()));
+#ifdef PADDLE_WITH_HIP
+  checkCudaErrors(hipIpcGetMemHandle((hipIpcMemHandle_t *)&shm->memHandle, data_ptr_now));
+#else
   checkCudaErrors(cudaIpcGetMemHandle((cudaIpcMemHandle_t *)&shm->memHandle, data_ptr_now));
+#endif
+
 
 }
 
 
@@ -37,10 +37,18 @@ std::vector<paddle::Tensor> ShareExternalData(paddle::Tensor& input,
   }
   shm = (volatile shmStruct *)info.addr;
   void *ptr = nullptr;
+#ifdef PADDLE_WITH_HIP
+  checkCudaErrors(
+      hipIpcOpenMemHandle(&ptr,
+                           *(hipIpcMemHandle_t *)&shm->memHandle,  // NOLINT
+                           hipIpcMemLazyEnablePeerAccess));
+#else
   checkCudaErrors(
       cudaIpcOpenMemHandle(&ptr,
                            *(cudaIpcMemHandle_t *)&shm->memHandle,  // NOLINT
                            cudaIpcMemLazyEnablePeerAccess));
+#endif
+
   paddle::Tensor tmp_tensor = paddle::from_blob(
     ptr,
     shape,
 
@@ -187,39 +187,45 @@ def find_end_files(directory, end_str):
 if paddle.is_compiled_with_rocm():
     # NOTE(@duanyanhui): paddle.is_compiled_with_cuda() returns True when paddle compiled with rocm.
     # so we need to check if paddle compiled with rocm at first.
+    json_dir = "third_party/nlohmann_json"
+    if not os.path.exists(json_dir) or not os.listdir(json_dir):
+        if not os.path.exists(json_dir):
+            os.makedirs(json_dir)
+        clone_git_repo("v3.11.3", "https://bgithub.xyz/nlohmann/json.git", json_dir)
+        if not os.listdir(json_dir):
+            raise ValueError("Git clone nlohmann_json failed!")
+    sources=[
+        "gpu_ops/set_value_by_flags.cu",
+        "gpu_ops/token_penalty_multi_scores.cu",
+        "gpu_ops/stop_generation.cu",
+        "gpu_ops/stop_generation_multi_ends.cu",
+        "gpu_ops/get_padding_offset.cu",
+        "gpu_ops/update_inputs.cu",
+        "gpu_ops/rebuild_padding.cu",
+        "gpu_ops/step.cu",
+        "gpu_ops/set_data_ipc.cu",
+        "gpu_ops/moe/tritonmoe_preprocess.cu",
+        "gpu_ops/step_system_cache.cu",
+        "gpu_ops/get_output_ep.cc",
+        "gpu_ops/speculate_decoding/speculate_get_padding_offset.cu",
+        "gpu_ops/speculate_decoding/speculate_get_output.cc",
+        "gpu_ops/share_external_data.cu",
+        "gpu_ops/speculate_decoding/speculate_clear_accept_nums.cu",
+        "gpu_ops/speculate_decoding/speculate_get_output_padding_offset.cu",
+        "gpu_ops/speculate_decoding/speculate_get_seq_lens_output.cu",
+        "gpu_ops/speculate_decoding/speculate_save_output.cc",
+        "gpu_ops/speculate_decoding/speculate_set_value_by_flags.cu",
+        "gpu_ops/speculate_decoding/speculate_step.cu",
+        "gpu_ops/speculate_decoding/speculate_step_system_cache.cu",
+        "gpu_ops/speculate_decoding/speculate_update_v3.cu",
+        "gpu_ops/get_position_ids_and_mask_encoder_batch.cu",
+        "gpu_ops/fused_rotary_position_encoding.cu",
+        "gpu_ops/step_reschedule.cu",
+    ]
     setup(
         name="fastdeploy_ops",
         ext_modules=CUDAExtension(
-            sources=[
-                "gpu_ops/save_with_output.cc",
-                "gpu_ops/set_mask_value.cu",
-                "gpu_ops/set_value_by_flags.cu",
-                "gpu_ops/ngram_mask.cu",
-                "gpu_ops/gather_idx.cu",
-                "gpu_ops/token_penalty_multi_scores.cu",
-                "gpu_ops/token_penalty_only_once.cu",
-                "gpu_ops/stop_generation.cu",
-                "gpu_ops/stop_generation_multi_ends.cu",
-                "gpu_ops/stop_generation_multi_stop_seqs.cu",
-                "gpu_ops/set_flags.cu",
-                "gpu_ops/fused_get_rope.cu",
-                "gpu_ops/transfer_output.cc",
-                "gpu_ops/get_padding_offset.cu",
-                "gpu_ops/update_inputs.cu",
-                "gpu_ops/update_inputs_beam.cu",
-                "gpu_ops/beam_search_softmax.cu",
-                "gpu_ops/rebuild_padding.cu",
-                "gpu_ops/save_with_output_msg.cc",
-                "gpu_ops/get_output.cc",
-                "gpu_ops/get_output_msg_with_topk.cc",
-                "gpu_ops/step.cu",
-                "gpu_ops/step_reschedule.cu",
-                "gpu_ops/set_data_ipc.cu",
-                "gpu_ops/read_data_ipc.cu",
-                "gpu_ops/dequant_int8.cu",
-                "gpu_ops/enforce_generation.cu",
-                "gpu_ops/tune_cublaslt_gemm.cu",
-            ],
+            sources=sources,
             extra_compile_args={
                 "cxx": ["-O3"],
                 "hipcc": [
@@ -231,6 +237,9 @@ def find_end_files(directory, end_str):
                     "-U__HIP_NO_BFLOAT16_CONVERSIONS__",
                     "-U__HIP_NO_BFLOAT162_OPERATORS__",
                     "-U__HIP_NO_BFLOAT162_CONVERSIONS__",
+                    "-DPADDLE_DEV",
+                    "-Ithird_party/nlohmann_json/include",
+                    "-Igpu_ops",
                 ],
             },
         ),
 
@@ -6,3 +6,4 @@ FastDeploy currently supports installation on the following hardware platforms:
 - [Kunlun XPU Installation](kunlunxin_xpu.md)
 - [Enflame S60 GCU Installation](Enflame_gcu.md)
 - [Iluvatar GPU Installation](iluvatar_gpu.md)
+- [Hygon DCU Installation](hygon_dcu.md)
@@ -0,0 +1,81 @@
+# Run ERNIE-4.5-300B-A47B & ERNIE-4.5-21B-A3B model on hygon machine
+The current version of the software merely serves as a demonstration demo for the hygon k100AI combined with the Fastdeploy inference framework for large models. There may be issues when running the latest ERNIE4.5 model, and we will conduct repairs and performance optimization in the future. Subsequent versions will provide customers with a more stable version.
+
+## Requirements
+Firstly, you need to prepare a machine with the following configuration
+- OS：Linux
+- Python：3.10
+- Memory: 2T
+- Disk: 4T
+- DCU Model：K100AI
+- DCU Driver Version：≥ 6.3.8-V1.9.2
+
+## 1. Set up using Docker (Recommended)
+
+```bash
+mkdir Work
+cd Work
+docker pull image.sourcefind.cn:5000/dcu/admin/base/custom:fastdeploy2.0.0-kylinv10-dtk25.04-py3.10
+
+docker run -it \
+--network=host \
+--name=ernie45t \
+--privileged \
+--device=/dev/kfd \
+--device=/dev/dri \
+--ipc=host \
+--shm-size=16G \
+--group-add video \
+--cap-add=SYS_PTRACE \
+--security-opt seccomp=unconfined \
+-u root \
+--ulimit stack=-1:-1 \
+--ulimit memlock=-1:-1 \
+-v `pwd`:/home \
+-v /opt/hyhal:/opt/hyhal:ro \
+image.sourcefind.cn:5000/dcu/admin/base/custom:fastdeploy2.0.0-kylinv10-dtk25.04-py3.10 /bin/bash
+```
+
+## 2. Start service
+```bash
+export FD_ATTENTION_BACKEND="BLOCK_ATTN"
+python -m fastdeploy.entrypoints.openai.api_server \
+    --model "/models/ERNIE-45-Turbo/ERNIE-4.5-300B-A47B-Paddle/" \
+    --port 8188 \
+    --tensor-parallel-size 8 \
+    --quantization=wint8 \
+    --gpu-memory-utilization=0.8
+```
+
+#### Send requests
+
+Send requests using either curl or Python
+
+```bash
+curl -X POST "http://0.0.0.0:8188/v1/chat/completions" \
+-H "Content-Type: application/json" \
+-d '{
+  "messages": [
+    {"role": "user", "content": "Where is the capital of China?"}
+  ]
+}'
+```
+
+```python
+import openai
+
+ip = "0.0.0.0"
+service_http_port = "8188"
+client = openai.Client(base_url=f"http://{ip}:{service_http_port}/v1", api_key="EMPTY_API_KEY")
+
+response = client.chat.completions.create(
+    model="default",
+    messages=[
+        {"role": "user", "content": "Eliza's rate per hour for the first 40 hours she works each week is $10. She also receives an overtime pay of 1.2 times her regular hourly rate. If Eliza worked for 45 hours this week, how much are her earnings for this week?"},
+    ],
+    temperature=1,
+    max_tokens=1024,
+    stream=False,
+)
+print(response)
+```
@@ -6,3 +6,4 @@ FastDeploy currently supports installation on the following hardware platforms:
 - [Kunlunxin XPU Installation](kunlunxin_xpu.md)
 - [Enflame S60 GCU Installation](Enflame_gcu.md)
 - [Iluvatar GPU Installation](iluvatar_gpu.md)
+- [Hygon DCU Installation](hygon_dcu.md)
@@ -0,0 +1,81 @@
+# 使用 FastDeploy 在海光 K100AI 上运行 ERNIE-4.5-300B-A47B & ERNIE-4.5-21B-A3B
+当前版本软件只是作为K100AI + Fastdeploy 推理大模型的一个演示 demo，跑最新ERNIE4.5模型可能存在问题，后续进行修复和性能优化，给客户提供一个更稳定的版本。
+
+## 准备机器
+首先您需要准备以下配置的机器
+- OS：Linux
+- Python：3.10
+- 内存：2T
+- 磁盘：4T
+- DCU 型号：K100AI
+- DCU 驱动版本：≥ 6.3.8-V1.9.2
+
+## 1. 使用 Docker 安装（推荐）
+
+```bash
+mkdir Work
+cd Work
+docker pull image.sourcefind.cn:5000/dcu/admin/base/custom:fastdeploy2.0.0-kylinv10-dtk25.04-py3.10
+
+docker run -it \
+--network=host \
+--name=ernie45t \
+--privileged \
+--device=/dev/kfd \
+--device=/dev/dri \
+--ipc=host \
+--shm-size=16G \
+--group-add video \
+--cap-add=SYS_PTRACE \
+--security-opt seccomp=unconfined \
+-u root \
+--ulimit stack=-1:-1 \
+--ulimit memlock=-1:-1 \
+-v `pwd`:/home \
+-v /opt/hyhal:/opt/hyhal:ro \
+image.sourcefind.cn:5000/dcu/admin/base/custom:fastdeploy2.0.0-kylinv10-dtk25.04-py3.10 /bin/bash
+```
+
+## 2. 启动服务
+```bash
+export FD_ATTENTION_BACKEND="BLOCK_ATTN"
+python -m fastdeploy.entrypoints.openai.api_server \
+    --model "/models/ERNIE-45-Turbo/ERNIE-4.5-300B-A47B-Paddle/" \
+    --port 8188 \
+    --tensor-parallel-size 8 \
+    --quantization=wint8 \
+    --gpu-memory-utilization=0.8
+```
+
+#### 请求服务
+
+您可以基于 OpenAI 协议，通过 curl 和 python 两种方式请求服务。
+
+```bash
+curl -X POST "http://0.0.0.0:8188/v1/chat/completions" \
+-H "Content-Type: application/json" \
+-d '{
+  "messages": [
+    {"role": "user", "content": "Where is the capital of China?"}
+  ]
+}'
+```
+
+```python
+import openai
+
+ip = "0.0.0.0"
+service_http_port = "8188"
+client = openai.Client(base_url=f"http://{ip}:{service_http_port}/v1", api_key="EMPTY_API_KEY")
+
+response = client.chat.completions.create(
+    model="default",
+    messages=[
+        {"role": "user", "content": "Eliza's rate per hour for the first 40 hours she works each week is $10. She also receives an overtime pay of 1.2 times her regular hourly rate. If Eliza worked for 45 hours this week, how much are her earnings for this week?"},
+    ],
+    temperature=1,
+    max_tokens=1024,
+    stream=False,
+)
+print(response)
+```
@@ -20,9 +20,11 @@
 from .native_paddle_backend import PaddleNativeAttnBackend
 from .xpu_attn_backend import XPUAttentionBackend
 from .iluvatar_attn_backend import IluvatarAttnBackend
+from .block_multihead_attn_backend import BlockAttentionBackend
 
 __all__ = [
     "AttentionBackend", "PaddleNativeAttnBackend",
     "get_attention_backend", "AppendAttentionBackend", "XPUAttentionBackend",
-    "MLAAttentionBackend", "FlashAttentionBackend", "IluvatarAttnBackend"
+    "MLAAttentionBackend", "FlashAttentionBackend", "IluvatarAttnBackend",
+    "BlockAttentionBackend"
 ]