PaddlePaddle
diff --git a/‎.github/workflows/ci_xpu.yml
Lines changed: 83 additions & 0 deletions b/‎.github/workflows/ci_xpu.yml
Lines changed: 83 additions & 0 deletions
diff --git a/‎build.sh
Lines changed: 19 additions & 13 deletions b/‎build.sh
Lines changed: 19 additions & 13 deletions
diff --git a/‎custom_ops/gpu_ops/cpp_extensions.cc
Lines changed: 48 additions & 26 deletions b/‎custom_ops/gpu_ops/cpp_extensions.cc
Lines changed: 48 additions & 26 deletions
diff --git a/‎custom_ops/gpu_ops/fp8_gemm_with_cutlass/fp8_fp8_half_gemm.cu
Lines changed: 17 additions & 3 deletions b/‎custom_ops/gpu_ops/fp8_gemm_with_cutlass/fp8_fp8_half_gemm.cu
Lines changed: 17 additions & 3 deletions
@@ -0,0 +1,83 @@
+name: CI_XPU
+
+on:
+  pull_request:
+    branches: [ develop ]
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.event.pull_request.number }}-xpu-ci
+  cancel-in-progress: true
+
+jobs:
+  CI_XPU:
+    runs-on: [self-hosted, XPU-P800-8Card]
+    steps:
+      - name: Print current runner name
+        run: |
+          echo "Current runner name: ${{ runner.name }}"
+      # Because the system version is lower than 2.23, the checkout cannot be used.
+      # - name: Checkout code
+      #   uses: actions/checkout@v4
+
+      - name: Code Checkout
+        env:
+          docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.0.0
+        run: |
+          REPO="https://github.com/${{ github.repository }}.git"
+          FULL_REPO="${{ github.repository }}"
+          REPO_NAME="${FULL_REPO##*/}"
+          # Clean the repository directory before starting
+          docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
+          -e "REPO_NAME=${REPO_NAME}" \
+          ${docker_image} /bin/bash -c '
+            if [ -d ${REPO_NAME} ]; then
+              echo "Directory ${REPO_NAME} exists, removing it..."
+              rm -rf ${REPO_NAME}
+            fi
+          '
+          git config --global user.name "FastDeployCI"
+          git config --global user.email "fastdeploy_ci@example.com"
+          git clone ${REPO} ${REPO_NAME}
+          cd FastDeploy
+          if [ "${{ github.event_name }}" = "pull_request" ]; then
+            git fetch origin pull/${{ github.event.pull_request.number }}/head:pr/${{ github.event.pull_request.number }}
+            git merge pr/${{ github.event.pull_request.number }}
+            git log -n 3 --oneline
+          else
+            git checkout ${{ github.sha }}
+            git log -n 3 --oneline
+          fi
+
+      - name: Run CI unittest
+        env:
+          docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.0.0
+        run: |
+          runner_name="${{ runner.name }}"
+          last_char="${runner_name: -1}"
+
+          if [[ "$last_char" =~ [0-3] ]]; then
+            gpu_id="$last_char"
+          else
+            gpu_id="0"  
+          fi
+          FD_API_PORT=$((9180 + gpu_id * 100))
+          FD_ENGINE_QUEUE_PORT=$((9150 + gpu_id * 100))
+          FD_METRICS_PORT=$((9170 + gpu_id * 100))
+
+          PARENT_DIR=$(dirname "$WORKSPACE")
+          echo "PARENT_DIR:$PARENT_DIR"
+          docker run --rm --net=host --cap-add=SYS_PTRACE --privileged --shm-size=64G  \
+          -v $(pwd):/workspace -w /workspace \
+          -v "/ssd3:/ssd3" \
+          -e "MODEL_PATH=/ssd3/model" \
+          -e "http_proxy=$(git config --global --get http.proxy)" \
+          -e "https_proxy=$(git config --global --get https.proxy)" \
+          -e "FD_API_PORT=${FD_API_PORT}" \
+          -e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
+          -e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
+           ${docker_image} /bin/bash -c "
+          git config --global --add safe.directory /workspace/FastDeploy
+          cd FastDeploy
+          bash scripts/run_ci_xpu.sh
+          "
@@ -104,6 +104,23 @@ function copy_ops(){
       return
     fi
 
+    if_corex=`$python -c "import paddle; print(paddle.is_compiled_with_custom_device(\"iluvatar_gpu\"))"`
+    if [ "$if_corex" = "True" ]; then
+      DEVICE_TYPE="iluvatar-gpu"
+      cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
+      cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/iluvatar
+      echo -e "BASE and Iluvatar ops have been copy to fastdeploy"
+      return
+    fi
+
+    is_gcu=`$python -c "import paddle; print(paddle.is_compiled_with_custom_device('gcu'))"`
+    if [ "$is_gcu" = "True" ]; then
+      DEVICE_TYPE="gcu"
+      cp -r ${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/gcu
+      echo -e "gcu ops have been copy to fastdeploy"
+      return
+    fi
+
     DEVICE_TYPE="cpu"
     cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
     cd ../../../../
@@ -163,24 +180,13 @@ function build_and_install() {
     exit 1
   fi
   echo -e "${BLUE}[build]${NONE} ${GREEN}build fastdeploy wheel success${NONE}\n"
-
-  echo -e "${BLUE}[install]${NONE} installing fastdeploy..."
-  cd $DIST_DIR
-  find . -name "fastdeploy*.whl" | xargs ${python} -m pip install --force-reinstall --no-cache-dir
-  if [ $? -ne 0 ]; then
-    cd ..
-    echo -e "${RED}[FAIL]${NONE} install fastdeploy wheel failed"
-    exit 1
-  fi
-  echo -e "${BLUE}[install]${NONE} ${GREEN}fastdeploy install success${NONE}\n"
-  cd ..
 }
 
 function version_info() {
   output_file="fastdeploy/version.txt"
   fastdeploy_git_commit_id=$(git rev-parse HEAD)
   paddle_version=$(${python} -c "import paddle; print(paddle.__version__)")
-  paddle_git_commit_id=$(${python} -c "import paddle; print(paddle.version.show())" | grep -Po "(?<=commit: )[\da-f]+")
+  paddle_git_commit_id=$(${python} -c "import paddle; print(paddle.__git_commit__)")
   cuda_version=$(nvcc -V | grep -Po "(?<=release )[\d.]+(?=, V)")
   cxx_version=$(g++ --version | head -n 1 | grep -Po "(?<=\) )[\d.]+")
 
@@ -246,7 +252,7 @@ if [ "$BUILD_WHEEL" -eq 1 ]; then
   echo -e "${GREEN}wheel saved under${NONE} ${RED}${BOLD}./dist${NONE}"
 
   # install wheel
-  ${python} -m pip install ./dist/fastdeploy*.whl
+  ${python} -m pip install ./dist/fastdeploy*.whl --force-reinstall --no-cache-dir
   echo -e "${GREEN}wheel install success${NONE}\n"
 
   trap : 0
 
@@ -468,6 +468,30 @@ std::vector<paddle::Tensor> NoauxTc(
       int topk,
       float routed_scaling_factor);
 
+#ifdef ENABLE_FP8
+paddle::Tensor cutlass_fp8_fp8_half_gemm_func(
+    const paddle::Tensor& x,
+    const paddle::Tensor& y,
+    const paddle::optional<paddle::Tensor>& bias,
+    bool trans_x,
+    bool trans_y,
+    float scale,  // only support per-tensor quantization
+    std::string output_dtype,
+    std::string activation_type);
+
+paddle::Tensor MoeFusedHadamardQuantFp8Func(
+                const paddle::Tensor &input,
+                const paddle::Tensor &scale,
+                const paddle::Tensor &topk_ids,
+                const int top_k,
+                const int intermediate_size,
+                const bool tiled);
+
+paddle::Tensor FusedHadamardQuantFp8Func(
+                const paddle::Tensor &input,
+                const float scale);
+#endif
+
 PYBIND11_MODULE(fastdeploy_ops, m) {
 
   m.def("get_expert_token_num", &GetExpertTokenNum, py::arg("topk_ids"),
@@ -697,38 +721,21 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
         "text_image_gather_scatter function");
 
   m.def("count_tokens_per_expert_func", &count_tokens_per_expert_func);
+
   m.def("tritonmoe_preprocess_func", &tritonmoe_preprocess_kernel);
 
   m.def("MoeWna16MarlinGemmApi", &MoeWna16MarlinGemmApi,
-  py::arg("a"),
-  py::arg("c_or_none"),
-  py::arg("b_q_weight"),
-  py::arg("b_scales"),
-  py::arg("global_scale_or_none"),
-  py::arg("b_zeros_or_none"),
-  py::arg("g_idx_or_none"),
-  py::arg("perm_or_none"),
-  py::arg("workspace"),
-  py::arg("sorted_token_ids"),
-  py::arg("expert_ids"),
-    py::arg("num_tokens_post_padded"),
-  py::arg("topk_weights"),
-  py::arg("moe_block_size"),
-    py::arg("top_k"),
-      py::arg("mul_topk_weights"),
-        py::arg("is_ep"),
-          py::arg("b_q_type_str"),
-            py::arg("size_m"),
-              py::arg("size_n"),
-              py::arg("size_k"),
-              py::arg("is_k_full"),
-              py::arg("use_atomic_add"),
-              py::arg("use_fp32_reduce"),
-              py::arg("is_zp_float"));
+      py::arg("a"), py::arg("c_or_none"), py::arg("b_q_weight"),
+      py::arg("b_scales"), py::arg("global_scale_or_none"), py::arg("b_zeros_or_none"),
+      py::arg("g_idx_or_none"), py::arg("perm_or_none"), py::arg("workspace"), py::arg("sorted_token_ids"),
+      py::arg("expert_ids"), py::arg("num_tokens_post_padded"), py::arg("topk_weights"), py::arg("moe_block_size"),
+      py::arg("top_k"), py::arg("mul_topk_weights"), py::arg("is_ep"),  py::arg("b_q_type_str"),
+      py::arg("size_m"), py::arg("size_n"), py::arg("size_k"), py::arg("is_k_full"), py::arg("use_atomic_add"),
+      py::arg("use_fp32_reduce"), py::arg("is_zp_float"));
+
   m.def("get_position_ids_and_mask_encoder_batch", &GetPositionIdsAndMaskEncoderBatch,
         "get_position_ids_and_mask_encoder_batch function");
 
-
   /**
    * cutlass_scaled_mm.cu
    * cutlass_scaled_mm
@@ -753,6 +760,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
   m.def("dynamic_per_token_scaled_fp8_quant", &DynamicPerTokenScaledFp8Quant,
         "dynamic_per_token_scaled_fp8_quant function",
          py::arg("out"), py::arg("input"), py::arg("scales"), py::arg("scale_ub"));
+
   m.def("decode_mla_write_cache", &DecodeMLAWriteCacheKernel, "decode_mla_write_cache function");
 
   m.def("prefill_mla_write_cache", &PrefillMLAWriteCacheKernel, "prefill_mla_write_cache function");
@@ -762,4 +770,18 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
   m.def("multi_head_latent_attention", &MultiHeadLatentAttention, "multi_head_latent_attention function");
 
   m.def("noaux_tc",&NoauxTc, "noaux_tc for Deepseekv3 MoE compute");
+
+#ifdef ENABLE_FP8
+  m.def("cutlass_fp8_fp8_half_gemm_fused", &cutlass_fp8_fp8_half_gemm_func,
+        py::arg("x"), py::arg("y"), py::arg("bias"), py::arg("transpose_x"),
+        py::arg("transpose_y"), py::arg("scale"), py::arg("output_dtype"),
+        py::arg("activation_type"), "cutlass_fp8_fp8_half_gemm_fused function");
+
+  m.def("moe_fused_hadamard_quant_fp8", &MoeFusedHadamardQuantFp8Func,
+      py::arg("input"), py::arg("scale"), py::arg("topk_ids"),
+      py::arg("top_k"), py::arg("intermediate_size"), py::arg("tiled"), "moe_fused_hadamard_quant_fp8 function");
+
+  m.def("fused_hadamard_quant_fp8", &FusedHadamardQuantFp8Func,
+      py::arg("input"), py::arg("scale"), "fused_hadamard_quant_fp8 function");
+#endif
 }
@@ -19,7 +19,7 @@
 #include "fp8_fp8_half_cuda_core_gemm.h"
 
 
-std::vector<paddle::Tensor> cutlass_fp8_fp8_half_gemm(
+paddle::Tensor cutlass_fp8_fp8_half_gemm_func(
     const paddle::Tensor& x,
     const paddle::Tensor& y,
     const paddle::optional<paddle::Tensor>& bias,
@@ -142,7 +142,7 @@ std::vector<paddle::Tensor> cutlass_fp8_fp8_half_gemm(
         {
             if(output_dtype == "bfloat16") {
                 cuda_core_gemm_launcher<__nv_fp8_e4m3, __nv_bfloat16>(params);
-                
+
             } else {
                 cuda_core_gemm_launcher<__nv_fp8_e4m3, half>(params);
             }
@@ -174,7 +174,21 @@ std::vector<paddle::Tensor> cutlass_fp8_fp8_half_gemm(
                                         fuse_gemm_config};
         fp8_fp8_gemm_scale_bias_act(params);
     }
-    return {out};
+    return out;
+}
+
+std::vector<paddle::Tensor> cutlass_fp8_fp8_half_gemm(
+    const paddle::Tensor& x,
+    const paddle::Tensor& y,
+    const paddle::optional<paddle::Tensor>& bias,
+    bool trans_x,
+    bool trans_y,
+    float scale,  // only support per-tensor quantization
+    std::string output_dtype,
+    std::string activation_type) {
+    return {cutlass_fp8_fp8_half_gemm_func(
+            x, y, bias, trans_x, trans_y, scale,
+            output_dtype, activation_type)};
 }
 
 std::vector<std::vector<int64_t>> CutlassFp8Fp8HalfGemmFusedInferShape(