Skip to content

Commit c53497b

Browse files
authored
Merge branch 'PaddlePaddle:develop' into develop
2 parents a56d64e + 57b086d commit c53497b

File tree

108 files changed

+9107
-700
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

108 files changed

+9107
-700
lines changed

.github/workflows/ci_xpu.yml

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
name: CI_XPU
2+
3+
on:
4+
pull_request:
5+
branches: [ develop ]
6+
workflow_dispatch:
7+
8+
concurrency:
9+
group: ${{ github.event.pull_request.number }}-xpu-ci
10+
cancel-in-progress: true
11+
12+
jobs:
13+
CI_XPU:
14+
runs-on: [self-hosted, XPU-P800-8Card]
15+
steps:
16+
- name: Print current runner name
17+
run: |
18+
echo "Current runner name: ${{ runner.name }}"
19+
# Because the system version is lower than 2.23, the checkout cannot be used.
20+
# - name: Checkout code
21+
# uses: actions/checkout@v4
22+
23+
- name: Code Checkout
24+
env:
25+
docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.0.0
26+
run: |
27+
REPO="https://github.com/${{ github.repository }}.git"
28+
FULL_REPO="${{ github.repository }}"
29+
REPO_NAME="${FULL_REPO##*/}"
30+
# Clean the repository directory before starting
31+
docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
32+
-e "REPO_NAME=${REPO_NAME}" \
33+
${docker_image} /bin/bash -c '
34+
if [ -d ${REPO_NAME} ]; then
35+
echo "Directory ${REPO_NAME} exists, removing it..."
36+
rm -rf ${REPO_NAME}
37+
fi
38+
'
39+
git config --global user.name "FastDeployCI"
40+
git config --global user.email "fastdeploy_ci@example.com"
41+
git clone ${REPO} ${REPO_NAME}
42+
cd FastDeploy
43+
if [ "${{ github.event_name }}" = "pull_request" ]; then
44+
git fetch origin pull/${{ github.event.pull_request.number }}/head:pr/${{ github.event.pull_request.number }}
45+
git merge pr/${{ github.event.pull_request.number }}
46+
git log -n 3 --oneline
47+
else
48+
git checkout ${{ github.sha }}
49+
git log -n 3 --oneline
50+
fi
51+
52+
- name: Run CI unittest
53+
env:
54+
docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.0.0
55+
run: |
56+
runner_name="${{ runner.name }}"
57+
last_char="${runner_name: -1}"
58+
59+
if [[ "$last_char" =~ [0-3] ]]; then
60+
gpu_id="$last_char"
61+
else
62+
gpu_id="0"
63+
fi
64+
FD_API_PORT=$((9180 + gpu_id * 100))
65+
FD_ENGINE_QUEUE_PORT=$((9150 + gpu_id * 100))
66+
FD_METRICS_PORT=$((9170 + gpu_id * 100))
67+
68+
PARENT_DIR=$(dirname "$WORKSPACE")
69+
echo "PARENT_DIR:$PARENT_DIR"
70+
docker run --rm --net=host --cap-add=SYS_PTRACE --privileged --shm-size=64G \
71+
-v $(pwd):/workspace -w /workspace \
72+
-v "/ssd3:/ssd3" \
73+
-e "MODEL_PATH=/ssd3/model" \
74+
-e "http_proxy=$(git config --global --get http.proxy)" \
75+
-e "https_proxy=$(git config --global --get https.proxy)" \
76+
-e "FD_API_PORT=${FD_API_PORT}" \
77+
-e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
78+
-e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
79+
${docker_image} /bin/bash -c "
80+
git config --global --add safe.directory /workspace/FastDeploy
81+
cd FastDeploy
82+
bash scripts/run_ci_xpu.sh
83+
"

build.sh

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,23 @@ function copy_ops(){
104104
return
105105
fi
106106

107+
if_corex=`$python -c "import paddle; print(paddle.is_compiled_with_custom_device(\"iluvatar_gpu\"))"`
108+
if [ "$if_corex" = "True" ]; then
109+
DEVICE_TYPE="iluvatar-gpu"
110+
cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
111+
cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/iluvatar
112+
echo -e "BASE and Iluvatar ops have been copy to fastdeploy"
113+
return
114+
fi
115+
116+
is_gcu=`$python -c "import paddle; print(paddle.is_compiled_with_custom_device('gcu'))"`
117+
if [ "$is_gcu" = "True" ]; then
118+
DEVICE_TYPE="gcu"
119+
cp -r ${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/gcu
120+
echo -e "gcu ops have been copy to fastdeploy"
121+
return
122+
fi
123+
107124
DEVICE_TYPE="cpu"
108125
cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
109126
cd ../../../../
@@ -163,24 +180,13 @@ function build_and_install() {
163180
exit 1
164181
fi
165182
echo -e "${BLUE}[build]${NONE} ${GREEN}build fastdeploy wheel success${NONE}\n"
166-
167-
echo -e "${BLUE}[install]${NONE} installing fastdeploy..."
168-
cd $DIST_DIR
169-
find . -name "fastdeploy*.whl" | xargs ${python} -m pip install --force-reinstall --no-cache-dir
170-
if [ $? -ne 0 ]; then
171-
cd ..
172-
echo -e "${RED}[FAIL]${NONE} install fastdeploy wheel failed"
173-
exit 1
174-
fi
175-
echo -e "${BLUE}[install]${NONE} ${GREEN}fastdeploy install success${NONE}\n"
176-
cd ..
177183
}
178184

179185
function version_info() {
180186
output_file="fastdeploy/version.txt"
181187
fastdeploy_git_commit_id=$(git rev-parse HEAD)
182188
paddle_version=$(${python} -c "import paddle; print(paddle.__version__)")
183-
paddle_git_commit_id=$(${python} -c "import paddle; print(paddle.version.show())" | grep -Po "(?<=commit: )[\da-f]+")
189+
paddle_git_commit_id=$(${python} -c "import paddle; print(paddle.__git_commit__)")
184190
cuda_version=$(nvcc -V | grep -Po "(?<=release )[\d.]+(?=, V)")
185191
cxx_version=$(g++ --version | head -n 1 | grep -Po "(?<=\) )[\d.]+")
186192

@@ -246,7 +252,7 @@ if [ "$BUILD_WHEEL" -eq 1 ]; then
246252
echo -e "${GREEN}wheel saved under${NONE} ${RED}${BOLD}./dist${NONE}"
247253

248254
# install wheel
249-
${python} -m pip install ./dist/fastdeploy*.whl
255+
${python} -m pip install ./dist/fastdeploy*.whl --force-reinstall --no-cache-dir
250256
echo -e "${GREEN}wheel install success${NONE}\n"
251257

252258
trap : 0

custom_ops/gpu_ops/cpp_extensions.cc

Lines changed: 48 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -468,6 +468,30 @@ std::vector<paddle::Tensor> NoauxTc(
468468
int topk,
469469
float routed_scaling_factor);
470470

471+
#ifdef ENABLE_FP8
472+
paddle::Tensor cutlass_fp8_fp8_half_gemm_func(
473+
const paddle::Tensor& x,
474+
const paddle::Tensor& y,
475+
const paddle::optional<paddle::Tensor>& bias,
476+
bool trans_x,
477+
bool trans_y,
478+
float scale, // only support per-tensor quantization
479+
std::string output_dtype,
480+
std::string activation_type);
481+
482+
paddle::Tensor MoeFusedHadamardQuantFp8Func(
483+
const paddle::Tensor &input,
484+
const paddle::Tensor &scale,
485+
const paddle::Tensor &topk_ids,
486+
const int top_k,
487+
const int intermediate_size,
488+
const bool tiled);
489+
490+
paddle::Tensor FusedHadamardQuantFp8Func(
491+
const paddle::Tensor &input,
492+
const float scale);
493+
#endif
494+
471495
PYBIND11_MODULE(fastdeploy_ops, m) {
472496

473497
m.def("get_expert_token_num", &GetExpertTokenNum, py::arg("topk_ids"),
@@ -697,38 +721,21 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
697721
"text_image_gather_scatter function");
698722

699723
m.def("count_tokens_per_expert_func", &count_tokens_per_expert_func);
724+
700725
m.def("tritonmoe_preprocess_func", &tritonmoe_preprocess_kernel);
701726

702727
m.def("MoeWna16MarlinGemmApi", &MoeWna16MarlinGemmApi,
703-
py::arg("a"),
704-
py::arg("c_or_none"),
705-
py::arg("b_q_weight"),
706-
py::arg("b_scales"),
707-
py::arg("global_scale_or_none"),
708-
py::arg("b_zeros_or_none"),
709-
py::arg("g_idx_or_none"),
710-
py::arg("perm_or_none"),
711-
py::arg("workspace"),
712-
py::arg("sorted_token_ids"),
713-
py::arg("expert_ids"),
714-
py::arg("num_tokens_post_padded"),
715-
py::arg("topk_weights"),
716-
py::arg("moe_block_size"),
717-
py::arg("top_k"),
718-
py::arg("mul_topk_weights"),
719-
py::arg("is_ep"),
720-
py::arg("b_q_type_str"),
721-
py::arg("size_m"),
722-
py::arg("size_n"),
723-
py::arg("size_k"),
724-
py::arg("is_k_full"),
725-
py::arg("use_atomic_add"),
726-
py::arg("use_fp32_reduce"),
727-
py::arg("is_zp_float"));
728+
py::arg("a"), py::arg("c_or_none"), py::arg("b_q_weight"),
729+
py::arg("b_scales"), py::arg("global_scale_or_none"), py::arg("b_zeros_or_none"),
730+
py::arg("g_idx_or_none"), py::arg("perm_or_none"), py::arg("workspace"), py::arg("sorted_token_ids"),
731+
py::arg("expert_ids"), py::arg("num_tokens_post_padded"), py::arg("topk_weights"), py::arg("moe_block_size"),
732+
py::arg("top_k"), py::arg("mul_topk_weights"), py::arg("is_ep"), py::arg("b_q_type_str"),
733+
py::arg("size_m"), py::arg("size_n"), py::arg("size_k"), py::arg("is_k_full"), py::arg("use_atomic_add"),
734+
py::arg("use_fp32_reduce"), py::arg("is_zp_float"));
735+
728736
m.def("get_position_ids_and_mask_encoder_batch", &GetPositionIdsAndMaskEncoderBatch,
729737
"get_position_ids_and_mask_encoder_batch function");
730738

731-
732739
/**
733740
* cutlass_scaled_mm.cu
734741
* cutlass_scaled_mm
@@ -753,6 +760,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
753760
m.def("dynamic_per_token_scaled_fp8_quant", &DynamicPerTokenScaledFp8Quant,
754761
"dynamic_per_token_scaled_fp8_quant function",
755762
py::arg("out"), py::arg("input"), py::arg("scales"), py::arg("scale_ub"));
763+
756764
m.def("decode_mla_write_cache", &DecodeMLAWriteCacheKernel, "decode_mla_write_cache function");
757765

758766
m.def("prefill_mla_write_cache", &PrefillMLAWriteCacheKernel, "prefill_mla_write_cache function");
@@ -762,4 +770,18 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
762770
m.def("multi_head_latent_attention", &MultiHeadLatentAttention, "multi_head_latent_attention function");
763771

764772
m.def("noaux_tc",&NoauxTc, "noaux_tc for Deepseekv3 MoE compute");
773+
774+
#ifdef ENABLE_FP8
775+
m.def("cutlass_fp8_fp8_half_gemm_fused", &cutlass_fp8_fp8_half_gemm_func,
776+
py::arg("x"), py::arg("y"), py::arg("bias"), py::arg("transpose_x"),
777+
py::arg("transpose_y"), py::arg("scale"), py::arg("output_dtype"),
778+
py::arg("activation_type"), "cutlass_fp8_fp8_half_gemm_fused function");
779+
780+
m.def("moe_fused_hadamard_quant_fp8", &MoeFusedHadamardQuantFp8Func,
781+
py::arg("input"), py::arg("scale"), py::arg("topk_ids"),
782+
py::arg("top_k"), py::arg("intermediate_size"), py::arg("tiled"), "moe_fused_hadamard_quant_fp8 function");
783+
784+
m.def("fused_hadamard_quant_fp8", &FusedHadamardQuantFp8Func,
785+
py::arg("input"), py::arg("scale"), "fused_hadamard_quant_fp8 function");
786+
#endif
765787
}

custom_ops/gpu_ops/fp8_gemm_with_cutlass/fp8_fp8_half_gemm.cu

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
#include "fp8_fp8_half_cuda_core_gemm.h"
2020

2121

22-
std::vector<paddle::Tensor> cutlass_fp8_fp8_half_gemm(
22+
paddle::Tensor cutlass_fp8_fp8_half_gemm_func(
2323
const paddle::Tensor& x,
2424
const paddle::Tensor& y,
2525
const paddle::optional<paddle::Tensor>& bias,
@@ -142,7 +142,7 @@ std::vector<paddle::Tensor> cutlass_fp8_fp8_half_gemm(
142142
{
143143
if(output_dtype == "bfloat16") {
144144
cuda_core_gemm_launcher<__nv_fp8_e4m3, __nv_bfloat16>(params);
145-
145+
146146
} else {
147147
cuda_core_gemm_launcher<__nv_fp8_e4m3, half>(params);
148148
}
@@ -174,7 +174,21 @@ std::vector<paddle::Tensor> cutlass_fp8_fp8_half_gemm(
174174
fuse_gemm_config};
175175
fp8_fp8_gemm_scale_bias_act(params);
176176
}
177-
return {out};
177+
return out;
178+
}
179+
180+
std::vector<paddle::Tensor> cutlass_fp8_fp8_half_gemm(
181+
const paddle::Tensor& x,
182+
const paddle::Tensor& y,
183+
const paddle::optional<paddle::Tensor>& bias,
184+
bool trans_x,
185+
bool trans_y,
186+
float scale, // only support per-tensor quantization
187+
std::string output_dtype,
188+
std::string activation_type) {
189+
return {cutlass_fp8_fp8_half_gemm_func(
190+
x, y, bias, trans_x, trans_y, scale,
191+
output_dtype, activation_type)};
178192
}
179193

180194
std::vector<std::vector<int64_t>> CutlassFp8Fp8HalfGemmFusedInferShape(

0 commit comments

Comments
 (0)