PaddlePaddle
diff --git a/‎benchmarks/README.md
Lines changed: 27 additions & 0 deletions b/‎benchmarks/README.md
Lines changed: 27 additions & 0 deletions
diff --git a/‎benchmarks/benchmark_mtp.py
Lines changed: 191 additions & 0 deletions b/‎benchmarks/benchmark_mtp.py
Lines changed: 191 additions & 0 deletions
diff --git a/‎build.sh
Lines changed: 28 additions & 14 deletions b/‎build.sh
Lines changed: 28 additions & 14 deletions
@@ -105,3 +105,30 @@ python benchmark_serving.py \
   --save-result > infer_log.txt 2>&1 &
 ```
 
+### 投机解码性能测试工具
+
+#### 使用方式：
+
+```bash
+python benchmarks/benchmark_mtp.py \
+  --host 127.0.0.1 --port 8000 \
+  --max-concurrency 16 32 64 96 --num-prompts 256 \
+  --acceptance-rate 0.8 --draft-token-steps 1 2 3 \
+  --s_itl-base-model 15.88 22.84 16.47 16.93 \
+  --dataset-name EBChat \
+  --dataset-path ./filtered_sharedgpt_2000_input_1136_output_200_fd.json
+```
+
+#### 参数说明
+
+```bash
+--host：服务ip地址，用于组url
+--port：服务HTTP端口，用于组url
+--max-concurrency：测试并发数
+--num-prompts：总计发送多少条请求
+--acceptance-rate：投机解码的模拟接受率
+--draft-token-steps：投机解码的步数
+--s_itl-base-model：主模型的解码延迟，可由上述的性能压测工具获得，与batch-size一一对应
+--dataset-name：指定数据集类，指定为"EBChat"可读取转存的FD格式数据集
+--dataset-path：测试数据集路径
+```
@@ -0,0 +1,191 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import argparse
+import asyncio
+import contextlib
+import os
+import signal
+import socket
+import subprocess
+import time
+from typing import Union
+
+import openai
+import yaml
+from benchmark_dataset import EBChatDataset, EBDataset, SampleRequest
+from benchmark_serving import benchmark
+
+
+def prepare_input_requests(
+    num_prompts: int, dataset_name: str, dataset_path: str
+) -> Union[EBDataset, EBChatDataset]:
+    dataset_mapping = {
+        "EB": lambda: EBDataset(dataset_path=dataset_path).sample(
+            num_requests=num_prompts
+        ),
+        "EBChat": lambda: EBChatDataset(dataset_path=dataset_path).sample(
+            num_requests=num_prompts
+        ),
+    }
+
+    try:
+        input_requests = dataset_mapping[dataset_name]()
+    except KeyError as err:
+        raise ValueError(f"Unknown dataset: {dataset_name}") from err
+
+    return input_requests
+
+
+class FakeTokenizer:
+    def encode(self, text: str, add_special_tokens: bool = False):
+        return []
+
+
+def send_one_batch(base_url, max_concurrency, input_requests, disable_tqdm):
+    selected_percentile_metrics = ["s_itl"]
+    selected_percentiles = []
+    # Run benchmark
+    results = asyncio.run(
+        benchmark(
+            backend="openai-chat",
+            api_url=f"{base_url}/v1/chat/completions",
+            base_url=base_url,
+            model_id="default",
+            model_name="default",
+            input_requests=input_requests,
+            hyper_parameters={},
+            logprobs=None,
+            request_rate=float("inf"),
+            burstiness=1.0,
+            disable_tqdm=disable_tqdm,
+            profile=False,
+            selected_percentile_metrics=selected_percentile_metrics,
+            selected_percentiles=selected_percentiles,
+            ignore_eos=False,
+            goodput_config_dict=None,
+            max_concurrency=max_concurrency,
+            lora_modules=None,
+            extra_body=None,
+        )
+    )
+
+    record = {
+        "mean_s_itl_ms": results["mean_s_itl_ms"],
+    }
+
+    return record
+
+
+def calculate_speedup(acceptance_rate, draft_token_step, t_ori, t_mtp):
+
+    tmp = 0.0
+    for i in range(draft_token_step):
+        tmp += pow(acceptance_rate, i + 1)
+
+    r_ac = tmp / (1 + tmp)
+
+    return t_ori / ((1 - r_ac) * t_mtp)
+
+
+def main(args):
+    base_url = f"http://{args.host}:{args.port}"
+
+    input_requests = prepare_input_requests(
+        args.num_prompts, args.dataset_name, args.dataset_path
+    )
+
+    if len(args.max_concurrency) != len(args.s_itl_base_model):
+        raise ValueError(f"--max_concurrency should be same length as --s_itl_base_model")
+
+    for max_concurrency, s_itl in zip(args.max_concurrency, args.s_itl_base_model):
+        # Wramup
+        print("Starting warmup...")
+        with open(os.devnull, "w") as f:
+            with contextlib.redirect_stdout(f):
+                send_one_batch(base_url, max_concurrency, input_requests[0:max_concurrency], True)
+
+        # Benchmark
+        record = send_one_batch(base_url, max_concurrency, input_requests, False)
+
+        metric_header = f"Speed up"
+        print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
+        for draft_token_step in args.draft_token_steps:
+            speedup = calculate_speedup(
+                args.acceptance_rate,
+                draft_token_step,
+                s_itl,
+                record["mean_s_itl_ms"],
+            )
+            print(
+                "{:<40} {:<10.2f}".format(
+                    f"Speed up on {draft_token_step} steps draft", speedup
+                )
+            )
+        print("=" * 50)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--host",
+        type=str,
+        default="127.0.0.1",
+    )
+    parser.add_argument(
+        "--port",
+        type=str,
+        default="8000",
+    )
+    parser.add_argument(
+        "--max-concurrency",
+        type=int,
+        nargs="+",
+        default=(1, 2, 4, 8, 16, 32),
+    )
+    parser.add_argument(
+        "--num-prompts",
+        type=int,
+        default=128,
+    )
+    parser.add_argument(
+        "--acceptance-rate",
+        type=float,
+        default=0.8,
+    )
+    parser.add_argument(
+        "--draft-token-steps",
+        type=int,
+        nargs="+",
+        default=(1, 2),
+    )
+    parser.add_argument(
+        "--s_itl-base-model",
+        type=float,
+        nargs="+",
+    )
+    parser.add_argument(
+        "--dataset-name",
+        type=str,
+        default="EBChat",
+    )
+    parser.add_argument(
+        "--dataset-path",
+        type=str,
+    )
+    args = parser.parse_args()
+
+    main(args)
@@ -18,6 +18,9 @@ BUILD_WHEEL=${1:-1}
 PYTHON_VERSION=${2:-"python"}
 export python=$PYTHON_VERSION
 FD_CPU_USE_BF16=${3:-"false"}
+# FD_BUILDING_ARCS: Specify target CUDA architectures for custom ops, e.g., "[80, 90, 100]".
+# For SM90 (Hopper), use 90. For SM100 (Blackwell), use 100.
+# These will be translated to 90a / 100a in setup_ops.py for specific features.
 FD_BUILDING_ARCS=${4:-""}
 
 
@@ -74,8 +77,10 @@ function copy_ops(){
     is_rocm=`$python -c "import paddle; print(paddle.is_compiled_with_rocm())"`
     if [ "$is_rocm" = "True" ]; then
       DEVICE_TYPE="rocm"
+      mkdir -p ../fastdeploy/model_executor/ops/base
+      cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
       cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/gpu
-      echo -e "ROCM ops have been copy to fastdeploy"
+      echo -e "BASE and ROCM ops have been copy to fastdeploy"
       return
     fi
     mkdir -p ../fastdeploy/model_executor/ops/base
@@ -104,6 +109,23 @@ function copy_ops(){
       return
     fi
 
+    if_corex=`$python -c "import paddle; print(paddle.is_compiled_with_custom_device(\"iluvatar_gpu\"))"`
+    if [ "$if_corex" = "True" ]; then
+      DEVICE_TYPE="iluvatar-gpu"
+      cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
+      cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/iluvatar
+      echo -e "BASE and Iluvatar ops have been copy to fastdeploy"
+      return
+    fi
+
+    is_gcu=`$python -c "import paddle; print(paddle.is_compiled_with_custom_device('gcu'))"`
+    if [ "$is_gcu" = "True" ]; then
+      DEVICE_TYPE="gcu"
+      cp -r ${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/gcu
+      echo -e "gcu ops have been copy to fastdeploy"
+      return
+    fi
+
     DEVICE_TYPE="cpu"
     cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
     cd ../../../../
@@ -163,25 +185,17 @@ function build_and_install() {
     exit 1
   fi
   echo -e "${BLUE}[build]${NONE} ${GREEN}build fastdeploy wheel success${NONE}\n"
-
-  echo -e "${BLUE}[install]${NONE} installing fastdeploy..."
-  cd $DIST_DIR
-  find . -name "fastdeploy*.whl" | xargs ${python} -m pip install --force-reinstall --no-cache-dir
-  if [ $? -ne 0 ]; then
-    cd ..
-    echo -e "${RED}[FAIL]${NONE} install fastdeploy wheel failed"
-    exit 1
-  fi
-  echo -e "${BLUE}[install]${NONE} ${GREEN}fastdeploy install success${NONE}\n"
-  cd ..
 }
 
 function version_info() {
   output_file="fastdeploy/version.txt"
   fastdeploy_git_commit_id=$(git rev-parse HEAD)
   paddle_version=$(${python} -c "import paddle; print(paddle.__version__)")
   paddle_git_commit_id=$(${python} -c "import paddle; print(paddle.__git_commit__)")
-  cuda_version=$(nvcc -V | grep -Po "(?<=release )[\d.]+(?=, V)")
+  cuda_version="nvcc-not-installed"
+  if command -v nvcc &> /dev/null; then
+    cuda_version=$(nvcc -V | grep -Po "(?<=release )[\d.]+(?=, V)")
+  fi
   cxx_version=$(g++ --version | head -n 1 | grep -Po "(?<=\) )[\d.]+")
 
   echo "fastdeploy GIT COMMIT ID: $fastdeploy_git_commit_id" > $output_file
@@ -246,7 +260,7 @@ if [ "$BUILD_WHEEL" -eq 1 ]; then
   echo -e "${GREEN}wheel saved under${NONE} ${RED}${BOLD}./dist${NONE}"
 
   # install wheel
-  ${python} -m pip install ./dist/fastdeploy*.whl
+  ${python} -m pip install ./dist/fastdeploy*.whl --force-reinstall --no-cache-dir
   echo -e "${GREEN}wheel install success${NONE}\n"
 
   trap : 0