Skip to content

Commit 2e8f219

Browse files
committed
Merge branch 'develop' into mm_structred_output
2 parents 75f5150 + 4e92d04 commit 2e8f219

File tree

8 files changed

+50
-72
lines changed

8 files changed

+50
-72
lines changed

fastdeploy/engine/engine.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -946,8 +946,8 @@ def _exit_sub_services(self):
946946

947947
def _setting_environ_variables(self):
948948
"""
949-
配置环境变量
950-
"""
949+
配置环境变量
950+
"""
951951
variables = {
952952
"PADDLE_TRAINER_ID": 0,
953953
"PADDLE_TRAINERS_NUM": 1,

fastdeploy/import_ops.py

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
import functools
1616
import importlib
1717
import inspect
18-
import os
1918

2019
import paddle
2120

@@ -77,7 +76,13 @@ def wrap_unified_op(original_cpp_ext_op, original_custom_op):
7776
@functools.wraps(original_custom_op)
7877
def unified_op(*args, **kwargs):
7978
if paddle.in_dynamic_mode():
80-
return original_cpp_ext_op(*args, **kwargs)
79+
res = original_cpp_ext_op(*args, **kwargs)
80+
if res is None:
81+
return None
82+
# TODO(DrRyanHuang): Remove this if when we align the implementation of custom op and C++ extension
83+
if isinstance(res, list) and len(res) == 1:
84+
return res[0]
85+
return res
8186
return original_custom_op(*args, **kwargs)
8287

8388
return unified_op
@@ -93,17 +98,13 @@ def preprocess_static_op(global_ns):
9398
"""
9499
static_op_prefix = "static_op_"
95100
static_op_names = [k for k in global_ns if k.startswith(static_op_prefix)]
96-
enforce_eager = int(os.getenv("FD_ENFORCE_EAGER", "0")) == 1
97-
98-
for static_op in static_op_names:
99-
op_name = static_op[len(static_op_prefix):]
100-
has_dynamic_op = op_name in global_ns
101-
102-
if has_dynamic_op:
103-
if not enforce_eager:
104-
original_cpp_ext_op = global_ns[op_name]
105-
original_custom_op = global_ns[static_op]
106-
global_ns[op_name] = wrap_unified_op(original_cpp_ext_op,
107-
original_custom_op)
108-
else:
109-
global_ns[op_name] = global_ns[static_op]
101+
102+
for static_op_name in static_op_names:
103+
op_name = static_op_name.removeprefix(static_op_prefix)
104+
if op_name not in global_ns:
105+
global_ns[op_name] = global_ns[static_op_name]
106+
continue
107+
108+
original_cpp_ext_op = global_ns[op_name]
109+
original_custom_op = global_ns[static_op_name]
110+
global_ns[op_name] = wrap_unified_op(original_cpp_ext_op, original_custom_op)

fastdeploy/model_executor/layers/attention/append_attn_backend.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
9191
self.use_speculate: bool = self.speculative_method is not None
9292
self.speculate_max_draft_token_num: int = fd_config.speculative_config.num_speculative_tokens
9393
self.keep_pd_step_flag: bool = fd_config.speculative_config.model_type == "mtp"
94-
self.rank: int = fd_config.parallel_config.tensor_parallel_rank
94+
self.rank: int = fd_config.parallel_config.expert_parallel_rank * fd_config.parallel_config.tensor_parallel_degree + fd_config.parallel_config.tensor_parallel_rank
9595

9696
self.kv_num_heads: int = kv_num_heads
9797
self.num_heads: int = num_heads
@@ -108,12 +108,12 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
108108

109109
if fd_config.parallel_config.expert_parallel_rank is None:
110110
fd_config.parallel_config.expert_parallel_rank = 0
111-
device_id = self.rank + fd_config.parallel_config.tensor_parallel_degree * \
112-
fd_config.parallel_config.expert_parallel_rank
113111
if self.device_id is None:
114-
self.device_id = device_id
112+
self.device_id = self.rank
115113
else:
116-
self.device_id = self.device_id.split(",")[device_id]
114+
device_ids = self.device_id.split(",")
115+
rank_index = self.rank % len(device_ids)
116+
self.device_id = self.device_id[rank_index]
117117

118118
def init_attention_metadata(self, forward_meta: ForwardMeta):
119119
"""Initialize attntion metadata hence all layers in the forward pass can reuse it."""

fastdeploy/model_executor/layers/attention/flash_attn_backend.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
100100
self.use_speculate = self.speculative_method is not None
101101
self.speculate_max_draft_token_num = fd_config.speculative_config.num_speculative_tokens
102102
self.keep_pd_step_flag: bool = fd_config.speculative_config.model_type == "mtp"
103-
self.rank: int = fd_config.parallel_config.tensor_parallel_rank
103+
self.rank: int = fd_config.parallel_config.expert_parallel_rank * fd_config.parallel_config.tensor_parallel_degree + fd_config.parallel_config.tensor_parallel_rank
104104

105105
# pd_disaggregation
106106
self.use_pd_disaggregation: int = int(
@@ -110,12 +110,13 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
110110

111111
if fd_config.parallel_config.expert_parallel_rank is None:
112112
fd_config.parallel_config.expert_parallel_rank = 0
113-
device_id = self.rank + fd_config.parallel_config.tensor_parallel_degree * \
114-
fd_config.parallel_config.expert_parallel_rank
113+
115114
if self.device_id is None:
116-
self.device_id = device_id
115+
self.device_id = self.rank
117116
else:
118-
self.device_id = self.device_id.split(",")[device_id]
117+
device_ids = self.device_id.split(",")
118+
rank_index = self.rank % len(device_ids)
119+
self.device_id = self.device_id[rank_index]
119120

120121
def get_attntion_meta(self):
121122
"""get_attntion_meta"""

fastdeploy/model_executor/layers/attention/mla_attention_backend.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
109109
self.use_speculate: bool = self.speculative_method is not None
110110
self.speculate_max_draft_token_num: int = fd_config.speculative_config.num_speculative_tokens
111111
self.keep_pd_step_flag: bool = fd_config.speculative_config.model_type == "mtp"
112-
self.rank: int = fd_config.parallel_config.tensor_parallel_rank
112+
self.rank: int = fd_config.parallel_config.expert_parallel_rank * fd_config.parallel_config.tensor_parallel_degree + fd_config.parallel_config.tensor_parallel_rank
113113

114114
self.kv_num_heads: int = kv_num_heads
115115
self.num_heads: int = num_heads
@@ -135,10 +135,13 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
135135
os.getenv("FLAGS_use_pd_disaggregation", 0))
136136
self.start_layer_index: int = fd_config.model_config.start_layer_index
137137
self.device_id: int = os.getenv("CUDA_VISIBLE_DEVICES", None)
138+
138139
if self.device_id is None:
139140
self.device_id = self.rank
140141
else:
141-
self.device_id = self.device_id.split(",")[self.rank]
142+
device_ids = self.device_id.split(",")
143+
rank_index = self.rank % len(device_ids)
144+
self.device_id = self.device_id[rank_index]
142145

143146
def init_attention_metadata(self, forward_meta: ForwardMeta):
144147
"""Initialize attention metadata hence all layers in the forward pass can reuse it."""

fastdeploy/model_executor/layers/attention/xpu_attn_backend.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
9191
# self.use_speculate = self.speculate_method is not None
9292
# self.speculate_max_draft_token_num = fd_config.parallel_config.speculate_max_draft_tokens
9393
self.keep_pd_step_flag: bool = fd_config.speculative_config.model_type == "mtp"
94-
self.rank: int = fd_config.parallel_config.tensor_parallel_rank
94+
self.rank: int = fd_config.parallel_config.expert_parallel_rank * fd_config.parallel_config.tensor_parallel_degree + fd_config.parallel_config.tensor_parallel_rank
9595

9696
self.kv_num_heads: int = kv_num_heads
9797
self.num_heads: int = num_heads

fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -445,7 +445,7 @@ def forward(
445445
forward_meta.seq_lens_this_time,
446446
forward_meta.cu_seqlens_q,
447447
score_text,
448-
)[0].cast(self._dtype)
448+
).cast(self._dtype)
449449
# -----------------------
450450

451451
out = self.norm(hidden_states)

test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py

Lines changed: 13 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -176,58 +176,31 @@ def consistent_payload():
176176
"seed": 13 # fixed random seed
177177
}
178178

179-
# ==========================
180-
# Helper function to calculate difference rate between two texts
181-
# ==========================
182-
def calculate_diff_rate(text1, text2):
183-
"""
184-
Calculate the difference rate between two strings
185-
based on the normalized Levenshtein edit distance.
186-
Returns a float in [0,1], where 0 means identical.
187-
"""
188-
if text1 == text2:
189-
return 0.0
190-
191-
len1, len2 = len(text1), len(text2)
192-
dp = [[0] * (len2 + 1) for _ in range(len1 + 1)]
193-
194-
for i in range(len1 + 1):
195-
for j in range(len2 + 1):
196-
if i == 0 or j == 0:
197-
dp[i][j] = i + j
198-
elif text1[i - 1] == text2[j - 1]:
199-
dp[i][j] = dp[i - 1][j - 1]
200-
else:
201-
dp[i][j] = 1 + min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1])
202-
203-
edit_distance = dp[len1][len2]
204-
max_len = max(len1, len2)
205-
return edit_distance / max_len if max_len > 0 else 0.0
206179

207180
# ==========================
208181
# Consistency test for repeated runs with fixed payload
209182
# ==========================
210183
def test_consistency_between_runs(api_url, headers, consistent_payload):
211184
"""
212-
Test that two runs with the same fixed input produce similar outputs.
185+
Test that result is same as the base result.
213186
"""
214-
# First request
187+
# request
215188
resp1 = requests.post(api_url, headers=headers, json=consistent_payload)
216189
assert resp1.status_code == 200
217190
result1 = resp1.json()
218191
content1 = result1["choices"][0]["message"]["content"]
219192

220-
# Second request
221-
resp2 = requests.post(api_url, headers=headers, json=consistent_payload)
222-
assert resp2.status_code == 200
223-
result2 = resp2.json()
224-
content2 = result2["choices"][0]["message"]["content"]
225-
226-
# Calculate difference rate
227-
diff_rate = calculate_diff_rate(content1, content2)
193+
# base result
194+
base_path = os.getenv("MODEL_PATH")
195+
if base_path:
196+
base_file = os.path.join(base_path, "ernie-4_5-vl-base")
197+
else:
198+
base_file = "ernie-4_5-vl-base"
199+
with open(base_file, "r") as f:
200+
content2 = f.read()
228201

229-
# Verify that the difference rate is below the threshold
230-
assert diff_rate < 0.05, "Output difference too large ({:.4%})".format(diff_rate)
202+
# Verify that result is same as the base result
203+
assert content1 == content2
231204

232205
# ==========================
233206
# OpenAI Client Chat Completion Test
@@ -322,4 +295,4 @@ def test_streaming_chat(openai_client, capsys):
322295
for chunk in response:
323296
if hasattr(chunk.choices[0], 'delta') and hasattr(chunk.choices[0].delta, 'content'):
324297
output.append(chunk.choices[0].delta.content)
325-
assert len(output) > 2
298+
assert len(output) > 2

0 commit comments

Comments
 (0)