Skip to content

Commit 03a7499

Browse files
authored
Clear dead code And supplementary notes (#2757)
* 1.supplementary notes 2.delete dead code * fix bug of forward meta * Global modification of forward meta * fix vl model_runner bug
1 parent b89180f commit 03a7499

File tree

12 files changed

+239
-454
lines changed

12 files changed

+239
-454
lines changed

fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py

Lines changed: 19 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -46,13 +46,9 @@ class ConcreteSizeEntry:
4646
# Output buffer of cudagraph
4747
output_buffer: Optional[paddle.Tensor] = None
4848

49-
# for cudagraph debugging, track the input addresses
50-
# during capture, and check if they are the same during replay
51-
input_addresses: Optional[list[int]] = None
52-
5349

5450
class CudaGraphPiecewiseBackend:
55-
""" """
51+
""" Manage the capture and replay of CUDA graphs at the subgraph level. """
5652

5753
def __init__(
5854
self,
@@ -65,33 +61,31 @@ def __init__(
6561
self.warm_up_size = fd_config.graph_opt_config.cudagraph_num_of_warmups
6662
self.batch_size_to_captured_size = fd_config.graph_opt_config.batch_size_to_captured_size
6763

68-
# runtime_bs -> ConcreteSizeEntry
64+
# Runtime batch size -> ConcreteSizeEntry
6965
self.concrete_size_entries: Dict[int, ConcreteSizeEntry] = {}
7066

7167
for shape in self.cudagraph_capture_sizes:
7268
self.concrete_size_entries[shape] = ConcreteSizeEntry(
7369
runtime_bs=shape)
7470

75-
print("[CUDA GRAPH] Created all batch size entry ")
71+
logger.debug("[CUDA GRAPH] Created all batch size entry ")
7672

7773
def __call__(self, **kwargs):
7874
# Get batch size
7975
ids_remove_padding: paddle.Tensor = kwargs["ids_remove_padding"]
8076
batch_size = ids_remove_padding.shape[0]
81-
8277
padding_batch_size = self.batch_size_to_captured_size[batch_size]
83-
# print(
84-
# f"[CUDA GRAPH] The actual batch size obtained by CUDAGraph is :{batch_size}, ",
85-
# f"The padded batch size is :{padding_batch_size}"
86-
# )
78+
logger.debug(
79+
f"[CUDA GRAPH] The actual batch size obtained by CUDAGraph is :{batch_size}, ",
80+
f"The padded batch size is :{padding_batch_size}")
8781

8882
entry = self.concrete_size_entries.get(padding_batch_size)
8983
assert entry is not None, f"Batch size:{padding_batch_size} is not in cuda graph capture list."
9084
if entry.runnable is None:
9185
entry.runnable = self.runnable
92-
# print(
93-
# f"[CUDA GRAPH] New entry lazy initialize with batch size {padding_batch_size}"
94-
# )
86+
logger.debug(
87+
f"[CUDA GRAPH] New entry lazy initialize with batch size {padding_batch_size}"
88+
)
9589

9690
if not entry.use_cudagraph:
9791
return entry.runnable(**kwargs)
@@ -102,10 +96,10 @@ def __call__(self, **kwargs):
10296
for n in range(entry.num_finished_warmup, self.warm_up_size):
10397
entry.num_finished_warmup += 1
10498
entry.runnable(**kwargs)
105-
# print(
106-
# "[CUDA GRAPH] Warm up for batch size ",
107-
# f"{padding_batch_size}, finished ({n+1}/{entry.num_finished_warmup}) times"
108-
# )
99+
logger.debug(
100+
"[CUDA GRAPH] Warm up for batch size ",
101+
f"{padding_batch_size}, finished ({n+1}/{entry.num_finished_warmup}) times"
102+
)
109103

110104
# Store input addresses for debug
111105
input_addresses = [
@@ -129,11 +123,13 @@ def __call__(self, **kwargs):
129123
output._clear
130124

131125
paddle.device.synchronize()
132-
# print(
133-
# f"[CUDA GRAPH] CUDAGraph captured for batch size {padding_batch_size}"
134-
# )
126+
logger.debug(
127+
f"[CUDA GRAPH] CUDAGraph captured for batch size {padding_batch_size}"
128+
)
135129

136130
# Replay
137131
entry.cuda_graph.replay()
138-
# print(f"[CUDA GRAPH] CUDAGraph replayed for batch size {padding_batch_size}")
132+
logger.debug(
133+
f"[CUDA GRAPH] CUDAGraph replayed for batch size {padding_batch_size}"
134+
)
139135
return entry.output_buffer

fastdeploy/model_executor/graph_optimization/decorator.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828

2929
def support_graph_optimization(cls: Optional[_T] = None) -> _T:
3030
"""
31-
A decorator for wrapping models or layers with CUDA graph support.
31+
A decorator for wrapping models or layers with static graph and CUDAGraph support.
3232
This enables efficient kernel launch sequencing for improved GPU performance.
3333
3434
Example usage:
@@ -74,7 +74,7 @@ def __call__(self, **kwargs):
7474

7575

7676
class GraphOptWrapper:
77-
""" """
77+
""" The wrapper for GraphOptBackend """
7878

7979
def __init__(
8080
self,
@@ -87,7 +87,7 @@ def __init__(
8787

8888
@abstractmethod
8989
def forward(self, **kwargs):
90-
""" """
90+
""" Abstract methods for implementing model.forward() """
9191
pass
9292

9393
def __call__(self, **kwargs):

fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,10 @@
2424

2525

2626
class GraphOptBackend:
27-
""" """
27+
"""
28+
Integrated various graph optimization functions, including dynamic graph to static graph conversion,
29+
CINN compilation optimization, CudaGraph, and so on.
30+
"""
2831

2932
fd_config: FDConfig
3033
cudagraph_piecewise_backend: Optional[CudaGraphPiecewiseBackend] = None

fastdeploy/spec_decode/mtp.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -436,8 +436,24 @@ def _initialize_forward_meta(self):
436436
Initialize forward meta and attention meta data
437437
"""
438438
# Initialize forward meta
439-
self.forward_meta = ForwardMeta.init_forward_meta(
440-
self.model_inputs, self.attn_backends[0])
439+
self.forward_meta = ForwardMeta(
440+
input_ids=self.model_inputs["input_ids"],
441+
ids_remove_padding=self.model_inputs["ids_remove_padding"],
442+
rotary_embs=self.model_inputs["rope_emb"],
443+
attn_backend=self.attn_backends[0],
444+
decoder_batch_ids=self.model_inputs["decoder_batch_ids"],
445+
decoder_tile_ids_per_batch=self.model_inputs["decoder_tile_ids_per_batch"],
446+
seq_lens_encoder=self.model_inputs["seq_lens_encoder"],
447+
seq_lens_decoder=self.model_inputs["seq_lens_decoder"],
448+
seq_lens_this_time=self.model_inputs["seq_lens_this_time"],
449+
cum_offsets=self.model_inputs["cum_offsets"],
450+
padding_offset=self.model_inputs["padding_offset"],
451+
cu_seqlens_q=self.model_inputs["cu_seqlens_q"],
452+
cu_seqlens_k=self.model_inputs["cu_seqlens_k"],
453+
block_tables=self.model_inputs["block_tables"],
454+
caches=self.model_inputs["caches"]
455+
)
456+
441457

442458
# Initialzie attention meta data
443459
for attn_backend in self.attn_backends:

0 commit comments

Comments
 (0)