PaddlePaddle
diff --git a/‎fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py
Lines changed: 19 additions & 23 deletions b/‎fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py
Lines changed: 19 additions & 23 deletions
diff --git a/‎fastdeploy/model_executor/graph_optimization/decorator.py
Lines changed: 3 additions & 3 deletions b/‎fastdeploy/model_executor/graph_optimization/decorator.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py
Lines changed: 4 additions & 1 deletion b/‎fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎fastdeploy/spec_decode/mtp.py
Lines changed: 18 additions & 2 deletions b/‎fastdeploy/spec_decode/mtp.py
Lines changed: 18 additions & 2 deletions
@@ -46,13 +46,9 @@ class ConcreteSizeEntry:
     # Output buffer of cudagraph
     output_buffer: Optional[paddle.Tensor] = None
 
-    # for cudagraph debugging, track the input addresses
-    # during capture, and check if they are the same during replay
-    input_addresses: Optional[list[int]] = None
-
 
 class CudaGraphPiecewiseBackend:
-    """ """
+    """ Manage the capture and replay of CUDA graphs at the subgraph level. """
 
     def __init__(
         self,
@@ -65,33 +61,31 @@ def __init__(
         self.warm_up_size = fd_config.graph_opt_config.cudagraph_num_of_warmups
         self.batch_size_to_captured_size = fd_config.graph_opt_config.batch_size_to_captured_size
 
-        # runtime_bs -> ConcreteSizeEntry
+        # Runtime batch size -> ConcreteSizeEntry
         self.concrete_size_entries: Dict[int, ConcreteSizeEntry] = {}
 
         for shape in self.cudagraph_capture_sizes:
             self.concrete_size_entries[shape] = ConcreteSizeEntry(
                 runtime_bs=shape)
 
-        print("[CUDA GRAPH] Created all batch size entry ")
+        logger.debug("[CUDA GRAPH] Created all batch size entry ")
 
     def __call__(self, **kwargs):
         # Get batch size
         ids_remove_padding: paddle.Tensor = kwargs["ids_remove_padding"]
         batch_size = ids_remove_padding.shape[0]
-
         padding_batch_size = self.batch_size_to_captured_size[batch_size]
-        # print(
-        #     f"[CUDA GRAPH] The actual batch size obtained by CUDAGraph is :{batch_size}, ",
-        #     f"The padded batch size is :{padding_batch_size}"
-        # )
+        logger.debug(
+            f"[CUDA GRAPH] The actual batch size obtained by CUDAGraph is :{batch_size}, ",
+            f"The padded batch size is :{padding_batch_size}")
 
         entry = self.concrete_size_entries.get(padding_batch_size)
         assert entry is not None, f"Batch size:{padding_batch_size} is not in cuda graph capture list."
         if entry.runnable is None:
             entry.runnable = self.runnable
-            # print(
-            #     f"[CUDA GRAPH] New entry lazy initialize with batch size {padding_batch_size}"
-            # )
+            logger.debug(
+                f"[CUDA GRAPH] New entry lazy initialize with batch size {padding_batch_size}"
+            )
 
         if not entry.use_cudagraph:
             return entry.runnable(**kwargs)
@@ -102,10 +96,10 @@ def __call__(self, **kwargs):
             for n in range(entry.num_finished_warmup, self.warm_up_size):
                 entry.num_finished_warmup += 1
                 entry.runnable(**kwargs)
-                # print(
-                #     "[CUDA GRAPH] Warm up for batch size ",
-                #     f"{padding_batch_size}, finished ({n+1}/{entry.num_finished_warmup}) times"
-                # )
+                logger.debug(
+                    "[CUDA GRAPH] Warm up for batch size ",
+                    f"{padding_batch_size}, finished ({n+1}/{entry.num_finished_warmup}) times"
+                )
 
             # Store input addresses for debug
             input_addresses = [
@@ -129,11 +123,13 @@ def __call__(self, **kwargs):
             output._clear
 
             paddle.device.synchronize()
-            # print(
-            #     f"[CUDA GRAPH] CUDAGraph captured for batch size {padding_batch_size}"
-            # )
+            logger.debug(
+                f"[CUDA GRAPH] CUDAGraph captured for batch size {padding_batch_size}"
+            )
 
         # Replay
         entry.cuda_graph.replay()
-        # print(f"[CUDA GRAPH] CUDAGraph replayed for batch size {padding_batch_size}")
+        logger.debug(
+            f"[CUDA GRAPH] CUDAGraph replayed for batch size {padding_batch_size}"
+        )
         return entry.output_buffer
@@ -28,7 +28,7 @@
 
 def support_graph_optimization(cls: Optional[_T] = None) -> _T:
     """
-    A decorator for wrapping models or layers with CUDA graph support.
+    A decorator for wrapping models or layers with static graph and CUDAGraph support.
     This enables efficient kernel launch sequencing for improved GPU performance.
 
     Example usage:
@@ -74,7 +74,7 @@ def __call__(self, **kwargs):
 
 
 class GraphOptWrapper:
-    """ """
+    """ The wrapper for GraphOptBackend """
 
     def __init__(
         self,
@@ -87,7 +87,7 @@ def __init__(
 
     @abstractmethod
     def forward(self, **kwargs):
-        """ """
+        """ Abstract methods for implementing model.forward() """
         pass
 
     def __call__(self, **kwargs):
 
@@ -24,7 +24,10 @@
 
 
 class GraphOptBackend:
-    """ """
+    """
+    Integrated various graph optimization functions, including dynamic graph to static graph conversion,
+    CINN compilation optimization, CudaGraph, and so on.
+    """
 
     fd_config: FDConfig
     cudagraph_piecewise_backend: Optional[CudaGraphPiecewiseBackend] = None
 
@@ -436,8 +436,24 @@ def _initialize_forward_meta(self):
         Initialize forward meta and attention meta data
         """
         # Initialize forward meta
-        self.forward_meta = ForwardMeta.init_forward_meta(
-            self.model_inputs, self.attn_backends[0])
+        self.forward_meta = ForwardMeta(
+            input_ids=self.model_inputs["input_ids"],
+            ids_remove_padding=self.model_inputs["ids_remove_padding"],
+            rotary_embs=self.model_inputs["rope_emb"],
+            attn_backend=self.attn_backends[0],
+            decoder_batch_ids=self.model_inputs["decoder_batch_ids"],
+            decoder_tile_ids_per_batch=self.model_inputs["decoder_tile_ids_per_batch"],
+            seq_lens_encoder=self.model_inputs["seq_lens_encoder"],
+            seq_lens_decoder=self.model_inputs["seq_lens_decoder"],
+            seq_lens_this_time=self.model_inputs["seq_lens_this_time"],
+            cum_offsets=self.model_inputs["cum_offsets"],
+            padding_offset=self.model_inputs["padding_offset"],
+            cu_seqlens_q=self.model_inputs["cu_seqlens_q"],
+            cu_seqlens_k=self.model_inputs["cu_seqlens_k"],
+            block_tables=self.model_inputs["block_tables"],
+            caches=self.model_inputs["caches"]
+        )
+
 
         # Initialzie attention meta data
         for attn_backend in self.attn_backends: