@@ -1111,17 +1111,30 @@ def sync_and_slice_intermediate_tensors(
1111
1111
for k , v in self .intermediate_tensors .items ()
1112
1112
})
1113
1113
1114
- def get_dp_padding (self , num_tokens : int ):
1114
+ def get_dp_padding (self ,
1115
+ num_tokens : int ) -> tuple [int , Optional [torch .Tensor ]]:
1115
1116
dp_size = self .vllm_config .parallel_config .data_parallel_size
1116
1117
dp_rank = self .vllm_config .parallel_config .data_parallel_rank
1117
- if dp_size == 1 :
1118
+
1119
+ # For DP: Don't pad when setting enforce_eager.
1120
+ # This lets us set enforce_eager on the prefiller in a P/D setup and
1121
+ # still use CUDA graphs (enabled by this padding) on the decoder.
1122
+ #
1123
+ # TODO(tms) : There are many cases where padding is enabled for
1124
+ # prefills, causing unnecessary and excessive padding of activations.
1125
+
1126
+ if dp_size == 1 or self .vllm_config .model_config .enforce_eager :
1118
1127
# Early exit.
1119
- return 0
1128
+ return 0 , None
1120
1129
1121
1130
num_tokens_across_dp = DPMetadata .num_tokens_across_dp (
1122
1131
num_tokens , dp_size , dp_rank )
1123
1132
max_tokens_across_dp_cpu = torch .max (num_tokens_across_dp ).item ()
1124
- return max_tokens_across_dp_cpu - num_tokens
1133
+ num_tokens_after_padding = torch .tensor ([max_tokens_across_dp_cpu ] *
1134
+ dp_size ,
1135
+ device = "cpu" ,
1136
+ dtype = torch .int32 )
1137
+ return max_tokens_across_dp_cpu - num_tokens , num_tokens_after_padding
1125
1138
1126
1139
@torch .inference_mode ()
1127
1140
def execute_model (
@@ -1161,7 +1174,8 @@ def execute_model(
1161
1174
num_input_tokens = num_scheduled_tokens
1162
1175
1163
1176
# Padding for DP
1164
- num_input_tokens += self .get_dp_padding (num_input_tokens )
1177
+ num_pad , num_tokens_across_dp = self .get_dp_padding (num_input_tokens )
1178
+ num_input_tokens += num_pad
1165
1179
1166
1180
# _prepare_inputs may reorder the batch, so we must gather multi
1167
1181
# modal outputs after that to ensure the correct order
@@ -1208,7 +1222,8 @@ def execute_model(
1208
1222
# Use persistent buffers for CUDA graphs.
1209
1223
with set_forward_context (attn_metadata ,
1210
1224
self .vllm_config ,
1211
- num_tokens = num_input_tokens ):
1225
+ num_tokens = num_input_tokens ,
1226
+ num_tokens_across_dp = num_tokens_across_dp ):
1212
1227
self .maybe_setup_kv_connector (scheduler_output )
1213
1228
1214
1229
model_output = self .model (
@@ -1681,7 +1696,8 @@ def _dummy_run(
1681
1696
) -> torch .Tensor :
1682
1697
1683
1698
# Padding for DP
1684
- num_tokens += self .get_dp_padding (num_tokens )
1699
+ num_pad , num_tokens_across_dp = self .get_dp_padding (num_tokens )
1700
+ num_tokens += num_pad
1685
1701
1686
1702
# Set num_scheduled_tokens based on num_tokens and max_num_seqs
1687
1703
# for dummy run with LoRA so that the num_reqs collectively
@@ -1747,9 +1763,11 @@ def _dummy_run(
1747
1763
intermediate_tensors = self .sync_and_slice_intermediate_tensors (
1748
1764
num_tokens , None , False )
1749
1765
1750
- with set_forward_context (attn_metadata ,
1751
- self .vllm_config ,
1752
- num_tokens = num_tokens ):
1766
+ with set_forward_context (
1767
+ attn_metadata ,
1768
+ self .vllm_config ,
1769
+ num_tokens = num_tokens ,
1770
+ num_tokens_across_dp = num_tokens_across_dp ):
1753
1771
outputs = model (
1754
1772
input_ids = input_ids ,
1755
1773
positions = positions ,
0 commit comments