We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 7721ef1 commit e34d130Copy full SHA for e34d130
vllm/v1/attention/backends/pallas.py
@@ -86,6 +86,12 @@ def get_max_num_seqs(model_len: int, page_size: int) -> int:
86
# spill less likely. Meanwhile we make sure the page size is in [16, 256].
87
@staticmethod
88
def get_page_size(vllm_config: VllmConfig) -> int:
89
+ # TODO: This is a temporary fix for vmem OOM.
90
+ # For long model length, we use 16 page-size to avoid too much
91
+ # VMEM spill. A more robust solution should be implemented to
92
+ # handle VREG spills.
93
+ if vllm_config.model_config.max_model_len > 8192:
94
+ return 16
95
page_size = next_power_of_2(
96
vllm_config.model_config.max_model_len) // 16
97
if page_size <= 16:
0 commit comments