make precommit and lint

llsj14 · llsj14 · commit 5d8490d093e9 · 2025-07-14T05:41:02.000Z
Signed-off-by: Sungjae Lee &lt;33976427+llsj14@users.noreply.github.com&gt;
diff --git a/vllm/config.py b/vllm/config.py
@@ -4412,7 +4412,9 @@ class ReasoningConfig:
     think_end_token_id: Optional[int] = None
     """Token ID that indicates the end of reasoning."""
 
-    def __init__(self, think_start_token_id: Optional[int] = None, think_end_token_id: Optional[int] = None):
+    def __init__(self,
+                 think_start_token_id: Optional[int] = None,
+                 think_end_token_id: Optional[int] = None):
         self.think_start_token_id = think_start_token_id
         self.think_end_token_id = think_end_token_id
 
diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
@@ -27,7 +27,8 @@ class MoveDirectionality(Enum):
 
 # (index, params, prompt_tok_ids, output_tok_ids) tuples for new
 # requests added to the batch.
-AddedRequest = tuple[int, Union[SamplingParams, PoolingParams], list[int], list[int]]
+AddedRequest = tuple[int, Union[SamplingParams, PoolingParams], list[int],
+                     list[int]]
 # (index 1, index 2, directionality) tuples representing
 # one-way moves or two-way swaps of requests in batch
 MovedRequest = tuple[int, int, MoveDirectionality]
@@ -497,13 +498,14 @@ def apply(self, logits: torch.Tensor) -> torch.Tensor:
 class MaxThinkTokensLogitsProcessor(LogitsProcessor):
     """A logits processor that limits the maximum number of thinking tokens."""
 
-    def __init__(self, reasoning_config: ReasoningConfig, pin_memory: bool, device: torch.device):
+    def __init__(self, reasoning_config: ReasoningConfig, pin_memory: bool,
+                 device: torch.device):
         """
         Args:
-            think_start_token_id (int): Token ID for the start of thinking section.
-            think_end_token_id (int): Token ID for the end of thinking section.
-            pin_memory (bool): Whether to use pinned memory for tensors.
-            device (torch.device): Device to use for tensor operations.
+          reasoning_config: Configuration for reasoning, which includes
+            the token IDs for thinking start and end.
+          pin_memory (bool): Whether to use pinned memory for tensors.
+          device (torch.device): Device to use for tensor operations.
         """
         super().__init__()
         self.think_start_token_id = reasoning_config.think_start_token_id
@@ -519,19 +521,25 @@ def _find_last_token_index(self, tokens: list[int], token_id: int) -> int:
             return -1
 
     def is_argmax_invariant(self) -> bool:
-        """This logits processor can change the outcome of greedy sampling
-        by forcing that the thinking section ends after a certain number of tokens."""
+        """This logits processor can change the outcome of
+        greedy sampling by forcing that the thinking section
+        ends after a certain number of tokens."""
         return False
 
     def update_state(self, batch_update: Optional[BatchUpdate]):
         if batch_update:
-            for index, params, prompt_tok_ids, output_tok_ids in batch_update.added:
-                max_think_tokens = params.max_think_tokens if isinstance(params, SamplingParams) else None
+            for (index, params, prompt_tok_ids,
+                 output_tok_ids) in batch_update.added:
+                max_think_tokens = (params.max_think_tokens if isinstance(
+                    params, SamplingParams) else None)
                 if max_think_tokens is not None:
-                    last_start = self._find_last_token_index(prompt_tok_ids, self.think_start_token_id)
-                    last_end = self._find_last_token_index(prompt_tok_ids, self.think_end_token_id)
+                    last_start = self._find_last_token_index(
+                        prompt_tok_ids, self.think_start_token_id)
+                    last_end = self._find_last_token_index(
+                        prompt_tok_ids, self.think_end_token_id)
                     in_think = last_start > last_end
-                    count = len(prompt_tok_ids) - (last_start + 1) if in_think else 0
+                    count = len(prompt_tok_ids) - (last_start +
+                                                   1) if in_think else 0
 
                     self._state[index] = {
                         "in_think": in_think,
@@ -542,13 +550,14 @@ def update_state(self, batch_update: Optional[BatchUpdate]):
                     }
 
             for index in batch_update.removed:
-                self._state.pop(index, None)
+                self._state.pop(index, {})
 
             for i1, i2, direction in batch_update.moved:
                 if direction == MoveDirectionality.SWAP:
-                    self._state[i1], self._state[i2] = self._state[i2], self._state[i1]
+                    self._state[i1], self._state[i2] = self._state[
+                        i2], self._state[i1]
                 else:
-                    self._state[i2] = self._state.pop(i1, None)
+                    self._state[i2] = self._state.pop(i1, {})
 
         # Update in_think and count for all active requests
         for state in self._state.values():
@@ -579,7 +588,8 @@ def apply(self, logits: torch.Tensor) -> torch.Tensor:
             if not state:
                 continue
 
-            if state["in_think"] and state["count"] >= state["max_think_tokens"]:
+            if state["in_think"] and state["count"] >= state[
+                    "max_think_tokens"]:
                 mask[index] = True
 
         if mask.any():
@@ -589,8 +599,9 @@ def apply(self, logits: torch.Tensor) -> torch.Tensor:
         return logits
 
 
-def init_builtin_logitsprocs(pin_memory_available: bool, max_num_reqs: int,
-                             device: torch.device, reasoning_config: ReasoningConfig) -> LogitsProcessorManager:
+def init_builtin_logitsprocs(
+        pin_memory_available: bool, max_num_reqs: int, device: torch.device,
+        reasoning_config: ReasoningConfig) -> LogitsProcessorManager:
     """Construct 'builtin' vLLM logitsprocs which the engine
     loads by default.
 
@@ -619,8 +630,7 @@ def init_builtin_logitsprocs(pin_memory_available: bool, max_num_reqs: int,
     )
     return LogitsProcessorManager(
         non_argmax_invariant=[
-            min_tokens_logitproc,
-            logit_bias_logitproc,
+            min_tokens_logitproc, logit_bias_logitproc,
             max_think_tokens_logitproc
         ],
         argmax_invariant=[min_p_logitproc],
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
@@ -263,7 +263,8 @@ def _register_add_request(self, request: "CachedRequestState") -> int:
         params = (request.sampling_params
                   if request.sampling_params else request.pooling_params)
         self.batch_update_builder.added.append(
-            (req_index, params, request.prompt_token_ids, request.output_token_ids))
+            (req_index, params, request.prompt_token_ids,
+             request.output_token_ids))
         return req_index
 
     def add_request(
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -18,7 +18,7 @@
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.layer import Attention
 from vllm.compilation.counter import compilation_counter
-from vllm.config import (CompilationLevel, VllmConfig,
+from vllm.config import (CompilationLevel, ReasoningConfig, VllmConfig,
                          get_layers_from_vllm_config, update_config)
 from vllm.distributed.eplb.eplb_state import EplbState
 from vllm.distributed.kv_transfer import (get_kv_transfer_group,
@@ -39,8 +39,10 @@
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.multimodal.utils import group_mm_inputs_by_modality
 from vllm.pooling_params import PoolingParams
+from vllm.reasoning import ReasoningParserManager
 from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
                         GiB_bytes, LazyLoader, async_tensor_h2d, cdiv,
                         check_use_alibi, get_dtype_size,
@@ -71,10 +73,6 @@
 from .utils import (gather_mm_placeholders, initialize_kv_cache_for_kv_sharing,
                     sanity_check_mm_encoder_outputs, scatter_mm_placeholders)
 
-from vllm.config import ReasoningConfig
-from vllm.reasoning import ReasoningParserManager
-from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
-
 if TYPE_CHECKING:
     import xgrammar as xgr
     import xgrammar.kernels.apply_token_bitmask_inplace_torch_compile as xgr_torch_compile  # noqa: E501
@@ -109,7 +107,8 @@ def __init__(
         self.prompt_adapter_config = vllm_config.prompt_adapter_config
         self.observability_config = vllm_config.observability_config
 
-        if self.vllm_config.decoding_config.reasoning_backend in ('deepseek_r1', 'qwen'):
+        if self.vllm_config.decoding_config.reasoning_backend in (
+                'deepseek_r1', 'qwen'):
             tokenizer = init_tokenizer_from_configs(
                 model_config=self.vllm_config.model_config,
                 scheduler_config=self.vllm_config.scheduler_config,
@@ -120,8 +119,9 @@ def __init__(
             reasoner_cls = ReasoningParserManager.get_reasoning_parser(
                 reasoning_backend)
             reasoning_parser = reasoner_cls(tokenizer=tokenizer)
-            self.vllm_config.reasoning_config = ReasoningConfig(think_start_token_id=reasoning_parser.think_start_token_id,
-                                                                think_end_token_id=reasoning_parser.think_end_token_id)
+            self.vllm_config.reasoning_config = ReasoningConfig(
+                think_start_token_id=reasoning_parser.think_start_token_id,
+                think_end_token_id=reasoning_parser.think_end_token_id)
 
         from vllm.model_executor.models.utils import set_cpu_offload_max_bytes
         set_cpu_offload_max_bytes(