add docstrings

leo-cf-tian · leo-cf-tian · commit 1de59d5a1dbf · 2025-06-26T15:32:17.000Z
diff --git a/requirements/test.txt b/requirements/test.txt
@@ -31,10 +31,6 @@ argcomplete==3.5.1
     # via datamodel-code-generator
 arrow==1.3.0
     # via isoduration
-async-timeout==5.0.1
-    # via
-    #   aiohttp
-    #   redis
 attrs==24.2.0
     # via
     #   aiohttp
@@ -145,11 +141,6 @@ eval-type-backport==0.2.2
     # via mteb
 evaluate==0.4.3
     # via lm-eval
-exceptiongroup==1.3.0
-    # via
-    #   anyio
-    #   hypothesis
-    #   pytest
 fastparquet==2024.11.0
     # via genai-perf
 fastrlock==0.8.2
@@ -699,6 +690,7 @@ setuptools==77.0.3
     # via
     #   mamba-ssm
     #   pytablewriter
+    #   torch
     #   triton
 shellingham==1.5.4
     # via typer
@@ -761,13 +753,8 @@ tokenizers==0.21.1
     # via
     #   -r requirements/test.in
     #   transformers
-toml==0.10.2
-    # via datamodel-code-generator
 tomli==2.2.1
-    # via
-    #   black
-    #   pytest
-    #   schemathesis
+    # via schemathesis
 tomli-w==1.2.0
     # via schemathesis
 torch==2.7.0+cu128
@@ -841,18 +828,13 @@ types-python-dateutil==2.9.0.20241206
     # via arrow
 typing-extensions==4.12.2
     # via
-    #   anyio
-    #   black
-    #   exceptiongroup
     #   huggingface-hub
     #   librosa
     #   mistral-common
     #   mteb
-    #   multidict
     #   pqdm
     #   pydantic
     #   pydantic-core
-    #   rich
     #   torch
     #   typer
     #   typing-inspection
@@ -892,4 +874,4 @@ yarl==1.17.1
     #   aiohttp
     #   schemathesis
 zstandard==0.23.0
-    # via lm-eval
+    # via lm-eval
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
@@ -77,6 +77,7 @@ def __init__(
                                    device=device,
                                    dtype=torch.int32)
 
+        # Used to store precomputed values from load_model() so they can be used in propose()
         self.last_token_indices = torch.zeros(self.max_num_tokens,
                                               dtype=torch.int32,
                                               device=device)
@@ -224,6 +225,18 @@ def advance_speculative_state(self, draft_token_ids: torch.Tensor,
                                   hidden_states: torch.Tensor,
                                   attn_metadata: FlashAttentionMetadata,
                                   batch_size: int):
+        """
+        Advances the speculative decoding state and metadata by one step
+
+        Parameters:
+        ----------
+        draft_token_ids (torch.Tensor): Token IDs generated by the draft model
+        positions (torch.Tensor): Position indices for the draft tokens
+        hidden_states (torch.Tensor): Corresponding hidden states for the tokens
+        attn_metadata (FlashAttentionMetadata): Metadata required for FlashAttention (e.g., sequence lengths, block table).
+        batch_size (int): Number of sequences to update.
+        """
+
         # Calculate number of thread blocks
         grid = lambda meta: (triton.cdiv(batch_size, meta['BLOCK_SIZE']), )
         attn_metadata.slot_mapping = torch.empty_like(positions)
@@ -305,8 +318,21 @@ def load_inputs(self, target_token_ids: torch.Tensor,
                     target_hidden_states: torch.Tensor,
                     next_token_ids_gpu: torch.Tensor,
                     cu_num_tokens: torch.Tensor, num_scheduled_tokens: int):
-        # Loads token ids, positions, etc. into the eagle model
-        # Moved from EagleProposer.propose() to here
+        """
+        Loads token ids, positions, etc. into the eagle model
+
+        Logic moved from EagleProposer.propose() to here
+
+        Parameters:
+        ----------
+        target_token_ids (torch.Tensor): Draft-step token IDs
+        target_positions (torch.Tensor): Position indices for the tokens
+        target_hidden_states (torch.Tensor): Corresponding hidden states for the tokens
+        next_token_ids_gpu (torch.Tensor): Sampled next token IDs to overwrite final token
+        cu_num_tokens (torch.Tensor): Cumulative number of tokens from prepare_inputs()
+        num_scheduled_tokens (int): Total number of tokens scheduled
+        """
+
         self.last_token_indices = cu_num_tokens[1:] - 1
 
         # FA requires seq_len to have dtype int32.
diff --git a/vllm/v1/spec_decode/utils.py b/vllm/v1/spec_decode/utils.py
@@ -70,6 +70,8 @@ def advance_state_kernel(
     BLOCK_SIZE: tl.constexpr,
     PADDING_SLOT_ID: tl.constexpr,
 ):
+    # Triton kernel to perform draft model state advancement.
+
     pid = tl.program_id(axis=0)
     block_start = pid * BLOCK_SIZE
     offsets = block_start + tl.arange(0, BLOCK_SIZE)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -1697,8 +1697,16 @@ def execute_model(
     def get_valid_sampled_token_ids(
             self, max_gen_len: int, sampled_token_ids: torch.Tensor,
             discard_sampled_tokens_req_indices: np.ndarray) -> list[list[int]]:
-        # Returns valid sampled tokens in a list of lists based on
-        # max gen length and discard indices
+        """
+        Returns valid sampled tokens in a list of lists based on max gen length and discard indices
+
+        Parameters:
+        ----------
+        - max_gen_len: Maximum length of the generated tokens
+        - sampled_token_ids: Tensor of sampled token IDs
+        - discard_sampled_tokens_req_indices: Indices of requests that should not be sampled
+        """
+
         if max_gen_len == 1:
             # No spec decode tokens.
             valid_sampled_token_ids = sampled_token_ids.tolist()