[worker] fix grad norm (#423)

hiyouga · web-flow · commit 75b07d2259b4 · 2025-07-15T01:33:48.000+08:00
diff --git a/verl/protocol.py b/verl/protocol.py
@@ -312,7 +312,7 @@ def from_dict(
                 current_batch = tensor.shape[:num_batch_dims]
                 assert batch_size == current_batch, (
                     f"Not all the tensor in tensors have the same batch size with batch_dims={num_batch_dims}. "
-                    f"Got {pivot_key} has {batch_size}, {key} has {current_batch}"
+                    f"Got {pivot_key} has {batch_size}, {key} has {current_batch}."
                 )
 
         for key, value in non_tensors.items():
@@ -322,18 +322,19 @@ def from_dict(
         tensor_dict = TensorDict(source=tensors, batch_size=batch_size) if tensors else None
         return cls(batch=tensor_dict, non_tensor_batch=non_tensors, meta_info=meta_info)
 
-    def to(self, device: torch.device) -> "DataProto":
-        """move the batch to device
+    def to(self, device: torch.device, non_blocking: bool = True) -> "DataProto":
+        """Move the batch to device
 
         Args:
-            device (torch.device, str): torch device
+            device (torch.device): the device to move to.
+            non_blocking (bool, optional): whether to use non-blocking mode. Defaults to True.
 
         Returns:
-            DataProto: the current DataProto
+            DataProto: the current DataProto.
 
         """
         if self.batch is not None:
-            self.batch = self.batch.to(device)
+            self.batch = self.batch.to(device, non_blocking=non_blocking)
 
         return self
 
diff --git a/verl/utils/seqlen_balancing.py b/verl/utils/seqlen_balancing.py
@@ -15,7 +15,7 @@
 import copy
 import heapq
 from itertools import chain
-from typing import List, Tuple
+from typing import Dict, List, Optional, Tuple
 
 import torch
 from tensordict import TensorDict
@@ -150,7 +150,7 @@ def greedy_partition(seqlen_list: List[int], k_partitions: int, equal_size: bool
     return partitions
 
 
-def get_seqlen_balanced_partitions(seqlen_list: List[int], k_partitions: int, equal_size: bool):
+def get_seqlen_balanced_partitions(seqlen_list: List[int], k_partitions: int, equal_size: bool) -> List[List[int]]:
     """Get order of seq lengths to make partitions balanced, this is
     used in balacing sum of seqlength across dp ranks and microbatches.
 
@@ -161,8 +161,7 @@ def get_seqlen_balanced_partitions(seqlen_list: List[int], k_partitions: int, eq
             resulting number of partitions
         equal_size (bool):
             if True, number of items in each partitions must be equal.
-            if False, only consider balancing the sum, each partition can have
-            variable number of items
+            if False, only consider balancing the sum, each partition can have variable number of items
 
     Returns:
         partitions (List[List[int]]):
@@ -186,14 +185,28 @@ def _check_and_sort_partitions(partitions):
     return _check_and_sort_partitions(partitions)
 
 
-def log_seqlen_unbalance(seqlen_list: List[int], partitions: List[List[int]], prefix):
-    # add some metrics of seqlen sum on dp ranks
+def log_seqlen_unbalance(seqlen_list: List[int], partitions: List[List[int]], prefix: str) -> Dict[str, float]:
+    """
+    Calculate and log metrics related to sequence length imbalance before and after partitioning.
+
+    Args:
+        seqlen_list (List[int]): A list of sequence lengths for each item.
+        partitions (List[List[int]]): A list of partitions, where each inner list contains indices
+                                      from seqlen_list assigned to that partition.
+        prefix (str): A prefix to be added to each metric key in the returned dictionary.
+
+    Returns:
+        dict: A dictionary containing metrics related to sequence length imbalance.
+    """
+    # Get the number of partitions
     k_partition = len(partitions)
     # assert len(seqlen_list) % k_partition == 0
     batch_size = len(seqlen_list) // k_partition
     min_sum_seqlen = None
     max_sum_seqlen = None
     total_sum_seqlen = 0
+
+    # Iterate over each batch of sequence lengths
     for offset in range(0, len(seqlen_list), batch_size):
         cur_sum_seqlen = sum(seqlen_list[offset : offset + batch_size])
         if min_sum_seqlen is None or cur_sum_seqlen < min_sum_seqlen:
@@ -206,7 +219,7 @@ def log_seqlen_unbalance(seqlen_list: List[int], partitions: List[List[int]], pr
     for partition in partitions:
         cur_sum_seqlen_balanced = sum([seqlen_list[i] for i in partition])
         balanced_sum_seqlen_list.append(cur_sum_seqlen_balanced)
-    # print("balanced_sum_seqlen_list: ", balanced_sum_seqlen_list)
+
     min_sum_seqlen_balanced = min(balanced_sum_seqlen_list)
     max_sum_seqlen_balanced = max(balanced_sum_seqlen_list)
 
@@ -220,11 +233,13 @@ def log_seqlen_unbalance(seqlen_list: List[int], partitions: List[List[int]], pr
     }
 
 
-def ceildiv(a, b):
+def ceildiv(a: float, b: float) -> float:
     return -(a // -b)
 
 
-def rearrange_micro_batches(batch: TensorDict, max_token_len, dp_group=None):
+def rearrange_micro_batches(
+    batch: TensorDict, max_token_len: int, dp_group: Optional[dist.ProcessGroup] = None
+) -> Tuple[List[TensorDict], List[List[int]]]:
     """Split the batch into a list of micro_batches, where the max_token_len is smaller than max_token_len
     and the number of valid tokens in each micro batch is well balanced.
     """
@@ -253,7 +268,16 @@ def rearrange_micro_batches(batch: TensorDict, max_token_len, dp_group=None):
     return micro_batches, micro_bsz_idx
 
 
-def get_reverse_idx(idx_map):
+def get_reverse_idx(idx_map: List[int]) -> List[int]:
+    """
+    Build the inverse of an index mapping.
+
+    Args:
+        idx_map (Sequence[int]): Sequence where idx_map[i] = j.
+
+    Returns:
+        List[int]: Inverse mapping list such that output[j] = i for each i.
+    """
     reverse_idx_map = copy.deepcopy(idx_map)
 
     for i, idx in enumerate(idx_map):
@@ -263,20 +287,38 @@ def get_reverse_idx(idx_map):
 
 
 def prepare_dynamic_batch(data: DataProto, max_token_len: int) -> tuple[list[DataProto], list[list[int]]]:
+    """
+    Prepare a batch for dynamic batching.
+
+    Args:
+        data (DataProto): The input data.
+        max_token_len (int): The maximum token length for dynamic batching.
+
+    Returns:
+        Tuple[List[DataProto], List[List[int]]]: A tuple containing a list of DataProto objects
+        and a list of index lists.
+    """
     batch, batch_idx_list = rearrange_micro_batches(data.batch, max_token_len=max_token_len)
     micro_batches = []
     for i, batch_idx in enumerate(batch_idx_list):
         tensors = dict(batch[i])
-        non_tensors = {}
-        for key in data.non_tensor_batch.keys():
-            non_tensors[key] = [data.non_tensor_batch[key][idx] for idx in batch_idx]
-
+        non_tensors = {key: value[batch_idx] for key, value in data.non_tensor_batch.items()}
         micro_batches.append(DataProto.from_dict(tensors, non_tensors))
 
     return micro_batches, batch_idx_list
 
 
 def restore_dynamic_batch(data: torch.Tensor, batch_idx_list: List[List[int]]) -> torch.Tensor:
+    """
+    Restore a batch from dynamic batching.
+
+    Args:
+        data (torch.Tensor): The input data.
+        batch_idx_list (List[List[int]]): The list of index lists.
+
+    Returns:
+        torch.Tensor: The restored data.
+    """
     indices = list(chain.from_iterable(batch_idx_list))
     revert_indices = torch.tensor(get_reverse_idx(indices), dtype=torch.long)
     return data[revert_indices]
diff --git a/verl/workers/actor/dp_actor.py b/verl/workers/actor/dp_actor.py
@@ -195,7 +195,7 @@ def compute_log_prob(self, data: DataProto) -> torch.Tensor:
         self.actor_module.eval()
 
         temperature = data.meta_info["temperature"]
-        select_keys = ["responses", "input_ids", "attention_mask", "position_ids"]
+        select_keys = ["input_ids", "attention_mask", "position_ids", "responses"]
         non_tensor_select_keys = ["multi_modal_inputs"]
 
         data = data.select(select_keys, non_tensor_select_keys)
@@ -225,7 +225,7 @@ def update_policy(self, data: DataProto) -> Dict[str, Any]:
         self.actor_module.train()
 
         temperature = data.meta_info["temperature"]  # temperature must be in the data.meta_info to avoid slient error
-        select_keys = ["responses", "input_ids", "attention_mask", "position_ids"]
+        select_keys = ["input_ids", "attention_mask", "position_ids", "responses", "response_mask"]
         select_keys.extend(["old_log_probs", "ref_log_probs", "advantages"])
         non_tensor_select_keys = ["multi_modal_inputs"]
 
@@ -239,10 +239,8 @@ def update_policy(self, data: DataProto) -> Dict[str, Any]:
                 mini_batches = tqdm(mini_batches, desc="Train mini-batches", position=1)
 
             for mini_batch in mini_batches:
-                response_length = mini_batch.batch["responses"].size(-1)
-                response_mask = mini_batch.batch["attention_mask"][:, -response_length:]
-                total_response_tokens = torch.sum(response_mask)
-                dist.all_reduce(torch.sum(response_mask), op=dist.ReduceOp.SUM)
+                total_response_tokens = torch.sum(mini_batch.batch["response_mask"])
+                dist.all_reduce(total_response_tokens, op=dist.ReduceOp.SUM)
 
                 if self.config.dynamic_batching:
                     max_input_len = mini_batch.batch["input_ids"].size(-1)
@@ -256,8 +254,7 @@ def update_policy(self, data: DataProto) -> Dict[str, Any]:
 
                 for micro_batch in micro_batches:
                     model_inputs = {**micro_batch.batch, **micro_batch.non_tensor_batch}
-                    response_length = model_inputs["responses"].size(-1)
-                    response_mask = model_inputs["attention_mask"][:, -response_length:]
+                    response_mask = model_inputs["response_mask"]
                     old_log_probs = model_inputs["old_log_probs"]
                     advantages = model_inputs["advantages"]
 
diff --git a/verl/workers/critic/dp_critic.py b/verl/workers/critic/dp_critic.py
@@ -148,7 +148,7 @@ def _optimizer_step(self) -> torch.Tensor:
     def compute_values(self, data: DataProto) -> torch.Tensor:
         self.critic_module.eval()
 
-        select_keys = ["responses", "input_ids", "attention_mask", "position_ids"]
+        select_keys = ["input_ids", "attention_mask", "position_ids", "responses", "response_mask"]
         non_tensor_select_keys = ["multi_modal_inputs"]
 
         data = data.select(select_keys, non_tensor_select_keys)
@@ -172,14 +172,14 @@ def compute_values(self, data: DataProto) -> torch.Tensor:
         if self.config.dynamic_batching:
             values = restore_dynamic_batch(values, batch_idx_list)
 
-        response_length = data.batch["responses"].size(1)
-        values = values * data.batch["attention_mask"][:, -response_length:]  # only action tokens have values
+        values = values * data.batch["response_mask"]  # only action tokens have values
         return values
 
     def update_critic(self, data: DataProto) -> Dict[str, Any]:
         self.critic_module.train()
 
-        select_keys = ["input_ids", "responses", "attention_mask", "position_ids", "values", "returns"]
+        select_keys = ["input_ids", "attention_mask", "position_ids", "responses", "response_mask"]
+        select_keys.extend(["values", "returns"])
         non_tensor_select_keys = ["multi_modal_inputs"]
 
         # Split to make minibatch iterator for updating the actor
@@ -192,10 +192,8 @@ def update_critic(self, data: DataProto) -> Dict[str, Any]:
                 mini_batches = tqdm(mini_batches, desc="Train mini-batches", position=1)
 
             for mini_batch in mini_batches:
-                response_length = mini_batch.batch["responses"].size(-1)
-                response_mask = mini_batch.batch["attention_mask"][:, -response_length:]
-                total_response_tokens = torch.sum(response_mask)
-                dist.all_reduce(torch.sum(response_mask), op=dist.ReduceOp.SUM)
+                total_response_tokens = torch.sum(mini_batch.batch["response_mask"])
+                dist.all_reduce(total_response_tokens, op=dist.ReduceOp.SUM)
 
                 if self.config.dynamic_batching:
                     max_input_len = mini_batch.batch["input_ids"].size(-1)
@@ -209,8 +207,7 @@ def update_critic(self, data: DataProto) -> Dict[str, Any]:
 
                 for micro_batch in micro_batches:
                     model_inputs = {**micro_batch.batch, **micro_batch.non_tensor_batch}
-                    response_length = model_inputs["responses"].size(-1)
-                    response_mask = model_inputs["attention_mask"][:, -response_length:]
+                    response_mask = model_inputs["response_mask"]
                     values = model_inputs["values"]
                     returns = model_inputs["returns"]
 
diff --git a/verl/workers/fsdp_workers.py b/verl/workers/fsdp_workers.py
@@ -453,7 +453,7 @@ def _process_multi_modal_inputs(self, data: DataProto):
             max_pixels = data.meta_info["max_pixels"]
             video_fps = data.meta_info["video_fps"]
             batch_multi_modal_inputs = []
-            for multi_modal_data in data.non_tensor_batch["multi_modal_data"]:
+            for multi_modal_data in data.non_tensor_batch["multi_modal_data"]:  # process multi modal data per sample
                 images, videos = [], []
                 if "images" in multi_modal_data:
                     for image in multi_modal_data["images"]:
@@ -468,16 +468,17 @@ def _process_multi_modal_inputs(self, data: DataProto):
                     # otherwise the batch features will be converted to dict keys
                     # see https://github.com/hiyouga/EasyR1/pull/339
                     multi_modal_inputs = dict(self.processor.image_processor(images=images, return_tensors="pt"))
-                    multi_modal_inputs = {k: v.to(torch.cuda.current_device()) for k, v in multi_modal_inputs.items()}
-                    batch_multi_modal_inputs.append(multi_modal_inputs)
                 elif len(videos) != 0:
                     multi_modal_inputs = dict(
                         self.processor.image_processor(images=None, videos=videos, return_tensors="pt")
                     )
-                    multi_modal_inputs = {k: v.to(torch.cuda.current_device()) for k, v in multi_modal_inputs.items()}
-                    batch_multi_modal_inputs.append(multi_modal_inputs)
-                else:  # text-only data
-                    batch_multi_modal_inputs.append({})
+                else:
+                    multi_modal_inputs = {}
+
+                multi_modal_inputs = {
+                    k: v.to(torch.cuda.current_device(), non_blocking=True) for k, v in multi_modal_inputs.items()
+                }
+                batch_multi_modal_inputs.append(multi_modal_inputs)
 
             self._cache["uid"] = data.non_tensor_batch["uid"]
             self._cache["multi_modal_inputs"] = np.array(batch_multi_modal_inputs, dtype=object)