amend

vmoens · vmoens · commit 471cb483100f · 2025-07-04T18:17:55.000+01:00
diff --git a/torchrl/modules/distributions/discrete.py b/torchrl/modules/distributions/discrete.py
@@ -15,7 +15,7 @@
 
 from torch.distributions.utils import lazy_property, logits_to_probs, probs_to_logits
 
-__all__ = ["OneHotCategorical", "MaskedCategorical", "Ordinal", "OneHotOrdinal"]
+__all__ = ["OneHotCategorical", "MaskedCategorical", "Ordinal", "OneHotOrdinal", "LLMMaskedCategorical"]
 
 
 def _treat_categorical_params(
@@ -51,8 +51,8 @@ def wrapped(_self, *args, **kwargs):
 
 
 class ReparamGradientStrategy(Enum):
-    PassThrough: Any = 1
-    RelaxedOneHot: Any = 2
+    PassThrough = 1
+    RelaxedOneHot = 2
 
 
 class OneHotCategorical(D.Categorical):
@@ -105,7 +105,11 @@ def __init__(
         probs = _treat_categorical_params(probs)
         self.grad_method = grad_method
         super().__init__(probs=probs, logits=logits, **kwargs)
-        self.num_samples = self._param.shape[-1]
+        # Get num_samples from logits or probs shape
+        if logits is not None:
+            self.num_samples = logits.shape[-1]
+        else:
+            self.num_samples = probs.shape[-1]
 
     def log_prob(self, value: torch.Tensor) -> torch.Tensor:
         return super().log_prob(value.argmax(dim=-1))
@@ -381,7 +385,9 @@ def _mask_logits(
             return logits
 
         if not sparse_mask:
-            return logits.masked_fill(~expand_as_right(mask, logits), neg_inf)
+            # Use a large negative value instead of -inf to avoid numerical issues
+            large_neg = torch.finfo(logits.dtype).min
+            return logits.masked_fill(~expand_as_right(mask, logits), large_neg)
 
         if padding_value is not None:
             padding_mask = mask == padding_value
@@ -390,7 +396,8 @@ def _mask_logits(
                 mask = mask.masked_fill(padding_mask, 0)
         logits = logits.gather(dim=-1, index=mask)
         if padding_value is not None:
-            logits.masked_fill_(padding_mask, neg_inf)
+            large_neg = torch.finfo(logits.dtype).min
+            logits.masked_fill_(padding_mask, large_neg)
         return logits
 
     @property
@@ -658,3 +665,147 @@ def _generate_ordinal_logits(scores: torch.Tensor) -> torch.Tensor:
     )
 
     return larger_than_log_probs + smaller_than_log_probs
+
+
+class LLMMaskedCategorical(D.Categorical):
+    """LLM-optimized masked categorical distribution.
+    
+    This class provides a more memory-efficient approach for LLM training by:
+    1. Using ignore_index=-100 for log_prob computation (no masking overhead)
+    2. Using traditional masking for sampling operations
+    
+    This is particularly beneficial for large vocabulary sizes where masking
+    all logits can be memory-intensive.
+    
+    Args:
+        logits (torch.Tensor): event log probabilities (unnormalized)
+        probs (torch.Tensor): event probabilities
+        mask (torch.Tensor): boolean mask indicating valid positions
+        ignore_index (int, optional): index to ignore in log_prob computation. Defaults to -100.
+    
+    Examples:
+        >>> logits = torch.randn(2, 10, 50000)  # batch=2, seq_len=10, vocab=50000
+        >>> mask = torch.ones(2, 10, dtype=torch.bool)
+        >>> mask[0, :5] = False  # mask first 5 tokens of first sequence
+        >>> dist = LLMMaskedCategorical(logits=logits, mask=mask)
+        >>> 
+        >>> # Efficient log_prob computation (no masking overhead)
+        >>> tokens = torch.randint(0, 50000, (2, 10))
+        >>> tokens[0, :5] = -100  # set masked positions to ignore_index
+        >>> log_probs = dist.log_prob(tokens)
+        >>> 
+        >>> # Sampling still uses masking for correctness
+        >>> samples = dist.sample()
+    """
+    
+    def __init__(
+        self,
+        logits: torch.Tensor,
+        mask: torch.Tensor,
+        ignore_index: int = -100,
+        **kwargs,
+    ) -> None:
+        self._original_logits = logits
+        self._mask = mask
+        self.ignore_index = ignore_index
+        
+        # Create masked logits for sampling (only when needed)
+        self._masked_logits = None
+        self._masked_dist = None
+        
+        # Initialize parent with original logits
+        super().__init__(logits=logits, **kwargs)
+    
+    @property
+    def _sampling_logits(self):
+        """Get masked logits for sampling operations."""
+        if self._masked_logits is None:
+            # Only create masked logits when needed for sampling
+            large_neg = torch.finfo(self._original_logits.dtype).min
+            self._masked_logits = self._original_logits.masked_fill(
+                ~expand_as_right(self._mask, self._original_logits), 
+                large_neg
+            )
+        return self._masked_logits
+    
+    @property
+    def _sampling_dist(self):
+        """Get masked distribution for sampling operations."""
+        if self._masked_dist is None:
+            self._masked_dist = D.Categorical(logits=self._sampling_logits)
+        return self._masked_dist
+    
+    def log_prob(self, value: torch.Tensor) -> torch.Tensor:
+        """Compute log probabilities using ignore_index approach.
+        
+        This is memory-efficient as it doesn't require masking the logits.
+        The value tensor should use ignore_index for masked positions.
+        """
+        # Use cross_entropy with ignore_index for efficiency
+        if value.ndim > 1:
+            # Reshape for cross_entropy: (batch, seq_len, vocab) -> (batch*seq_len, vocab)
+            logits_flat = self._original_logits.view(-1, self._original_logits.size(-1))
+            value_flat = value.view(-1)
+            
+            # Compute cross_entropy with ignore_index
+            log_probs_flat = -F.cross_entropy(
+                logits_flat, value_flat, 
+                reduce=False, 
+                ignore_index=self.ignore_index
+            )
+            
+            # Reshape back
+            log_probs = log_probs_flat.view_as(value)
+        else:
+            log_probs = -F.cross_entropy(
+                self._original_logits, value,
+                reduce=False,
+                ignore_index=self.ignore_index
+            )
+        
+        return log_probs
+    
+    def sample(self, sample_shape: torch.Size | Sequence[int] | None = None) -> torch.Tensor:
+        """Sample from the distribution using masked logits."""
+        return self._sampling_dist.sample(sample_shape)
+    
+    def rsample(self, sample_shape: torch.Size | Sequence = None) -> torch.Tensor:
+        """Reparameterized sampling using masked logits."""
+        # This would need to be implemented based on the specific reparameterization strategy
+        # For now, fall back to regular sampling
+        return self.sample(sample_shape)
+    
+    @property
+    def mode(self) -> torch.Tensor:
+        """Get the mode using masked logits."""
+        masked_logits = self._sampling_logits
+        return (masked_logits == masked_logits.max(-1, True)[0]).to(torch.long)
+    
+    def entropy(self) -> torch.Tensor:
+        """Compute entropy using masked logits."""
+        return self._sampling_dist.entropy()
+    
+    def clear_cache(self):
+        """Clear cached masked tensors to free memory."""
+        self._masked_logits = None
+        self._masked_dist = None
+
+    @property
+    def mask(self) -> torch.Tensor:
+        """Get the mask."""
+        return self._mask
+    
+    @property
+    def logits(self) -> torch.Tensor:
+        """Get the original logits."""
+        return self._original_logits
+
+    @property
+    def masked_logits(self) -> torch.Tensor:
+        """Get the masked logits for sampling operations."""
+        return self._sampling_logits
+    
+    @property
+    def masked_dist(self) -> D.Categorical:
+        """Get the masked distribution for sampling operations."""
+        return self._sampling_dist
diff --git a/torchrl/modules/llm/policies/common.py b/torchrl/modules/llm/policies/common.py
@@ -17,7 +17,7 @@
 from torch.nn.utils.rnn import pad_sequence
 from torchrl.data.llm import History
 from torchrl.data.tensor_specs import Unbounded
-from torchrl.modules import MaskedCategorical
+from torchrl.modules.distributions.discrete import LLMMaskedCategorical
 
 # TODOs:
 # - [ ] Remove the useless view(-1) calls when num_samples is not > 1
@@ -446,7 +446,7 @@ def get_dist(
             **kwargs: Additional arguments
 
         Returns:
-            Distribution (Categorical or MaskedCategorical)
+            Distribution (Categorical or LLMMaskedCategorical)
         """
         if self.generate:
             raise NotImplementedError(
@@ -489,11 +489,9 @@ def get_dist(
             mask = logits != padding_value
 
         if mask is not None:
-            return MaskedCategorical(
+            return LLMMaskedCategorical(
                 logits=logits,
                 mask=mask,
-                use_cross_entropy=True,
-                padding_side=padding_side,
             )
         return Categorical(logits)
 
@@ -603,11 +601,9 @@ def _get_dist_with_prompt_mask(
                 padding_side=padding_side,
             )
 
-        return MaskedCategorical(
+        return LLMMaskedCategorical(
             logits=logits,
             mask=response_mask.bool(),
-            use_cross_entropy=True,
-            padding_side=padding_side,
         )
 
     def _get_dist_with_assistant_mask(
@@ -668,11 +664,9 @@ def _get_dist_with_assistant_mask(
                 f"Assistant mask not found in tensordict at key {assistant_mask_key}. {post_msg}"
             )
 
-        return MaskedCategorical(
+        return LLMMaskedCategorical(
             logits=logits,
             mask=assistant_mask,
-            use_cross_entropy=True,
-            padding_side=padding_side,
         )
 
     def _get_dist_with_attention_mask(
@@ -722,11 +716,9 @@ def _get_dist_with_attention_mask(
                 f"Attention mask not found in tensordict at key {attention_mask_key}"
             )
 
-        return MaskedCategorical(
+        return LLMMaskedCategorical(
             logits=logits,
             mask=attention_mask,
-            use_cross_entropy=True,
-            padding_side=padding_side,
         )
 
     def _get_dist_with_custom_mask(
@@ -764,11 +756,9 @@ def _get_dist_with_custom_mask(
         if logits is None:
             raise ValueError(f"Logits not found in tensordict at key {logits_key}")
 
-        return MaskedCategorical(
+        return LLMMaskedCategorical(
             logits=logits,
             mask=mask,
-            use_cross_entropy=True,
-            padding_side=padding_side,
         )
 
     # Convenience methods for common LLM training scenarios