amend

vmoens · vmoens · commit 39772aaa9a70 · 2025-07-04T21:30:02.000+01:00
diff --git a/torchrl/envs/llm/transforms/kl.py b/torchrl/envs/llm/transforms/kl.py
@@ -1177,14 +1177,15 @@ def _step(
                         r - self.coeff * k.unsqueeze(-1)
                         for r, k in _zip_strict(reward, kl)
                     ]
+                    next_tensordict.set("reward", torch.nested.as_nested_tensor(reward, layout=torch.strided))
                 else:
                     if reward.ndim != kl.ndim + 1:
                         raise ValueError(
                             f"The rewards have shape {reward.shape} but the kl has shape {kl.shape}. "
                             f"The rewards should have one more dimension than the KL."
                         )
                     reward = reward - self.coeff * kl.unsqueeze(-1)
-                next_tensordict.set("reward", reward)
+                    next_tensordict.set("reward", reward)
 
         return next_tensordict
 
diff --git a/torchrl/modules/distributions/discrete.py b/torchrl/modules/distributions/discrete.py
@@ -369,6 +369,10 @@ def log_prob(self, value: torch.Tensor) -> torch.Tensor:
             if logits.ndim > 2:
                 # Bring channels in 2nd dim
                 logits = logits.transpose(-1, 1)
+            if logits.ndim <= idx.ndim:
+                logits = logits.expand(idx.shape + logits.shape)
+            print(f"logits: {logits.shape}")
+            print(f"idx: {idx.shape}")
             ret = -torch.nn.functional.cross_entropy(logits, idx, reduce=False)
         else:
             ret = super().log_prob(idx)