pytorch
diff --git a/‎examples/agents/ppo-chess.py
Lines changed: 118 additions & 0 deletions b/‎examples/agents/ppo-chess.py
Lines changed: 118 additions & 0 deletions
diff --git a/‎test/mocking_classes.py
Lines changed: 4 additions & 1 deletion b/‎test/mocking_classes.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎test/test_specs.py
Lines changed: 21 additions & 8 deletions b/‎test/test_specs.py
Lines changed: 21 additions & 8 deletions
@@ -0,0 +1,118 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import tensordict.nn
+import torch
+import tqdm
+from tensordict.nn import TensorDictSequential as TDSeq, TensorDictModule as TDMod, \
+    ProbabilisticTensorDictModule as TDProb, ProbabilisticTensorDictSequential as TDProbSeq
+from torch import nn
+from torch.nn.utils import clip_grad_norm_
+from torch.optim import Adam
+
+from torchrl.collectors import SyncDataCollector
+
+from torchrl.envs import ChessEnv, Tokenizer
+from torchrl.modules import MLP
+from torchrl.modules.distributions import MaskedCategorical
+from torchrl.objectives import ClipPPOLoss
+from torchrl.objectives.value import GAE
+from torchrl.data import ReplayBuffer, LazyTensorStorage, SamplerWithoutReplacement
+
+tensordict.nn.set_composite_lp_aggregate(False)
+
+num_epochs = 10
+batch_size = 256
+frames_per_batch = 2048
+
+env = ChessEnv(include_legal_moves=True, include_fen=True)
+
+# tokenize the fen - assume max 70 elements
+transform = Tokenizer(in_keys=["fen"], out_keys=["fen_tokenized"], max_length=70)
+
+env = env.append_transform(transform)
+n = env.action_spec.n
+print(env.rollout(10000))
+
+# Embedding layer for the legal moves
+embedding_moves = nn.Embedding(num_embeddings=n + 1, embedding_dim=64)
+
+# Embedding for the fen
+embedding_fen = nn.Embedding(num_embeddings=transform.tokenizer.vocab_size, embedding_dim=64)
+
+backbone = MLP(out_features=512, num_cells=[512] * 8, activation_class=nn.ReLU)
+
+actor_head = nn.Linear(512, env.action_spec.n)
+actor_head.bias.data.fill_(0)
+
+critic_head = nn.Linear(512, 1)
+critic_head.bias.data.fill_(0)
+
+prob = TDProb(in_keys=["logits", "mask"], out_keys=["action"], distribution_class=MaskedCategorical, return_log_prob=True)
+
+def make_mask(idx):
+    mask = idx.new_zeros((*idx.shape[:-1], n + 1), dtype=torch.bool)
+    return mask.scatter_(-1, idx, torch.ones_like(idx, dtype=torch.bool))[..., :-1]
+
+actor = TDProbSeq(
+    TDMod(
+        make_mask,
+        in_keys=["legal_moves"], out_keys=["mask"]),
+    TDMod(embedding_moves, in_keys=["legal_moves"], out_keys=["embedded_legal_moves"]),
+    TDMod(embedding_fen, in_keys=["fen_tokenized"], out_keys=["embedded_fen"]),
+    TDMod(lambda *args: torch.cat([arg.view(*arg.shape[:-2], -1) for arg in args], dim=-1), in_keys=["embedded_legal_moves", "embedded_fen"],
+          out_keys=["features"]),
+    TDMod(backbone, in_keys=["features"], out_keys=["hidden"]),
+    TDMod(actor_head, in_keys=["hidden"], out_keys=["logits"]),
+    prob,
+)
+critic = TDSeq(
+    TDMod(critic_head, in_keys=["hidden"], out_keys=["state_value"]),
+)
+
+
+print(env.rollout(3, actor))
+# loss
+loss = ClipPPOLoss(actor, critic)
+
+optim = Adam(loss.parameters())
+
+gae = GAE(value_network=TDSeq(*actor[:-2], critic), gamma=0.99, lmbda=0.95, shifted=True)
+
+# Create a data collector
+collector = SyncDataCollector(
+    create_env_fn=env,
+    policy=actor,
+    frames_per_batch=frames_per_batch,
+    total_frames=1_000_000,
+)
+
+replay_buffer0 = ReplayBuffer(storage=LazyTensorStorage(max_size=collector.frames_per_batch//2), batch_size=batch_size, sampler=SamplerWithoutReplacement())
+replay_buffer1 = ReplayBuffer(storage=LazyTensorStorage(max_size=collector.frames_per_batch//2), batch_size=batch_size, sampler=SamplerWithoutReplacement())
+
+for data in tqdm.tqdm(collector):
+    data = data.filter_non_tensor_data()
+    print('data', data[0::2])
+    for i in range(num_epochs):
+        replay_buffer0.empty()
+        replay_buffer1.empty()
+        with torch.no_grad():
+            # player 0
+            data0 = gae(data[0::2])
+            # player 1
+            data1 = gae(data[1::2])
+            if i == 0:
+                print('win rate for 0', data0["next", "reward"].sum() / data["next", "done"].sum().clamp_min(1e-6))
+                print('win rate for 1', data1["next", "reward"].sum() / data["next", "done"].sum().clamp_min(1e-6))
+
+            replay_buffer0.extend(data0)
+            replay_buffer1.extend(data1)
+
+        n_iter = collector.frames_per_batch//(2 * batch_size)
+        for (d0, d1) in tqdm.tqdm(zip(replay_buffer0, replay_buffer1, strict=True), total=n_iter):
+            loss_vals = (loss(d0) + loss(d1)) / 2
+            loss_vals.sum(reduce=True).backward()
+            gn = clip_grad_norm_(loss.parameters(), 100.0)
+            optim.step()
+            optim.zero_grad()
@@ -1070,17 +1070,20 @@ def _step(
 
 class CountingEnvWithString(CountingEnv):
     def __init__(self, *args, **kwargs):
+        self.max_size = kwargs.pop("max_size", 30)
+        self.min_size = kwargs.pop("min_size", 4)
         super().__init__(*args, **kwargs)
         self.observation_spec.set(
             "string",
             NonTensor(
                 shape=self.batch_size,
                 device=self.device,
+                example_data=self.get_random_string(),
             ),
         )
 
     def get_random_string(self):
-        size = random.randint(4, 30)
+        size = random.randint(self.min_size, self.max_size)
         return "".join(random.choice(string.ascii_lowercase) for _ in range(size))
 
     def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase:
 
@@ -1402,12 +1402,13 @@ def test_multionehot(self, shape1, shape2):
         assert spec2.zero().shape == spec2.shape
 
     def test_non_tensor(self):
-        spec = NonTensor((3, 4), device="cpu")
+        spec = NonTensor((3, 4), device="cpu", example_data="example_data")
         assert (
             spec.expand(2, 3, 4)
             == spec.expand((2, 3, 4))
-            == NonTensor((2, 3, 4), device="cpu")
+            == NonTensor((2, 3, 4), device="cpu", example_data="example_data")
         )
+        assert spec.expand(2, 3, 4).example_data == "example_data"
 
     @pytest.mark.parametrize("shape1", [None, (), (5,)])
     @pytest.mark.parametrize("shape2", [(), (10,)])
@@ -1607,9 +1608,10 @@ def test_multionehot(
         assert spec is not spec.clone()
 
     def test_non_tensor(self):
-        spec = NonTensor(shape=(3, 4), device="cpu")
+        spec = NonTensor(shape=(3, 4), device="cpu", example_data="example_data")
         assert spec.clone() == spec
         assert spec.clone() is not spec
+        assert spec.clone().example_data == "example_data"
 
     @pytest.mark.parametrize("shape1", [None, (), (5,)])
     def test_onehot(
@@ -1840,9 +1842,10 @@ def test_multionehot(
             spec.unbind(-1)
 
     def test_non_tensor(self):
-        spec = NonTensor(shape=(3, 4), device="cpu")
+        spec = NonTensor(shape=(3, 4), device="cpu", example_data="example_data")
         assert spec.unbind(1)[0] == spec[:, 0]
         assert spec.unbind(1)[0] is not spec[:, 0]
+        assert spec.unbind(1)[0].example_data == "example_data"
 
     @pytest.mark.parametrize("shape1", [(5,), (5, 6)])
     def test_onehot(
@@ -2001,8 +2004,9 @@ def test_multionehot(self, shape1, device):
         assert spec.to(device).device == device
 
     def test_non_tensor(self, device):
-        spec = NonTensor(shape=(3, 4), device="cpu")
+        spec = NonTensor(shape=(3, 4), device="cpu", example_data="example_data")
         assert spec.to(device).device == device
+        assert spec.to(device).example_data == "example_data"
 
     @pytest.mark.parametrize("shape1", [(5,), (5, 6)])
     def test_onehot(self, shape1, device):
@@ -2262,13 +2266,14 @@ def test_stack_multionehot_zero(self, shape, stack_dim):
         assert r.shape == c.shape
 
     def test_stack_non_tensor(self, shape, stack_dim):
-        spec0 = NonTensor(shape=shape, device="cpu")
-        spec1 = NonTensor(shape=shape, device="cpu")
+        spec0 = NonTensor(shape=shape, device="cpu", example_data="example_data")
+        spec1 = NonTensor(shape=shape, device="cpu", example_data="example_data")
         new_spec = torch.stack([spec0, spec1], stack_dim)
         shape_insert = list(shape)
         shape_insert.insert(stack_dim, 2)
         assert new_spec.shape == torch.Size(shape_insert)
         assert new_spec.device == torch.device("cpu")
+        assert new_spec.example_data == "example_data"
 
     def test_stack_onehot(self, shape, stack_dim):
         n = 5
@@ -3642,10 +3647,18 @@ def test_expand(self):
 
 class TestNonTensorSpec:
     def test_sample(self):
-        nts = NonTensor(shape=(3, 4))
+        nts = NonTensor(shape=(3, 4), example_data="example_data")
         assert nts.one((2,)).shape == (2, 3, 4)
         assert nts.rand((2,)).shape == (2, 3, 4)
         assert nts.zero((2,)).shape == (2, 3, 4)
+        assert nts.one((2,)).data == "example_data"
+        assert nts.rand((2,)).data == "example_data"
+        assert nts.zero((2,)).data == "example_data"
+
+    def test_example_data_ineq(self):
+        nts0 = NonTensor(shape=(3, 4), example_data="example_data")
+        nts1 = NonTensor(shape=(3, 4), example_data="example_data 2")
+        assert nts0 != nts1
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="not cuda device")