pytorch
diff --git a/‎torchrl/envs/custom/chess.py
Lines changed: 147 additions & 43 deletions b/‎torchrl/envs/custom/chess.py
Lines changed: 147 additions & 43 deletions
@@ -6,12 +6,13 @@
 
 import importlib.util
 import io
+import pathlib
 from typing import Dict, Optional
 
 import torch
 from PIL import Image
 from tensordict import TensorDict, TensorDictBase
-from torchrl.data import Categorical, Composite, NonTensor, Unbounded
+from torchrl.data import Bounded, Categorical, Composite, NonTensor, Unbounded
 
 from torchrl.envs import EnvBase
 from torchrl.envs.common import _EnvPostInit
@@ -43,39 +44,65 @@ def __call__(cls, *args, **kwargs):
 class ChessEnv(EnvBase, metaclass=_HashMeta):
     """A chess environment that follows the TorchRL API.
 
+    This environment simulates a chess game using the `chess` library. It supports various state representations
+    and can be configured to include different types of observations such as SAN, FEN, PGN, and legal moves.
+
     Requires: the `chess` library. More info `here <https://python-chess.readthedocs.io/en/latest/>`__.
 
     Args:
         stateful (bool): Whether to keep track of the internal state of the board.
             If False, the state will be stored in the observation and passed back
             to the environment on each call. Default: ``True``.
+        include_san (bool): Whether to include SAN (Standard Algebraic Notation) in the observations. Default: ``False``.
+        include_fen (bool): Whether to include FEN (Forsyth-Edwards Notation) in the observations. Default: ``False``.
+        include_pgn (bool): Whether to include PGN (Portable Game Notation) in the observations. Default: ``False``.
+        include_legal_moves (bool): Whether to include legal moves in the observations. Default: ``False``.
+        include_hash (bool): Whether to include hash transformations in the environment. Default: ``False``.
+        pixels (bool): Whether to include pixel-based observations of the board. Default: ``False``.
 
-    .. note:: the action spec is a :class:`~torchrl.data.Categorical` spec with a ``-1`` shape.
-        Unless :meth:`~torchrl.data.Categorical.set_provisional_n` is called with the cardinality of the legal moves,
-        valid random actions cannot be taken. :meth:`~torchrl.envs.EnvBase.rand_action` has been adapted to account for
-        this behavior.
+    .. note:: The action spec is a :class:`~torchrl.data.Categorical` with a number of actions equal to the number of possible SAN moves.
+        The action space is structured as a categorical distribution over all possible SAN moves, with the legal moves
+        being a subset of this space. The environment uses a mask to ensure only legal moves are selected.
 
     Examples:
-        >>> env = ChessEnv()
+        >>> env = ChessEnv(include_fen=True, include_san=True, include_pgn=True, include_legal_moves=True)
         >>> r = env.reset()
         >>> env.rand_step(r)
         TensorDict(
             fields={
                 action: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.int64, is_shared=False),
                 done: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False),
                 fen: NonTensorData(data=rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1, batch_size=torch.Size([]), device=None),
-                hashing: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.int64, is_shared=False),
+                legal_moves: Tensor(shape=torch.Size([219]), device=cpu, dtype=torch.int64, is_shared=False),
                 next: TensorDict(
                     fields={
                         done: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False),
-                        fen: NonTensorData(data=rnbqkbnr/pppppppp/8/8/8/2N5/PPPPPPPP/R1BQKBNR b KQkq - 1 1, batch_size=torch.Size([]), device=None),
-                        hashing: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.int64, is_shared=False),
-                        reward: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.int32, is_shared=False),
+                        fen: NonTensorData(data=rnbqkbnr/pppppppp/8/8/8/1P6/P1PPPPPP/RNBQKBNR b KQkq - 0 1, batch_size=torch.Size([]), device=None),
+                        legal_moves: Tensor(shape=torch.Size([219]), device=cpu, dtype=torch.int64, is_shared=False),
+                        pgn: NonTensorData(data=[Event "?"]
+                        [Site "?"]
+                        [Date "????.??.??"]
+                        [Round "?"]
+                        [White "?"]
+                        [Black "?"]
+                        [Result "*"]
+                        1. b3 *, batch_size=torch.Size([]), device=None),
+                        reward: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, is_shared=False),
+                        san: NonTensorData(data=b3, batch_size=torch.Size([]), device=None),
                         terminated: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False),
                         turn: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.bool, is_shared=False)},
                     batch_size=torch.Size([]),
                     device=None,
                     is_shared=False),
+                pgn: NonTensorData(data=[Event "?"]
+                [Site "?"]
+                [Date "????.??.??"]
+                [Round "?"]
+                [White "?"]
+                [Black "?"]
+                [Result "*"]
+                *, batch_size=torch.Size([]), device=None),
+                san: NonTensorData(data=[SAN][START], batch_size=torch.Size([]), device=None),
                 terminated: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False),
                 turn: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.bool, is_shared=False)},
             batch_size=torch.Size([]),
@@ -84,30 +111,46 @@ class ChessEnv(EnvBase, metaclass=_HashMeta):
         >>> env.rollout(1000)
         TensorDict(
             fields={
-                action: Tensor(shape=torch.Size([322]), device=cpu, dtype=torch.int64, is_shared=False),
-                done: Tensor(shape=torch.Size([322, 1]), device=cpu, dtype=torch.bool, is_shared=False),
+                action: Tensor(shape=torch.Size([352]), device=cpu, dtype=torch.int64, is_shared=False),
+                done: Tensor(shape=torch.Size([352, 1]), device=cpu, dtype=torch.bool, is_shared=False),
                 fen: NonTensorStack(
                     ['rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQ...,
-                    batch_size=torch.Size([322]),
+                    batch_size=torch.Size([352]),
                     device=None),
-                hashing: Tensor(shape=torch.Size([322]), device=cpu, dtype=torch.int64, is_shared=False),
+                legal_moves: Tensor(shape=torch.Size([352, 219]), device=cpu, dtype=torch.int64, is_shared=False),
                 next: TensorDict(
                     fields={
-                        done: Tensor(shape=torch.Size([322, 1]), device=cpu, dtype=torch.bool, is_shared=False),
+                        done: Tensor(shape=torch.Size([352, 1]), device=cpu, dtype=torch.bool, is_shared=False),
                         fen: NonTensorStack(
-                            ['rnbqkbnr/pppppppp/8/8/2P5/8/PP1PPPPP/RNBQKBNR b ...,
-                            batch_size=torch.Size([322]),
+                            ['rnbqkbnr/pppppppp/8/8/8/N7/PPPPPPPP/R1BQKBNR b K...,
+                            batch_size=torch.Size([352]),
+                            device=None),
+                        legal_moves: Tensor(shape=torch.Size([352, 219]), device=cpu, dtype=torch.int64, is_shared=False),
+                        pgn: NonTensorStack(
+                            ['[Event "?"]\n[Site "?"]\n[Date "????.??.??"]\n[R...,
+                            batch_size=torch.Size([352]),
                             device=None),
-                        hashing: Tensor(shape=torch.Size([322]), device=cpu, dtype=torch.int64, is_shared=False),
-                        reward: Tensor(shape=torch.Size([322, 1]), device=cpu, dtype=torch.int32, is_shared=False),
-                        terminated: Tensor(shape=torch.Size([322, 1]), device=cpu, dtype=torch.bool, is_shared=False),
-                        turn: Tensor(shape=torch.Size([322]), device=cpu, dtype=torch.bool, is_shared=False)},
-                    batch_size=torch.Size([322]),
+                        reward: Tensor(shape=torch.Size([352, 1]), device=cpu, dtype=torch.float32, is_shared=False),
+                        san: NonTensorStack(
+                            ['Na3', 'a5', 'Nb1', 'Nc6', 'a3', 'g6', 'd4', 'd6'...,
+                            batch_size=torch.Size([352]),
+                            device=None),
+                        terminated: Tensor(shape=torch.Size([352, 1]), device=cpu, dtype=torch.bool, is_shared=False),
+                        turn: Tensor(shape=torch.Size([352]), device=cpu, dtype=torch.bool, is_shared=False)},
+                    batch_size=torch.Size([352]),
                     device=None,
                     is_shared=False),
-                terminated: Tensor(shape=torch.Size([322, 1]), device=cpu, dtype=torch.bool, is_shared=False),
-                turn: Tensor(shape=torch.Size([322]), device=cpu, dtype=torch.bool, is_shared=False)},
-            batch_size=torch.Size([322]),
+                pgn: NonTensorStack(
+                    ['[Event "?"]\n[Site "?"]\n[Date "????.??.??"]\n[R...,
+                    batch_size=torch.Size([352]),
+                    device=None),
+                san: NonTensorStack(
+                    ['[SAN][START]', 'Na3', 'a5', 'Nb1', 'Nc6', 'a3', ...,
+                    batch_size=torch.Size([352]),
+                    device=None),
+                terminated: Tensor(shape=torch.Size([352, 1]), device=cpu, dtype=torch.bool, is_shared=False),
+                turn: Tensor(shape=torch.Size([352]), device=cpu, dtype=torch.bool, is_shared=False)},
+            batch_size=torch.Size([352]),
             device=None,
             is_shared=False)
 
@@ -136,13 +179,50 @@ def lib(cls):
             )
         return chess
 
+    _san_moves = []
+
+    @_classproperty
+    def san_moves(cls):
+        if not cls._san_moves:
+            with open(pathlib.Path(__file__).parent / "san_moves.txt", "r+") as f:
+                cls._san_moves.extend(f.read().split("\n"))
+        return cls._san_moves
+
+    def _legal_moves_to_index(
+        self,
+        tensordict: TensorDictBase | None = None,
+        board: "chess.Board" | None = None,  # noqa: F821
+        return_mask: bool = False,
+        pad: bool = False,
+    ) -> torch.Tensor:
+        if not self.stateful and tensordict is not None:
+            fen = self._get_fen(tensordict).data
+            self.board.set_fen(fen)
+            board = self.board
+        elif board is None:
+            board = self.board
+        indices = torch.tensor(
+            [self._san_moves.index(board.san(m)) for m in board.legal_moves],
+            dtype=torch.int64,
+        )
+        if return_mask:
+            return torch.zeros(len(self.san_moves), dtype=torch.bool).index_fill_(
+                0, indices, True
+            )
+        if pad:
+            indices = torch.nn.functional.pad(
+                indices, [0, 218 - indices.numel() + 1], value=len(self.san_moves)
+            )
+        return indices
+
     def __init__(
         self,
         *,
         stateful: bool = True,
         include_san: bool = False,
         include_fen: bool = False,
         include_pgn: bool = False,
+        include_legal_moves: bool = False,
         include_hash: bool = False,
         pixels: bool = False,
     ):
@@ -154,6 +234,14 @@ def __init__(
         self.include_san = include_san
         self.include_fen = include_fen
         self.include_pgn = include_pgn
+        self.include_legal_moves = include_legal_moves
+        if include_legal_moves:
+            # 218 max possible legal moves per chess board position
+            # https://www.stmintz.com/ccc/index.php?id=424966
+            # len(self.san_moves)+1 is the padding value
+            self.full_observation_spec["legal_moves"] = Bounded(
+                0, 1 + len(self.san_moves), shape=(218,), dtype=torch.int64
+            )
         if include_san:
             self.full_observation_spec["san"] = NonTensor(shape=(), example_data="Nc6")
         if include_pgn:
@@ -186,18 +274,19 @@ def __init__(
             self.full_observation_spec["pixels"] = Unbounded(shape=())
 
         self.full_action_spec = Composite(
-            action=Categorical(n=-1, shape=(), dtype=torch.int64)
+            action=Categorical(n=len(self.san_moves), shape=(), dtype=torch.int64)
         )
         self.full_reward_spec = Composite(
-            reward=Unbounded(shape=(1,), dtype=torch.int32)
+            reward=Unbounded(shape=(1,), dtype=torch.float32)
         )
         # done spec generated automatically
         self.board = chess.Board()
         if self.stateful:
             self.action_spec.set_provisional_n(len(list(self.board.legal_moves)))
 
     def rand_action(self, tensordict: Optional[TensorDictBase] = None):
-        self._set_action_space(tensordict)
+        mask = self._legal_moves_to_index(tensordict, return_mask=True)
+        self.action_spec.update_mask(mask)
         return super().rand_action(tensordict)
 
     def _is_done(self, board):
@@ -208,10 +297,14 @@ def _reset(self, tensordict=None):
         pgn = None
         if tensordict is not None:
             if self.include_fen:
-                fen = self._get_fen(tensordict).data
+                fen = self._get_fen(tensordict)
+                if fen is not None:
+                    fen = fen.data
                 dest = tensordict.empty()
             if self.include_pgn:
-                fen = self._get_pgn(tensordict).data
+                pgn = self._get_pgn(tensordict)
+                if pgn is not None:
+                    pgn = pgn.data
                 dest = tensordict.empty()
         else:
             dest = TensorDict()
@@ -245,6 +338,9 @@ def _reset(self, tensordict=None):
                 pgn = self._board_to_pgn(self.board)
             dest.set("pgn", pgn)
         dest.set("turn", turn)
+        if self.include_legal_moves:
+            moves_idx = self._legal_moves_to_index(board=self.board, pad=True)
+            dest.set("legal_moves", moves_idx)
         if self.pixels:
             dest.set("pixels", self._get_tensor_image(board=self.board))
         return dest
@@ -296,8 +392,8 @@ def _set_action_space(self, tensordict: TensorDict | None = None):
 
     @classmethod
     def _pgn_to_board(
-        cls, pgn_string: str, board: "chess.Board" | None = None
-    ) -> "chess.Board":
+        cls, pgn_string: str, board: "chess.Board" | None = None  # noqa: F821
+    ) -> "chess.Board":  # noqa: F821
         pgn_io = io.StringIO(pgn_string)
         game = cls.lib.pgn.read_game(pgn_io)
         if board is None:
@@ -309,7 +405,7 @@ def _pgn_to_board(
         return board
 
     @classmethod
-    def _board_to_pgn(cls, board: "chess.Board") -> str:
+    def _board_to_pgn(cls, board: "chess.Board") -> str:  # noqa: F821
         # Create a new Game object
         game = cls.lib.pgn.Game()
 
@@ -376,11 +472,8 @@ def _step(self, tensordict):
                     "Not enough information to deduce the board. If stateful=False, include_pgn or include_fen must be True."
                 )
 
-        action = list(board.legal_moves)[action]
-        san = None
-        if self.include_san:
-            san = board.san(action)
-        board.push(action)
+        san = self.san_moves[action]
+        board.push_san(san)
 
         self._set_action_space()
 
@@ -398,22 +491,33 @@ def _step(self, tensordict):
         if san is not None:
             dest.set("san", san)
 
+        if self.include_legal_moves:
+            moves_idx = self._legal_moves_to_index(board=board, pad=True)
+            dest.set("legal_moves", moves_idx)
+
         turn = torch.tensor(board.turn)
+        done = self._is_done(board)
         if board.is_checkmate():
             # turn flips after every move, even if the game is over
-            winner = not turn
-            reward_val = 1 if winner == self.lib.WHITE else -1
+            # winner = not turn
+            reward_val = 1  # if winner == self.lib.WHITE else 0
+        elif done:
+            reward_val = 0.5
         else:
-            reward_val = 0
+            reward_val = 0.0
 
-        reward = torch.tensor([reward_val], dtype=torch.int32)
-        done = self._is_done(board)
+        reward = torch.tensor([reward_val], dtype=torch.float32)
         dest.set("reward", reward)
         dest.set("turn", turn)
         dest.set("done", [done])
         dest.set("terminated", [done])
         if self.pixels:
             dest.set("pixels", self._get_tensor_image(board=self.board))
+
+        if self.stateful:
+            # Make sure that rand_action will work next iteration
+            self._set_action_space()
+
         return dest
 
     def _set_seed(self, *args, **kwargs):