[Environment] Fix lib CI failures

Vincent Moens · Vincent Moens · commit 4e310ddff4e4 · 2025-04-29T10:53:46.000+01:00
ghstack-source-id: c046f5d Pull-Request-resolved: #2923
diff --git a/.github/scripts/td_script.sh b/.github/scripts/td_script.sh
@@ -8,15 +8,31 @@ ARCH=${ARCH:-}  # This sets ARCH to an empty string if it's not defined
 
 if pip list | grep -q torch; then
     echo "Torch is installed."
+
     ${CONDA_RUN} pip install "pybind11[global]"
-    ${CONDA_RUN} conda install anaconda::cmake -y
+
+    if conda list cmake | grep -q 'cmake'; then
+        echo "CMake is already installed."
+    else
+        echo "CMake is not installed. Installing now..."
+        ${CONDA_RUN} conda install anaconda::cmake -y --no-update-deps
+    fi
+
     ${CONDA_RUN} pip install git+https://github.com/pytorch/tensordict.git -U --no-deps
 elif [[ -n "${SMOKE_TEST_SCRIPT:-}" ]]; then
     ${CONDA_RUN} ${PIP_INSTALL_TORCH}
     #    TODO: revert when nightlies of tensordict are fixed
     #    if [[ "$ARCH" == "aarch64" ]]; then
+
     ${CONDA_RUN} pip install "pybind11[global]"
-    ${CONDA_RUN} conda install anaconda::cmake -y
+
+    if conda list cmake | grep -q 'cmake'; then
+        echo "CMake is already installed."
+    else
+        echo "CMake is not installed. Installing now..."
+        ${CONDA_RUN} conda install anaconda::cmake -y --no-update-deps
+    fi
+
     ${CONDA_RUN} pip install git+https://github.com/pytorch/tensordict.git -U --no-deps
 else
     echo "Torch is not installed - tensordict will be installed later."
diff --git a/.github/unittest/linux_libs/scripts_gym/batch_scripts.sh b/.github/unittest/linux_libs/scripts_gym/batch_scripts.sh
@@ -15,7 +15,7 @@ conda activate ./env
 $DIR/install.sh
 
 # Extracted from run_test.sh to run once.
-apt-get update && apt-get install -y git wget libglew-dev libx11-dev x11proto-dev g++ cmake
+apt-get update && apt-get install -y git wget libglew-dev libx11-dev x11proto-dev g++
 
 # solves "'extras_require' must be a dictionary"
 pip install setuptools==65.3.0
diff --git a/.github/unittest/linux_libs/scripts_gym/install.sh b/.github/unittest/linux_libs/scripts_gym/install.sh
@@ -4,8 +4,7 @@ unset PYTORCH_VERSION
 # For unittest, nightly PyTorch is used as the following section,
 # so no need to set PYTORCH_VERSION.
 # In fact, keeping PYTORCH_VERSION forces us to hardcode PyTorch version in config.
-apt-get update && apt-get install -y git wget gcc g++ cmake
-
+apt-get update && apt-get install -y git wget gcc g++
 set -e
 set -v
 
@@ -48,6 +47,8 @@ pip install -U charset-normalizer
 
 # install tensordict
 if [[ "$RELEASE" == 0 ]]; then
+  conda install "anaconda::cmake>=3.22" -y
+  pip3 install "pybind11[global]"
   pip3 install git+https://github.com/pytorch/tensordict.git
 else
   pip3 install tensordict
diff --git a/.github/unittest/linux_libs/scripts_gym/run_test.sh b/.github/unittest/linux_libs/scripts_gym/run_test.sh
@@ -21,6 +21,7 @@ export MKL_THREADING_LAYER=GNU
 python .github/unittest/helpers/coverage_run_parallel.py -m pytest test/smoke_test.py -v --durations 200
 python .github/unittest/helpers/coverage_run_parallel.py -m pytest test/smoke_test_deps.py -v --durations 200 -k 'test_gym'
 
+unset LD_PRELOAD
 export DISPLAY=:99
 Xvfb :99 -screen 0 1400x900x24 > /dev/null 2>&1 &
 python .github/unittest/helpers/coverage_run_parallel.py -m pytest test/test_libs.py --instafail -v --durations 200 -k "gym and not isaac" --error-for-skips --mp_fork
diff --git a/.github/unittest/linux_libs/scripts_gym/setup_env.sh b/.github/unittest/linux_libs/scripts_gym/setup_env.sh
@@ -9,7 +9,11 @@ set -e
 
 this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 # Avoid error: "fatal: unsafe repository"
-apt-get update && apt-get install -y git wget gcc g++ cmake
+apt-get update && apt-get install -y git wget gcc g++
+
+apt-get install -y libglfw3 libgl1-mesa-glx libosmesa6 libglew-dev libsdl2-dev libsdl2-2.0-0
+apt-get install -y libegl-dev libegl
+apt-get install -y libglvnd0 libgl1 libglx0 libegl1 libgles2 xvfb
 
 git config --global --add safe.directory '*'
 root_dir="$(git rev-parse --show-toplevel)"
@@ -69,6 +73,7 @@ printf "* Installing dependencies (except PyTorch)\n"
 echo "  - python=${PYTHON_VERSION}" >> "${this_dir}/environment.yml"
 cat "${this_dir}/environment.yml"
 
+
 export MUJOCO_GL=egl
 conda env config vars set \
   MAX_IDLE_COUNT=1000 \
diff --git a/.github/workflows/test-linux-libs.yml b/.github/workflows/test-linux-libs.yml
@@ -206,7 +206,7 @@ jobs:
       runner: "linux.g5.4xlarge.nvidia.gpu"
       # gpu-arch-type: "cuda"
       # gpu-arch-version: "11.6"
-      docker-image: "nvidia/cudagl:11.4.0-base"
+      docker-image: "nvidia/cuda:12.4.0-devel-ubuntu22.04"
       timeout: 120
       script: |
         if [[ "${{ github.ref }}" =~ release/* ]]; then
diff --git a/sota-implementations/a2c/utils_atari.py b/sota-implementations/a2c/utils_atari.py
@@ -74,7 +74,6 @@ def make_parallel_env(env_name, num_envs, device, gym_backend, is_test=False):
             lambda: make_base_env(env_name, gym_backend=gym_backend, is_test=is_test),
         ),
         serial_for_single=True,
-        gym_backend=gym_backend,
         device=device,
     )
     env = TransformedEnv(env)
diff --git a/torchrl/data/datasets/atari_dqn.py b/torchrl/data/datasets/atari_dqn.py
@@ -508,6 +508,12 @@ def _download_and_preproc(self):
             if not os.listdir(tempdir):
                 os.makedirs(tempdir, exist_ok=True)
                 # get the list of runs
+                try:
+                    subprocess.run(
+                        ["gsutil", "version"], check=True, capture_output=True
+                    )
+                except subprocess.CalledProcessError:
+                    raise RuntimeError("gsutil is not installed or not found in PATH.")
                 command = f"gsutil -m ls -R gs://atari-replay-datasets/dqn/{self.dataset_id}/replay_logs"
                 output = subprocess.run(
                     command, shell=True, capture_output=True
@@ -520,9 +526,7 @@ def _download_and_preproc(self):
                 self.remote_gz_files = self._list_runs(None, files)
                 remote_gz_files = list(self.remote_gz_files)
                 if not len(remote_gz_files):
-                    raise RuntimeError(
-                        "Could not load the file list. Did you install gsutil?"
-                    )
+                    raise RuntimeError("No files in file list.")
 
                 total_runs = remote_gz_files[-1]
                 if self.num_procs == 0:
diff --git a/torchrl/envs/libs/smacv2.py b/torchrl/envs/libs/smacv2.py
@@ -228,7 +228,7 @@ def _make_specs(self, env: "smacv2.env.StarCraft2Env") -> None:  # noqa: F821
             dtype=torch.bool,
             device=self.device,
         )
-        self.action_spec = self._make_action_spec()
+        self.full_action_spec = self._make_action_spec()
         self.observation_spec = self._make_observation_spec()
 
     def _init_env(self) -> None:
@@ -356,7 +356,7 @@ def _reset(
     def _step(self, tensordict: TensorDictBase) -> TensorDictBase:
         # perform actions
         action = tensordict.get(("agents", "action"))
-        action_np = self.action_spec.to_numpy(action)
+        action_np = self.full_action_spec[self.action_key].to_numpy(action)
 
         # Actions are validated by the environment.
         try:
diff --git a/torchrl/envs/transforms/transforms.py b/torchrl/envs/transforms/transforms.py
@@ -8798,29 +8798,37 @@ def __init__(
     def forward(self, tensordict: TensorDictBase) -> TensorDictBase:
         raise RuntimeError(FORWARD_NOT_IMPLEMENTED.format(type(self)))
 
+    @property
+    def action_spec(self):
+        action_spec = self.container.full_action_spec
+        keys = self.container.action_keys
+        if len(keys) == 1:
+            action_spec = action_spec[keys[0]]
+        else:
+            raise ValueError(
+                f"Too many action keys for {self.__class__.__name__}: {keys=}"
+            )
+        if not isinstance(action_spec, self.ACCEPTED_SPECS):
+            raise ValueError(
+                self.SPEC_TYPE_ERROR.format(self.ACCEPTED_SPECS, type(action_spec))
+            )
+        return action_spec
+
     def _call(self, next_tensordict: TensorDictBase) -> TensorDictBase:
         parent = self.parent
         if parent is None:
             raise RuntimeError(
                 f"{type(self)}.parent cannot be None: make sure this transform is executed within an environment."
             )
         mask = next_tensordict.get(self.in_keys[1])
-        action_spec = self.container.action_spec
-        if not isinstance(action_spec, self.ACCEPTED_SPECS):
-            raise ValueError(
-                self.SPEC_TYPE_ERROR.format(self.ACCEPTED_SPECS, type(action_spec))
-            )
+        action_spec = self.action_spec
         action_spec.update_mask(mask.to(action_spec.device))
         return next_tensordict
 
     def _reset(
         self, tensordict: TensorDictBase, tensordict_reset: TensorDictBase
     ) -> TensorDictBase:
-        action_spec = self.container.action_spec
-        if not isinstance(action_spec, self.ACCEPTED_SPECS):
-            raise ValueError(
-                self.SPEC_TYPE_ERROR.format(self.ACCEPTED_SPECS, type(action_spec))
-            )
+        action_spec = self.action_spec
         mask = tensordict.get(self.in_keys[1], None)
         if mask is not None:
             mask = mask.to(action_spec.device)

Original file line number	Diff line number	Diff line change
`@@ -74,7 +74,6 @@ def make_parallel_env(env_name, num_envs, device, gym_backend, is_test=False):`
`74`	`74`	`lambda: make_base_env(env_name, gym_backend=gym_backend, is_test=is_test),`
`75`	`75`	`),`
`76`	`76`	`serial_for_single=True,`
`77`		`- gym_backend=gym_backend,`
`78`	`77`	`device=device,`
`79`	`78`	`)`
`80`	`79`	`env = TransformedEnv(env)`