Can't adapt torch_ant_ppo.py to use LSTM #324

ivanpanshin · 2025-06-04T15:43:06Z

ivanpanshin
Jun 4, 2025

Hi!

I'm trying to figure out how to train an agent with LSTM in order to account for observation history. Based on previous issues, I found out that standalone scripts is a good starting point.

I trained an Ant using torch_ant_ppo.py and am now trying to adapt it to use ppo_rnn.py

In short, I adapted LSTM model, but keep getting mismatch error

rnn_input = states.view(-1, self.sequence_length, states.shape[-1])  # (N, L, Hin): N=batch_size, L=sequence_length
RuntimeError: shape '[-1, 10, 60]' is invalid for input of size 983040

Which heavily hints that model awaits a sequence of states, but instead is presented with a single state, which is exactly the difference between training a regular model and a sequential one.

My question is simple - what's wrong with my script adaptation? Attaching it here along with full logs.

Script code (click to expand)

import torch
import torch.nn as nn

# import the skrl components to build the RL system
from skrl.agents.torch.ppo import PPO_RNN, PPO_DEFAULT_CONFIG
from skrl.envs.loaders.torch import load_isaaclab_env
from skrl.envs.wrappers.torch import wrap_env
from skrl.memories.torch import RandomMemory
from skrl.models.torch import DeterministicMixin, GaussianMixin, Model
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.trainers.torch import SequentialTrainer
from skrl.utils import set_seed


# seed for reproducibility
set_seed()  # e.g. `set_seed(42)` for fixed seed


# define shared model (stochastic and deterministic models) using mixins
class Shared(GaussianMixin, DeterministicMixin, Model):
    def __init__(
            self, 
            observation_space, 
            action_space, 
            device, 
            clip_actions=False,
            clip_log_std=True, 
            min_log_std=-20, 
            max_log_std=2, 
            reduction="sum",
            num_envs=1,
            num_layers=1,
            hidden_size=64,
            sequence_length=10,
        ):
        Model.__init__(self, observation_space, action_space, device)
        GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)
        DeterministicMixin.__init__(self, clip_actions)

        self.num_envs = num_envs
        self.num_layers = num_layers
        self.hidden_size = hidden_size  # Hcell (Hout is Hcell because proj_size = 0)
        self.sequence_length = sequence_length

        self.lstm = nn.LSTM(input_size=self.num_observations,
                            hidden_size=self.hidden_size,
                            num_layers=self.num_layers,
                            batch_first=True)  # batch_first -> (batch, sequence, features)
        
        # self.net = nn.Sequential(nn.Linear(self.num_observations, 256),
        #                          nn.ELU(),
        #                          nn.Linear(256, 128),
        #                          nn.ELU(),
        #                          nn.Linear(128, 64),
        #                          nn.ELU())

        self.net = nn.Sequential(nn.Linear(self.hidden_size, 64),
                                 nn.ReLU(),
                                 nn.Linear(64, 32),
                                 nn.ReLU(),
                                #  nn.Linear(32, self.num_actions),
                                #  nn.Tanh()
                                 )

        self.mean_layer = nn.Linear(32, self.num_actions)
        self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))

        self.value_layer = nn.Linear(32, 1)

    def get_specification(self):
        # batch size (N) is the number of envs during rollout
        return {"rnn": {"sequence_length": self.sequence_length,
                        "sizes": [(self.num_layers, self.num_envs, self.hidden_size),    # hidden states (D ∗ num_layers, N, Hout)
                                  (self.num_layers, self.num_envs, self.hidden_size)]}}  # cell states   (D ∗ num_layers, N, Hcell)


    def act(self, inputs, role):
        if role == "policy":
            return GaussianMixin.act(self, inputs, role)
        elif role == "value":
            return DeterministicMixin.act(self, inputs, role)


    def run_rnn(
        self, 
        states,
        hidden_states,
        cell_states,
        terminated,
    ):
        if self.training:
            rnn_input = states.view(-1, self.sequence_length, states.shape[-1])  # (N, L, Hin): N=batch_size, L=sequence_length
            hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1])  # (D * num_layers, N, L, Hout)
            cell_states = cell_states.view(self.num_layers, -1, self.sequence_length, cell_states.shape[-1])  # (D * num_layers, N, L, Hcell)
            # get the hidden/cell states corresponding to the initial sequence
            hidden_states = hidden_states[:,:,0,:].contiguous()  # (D * num_layers, N, Hout)
            cell_states = cell_states[:,:,0,:].contiguous()  # (D * num_layers, N, Hcell)

            # reset the RNN state in the middle of a sequence
            if terminated is not None and torch.any(terminated):
                rnn_outputs = []
                terminated = terminated.view(-1, self.sequence_length)
                indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length]

                for i in range(len(indexes) - 1):
                    i0, i1 = indexes[i], indexes[i + 1]
                    rnn_output, (hidden_states, cell_states) = self.lstm(rnn_input[:,i0:i1,:], (hidden_states, cell_states))
                    hidden_states[:, (terminated[:,i1-1]), :] = 0
                    cell_states[:, (terminated[:,i1-1]), :] = 0
                    rnn_outputs.append(rnn_output)

                rnn_states = (hidden_states, cell_states)
                rnn_output = torch.cat(rnn_outputs, dim=1)
            # no need to reset the RNN state in the sequence
            else:
                rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states))
        # rollout
        else:
            rnn_input = states.view(-1, 1, states.shape[-1])  # (N, L, Hin): N=num_envs, L=1
            rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states))

        # flatten the RNN output
        rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1)  # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout)
        return {"rnn_output": rnn_output, "rnn_states": rnn_states}

    def compute(self, inputs, role):
        states = inputs["states"]
        terminated = inputs.get("terminated", None)
        hidden_states, cell_states = inputs["rnn"][0], inputs["rnn"][1]

        rnn_results = self.run_rnn(
            states=states,
            hidden_states=hidden_states,
            cell_states=cell_states,
            terminated=terminated
        )
  
        if role == "policy":
            self._shared_output = self.net(rnn_results['rnn_output'])
            return self.mean_layer(self._shared_output), self.log_std_parameter, {"rnn": [rnn_results['rnn_states'][0], rnn_results['rnn_states'][1]]}

        elif role == "value":
            shared_output = self.net(rnn_results['rnn_output']) if self._shared_output is None else self._shared_output
            self._shared_output = None
            return self.value_layer(shared_output), {}


# load and wrap the Isaac Lab environment
env = load_isaaclab_env(task_name="Isaac-Ant-v0")
env = wrap_env(env)

device = env.device


# instantiate a memory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=16, num_envs=env.num_envs, device=device)


# instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/api/agents/ppo.html#models
models = {}
models["policy"] = Shared(
    observation_space=env.observation_space,
    action_space=env.action_space,
    device=device,
    num_envs=env.num_envs,
    num_layers=1,
    hidden_size=64,
    sequence_length=10
)

models["value"] = models["policy"]  # same instance: shared model


# configure and instantiate the agent (visit its documentation to see all the options)
# https://skrl.readthedocs.io/en/latest/api/agents/ppo.html#configuration-and-hyperparameters
cfg = PPO_DEFAULT_CONFIG.copy()
cfg["rollouts"] = 16  # memory_size
cfg["learning_epochs"] = 8
cfg["mini_batches"] = 4  # 16 * 1024 / 4096
cfg["discount_factor"] = 0.99
cfg["lambda"] = 0.95
cfg["learning_rate"] = 3e-4
cfg["learning_rate_scheduler"] = KLAdaptiveRL
cfg["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008}
cfg["random_timesteps"] = 0
cfg["learning_starts"] = 0
cfg["grad_norm_clip"] = 1.0
cfg["ratio_clip"] = 0.2
cfg["value_clip"] = 0.2
cfg["clip_predicted_values"] = True
cfg["entropy_loss_scale"] = 0.0
cfg["value_loss_scale"] = 1.0
cfg["kl_threshold"] = 0
cfg["rewards_shaper"] = lambda rewards, *args, **kwargs: rewards * 0.1
cfg["time_limit_bootstrap"] = True
cfg["state_preprocessor"] = RunningStandardScaler
cfg["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg["value_preprocessor"] = RunningStandardScaler
cfg["value_preprocessor_kwargs"] = {"size": 1, "device": device}
# logging to TensorBoard and write checkpoints (in timesteps)
cfg["experiment"]["write_interval"] = 40
cfg["experiment"]["checkpoint_interval"] = 400
cfg["experiment"]["directory"] = "runs/torch/Isaac-Ant-v0"

print(f'{env.observation_space=}')

agent = PPO_RNN(models=models,
            memory=memory,
            cfg=cfg,
            observation_space=env.observation_space,
            action_space=env.action_space,
            device=device)



# configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 8000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)

# start training
trainer.train()

Full traceback (click to expand)

./isaaclab.sh -p torch_ant_ppo_rnn.py --headless
[INFO] Using python from: /home/ivan/isaaclab_temp/venv/bin/python                                                                                                                       
[skrl:INFO] Seed: 1805829142
[INFO][AppLauncher]: Using device: cuda:0
[INFO][AppLauncher]: Loading experience file: /home/ivan/isaaclab_temp/apps/isaaclab.python.headless.kit
[Warning] [simulation_app.simulation_app] Modules: ['omni.kit_app'] were loaded before SimulationApp was started and might not be loaded correctly.
[Warning] [simulation_app.simulation_app] Please check to make sure no extra omniverse or pxr modules are imported before the call to SimulationApp(...)
Loading user config located at: '/home/ivan/isaaclab_temp/venv/lib/python3.10/site-packages/omni/data/Kit/Isaac-Sim/4.5/user.config.json'
[Info] [carb] Logging to file: /home/ivan/isaaclab_temp/venv/lib/python3.10/site-packages/omni/logs/Kit/Isaac-Sim/4.5/kit_20250604_183829.log
2025-06-04 15:38:29 [0ms] [Warning] [omni.kit.app.plugin] No crash reporter present, dumps uploading isn't available.
2025-06-04 15:38:29 [2ms] [Warning] [omni.ext.plugin] [ext: rendering_modes] Extensions config 'extension.toml' doesn't exist '/home/ivan/isaaclab_temp/apps/rendering_modes' or '/home/ivan/isaaclab_temp/apps/rendering_modes/config'
2025-06-04 15:38:29 [122ms] [Warning] [omni.usd_config.extension] Enable omni.materialx.libs extension to use MaterialX
2025-06-04 15:38:29 [210ms] [Warning] [omni.platforminfo.plugin] failed to open the default display.  Can't verify X Server version.
2025-06-04 15:38:29 [411ms] [Warning] [omni.datastore] OmniHub is inaccessible
2025-06-04 15:38:29 [438ms] [Warning] [omni.isaac.dynamic_control] omni.isaac.dynamic_control is deprecated as of Isaac Sim 4.5. No action is needed from end-users.
2025-06-04 15:38:30 [1,064ms] [Warning] [pxr.Semantics] pxr.Semantics is deprecated - please use Semantics instead
2025-06-04 15:38:32 [3,481ms] [Warning] [carb.cudainterop.plugin] CUDA_VISIBLE_DEVICES environment variable is set.
2025-06-04 15:38:32 [3,481ms] [Warning] [carb.cudainterop.plugin] Note CUDA device enumeration and Omniverse device enumeration are different.
2025-06-04 15:38:32 [3,481ms] [Warning] [carb.cudainterop.plugin] Setting CUDA_VISIBLE_DEVICES can lead to undesired behavior or crashes.
2025-06-04 15:38:32 [3,482ms] [Warning] [gpu.foundation.plugin] Skipping NVIDIA GPU due CUDA being in bad state: NVIDIA RTX 6000 Ada Generation
2025-06-04 15:38:32 [3,482ms] [Warning] [gpu.foundation.plugin] Please restart your system if CUDA is known to work in your system.
2025-06-04 15:38:32 [3,482ms] [Warning] [gpu.foundation.plugin] Skipping NVIDIA GPU due CUDA being in bad state: NVIDIA RTX 6000 Ada Generation
2025-06-04 15:38:32 [3,482ms] [Warning] [gpu.foundation.plugin] Please restart your system if CUDA is known to work in your system.

|---------------------------------------------------------------------------------------------|
| Driver Version: 535.230.02    | Graphics API: Vulkan
|=============================================================================================|
| GPU | Name                             | Active | LDA | GPU Memory | Vendor-ID | LUID       |
|     |                                  |        |     |            | Device-ID | UUID       |
|     |                                  |        |     |            | Bus-ID    |            |
|---------------------------------------------------------------------------------------------|
| 0   | NVIDIA RTX 6000 Ada Generation   |        |     | 49386   MB | 10de      | 0          |
|     |                                  |        |     |            | 26b1      | 2b0a2be5.. |
|     |                                  |        |     |            | 24        |            |
|---------------------------------------------------------------------------------------------|
| 1   | NVIDIA RTX 6000 Ada Generation   | Yes: 0 |     | 49386   MB | 10de      | 0          |
|     |                                  |        |     |            | 26b1      | 0b1fbc17.. |
|     |                                  |        |     |            | 2d        |            |
|=============================================================================================|
| OS: 24.04.2 LTS (Noble Numbat) ubuntu, Version: 24.04.2, Kernel: 6.11.0-25-generic
| Processor: AMD Ryzen 9 5950X 16-Core Processor
| Cores: 16 | Logical Cores: 32
|---------------------------------------------------------------------------------------------|
| Total Memory (MB): 128731 | Free Memory: 101380
| Total Page/Swap (MB): 3906 | Free Page/Swap: 3756
|---------------------------------------------------------------------------------------------|
2025-06-04 15:38:32 [3,662ms] [Warning] [gpu.foundation.plugin] IOMMU is enabled.
2025-06-04 15:38:32 [3,662ms] [Warning] [gpu.foundation.plugin] Detected IOMMU is enabled. Running CUDA peer-to-peer bandwidth and latency validation.
Unidirectional P2P=Enabled Bandwidth (P2P Writes) Matrix (GB/s)
   D\D     0 
     0 796.03 
P2P=Enabled Latency (P2P Writes) Matrix (us)
   GPU     0 
     0   1.51 

   CPU     0 
     0   2.01 
2025-06-04 15:38:32 [3,701ms] [Warning] [omni.kvdb.plugin] Disabling key-value database because another kit process is locking it
[INFO]: Parsing configuration from: isaaclab_tasks.manager_based.classic.ant.ant_env_cfg:AntEnvCfg

Isaac Lab environment (Isaac-Ant-v0)
2025-06-04 15:38:33 [3,855ms] [Warning] [isaaclab.envs.manager_based_env] Seed not set for the environment. The environment creation may not be deterministic.
[INFO]: Base environment:
    Environment device    : cuda:0
    Environment seed      : None
    Physics step-size     : 0.008333333333333333
    Rendering step-size   : 0.016666666666666666
    Environment step-size : 0.016666666666666666
[INFO]: Time taken for scene creation : 2.147873 seconds
[INFO]: Scene manager:  <class InteractiveScene>
    Number of environments: 4096
    Environment spacing   : 5.0
    Source prim name      : /World/envs/env_0
    Global prim paths     : ['/World/ground']
    Replicate physics     : True
[INFO]: Starting the simulation. This may take a few seconds. Please wait...
[INFO]: Time taken for simulation start : 1.732636 seconds
[INFO] Command Manager:  <CommandManager> contains 0 active terms.
+------------------------+
|  Active Command Terms  |
+--------+-------+-------+
| Index  | Name  |  Type |
+--------+-------+-------+
+--------+-------+-------+

[INFO] Event Manager:  <EventManager> contains 1 active terms.
+-------------------------------------+
| Active Event Terms in Mode: 'reset' |
+---------+---------------------------+
|  Index  | Name                      |
+---------+---------------------------+
|    0    | reset_base                |
|    1    | reset_robot_joints        |
+---------+---------------------------+

[INFO] Recorder Manager:  <RecorderManager> contains 0 active terms.
+---------------------+
| Active Recorder Terms |
+-----------+---------+
|   Index   | Name    |
+-----------+---------+
+-----------+---------+

[INFO] Action Manager:  <ActionManager> contains 1 active terms.
+----------------------------------+
|  Active Action Terms (shape: 8)  |
+-------+--------------+-----------+
| Index | Name         | Dimension |
+-------+--------------+-----------+
|   0   | joint_effort |         8 |
+-------+--------------+-----------+

[INFO] Observation Manager: <ObservationManager> contains 1 groups.
+-----------------------------------------------------------+
| Active Observation Terms in Group: 'policy' (shape: (60,)) |
+-----------+-----------------------------------+-----------+
|   Index   | Name                              |   Shape   |
+-----------+-----------------------------------+-----------+
|     0     | base_height                       |    (1,)   |
|     1     | base_lin_vel                      |    (3,)   |
|     2     | base_ang_vel                      |    (3,)   |
|     3     | base_yaw_roll                     |    (2,)   |
|     4     | base_angle_to_target              |    (1,)   |
|     5     | base_up_proj                      |    (1,)   |
|     6     | base_heading_proj                 |    (1,)   |
|     7     | joint_pos_norm                    |    (8,)   |
|     8     | joint_vel_rel                     |    (8,)   |
|     9     | feet_body_forces                  |   (24,)   |
|     10    | actions                           |    (8,)   |
+-----------+-----------------------------------+-----------+

[INFO] Termination Manager:  <TerminationManager> contains 2 active terms.
+---------------------------------+
|     Active Termination Terms    |
+-------+--------------+----------+
| Index | Name         | Time Out |
+-------+--------------+----------+
|   0   | time_out     |   True   |
|   1   | torso_height |  False   |
+-------+--------------+----------+

[INFO] Reward Manager:  <RewardManager> contains 7 active terms.
+-----------------------------------+
|        Active Reward Terms        |
+-------+------------------+--------+
| Index | Name             | Weight |
+-------+------------------+--------+
|   0   | progress         |    1.0 |
|   1   | alive            |    0.5 |
|   2   | upright          |    0.1 |
|   3   | move_to_target   |    0.5 |
|   4   | action_l2        | -0.005 |
|   5   | energy           |  -0.05 |
|   6   | joint_pos_limits |   -0.1 |
+-------+------------------+--------+

[INFO] Curriculum Manager:  <CurriculumManager> contains 0 active terms.
+----------------------+
| Active Curriculum Terms |
+-----------+----------+
|   Index   | Name     |
+-----------+----------+
+-----------+----------+

[INFO]: Completed setting up the environment...
[skrl:INFO] Environment wrapper: 'auto' (class: gymnasium.core.Env, gymnasium.core.Wrapper, gymnasium.utils.record_constructor.RecordConstructorArgs, isaaclab.envs.manager_based_env.ManagerBasedEnv)
[skrl:INFO] Environment wrapper: Isaac Lab (single-agent)
env.observation_space=Box(-inf, inf, (60,), float32)
  0%|▎                                                                                                                                                  | 15/8000 [00:00<02:07, 62.81it/s]
Traceback (most recent call last):
  File "/home/ivan/isaaclab_temp/torch_ant_ppo_rnn.py", line 224, in <module>
    trainer.train()
  File "/home/ivan/isaaclab_temp/venv/lib/python3.10/site-packages/skrl/trainers/torch/sequential.py", line 86, in train
    self.single_agent_train()
  File "/home/ivan/isaaclab_temp/venv/lib/python3.10/site-packages/skrl/trainers/torch/base.py", line 222, in single_agent_train
    self.agents.post_interaction(timestep=timestep, timesteps=self.timesteps)
  File "/home/ivan/isaaclab_temp/venv/lib/python3.10/site-packages/skrl/agents/torch/ppo/ppo_rnn.py", line 429, in post_interaction
    self._update(timestep, timesteps)
  File "/home/ivan/isaaclab_temp/venv/lib/python3.10/site-packages/skrl/agents/torch/ppo/ppo_rnn.py", line 577, in _update
    _, next_log_prob, _ = self.policy.act(
  File "/home/ivan/isaaclab_temp/torch_ant_ppo_rnn.py", line 80, in act
    return GaussianMixin.act(self, inputs, role)
  File "/home/ivan/isaaclab_temp/venv/lib/python3.10/site-packages/skrl/models/torch/gaussian.py", line 129, in act
    mean_actions, log_std, outputs = self.compute(inputs, role)
  File "/home/ivan/isaaclab_temp/torch_ant_ppo_rnn.py", line 132, in compute
    rnn_results = self.run_rnn(
  File "/home/ivan/isaaclab_temp/torch_ant_ppo_rnn.py", line 93, in run_rnn
    rnn_input = states.view(-1, self.sequence_length, states.shape[-1])  # (N, L, Hin): N=batch_size, L=sequence_length
RuntimeError: shape '[-1, 10, 60]' is invalid for input of size 983040
[skrl:INFO] Closing environment
[skrl:INFO] Environment closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Can't adapt torch_ant_ppo.py to use LSTM #324

Uh oh!

{{title}}

Uh oh!

Replies: 0 comments

Select a reply

Uh oh!

Can't adapt torch_ant_ppo.py to use LSTM #324

Uh oh!

ivanpanshin Jun 4, 2025

Replies: 0 comments

ivanpanshin
Jun 4, 2025