Skip to content

Commit a75858c

Browse files
chore: update submodules (#175)
Co-authored-by: vfdev-5 <vfdev-5@users.noreply.github.com>
1 parent 2f327d6 commit a75858c

File tree

2 files changed

+50
-60
lines changed

2 files changed

+50
-60
lines changed

src/tutorials/intermediate/03-reinforcement-learning.md

Lines changed: 49 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ The problem is considered solved when the average reward is greater than `reward
4242

4343

4444
```python
45-
!pip install gym pytorch-ignite
45+
!pip install gymnasium pytorch-ignite
4646
```
4747

4848
### On Colab
@@ -53,12 +53,14 @@ We need additional dependencies to render the environment on Google Colab.
5353
```python
5454
!apt-get install -y xvfb python-opengl
5555
!pip install pyvirtualdisplay
56+
!pip install --upgrade pygame moviepy
5657
```
5758

5859
## Imports
5960

6061

6162
```python
63+
from collections import deque
6264
import numpy as np
6365
import torch
6466
import torch.nn as nn
@@ -68,8 +70,8 @@ from torch.distributions import Categorical
6870

6971
from ignite.engine import Engine, Events
7072

71-
import gym
72-
from gym.wrappers import Monitor
73+
import gymnasium as gym
74+
from gymnasium.wrappers import RecordVideo
7375

7476
import glob
7577
import io
@@ -98,7 +100,7 @@ Let's load our environment first.
98100

99101

100102
```python
101-
env = gym.make("CartPole-v0")
103+
env = gym.make("CartPole-v0", render_mode="rgb_array")
102104
```
103105

104106
### On Colab
@@ -123,27 +125,12 @@ Below we have a utility function to enable video recording of the gym environmen
123125

124126
```python
125127
def wrap_env(env):
126-
env = Monitor(env, './video', force=True)
128+
env = RecordVideo(env, './video', disable_logger=True)
127129
return env
128130

129131
env = wrap_env(env)
130132
```
131133

132-
### Set the seed
133-
134-
135-
```python
136-
env.seed(seed_val)
137-
torch.manual_seed(seed_val)
138-
```
139-
140-
141-
142-
143-
<torch._C.Generator at 0x7f76fa684730>
144-
145-
146-
147134
## Model
148135

149136
We are going to utilize the reinforce algorithm in which our agent will use episode samples from starting state to goal state directly from the environment. Our model has two linear layers with 4 in features and 2 out features for 4 state variables and 2 actions respectively. We also define an action buffer as `saved_log_probs` and a rewards one. We also have an intermediate ReLU layer through which the outputs of the 1st layer are passed to receive the score for each action taken. Finally, we return a list of probabilities for each of these actions.
@@ -156,13 +143,16 @@ class Policy(nn.Module):
156143
def __init__(self):
157144
super(Policy, self).__init__()
158145
self.affine1 = nn.Linear(4, 128)
146+
self.dropout = nn.Dropout(p=0.6)
159147
self.affine2 = nn.Linear(128, 2)
160148

161149
self.saved_log_probs = []
162150
self.rewards = []
163151

164152
def forward(self, x):
165-
x = F.relu(self.affine1(x))
153+
x = self.affine1(x)
154+
x = self.dropout(x)
155+
x = F.relu(x)
166156
action_scores = self.affine2(x)
167157
return F.softmax(action_scores, dim=1)
168158
```
@@ -172,10 +162,10 @@ And then we initialize our model, optimizer, epsilon and timesteps.
172162
173163

174164
```python
175-
model = Policy()
176-
optimizer = optim.Adam(model.parameters(), lr=1e-2)
165+
policy = Policy()
166+
optimizer = optim.Adam(policy.parameters(), lr=1e-2)
177167
eps = np.finfo(np.float32).eps.item()
178-
timesteps = list(range(10000))
168+
timesteps = range(10000)
179169
```
180170

181171
## Create Trainer
@@ -188,12 +178,13 @@ Ignite's [`Engine`](https://pytorch.org/ignite/concepts.html#engine) allows user
188178
```python
189179
def run_single_timestep(engine, timestep):
190180
observation = engine.state.observation
191-
action = select_action(model, observation)
192-
engine.state.observation, reward, done, _ = env.step(action)
181+
action = select_action(policy, observation)
182+
engine.state.observation, reward, done, _, _ = env.step(action)
193183
if render:
194184
env.render()
195185

196-
model.rewards.append(reward)
186+
policy.rewards.append(reward)
187+
engine.state.ep_reward += reward
197188

198189
if done:
199190
engine.terminate_epoch()
@@ -206,40 +197,40 @@ Next we need to select an action to take. After we get a list of probabilities,
206197

207198

208199
```python
209-
def select_action(model, observation):
200+
def select_action(policy, observation):
210201
state = torch.from_numpy(observation).float().unsqueeze(0)
211-
probs = model(state)
202+
probs = policy(state)
212203
m = Categorical(probs)
213204
action = m.sample()
214-
model.saved_log_probs.append(m.log_prob(action))
205+
policy.saved_log_probs.append(m.log_prob(action))
215206
return action.item()
216207
```
217208

218209
We initialize a list to save policy loss and true returns of the rewards returned from the environment. Then we calculate the policy losses from the advantage (`-log_prob * reward`). Finally, we reset the gradients, perform backprop on the policy loss and reset the rewards and actions buffer.
219210

220211

221212
```python
222-
def finish_episode(model, optimizer, gamma, eps):
213+
def finish_episode(policy, optimizer, gamma):
223214
R = 0
224215
policy_loss = []
225-
rewards = []
226-
for r in model.rewards[::-1]:
216+
returns = deque()
217+
for r in policy.rewards[::-1]:
227218
R = r + gamma * R
228-
rewards.insert(0, R)
229-
230-
rewards = torch.tensor(rewards)
231-
rewards = (rewards - rewards.mean()) / (rewards.std() + eps)
219+
returns.appendleft(R)
232220

233-
for log_prob, reward in zip(model.saved_log_probs, rewards):
234-
policy_loss.append(-log_prob * reward)
221+
returns = torch.tensor(returns)
222+
returns = (returns - returns.mean()) / (returns.std() + eps)
223+
224+
for log_prob, R in zip(policy.saved_log_probs, returns):
225+
policy_loss.append(-log_prob * R)
235226

236227
optimizer.zero_grad()
237228
policy_loss = torch.cat(policy_loss).sum()
238229
policy_loss.backward()
239230
optimizer.step()
240231

241-
del model.rewards[:]
242-
del model.saved_log_probs[:]
232+
del policy.rewards[:]
233+
del policy.saved_log_probs[:]
243234
```
244235

245236
## Attach handlers to run on specific events
@@ -256,41 +247,40 @@ Before training begins, we initialize the reward in `trainer`'s state.
256247

257248

258249
```python
259-
@trainer.on(Events.STARTED)
260-
def initialize(engine):
261-
engine.state.running_reward = 10
250+
trainer.state.running_reward = 10
262251
```
263252

264253
When an episode begins, we have to reset the environment's state.
265254

266255

267256
```python
268257
@trainer.on(EPISODE_STARTED)
269-
def reset_environment_state(engine):
270-
engine.state.observation = env.reset()
258+
def reset_environment_state():
259+
torch.manual_seed(seed_val + trainer.state.epoch)
260+
trainer.state.observation, _ = env.reset(seed=seed_val + trainer.state.epoch)
261+
trainer.state.ep_reward = 0
271262
```
272263

273264
When an episode finishes, we update the running reward and perform backpropogation by calling `finish_episode()`.
274265

275266

276267
```python
277268
@trainer.on(EPISODE_COMPLETED)
278-
def update_model(engine):
279-
t = engine.state.timestep
280-
engine.state.running_reward = engine.state.running_reward * 0.99 + t * 0.01
281-
finish_episode(model, optimizer, gamma, eps)
269+
def update_model():
270+
trainer.state.running_reward = 0.05 * trainer.state.ep_reward + (1 - 0.05) * trainer.state.running_reward
271+
finish_episode(policy, optimizer, gamma)
282272
```
283273

284274
After that, every 100 (`log_interval`) episodes, we log the results.
285275

286276

287277
```python
288278
@trainer.on(EPISODE_COMPLETED(every=log_interval))
289-
def log_episode(engine):
290-
i_episode = engine.state.epoch
279+
def log_episode():
280+
i_episode = trainer.state.epoch
291281
print(
292-
f"Episode {i_episode}\tLast length: {engine.state.timestep:5d}"
293-
f"\tAverage length: {engine.state.running_reward:.2f}"
282+
f"Episode {i_episode}\tLast reward: {trainer.state.ep_reward:.2f}"
283+
f"\tAverage length: {trainer.state.running_reward:.2f}"
294284
)
295285
```
296286

@@ -299,14 +289,14 @@ And finally, we check if our running reward has crossed the threshold so that we
299289

300290
```python
301291
@trainer.on(EPISODE_COMPLETED)
302-
def should_finish_training(engine):
303-
running_reward = engine.state.running_reward
292+
def should_finish_training():
293+
running_reward = trainer.state.running_reward
304294
if running_reward > env.spec.reward_threshold:
305295
print(
306296
f"Solved! Running reward is now {running_reward} and "
307-
f"the last episode runs to {engine.state.timestep} time steps!"
297+
f"the last episode runs to {trainer.state.timestep} time steps!"
308298
)
309-
engine.should_terminate = True
299+
trainer.should_terminate = True
310300
```
311301

312302
## Run Trainer
@@ -385,7 +375,7 @@ Finally, we can view our saved video.
385375
mp4list = glob.glob('video/*.mp4')
386376

387377
if len(mp4list) > 0:
388-
mp4 = mp4list[0]
378+
mp4 = mp4list[-1] # pick the last video
389379
video = io.open(mp4, 'r+b').read()
390380
encoded = base64.b64encode(video)
391381
ipythondisplay.display(HTML(data='''<video alt="test" autoplay

0 commit comments

Comments
 (0)