Skip to content

Commit a65a8f9

Browse files
committed
Fixed Transformer layers/model; some minor fix
1 parent 8fe1955 commit a65a8f9

File tree

4 files changed

+66
-43
lines changed

4 files changed

+66
-43
lines changed

tsl/datasets/prototypes/tabular_dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020

2121
class TabularDataset(Dataset, TabularParsingMixin):
22-
r"""Base :class:`~tsl.datasets.Dataset` class for tabular data.
22+
r"""Base :class:`~tsl.datasets.prototypes.Dataset` class for tabular data.
2323
2424
Tabular data are assumed to be 3-dimensional arrays where the dimensions
2525
represent time, nodes and features, respectively. They can be either

tsl/experiment/experiment.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import inspect
22
import os
3+
import os.path as osp
34
import sys
45
from functools import wraps
56
from typing import Optional, Callable, List, Union
@@ -43,9 +44,9 @@ def _pre_experiment_routine(cfg: DictConfig):
4344
# name=hconf.job.name,
4445
dir=hconf.runtime.output_dir)
4546
if hconf.get('output_subdir') is not None:
46-
run_args['tsl_subdir'] = os.path.join(cfg.run.dir, hconf.output_subdir)
47+
run_args['tsl_subdir'] = osp.join(cfg.run.dir, hconf.output_subdir)
4748
# remove hydra conf from logging
48-
os.unlink(os.path.join(run_args['tsl_subdir'], 'hydra.yaml'))
49+
os.unlink(osp.join(run_args['tsl_subdir'], 'hydra.yaml'))
4950
# set run name
5051
run_args['name'] = "${now:%Y-%m-%d_%H-%M-%S}_${run.seed}"
5152
with flag_override(cfg, 'struct', False):
@@ -112,9 +113,9 @@ def __init__(self, run_fn: Callable,
112113
# config_path={config_path} same as --config-path {config_path}
113114
override_config_path = get_hydra_cli_arg('config_path', delete=True)
114115
config_path = override_config_path or config_path
115-
if not os.path.isabs(config_path):
116-
root_path = os.path.dirname(inspect.getfile(run_fn))
117-
config_path = os.path.join(root_path, config_path)
116+
if not osp.isabs(config_path):
117+
root_path = osp.dirname(inspect.getfile(run_fn))
118+
config_path = osp.abspath(osp.join(root_path, config_path))
118119
self.config_path = config_path
119120
# store config_dir in tsl config
120121
config.config_dir = self.config_path
@@ -164,7 +165,7 @@ def decorated_run_fn(cfg: DictConfig):
164165
def log_config(self) -> None:
165166
"""Save config as ``.yaml`` file in
166167
:meth:`~tsl.experiment.Experiment.run_dir`."""
167-
with open(os.path.join(self.run_dir, 'config.yaml'), 'w') as fp:
168+
with open(osp.join(self.run_dir, 'config.yaml'), 'w') as fp:
168169
fp.write(OmegaConf.to_yaml(self.cfg, resolve=True))
169170

170171
def __repr__(self):

tsl/nn/blocks/encoders/transformer.py

Lines changed: 52 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,41 @@
1-
from torch import nn
2-
3-
from tsl.nn.base.attention import MultiHeadAttention
4-
from tsl.nn.layers.norm import LayerNorm
5-
from tsl.nn import utils
61
from functools import partial
2+
from typing import Optional
73

84
import torch.nn.functional as F
5+
from torch import nn, Tensor
6+
7+
from tsl.nn import utils
8+
from tsl.nn.base.attention import MultiHeadAttention
9+
from tsl.nn.layers.norm import LayerNorm
910

1011

1112
class TransformerLayer(nn.Module):
12-
r"""
13-
A TransformerLayer which can be instantiated to attent the temporal or spatial dimension.
13+
r"""A Transformer layer from the paper `"Attention Is All You Need"
14+
<https://arxiv.org/abs/1706.03762>`_ (Vaswani et al., NeurIPS 2017).
15+
16+
This layer can be instantiated to attend the temporal or spatial dimension.
1417
1518
Args:
1619
input_size (int): Input size.
1720
hidden_size (int): Dimension of the learned representations.
1821
ff_size (int): Units in the MLP after self attention.
1922
n_heads (int, optional): Number of parallel attention heads.
20-
axis (str, optional): Dimension on which to apply attention to update the representations.
21-
causal (bool, optional): Whether to causally mask the attention scores (can be `True` only if `axis` is `steps`).
23+
axis (str, optional): Dimension on which to apply attention to update
24+
the representations. Can be either, 'time' or 'nodes'.
25+
(default: :obj:`'time'`)
26+
causal (bool, optional): If :obj:`True`, then causally mask attention
27+
scores in temporal attention (has an effect only if :attr:`axis` is
28+
:obj:`'time'`). (default: :obj:`True`)
2229
activation (str, optional): Activation function.
2330
dropout (float, optional): Dropout probability.
2431
"""
32+
2533
def __init__(self,
2634
input_size,
2735
hidden_size,
2836
ff_size=None,
2937
n_heads=1,
30-
axis='steps',
38+
axis='time',
3139
causal=True,
3240
activation='elu',
3341
dropout=0.):
@@ -60,27 +68,32 @@ def __init__(self,
6068

6169
self.activation = utils.get_functional_activation(activation)
6270

63-
def forward(self, x, mask=None):
71+
def forward(self, x: Tensor, mask: Optional[Tensor] = None):
6472
""""""
6573
# x: [batch, steps, nodes, features]
66-
x = self.skip_conn(x) + self.dropout(self.att(self.norm1(x), attn_mask=mask)[0])
74+
x = self.skip_conn(x) + self.dropout(
75+
self.att(self.norm1(x), attn_mask=mask)[0])
6776
x = x + self.mlp(x)
6877
return x
6978

7079

7180
class SpatioTemporalTransformerLayer(nn.Module):
72-
r"""
73-
A TransformerLayer which attend both the spatial and temporal dimensions by stacking two `MultiHeadAttention` layers.
81+
r"""A :class:`~tsl.nn.blocks.encoders.TransformerLayer` which attend both
82+
the spatial and temporal dimensions by stacking two
83+
:class:`~tsl.nn.base.MultiHeadAttention` layers.
7484
7585
Args:
7686
input_size (int): Input size.
7787
hidden_size (int): Dimension of the learned representations.
7888
ff_size (int): Units in the MLP after self attention.
7989
n_heads (int, optional): Number of parallel attention heads.
80-
causal (bool, optional): Whether to causally mask the attention scores (can be `True` only if `axis` is `steps`).
90+
causal (bool, optional): If :obj:`True`, then causally mask attention
91+
scores in temporal attention.
92+
(default: :obj:`True`)
8193
activation (str, optional): Activation function.
8294
dropout (float, optional): Dropout probability.
8395
"""
96+
8497
def __init__(self,
8598
input_size,
8699
hidden_size,
@@ -95,7 +108,7 @@ def __init__(self,
95108
kdim=input_size,
96109
vdim=input_size,
97110
heads=n_heads,
98-
axis='steps',
111+
axis='time',
99112
causal=causal)
100113

101114
self.spatial_att = MultiHeadAttention(embed_dim=hidden_size,
@@ -122,19 +135,18 @@ def __init__(self,
122135

123136
self.dropout = nn.Dropout(dropout)
124137

125-
def forward(self, x, mask=None):
138+
def forward(self, x: Tensor, mask: Optional[Tensor] = None):
126139
""""""
127140
# x: [batch, steps, nodes, features]
128-
129-
x = self.skip_conn(x) + self.dropout(self.temporal_att(self.norm1(x), attn_mask=mask)[0])
141+
x = self.skip_conn(x) + self.dropout(
142+
self.temporal_att(self.norm1(x), attn_mask=mask)[0])
130143
x = x + self.dropout(self.spatial_att(self.norm2(x), attn_mask=mask)[0])
131144
x = x + self.mlp(x)
132145
return x
133146

134147

135148
class Transformer(nn.Module):
136-
r"""
137-
A stack of Transformer layers.
149+
r"""A stack of Transformer layers.
138150
139151
Args:
140152
input_size (int): Input size.
@@ -143,19 +155,25 @@ class Transformer(nn.Module):
143155
output_size (int, optional): Size of an optional linear readout.
144156
n_layers (int, optional): Number of Transformer layers.
145157
n_heads (int, optional): Number of parallel attention heads.
146-
axis (str, optional): Dimension on which to apply attention to update the representations.
147-
causal (bool, optional): Whether to causally mask the attention scores (can be `True` only if `axis` is `steps`).
158+
axis (str, optional): Dimension on which to apply attention to update
159+
the representations. Can be either, 'time', 'nodes', or 'both'.
160+
(default: :obj:`'time'`)
161+
causal (bool, optional): If :obj:`True`, then causally mask attention
162+
scores in temporal attention (has an effect only if :attr:`axis` is
163+
:obj:`'time'` or :obj:`'both'`).
164+
(default: :obj:`True`)
148165
activation (str, optional): Activation function.
149166
dropout (float, optional): Dropout probability.
150167
"""
168+
151169
def __init__(self,
152170
input_size,
153171
hidden_size,
154172
ff_size=None,
155173
output_size=None,
156174
n_layers=1,
157175
n_heads=1,
158-
axis='steps',
176+
axis='time',
159177
causal=True,
160178
activation='elu',
161179
dropout=0.):
@@ -165,7 +183,7 @@ def __init__(self,
165183
if ff_size is None:
166184
ff_size = hidden_size
167185

168-
if axis in ['steps', 'nodes']:
186+
if axis in ['time', 'nodes']:
169187
transformer_layer = partial(TransformerLayer, axis=axis)
170188
elif axis == 'both':
171189
transformer_layer = SpatioTemporalTransformerLayer
@@ -174,13 +192,14 @@ def __init__(self,
174192

175193
layers = []
176194
for i in range(n_layers):
177-
layers.append(transformer_layer(input_size=input_size if i == 0 else hidden_size,
178-
hidden_size=hidden_size,
179-
ff_size=ff_size,
180-
n_heads=n_heads,
181-
causal=causal,
182-
activation=activation,
183-
dropout=dropout))
195+
layers.append(transformer_layer(
196+
input_size=input_size if i == 0 else hidden_size,
197+
hidden_size=hidden_size,
198+
ff_size=ff_size,
199+
n_heads=n_heads,
200+
causal=causal,
201+
activation=activation,
202+
dropout=dropout))
184203

185204
self.net = nn.Sequential(*layers)
186205

@@ -189,7 +208,7 @@ def __init__(self,
189208
else:
190209
self.register_parameter('readout', None)
191210

192-
def forward(self, x):
211+
def forward(self, x: Tensor):
193212
""""""
194213
x = self.net(x)
195214
if self.readout is not None:

tsl/nn/models/temporal/transformer_model.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,15 @@
77
from tsl.nn.blocks.encoders import ConditionalBlock
88
from tsl.nn.blocks.encoders.mlp import MLP
99
from tsl.nn.blocks.encoders.transformer import Transformer
10+
from tsl.nn.layers.ops import Select
1011
from tsl.nn.layers.positional_encoding import PositionalEncoding
1112
from tsl.nn.models.base_model import BaseModel
12-
from tsl.nn.layers.ops import Select
1313

1414

1515
class TransformerModel(BaseModel):
16-
r"""Simple Transformer for multistep time series forecasting.
16+
r"""A Transformer from the paper `"Attention Is All You Need"
17+
<https://arxiv.org/abs/1706.03762>`_ (Vaswani et al., NeurIPS 2017) for
18+
multistep time series forecasting.
1719
1820
Args:
1921
input_size (int): Input size.
@@ -26,7 +28,8 @@ class TransformerModel(BaseModel):
2628
n_layers (int, optional): Number of layers.
2729
dropout (float, optional): Dropout probability.
2830
axis (str, optional): Dimension on which to apply attention to update
29-
the representations.
31+
the representations. Can be either, 'time', 'nodes', or 'both'.
32+
(default: :obj:`'time'`)
3033
activation (str, optional): Activation function.
3134
"""
3235

0 commit comments

Comments
 (0)