Fixed Transformer layers/model; some minor fix

marshka · marshka · commit a65a8f918767 · 2022-11-18T15:02:52.000+01:00
diff --git a/tsl/datasets/prototypes/tabular_dataset.py b/tsl/datasets/prototypes/tabular_dataset.py
@@ -19,7 +19,7 @@
 
 
 class TabularDataset(Dataset, TabularParsingMixin):
-    r"""Base :class:`~tsl.datasets.Dataset` class for tabular data.
+    r"""Base :class:`~tsl.datasets.prototypes.Dataset` class for tabular data.
 
     Tabular data are assumed to be 3-dimensional arrays where the dimensions
     represent time, nodes and features, respectively. They can be either
diff --git a/tsl/experiment/experiment.py b/tsl/experiment/experiment.py
@@ -1,5 +1,6 @@
 import inspect
 import os
+import os.path as osp
 import sys
 from functools import wraps
 from typing import Optional, Callable, List, Union
@@ -43,9 +44,9 @@ def _pre_experiment_routine(cfg: DictConfig):
                     # name=hconf.job.name,
                     dir=hconf.runtime.output_dir)
     if hconf.get('output_subdir') is not None:
-        run_args['tsl_subdir'] = os.path.join(cfg.run.dir, hconf.output_subdir)
+        run_args['tsl_subdir'] = osp.join(cfg.run.dir, hconf.output_subdir)
         # remove hydra conf from logging
-        os.unlink(os.path.join(run_args['tsl_subdir'], 'hydra.yaml'))
+        os.unlink(osp.join(run_args['tsl_subdir'], 'hydra.yaml'))
     # set run name
     run_args['name'] = "${now:%Y-%m-%d_%H-%M-%S}_${run.seed}"
     with flag_override(cfg, 'struct', False):
@@ -112,9 +113,9 @@ def __init__(self, run_fn: Callable,
         # config_path={config_path} same as --config-path {config_path}
         override_config_path = get_hydra_cli_arg('config_path', delete=True)
         config_path = override_config_path or config_path
-        if not os.path.isabs(config_path):
-            root_path = os.path.dirname(inspect.getfile(run_fn))
-            config_path = os.path.join(root_path, config_path)
+        if not osp.isabs(config_path):
+            root_path = osp.dirname(inspect.getfile(run_fn))
+            config_path = osp.abspath(osp.join(root_path, config_path))
         self.config_path = config_path
         # store config_dir in tsl config
         config.config_dir = self.config_path
@@ -164,7 +165,7 @@ def decorated_run_fn(cfg: DictConfig):
     def log_config(self) -> None:
         """Save config as ``.yaml`` file in
         :meth:`~tsl.experiment.Experiment.run_dir`."""
-        with open(os.path.join(self.run_dir, 'config.yaml'), 'w') as fp:
+        with open(osp.join(self.run_dir, 'config.yaml'), 'w') as fp:
             fp.write(OmegaConf.to_yaml(self.cfg, resolve=True))
 
     def __repr__(self):
diff --git a/tsl/nn/blocks/encoders/transformer.py b/tsl/nn/blocks/encoders/transformer.py
@@ -1,33 +1,41 @@
-from torch import nn
-
-from tsl.nn.base.attention import MultiHeadAttention
-from tsl.nn.layers.norm import LayerNorm
-from tsl.nn import utils
 from functools import partial
+from typing import Optional
 
 import torch.nn.functional as F
+from torch import nn, Tensor
+
+from tsl.nn import utils
+from tsl.nn.base.attention import MultiHeadAttention
+from tsl.nn.layers.norm import LayerNorm
 
 
 class TransformerLayer(nn.Module):
-    r"""
-    A TransformerLayer which can be instantiated to attent the temporal or spatial dimension.
+    r"""A Transformer layer from the paper `"Attention Is All You Need"
+    <https://arxiv.org/abs/1706.03762>`_ (Vaswani et al., NeurIPS 2017).
+
+    This layer can be instantiated to attend the temporal or spatial dimension.
 
     Args:
         input_size (int): Input size.
         hidden_size (int): Dimension of the learned representations.
         ff_size (int): Units in the MLP after self attention.
         n_heads (int, optional): Number of parallel attention heads.
-        axis (str, optional): Dimension on which to apply attention to update the representations.
-        causal (bool, optional): Whether to causally mask the attention scores (can be `True` only if `axis` is `steps`).
+        axis (str, optional): Dimension on which to apply attention to update
+            the representations. Can be either, 'time' or 'nodes'.
+            (default: :obj:`'time'`)
+        causal (bool, optional): If :obj:`True`, then causally mask attention
+            scores in temporal attention (has an effect only if :attr:`axis` is
+            :obj:`'time'`). (default: :obj:`True`)
         activation (str, optional): Activation function.
         dropout (float, optional): Dropout probability.
     """
+
     def __init__(self,
                  input_size,
                  hidden_size,
                  ff_size=None,
                  n_heads=1,
-                 axis='steps',
+                 axis='time',
                  causal=True,
                  activation='elu',
                  dropout=0.):
@@ -60,27 +68,32 @@ def __init__(self,
 
         self.activation = utils.get_functional_activation(activation)
 
-    def forward(self, x, mask=None):
+    def forward(self, x: Tensor, mask: Optional[Tensor] = None):
         """"""
         # x: [batch, steps, nodes, features]
-        x = self.skip_conn(x) + self.dropout(self.att(self.norm1(x), attn_mask=mask)[0])
+        x = self.skip_conn(x) + self.dropout(
+            self.att(self.norm1(x), attn_mask=mask)[0])
         x = x + self.mlp(x)
         return x
 
 
 class SpatioTemporalTransformerLayer(nn.Module):
-    r"""
-    A TransformerLayer which attend both the spatial and temporal dimensions by stacking two `MultiHeadAttention` layers.
+    r"""A :class:`~tsl.nn.blocks.encoders.TransformerLayer` which attend both
+    the spatial and temporal dimensions by stacking two
+    :class:`~tsl.nn.base.MultiHeadAttention` layers.
 
     Args:
         input_size (int): Input size.
         hidden_size (int): Dimension of the learned representations.
         ff_size (int): Units in the MLP after self attention.
         n_heads (int, optional): Number of parallel attention heads.
-        causal (bool, optional): Whether to causally mask the attention scores (can be `True` only if `axis` is `steps`).
+        causal (bool, optional): If :obj:`True`, then causally mask attention
+            scores in temporal attention.
+            (default: :obj:`True`)
         activation (str, optional): Activation function.
         dropout (float, optional): Dropout probability.
     """
+
     def __init__(self,
                  input_size,
                  hidden_size,
@@ -95,7 +108,7 @@ def __init__(self,
                                                kdim=input_size,
                                                vdim=input_size,
                                                heads=n_heads,
-                                               axis='steps',
+                                               axis='time',
                                                causal=causal)
 
         self.spatial_att = MultiHeadAttention(embed_dim=hidden_size,
@@ -122,19 +135,18 @@ def __init__(self,
 
         self.dropout = nn.Dropout(dropout)
 
-    def forward(self, x, mask=None):
+    def forward(self, x: Tensor, mask: Optional[Tensor] = None):
         """"""
         # x: [batch, steps, nodes, features]
-
-        x = self.skip_conn(x) + self.dropout(self.temporal_att(self.norm1(x), attn_mask=mask)[0])
+        x = self.skip_conn(x) + self.dropout(
+            self.temporal_att(self.norm1(x), attn_mask=mask)[0])
         x = x + self.dropout(self.spatial_att(self.norm2(x), attn_mask=mask)[0])
         x = x + self.mlp(x)
         return x
 
 
 class Transformer(nn.Module):
-    r"""
-    A stack of Transformer layers.
+    r"""A stack of Transformer layers.
 
     Args:
         input_size (int): Input size.
@@ -143,19 +155,25 @@ class Transformer(nn.Module):
         output_size (int, optional): Size of an optional linear readout.
         n_layers (int, optional): Number of Transformer layers.
         n_heads (int, optional): Number of parallel attention heads.
-        axis (str, optional): Dimension on which to apply attention to update the representations.
-        causal (bool, optional): Whether to causally mask the attention scores (can be `True` only if `axis` is `steps`).
+        axis (str, optional): Dimension on which to apply attention to update
+            the representations. Can be either, 'time', 'nodes', or 'both'.
+            (default: :obj:`'time'`)
+        causal (bool, optional): If :obj:`True`, then causally mask attention
+            scores in temporal attention (has an effect only if :attr:`axis` is
+            :obj:`'time'` or :obj:`'both'`).
+            (default: :obj:`True`)
         activation (str, optional): Activation function.
         dropout (float, optional): Dropout probability.
     """
+
     def __init__(self,
                  input_size,
                  hidden_size,
                  ff_size=None,
                  output_size=None,
                  n_layers=1,
                  n_heads=1,
-                 axis='steps',
+                 axis='time',
                  causal=True,
                  activation='elu',
                  dropout=0.):
@@ -165,7 +183,7 @@ def __init__(self,
         if ff_size is None:
             ff_size = hidden_size
 
-        if axis in ['steps', 'nodes']:
+        if axis in ['time', 'nodes']:
             transformer_layer = partial(TransformerLayer, axis=axis)
         elif axis == 'both':
             transformer_layer = SpatioTemporalTransformerLayer
@@ -174,13 +192,14 @@ def __init__(self,
 
         layers = []
         for i in range(n_layers):
-            layers.append(transformer_layer(input_size=input_size if i == 0 else hidden_size,
-                                            hidden_size=hidden_size,
-                                            ff_size=ff_size,
-                                            n_heads=n_heads,
-                                            causal=causal,
-                                            activation=activation,
-                                            dropout=dropout))
+            layers.append(transformer_layer(
+                input_size=input_size if i == 0 else hidden_size,
+                hidden_size=hidden_size,
+                ff_size=ff_size,
+                n_heads=n_heads,
+                causal=causal,
+                activation=activation,
+                dropout=dropout))
 
         self.net = nn.Sequential(*layers)
 
@@ -189,7 +208,7 @@ def __init__(self,
         else:
             self.register_parameter('readout', None)
 
-    def forward(self, x):
+    def forward(self, x: Tensor):
         """"""
         x = self.net(x)
         if self.readout is not None:
diff --git a/tsl/nn/models/temporal/transformer_model.py b/tsl/nn/models/temporal/transformer_model.py
@@ -7,13 +7,15 @@
 from tsl.nn.blocks.encoders import ConditionalBlock
 from tsl.nn.blocks.encoders.mlp import MLP
 from tsl.nn.blocks.encoders.transformer import Transformer
+from tsl.nn.layers.ops import Select
 from tsl.nn.layers.positional_encoding import PositionalEncoding
 from tsl.nn.models.base_model import BaseModel
-from tsl.nn.layers.ops import Select
 
 
 class TransformerModel(BaseModel):
-    r"""Simple Transformer for multistep time series forecasting.
+    r"""A Transformer from the paper `"Attention Is All You Need"
+    <https://arxiv.org/abs/1706.03762>`_ (Vaswani et al., NeurIPS 2017) for
+    multistep time series forecasting.
 
     Args:
         input_size (int): Input size.
@@ -26,7 +28,8 @@ class TransformerModel(BaseModel):
         n_layers (int, optional): Number of layers.
         dropout (float, optional): Dropout probability.
         axis (str, optional): Dimension on which to apply attention to update
-            the representations.
+            the representations. Can be either, 'time', 'nodes', or 'both'.
+            (default: :obj:`'time'`)
         activation (str, optional): Activation function.
     """