diff --git a/nbs/049_models.TST.ipynb b/nbs/049_models.TST.ipynb index c8b87750..21842f86 100644 --- a/nbs/049_models.TST.ipynb +++ b/nbs/049_models.TST.ipynb @@ -71,8 +71,8 @@ "* max_seq_len: useful to control the temporal resolution in long time series to avoid memory issues. Default. None.\n", "* d_model: total dimension of the model (number of features created by the model). Usual values: 128-1024. Default: 128.\n", "* n_heads: parallel attention heads. Usual values: 8-16. Default: 16.\n", - "* d_k: size of the learned linear projection of queries and keys in the MHA. Usual values: 16-512. Default: None -> (d_model/n_heads) = 32.\n", - "* d_v: size of the learned linear projection of values in the MHA. Usual values: 16-512. Default: None -> (d_model/n_heads) = 32.\n", + "* d_k: size of the learned linear projection of queries and keys in the MHA. Usual values: 8-64. Default: None -> (d_model/n_heads) = 8.\n", + "* d_v: size of the learned linear projection of values in the MHA. Usual values: 8-64. Default: None -> (d_model/n_heads) = 8.\n", "* d_ff: the dimension of the feedforward network model. Usual values: 256-4096. Default: 256.\n", "* dropout: amount of residual dropout applied in the encoder. Usual values: 0.-0.3. Default: 0.1.\n", "* activation: the activation function of intermediate layer, relu or gelu. Default: 'gelu'.\n", @@ -218,7 +218,6 @@ " def __init__(self, q_len:int, d_model:int, n_heads:int, d_k:Optional[int]=None, d_v:Optional[int]=None, d_ff:int=256, dropout:float=0.1, \n", " activation:str=\"gelu\"):\n", "\n", - " assert d_model // n_heads, f\"d_model ({d_model}) must be divisible by n_heads ({n_heads})\"\n", " d_k = ifnone(d_k, d_model // n_heads)\n", " d_v = ifnone(d_v, d_model // n_heads)\n", "\n", @@ -320,8 +319,8 @@ " max_seq_len: useful to control the temporal resolution in long time series to avoid memory issues.\n", " d_model: total dimension of the model (number of features created by the model)\n", " n_heads: parallel attention heads.\n", - " d_k: size of the learned linear projection of queries and keys in the MHA. Usual values: 16-512. Default: None -> (d_model/n_heads) = 32.\n", - " d_v: size of the learned linear projection of values in the MHA. Usual values: 16-512. Default: None -> (d_model/n_heads) = 32.\n", + " d_k: size of the learned linear projection of queries and keys in the MHA. Usual values: 8-64. Default: None -> (d_model/n_heads) = 8.\n", + " d_v: size of the learned linear projection of values in the MHA. Usual values: 8-64. Default: None -> (d_model/n_heads) = 8.\n", " d_ff: the dimension of the feedforward network model.\n", " dropout: amount of residual dropout applied in the encoder.\n", " act: the activation function of intermediate layer, relu or gelu.\n", diff --git a/tsai/models/TST.py b/tsai/models/TST.py index c8f51f36..abb03769 100644 --- a/tsai/models/TST.py +++ b/tsai/models/TST.py @@ -76,7 +76,6 @@ class _TSTEncoderLayer(Module): def __init__(self, q_len:int, d_model:int, n_heads:int, d_k:Optional[int]=None, d_v:Optional[int]=None, d_ff:int=256, dropout:float=0.1, activation:str="gelu"): - assert d_model // n_heads, f"d_model ({d_model}) must be divisible by n_heads ({n_heads})" d_k = ifnone(d_k, d_model // n_heads) d_v = ifnone(d_v, d_model // n_heads) @@ -142,8 +141,8 @@ def __init__(self, c_in:int, c_out:int, seq_len:int, max_seq_len:Optional[int]=N max_seq_len: useful to control the temporal resolution in long time series to avoid memory issues. d_model: total dimension of the model (number of features created by the model) n_heads: parallel attention heads. - d_k: size of the learned linear projection of queries and keys in the MHA. Usual values: 16-512. Default: None -> (d_model/n_heads) = 32. - d_v: size of the learned linear projection of values in the MHA. Usual values: 16-512. Default: None -> (d_model/n_heads) = 32. + d_k: size of the learned linear projection of queries and keys in the MHA. Usual values: 8-64. Default: None -> (d_model/n_heads) = 8. + d_v: size of the learned linear projection of values in the MHA. Usual values: 8-64. Default: None -> (d_model/n_heads) = 8. d_ff: the dimension of the feedforward network model. dropout: amount of residual dropout applied in the encoder. act: the activation function of intermediate layer, relu or gelu.