diff --git a/nbs/049_models.TST.ipynb b/nbs/049_models.TST.ipynb
index c8b87750..21842f86 100644
--- a/nbs/049_models.TST.ipynb
+++ b/nbs/049_models.TST.ipynb
@@ -71,8 +71,8 @@
     "* max_seq_len: useful to control the temporal resolution in long time series to avoid memory issues. Default. None.\n",
     "* d_model: total dimension of the model (number of features created by the model). Usual values: 128-1024. Default: 128.\n",
     "* n_heads:  parallel attention heads. Usual values: 8-16. Default: 16.\n",
-    "* d_k: size of the learned linear projection of queries and keys in the MHA. Usual values: 16-512. Default: None -> (d_model/n_heads) = 32.\n",
-    "* d_v: size of the learned linear projection of values in the MHA. Usual values: 16-512. Default: None -> (d_model/n_heads) = 32.\n",
+    "* d_k: size of the learned linear projection of queries and keys in the MHA. Usual values: 8-64. Default: None -> (d_model/n_heads) = 8.\n",
+    "* d_v: size of the learned linear projection of values in the MHA. Usual values: 8-64. Default: None -> (d_model/n_heads) = 8.\n",
     "* d_ff: the dimension of the feedforward network model. Usual values: 256-4096. Default: 256.\n",
     "* dropout: amount of residual dropout applied in the encoder. Usual values: 0.-0.3. Default: 0.1.\n",
     "* activation: the activation function of intermediate layer, relu or gelu. Default: 'gelu'.\n",
@@ -218,7 +218,6 @@
     "    def __init__(self, q_len:int, d_model:int, n_heads:int, d_k:Optional[int]=None, d_v:Optional[int]=None, d_ff:int=256, dropout:float=0.1, \n",
     "                 activation:str=\"gelu\"):\n",
     "\n",
-    "        assert d_model // n_heads, f\"d_model ({d_model}) must be divisible by n_heads ({n_heads})\"\n",
     "        d_k = ifnone(d_k, d_model // n_heads)\n",
     "        d_v = ifnone(d_v, d_model // n_heads)\n",
     "\n",
@@ -320,8 +319,8 @@
     "            max_seq_len: useful to control the temporal resolution in long time series to avoid memory issues.\n",
     "            d_model: total dimension of the model (number of features created by the model)\n",
     "            n_heads:  parallel attention heads.\n",
-    "            d_k: size of the learned linear projection of queries and keys in the MHA. Usual values: 16-512. Default: None -> (d_model/n_heads) = 32.\n",
-    "            d_v: size of the learned linear projection of values in the MHA. Usual values: 16-512. Default: None -> (d_model/n_heads) = 32.\n",
+    "            d_k: size of the learned linear projection of queries and keys in the MHA. Usual values: 8-64. Default: None -> (d_model/n_heads) = 8.\n",
+    "            d_v: size of the learned linear projection of values in the MHA. Usual values: 8-64. Default: None -> (d_model/n_heads) = 8.\n",
     "            d_ff: the dimension of the feedforward network model.\n",
     "            dropout: amount of residual dropout applied in the encoder.\n",
     "            act: the activation function of intermediate layer, relu or gelu.\n",
diff --git a/tsai/models/TST.py b/tsai/models/TST.py
index c8f51f36..abb03769 100644
--- a/tsai/models/TST.py
+++ b/tsai/models/TST.py
@@ -76,7 +76,6 @@ class _TSTEncoderLayer(Module):
     def __init__(self, q_len:int, d_model:int, n_heads:int, d_k:Optional[int]=None, d_v:Optional[int]=None, d_ff:int=256, dropout:float=0.1, 
                  activation:str="gelu"):
 
-        assert d_model // n_heads, f"d_model ({d_model}) must be divisible by n_heads ({n_heads})"
         d_k = ifnone(d_k, d_model // n_heads)
         d_v = ifnone(d_v, d_model // n_heads)
 
@@ -142,8 +141,8 @@ def __init__(self, c_in:int, c_out:int, seq_len:int, max_seq_len:Optional[int]=N
             max_seq_len: useful to control the temporal resolution in long time series to avoid memory issues.
             d_model: total dimension of the model (number of features created by the model)
             n_heads:  parallel attention heads.
-            d_k: size of the learned linear projection of queries and keys in the MHA. Usual values: 16-512. Default: None -> (d_model/n_heads) = 32.
-            d_v: size of the learned linear projection of values in the MHA. Usual values: 16-512. Default: None -> (d_model/n_heads) = 32.
+            d_k: size of the learned linear projection of queries and keys in the MHA. Usual values: 8-64. Default: None -> (d_model/n_heads) = 8.
+            d_v: size of the learned linear projection of values in the MHA. Usual values: 8-64. Default: None -> (d_model/n_heads) = 8.
             d_ff: the dimension of the feedforward network model.
             dropout: amount of residual dropout applied in the encoder.
             act: the activation function of intermediate layer, relu or gelu.