pszemraj
diff --git a/‎annotated_mpnet/data/mpnet_data.py
Lines changed: 4 additions & 3 deletions b/‎annotated_mpnet/data/mpnet_data.py
Lines changed: 4 additions & 3 deletions
diff --git a/‎annotated_mpnet/modeling/mpnet_for_pretraining.py
Lines changed: 6 additions & 0 deletions b/‎annotated_mpnet/modeling/mpnet_for_pretraining.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎annotated_mpnet/transformer_modules/sentence_encoder.py
Lines changed: 58 additions & 8 deletions b/‎annotated_mpnet/transformer_modules/sentence_encoder.py
Lines changed: 58 additions & 8 deletions
diff --git a/‎annotated_mpnet/transformer_modules/sentence_encoder_layer.py
Lines changed: 1 addition & 3 deletions b/‎annotated_mpnet/transformer_modules/sentence_encoder_layer.py
Lines changed: 1 addition & 3 deletions
@@ -3,8 +3,9 @@
 as the data collator
 """
 
-import os
 import logging
+import os
+import random
 from typing import Dict, Iterator, Sized
 
 from rich.logging import RichHandler
@@ -15,12 +16,12 @@
 )
 LOGGER = logging.getLogger(__name__)
 
+
 import numpy as np
 import torch
+from datasets import load_dataset
 from torch.utils.data import Sampler
 from transformers import PreTrainedTokenizer
-from datasets import load_dataset
-import random
 
 from annotated_mpnet.utils import utils
 from annotated_mpnet.utils.perm_utils_fast import make_span_perm
 
@@ -72,6 +72,8 @@ def __init__(self, args, tokenizer) -> None:
             encoder_normalize_before=True,
             activation_fn=args.activation_fn,
             normalize_before=args.normalize_before,
+            relative_attention_num_buckets=args.relative_attention_num_buckets,
+            relative_attention_max_distance=args.relative_attention_max_distance,
         )
 
         # Add the language modeling head
@@ -534,6 +536,10 @@ def make_query_and_content_mask(
                              [ 0 0 0 0 1 1 1 0 0 0 ]
                              [ 0 0 0 0 1 1 1 0 0 0 ]
                              [ 0 0 0 0 1 1 1 0 0 0 ]
+
+    Note: This function is designed to scale automatically with sequence length as it's
+    matrix-based and constructs masks based on the provided seq_len and pred_size.
+    There's no need to modify this function when changing context length.
     """
 
     # Define helper function to keep things organized
 
@@ -71,7 +71,8 @@ def __init__(
         embed_scale: float = None,
         freeze_embeddings: bool = False,
         n_trans_layers_to_freeze: int = 0,
-        relative_attention_num_buckets: int = 32,
+        relative_attention_num_buckets: int = None,
+        relative_attention_max_distance: int = None,
         normalize_before: bool = False,
         export: bool = False,
     ) -> None:
@@ -115,6 +116,8 @@ def __init__(
                 This is probably only useful for finetuning
             relative_attention_num_buckets: the number of buckets to add to the relative atttention
                 portion of the attention mechanism
+            relative_attention_max_distance: the maximum distance (in tokens) to consider in the relative
+                attention mechanism
             normalize_before: boolean dictating if a layer norm should be applied before the encoder
                 layers
             export: boolean dictating ONNX exporting, which I think we won't be using
@@ -160,7 +163,27 @@ def __init__(
         )
 
         # Set up relative attention bias for the attention mechanism
-        self.relative_attention_num_buckets = relative_attention_num_buckets
+        # and compute params for relative attention if they are not specified
+        base_context = 512
+        base_buckets = 32  # Default buckets for 512 context length is 32
+        base_max_distance = 128  # Default max distance for 512 context length is 128
+
+        if relative_attention_num_buckets is None:
+            # linear scaling of num buckets based on seq len (round up to nearest 8)
+            scaled_buckets = max(32, int(base_buckets * max_seq_len / base_context))
+            self.relative_attention_num_buckets = (scaled_buckets + 7) // 8 * 8
+        else:
+            self.relative_attention_num_buckets = relative_attention_num_buckets
+
+        if relative_attention_max_distance is None:
+            # linear scaling of max distance based on seq len (round up to nearest 8)
+            scaled_max_distance = max(
+                128, int(base_max_distance * max_seq_len / base_context)
+            )
+            self.relative_attention_max_distance = (scaled_max_distance + 7) // 8 * 8
+        else:
+            self.relative_attention_max_distance = relative_attention_max_distance
+
         self.relative_attention_bias = nn.Embedding(
             self.relative_attention_num_buckets, num_attention_heads, padding_idx=None
         )
@@ -259,7 +282,7 @@ def forward(
 
         # Compute the relative attention bias
         positions_bias = self.compute_position_bias(
-            x, self.relative_attention_num_buckets
+            x, self.relative_attention_num_buckets, self.relative_attention_max_distance
         )
 
         # If the user wants ALL hidden states, we keep track of it here
@@ -293,10 +316,18 @@ def forward(
 
         return inner_states, sentence_rep
 
-    # Helper function below
-    def compute_position_bias(self, x, num_buckets):
+    def compute_position_bias(self, x, num_buckets, max_distance):
         """
-        Helper function that computes the position bias based on the number of buckets provided
+        Computes the relative position bias for self-attention.
+
+        Args:
+            x: Input tensor with shape (seq_len, batch_size, embed_dim).
+            num_buckets: Number of buckets to use for relative position encoding.
+            max_distance: The maximum distance to consider for relative positions.
+
+        Returns:
+            A tensor representing the relative position bias, with shape
+            (batch_size * num_heads, qlen, klen).
         """
 
         # Get the batch size, q and k len
@@ -307,7 +338,9 @@ def compute_position_bias(self, x, num_buckets):
         relative_position = memory_position - context_position
 
         rp_bucket = self.relative_position_bucket(
-            relative_position, num_buckets=num_buckets
+            relative_position,
+            num_buckets=num_buckets,
+            max_distance=max_distance,
         )
         rp_bucket = rp_bucket.to(x.device)
         values = self.relative_attention_bias(rp_bucket)
@@ -317,7 +350,24 @@ def compute_position_bias(self, x, num_buckets):
         return values
 
     @staticmethod
-    def relative_position_bucket(relative_position, num_buckets=32, max_distance=128):
+    def relative_position_bucket(
+        relative_position, num_buckets: int = 32, max_distance: int = 128
+    ):
+        """
+        Computes the relative position bias for a given tensor of relative positions.
+            Defaults are for original MPNet @ context length 512.
+
+        Args:
+            relative_position: Tensor of shape (bsz, qlen, klen) containing the relative
+                positions between the queries and keys.
+            num_buckets: The number of buckets to use for the relative position bias.
+                Defaults to 32.
+            max_distance: The maximum distance to consider when computing the relative
+                position bias. Defaults to 128.
+
+        Returns:
+            A tensor of shape (bsz, qlen, klen) containing the relative position biases.
+        """
         ret = 0
         n = -relative_position
 
 
@@ -56,9 +56,7 @@ def __init__(
                 forward pass
             attention_dropout: similar to above, but is the dropout prob within the self-attention
                 mechanism
-            activation_fn: the activation function you will be using in this network. Although ReLU
-                is the default, more and more evidence points towards GELU being better for large
-                NLP-based transformers
+            activation_fn: the activation function you will be using in this network.
             add_bias_kv: boolean that dictates whether or not to add a bias parameter to the K, V
                 matrices in the self-attention mechanism
             add_zero_attn: boolean that dictate whether or not to add zero attention to the