Add a common rope impl (#107)

tengyifei · web-flow · commit ed8655d55f8e · 2025-02-12T15:50:48.000-08:00
This rope impl supports the OG rope and the scaling used in Llama 3.1
diff --git a/torchprime/rope/__init__.py b/torchprime/rope/__init__.py
diff --git a/torchprime/rope/rope.py b/torchprime/rope/rope.py
@@ -0,0 +1,74 @@
+"""
+Rotary Positional Embeddings (RoPE) implementation.
+Reference: https://github.com/adalkiran/llama-nuts-and-bolts/blob/main/docs/10-ROPE-ROTARY-POSITIONAL-EMBEDDINGS.md
+"""
+
+import math
+from dataclasses import dataclass
+
+import torch
+
+
+@dataclass
+class RopeScaling:
+  """
+  RoPE scaling parameters. The defaults are what was selected in Llama 3.1.
+  """
+
+  factor: float = 8.0
+  low_freq_factor: float = 1.0
+  high_freq_factor: float = 4.0
+  original_context_len: int = 8192
+
+
+def default_rope_frequencies(
+  head_dim: int,
+  theta: float = 10000.0,
+) -> torch.Tensor:
+  """
+  Computes the original RoPE frequencies in e.g. Llama 2.
+  Args:
+      head_dim: the size of a single attention head.
+      theta: a hyperparameter controlling how fast the embeddings rotate.
+  Returns:
+      The frequencies for the RoPE embeddings.
+  """
+  return 1.0 / (
+    theta ** (torch.arange(0, head_dim, 2, dtype=torch.int64).float() / head_dim)
+  )
+
+
+def llama3_rope_frequencies(
+  head_dim: int,
+  theta: float = 10000.0,
+  scaling: RopeScaling | None = None,
+) -> torch.Tensor:
+  """
+  Computes Llama 3 and 3.1 RoPE frequencies. In Llama 3.1, RoPE frequencies
+  may be scaled and interpolated as we move beyond the original context length.
+  """
+  freqs = default_rope_frequencies(head_dim=head_dim, theta=theta)
+  if scaling is None:
+    return freqs
+
+  low_freq_wavelen = scaling.original_context_len / scaling.low_freq_factor
+  high_freq_wavelen = scaling.original_context_len / scaling.high_freq_factor
+
+  assert low_freq_wavelen > high_freq_wavelen, (
+    f"low_freq_wavelen {low_freq_wavelen} must be greater than "
+    f"high_freq_wavelen {high_freq_wavelen}"
+  )
+
+  wavelen = 2 * math.pi / freqs
+  # wavelen < high_freq_wavelen: do nothing
+  # wavelen > low_freq_wavelen: divide by factor
+  freqs = torch.where(wavelen > low_freq_wavelen, freqs / scaling.factor, freqs)
+  # otherwise: interpolate between the two, using a smooth factor
+  smooth_factor = (scaling.original_context_len / wavelen - scaling.low_freq_factor) / (
+    scaling.high_freq_factor - scaling.low_freq_factor
+  )
+  smoothed_freqs = (1 - smooth_factor) * freqs / scaling.factor + smooth_factor * freqs
+  is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen)
+  freqs = torch.where(is_medium_freq, smoothed_freqs, freqs)
+
+  return freqs
diff --git a/torchprime/tests/test_rope.py b/torchprime/tests/test_rope.py
@@ -0,0 +1,108 @@
+import math
+
+import pytest
+import torch
+from transformers import PretrainedConfig
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
+
+from torchprime.rope import rope
+
+LLAMA3_SCALING = rope.RopeScaling(
+  factor=8,
+  low_freq_factor=1,
+  high_freq_factor=4,
+  original_context_len=8192,
+)
+
+
+@pytest.mark.parametrize(
+  "hidden_size, num_attention_heads, theta",
+  [(4096, 32, 500000.0), (16384, 128, 500000.0), (65536, 128, 500000.0)],
+)
+class TestRope:
+  def test_default_rope(self, hidden_size, num_attention_heads, theta):
+    head_dim = hidden_size // num_attention_heads
+    ours = rope.default_rope_frequencies(head_dim=head_dim, theta=theta)
+
+    hf_rope_fn = ROPE_INIT_FUNCTIONS["default"]
+    hf, scale = hf_rope_fn(
+      PretrainedConfig.from_dict(
+        {
+          "hidden_size": hidden_size,
+          "num_attention_heads": num_attention_heads,
+          "rope_theta": theta,
+        }
+      )
+    )
+
+    assert scale == 1
+    torch.testing.assert_close(ours, hf)
+
+  def test_llama3_rope_against_hf(self, hidden_size, num_attention_heads, theta):
+    head_dim = hidden_size // num_attention_heads
+    ours = rope.llama3_rope_frequencies(
+      head_dim=head_dim,
+      theta=theta,
+      scaling=LLAMA3_SCALING,
+    )
+
+    hf_rope_fn = ROPE_INIT_FUNCTIONS["llama3"]
+    hf, scale = hf_rope_fn(
+      PretrainedConfig.from_dict(
+        {
+          "hidden_size": hidden_size,
+          "num_attention_heads": num_attention_heads,
+          "rope_theta": theta,
+          "rope_scaling": {
+            "factor": 8,
+            "low_freq_factor": 1,
+            "high_freq_factor": 4,
+            "original_max_position_embeddings": 8192,
+          },
+        }
+      ),
+      device="cpu",
+    )
+
+    assert scale == 1
+    torch.testing.assert_close(ours, hf)
+
+  def test_llama3_rope_against_reference(self, hidden_size, num_attention_heads, theta):
+    head_dim = hidden_size // num_attention_heads
+    ours = rope.llama3_rope_frequencies(
+      head_dim=head_dim,
+      theta=theta,
+      scaling=LLAMA3_SCALING,
+    )
+    reference = _llama3_reference_apply_scaling(
+      rope.default_rope_frequencies(head_dim=head_dim, theta=theta)
+    )
+    torch.testing.assert_close(ours, reference)
+
+
+def _llama3_reference_apply_scaling(freqs: torch.Tensor):
+  """
+  Reference from https://github.com/karpathy/llm.c/blob/7ecd8906afe6ed7a2b2cdb731c042f26d525b820/train_llama3.py#L80
+  """
+  # Values obtained from grid search
+  scale_factor = 8
+  low_freq_factor = 1
+  high_freq_factor = 4
+  old_context_len = 8192  # original llama3 length
+
+  low_freq_wavelen = old_context_len / low_freq_factor
+  high_freq_wavelen = old_context_len / high_freq_factor
+  new_freqs = []
+  for freq in freqs:
+    wavelen = 2 * math.pi / freq
+    if wavelen < high_freq_wavelen:
+      new_freqs.append(freq)
+    elif wavelen > low_freq_wavelen:
+      new_freqs.append(freq / scale_factor)
+    else:
+      assert low_freq_wavelen != high_freq_wavelen
+      smooth = (old_context_len / wavelen - low_freq_factor) / (
+        high_freq_factor - low_freq_factor
+      )
+      new_freqs.append((1 - smooth) * freq / scale_factor + smooth * freq)
+  return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device)