Merge pull request #4 from tillahoffmann/sampling

tillahoffmann · web-flow · commit b685bc7d54b7 · 2025-08-12T14:39:29.000-04:00
Use uniform sampled softmax and add `&lt;START&gt;` token.
diff --git a/.gitignore b/.gitignore
@@ -6,8 +6,9 @@ __pycache__/
 .vscode
 *.egg-info
 *.ipynb
-*.ipynb-checkpoint
+*.ipynb_checkpoints
 /data/
 ~*
 htmlcov/
 playground/
+workspace/
diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-.PHONY : tests
+.PHONY : experiments tests
 
 tests :
 	pytest tests --cov=trecs --cov-report=term-missing -v
@@ -21,3 +21,15 @@ data/spotify_million_playlist_dataset/md5sums.check : data/spotify_million_playl
 data/mpd.db : data/spotify_million_playlist_dataset/md5sums.check
 	# Build the database.
 	python -m trecs.scripts.build_db data/mpd.db data/spotify_million_playlist_dataset/data/mpd.slice.*.json
+
+# Training.
+
+WORKDIR ?= workspace
+MPD_PATH ?= data/mpd.db
+EXPERIMENT_SETUPS = $(filter-out $(wildcard src/trecs/experiments/*/_*.py),$(wildcard src/trecs/experiments/*/*.py))
+EXPERIMENT_OUTPUTS = $(addprefix ${WORKDIR}/,${EXPERIMENT_SETUPS:src/trecs/experiments/%.py=%})
+
+experiments : ${EXPERIMENT_OUTPUTS}
+
+${EXPERIMENT_OUTPUTS} : ${WORKDIR}/% : src/trecs/experiments/%.py data/mpd.db
+	MPD=${MPD_PATH} python -m trecs.scripts.train $@ $<
diff --git a/README.md b/README.md
@@ -48,7 +48,6 @@ We directly sample from the next-token distribution with top-$k$ sampling with u
 
 ## 🚀 Next Steps
 
-* We should prepend a special `<start>` token to all playlists because the first token is never used as a label.
 * The sampled softmax cross-entropy is biased due to the non-linearity in the denominator. We should be able to apply a low-order bias correction to get a better estimate, although it remains to be established if the bias affects the gradients. The bias leads to an *optimistic* estimate of the perplexity (see appendix for details). First-order bias correction is definitely feasible for the loss.
 * The model can be readily extended to include album, artist, and stylistic coherence as well as conditioning on a representation of user taste or expressed user preference for a particular session—future work.
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -17,7 +17,6 @@ dependencies = [
     "orbax-checkpoint>=0.11.20",
     "pandas>=2.3.1",
     "pydantic>=2.11.7",
-    "pydantic-ai>=0.4.11",
     "python-dotenv>=1.1.1",
     "tensorboard>=2.20.0",
     "tensorboardx>=2.6.4",
@@ -29,6 +28,8 @@ dependencies = [
 dev = [
     "black>=25.1.0",
     "jupyter>=1.1.1",
+    "jupytext>=1.17.2",
+    "localscope>=0.2.5",
     "matplotlib>=3.10.5",
     "pyright>=1.1.403",
     "pytest>=8.4.1",
diff --git a/src/trecs/experiments/decoder_only/__init__.py b/src/trecs/experiments/decoder_only/__init__.py
@@ -0,0 +1,6 @@
+from ._base import DecoderOnlyExperiment
+
+
+__all__ = [
+    "DecoderOnlyExperiment",
+]
diff --git a/src/trecs/experiments/decoder_only/_base.py b/src/trecs/experiments/decoder_only/_base.py
@@ -10,10 +10,10 @@
 from jax import numpy as jnp
 import pydantic
 import sqlite3
-from typing import cast
-from .util import Experiment
-from ..models import PlaylistDecoder
-from ..data import (
+from typing import cast, Literal
+from ..util import Experiment
+from ...models import PlaylistDecoder
+from ...data import (
     Sqlite3Dataset,
     SELECT_DISTINCT_TRACK_IDS_BY_SPLIT,
     SELECT_PLAYLISTS_BY_SPLIT,
@@ -24,7 +24,11 @@
     BatchTransform,
     Encoder,
 )
-from ..util import sampled_dot_cross_entropy_with_integer_labels, evaluate_eop_loss_mask
+from ...util import (
+    sampled_dot_cross_entropy_with_integer_labels_and_label_in_denominator,
+    sampled_dot_cross_entropy_with_integer_labels_uniform,
+    evaluate_eop_loss_mask,
+)
 
 
 class DecoderOnlyExperiment(Experiment):
@@ -33,11 +37,18 @@ class DecoderOnlyExperiment(Experiment):
     num_heads: int
     num_features: int
     num_hidden: int
+    loss_function: Literal[
+        "label_in_denominator",
+        "uniform",
+    ]
     dropout: float = pydantic.Field(ge=0, le=1)
     num_tracks: int | None
     unk_proba: float = pydantic.Field(ge=0, le=1)
     weight_decay: float = pydantic.Field(ge=0)
+
+    start_token: int | None = None
     eop_token: int | None = None
+    unk_token: int | None = None
     track_encoder: Encoder | None = None
 
     # Because the `Encoder` is not a standard class.
@@ -78,7 +89,7 @@ def create_data_source(self, split: str) -> RandomAccessDataSource:
             ON ptm.track_id = tracks.id
             WHERE ptm.playlist_id = :id
             ORDER BY ptm.pos
-            LIMIT :context_length + 1
+            LIMIT :context_length
             """,
             {"split": split},
             {"context_length": self.context_length},
@@ -90,7 +101,14 @@ def create_data_loader(
     ) -> DataLoader:
         assert self.track_encoder, "Create track encoder first."
         operations = [
-            # {START}: {"pos": [0, 1, ...], "track_id": [43, 7, ...]}
+            # {INPUT}: {"pos": [0, 1, ...], "track_id": [43, 7, ...]}
+            # Inject a start token.
+            LambdaMap[dict, dict](
+                lambda x: {
+                    "track_id": ["<START>", *x["track_id"]],
+                    "pos": list(range(len(x["pos"]) + 1)),
+                }
+            ),
             # Encode tracks and truncate to the maximum context length:
             # {"pos": [0, 1, ...], "track_id": [0, 1, ...]}
             LambdaMap[dict, dict](
@@ -108,7 +126,11 @@ def create_data_loader(
             # Batch records: [{"track_id": [0, 1], ...}, {"track_id": [4, 5], ...}, ...]
             BatchTransform(self.batch_size, on_short="drop"),
             # Pad values to the same length.
-            LambdaMap(pad_batch, fill_value={"track_id": self.eop_token, "pos": 0}),
+            LambdaMap(
+                pad_batch,
+                fill_value={"track_id": self.eop_token, "pos": 0},
+                length=self.context_length + 1,
+            ),
             # Transpose to get a dictionary keyed by `track_id`, `pos`, etc. Then
             # convert to jax arrays.
             LambdaMap[dict, dict](
@@ -166,7 +188,16 @@ def evaluate_loss(
         flat_labels = labels.reshape((batch_size * num_tokens,))
 
         # Evaluate the loss.
-        sampled_loss = sampled_dot_cross_entropy_with_integer_labels(
+        if self.loss_function == "label_in_denominator":
+            func = (
+                sampled_dot_cross_entropy_with_integer_labels_and_label_in_denominator
+            )
+        elif self.loss_function == "uniform":
+            func = sampled_dot_cross_entropy_with_integer_labels_uniform
+        else:
+            raise ValueError(self.loss_function)
+
+        sampled_loss = func(
             prng_key,
             flat_embeddings,
             model.track_embedding.embedding.value,
@@ -214,14 +245,23 @@ def setup_output(self, output: Path) -> None:
                     SELECT_DISTINCT_TRACK_IDS_BY_SPLIT, {"split": "train"}
                 )
                 self.track_encoder = Encoder(
-                    ["<UNK>", "<EOP>", *(track_id for (track_id,) in cursor)],
+                    [
+                        "<START>",
+                        "<EOP>",
+                        "<UNK>",
+                        *(track_id for (track_id,) in cursor),
+                    ],
                     on_unknown="default",
                     default="<UNK>",
                 )
             self.track_encoder.to_pickle(encoder_path)
             print(f"Built new track encoder with {len(self.track_encoder):,} tokens.")
 
+        # Get named special tokens.
+        self.start_token = self.track_encoder("<START>")
         self.eop_token = self.track_encoder("<EOP>")
+        self.unk_token = self.track_encoder("<UNK>")
+
         num_tracks = len(self.track_encoder)
         if self.num_tracks is None:
             self.num_tracks = num_tracks
diff --git a/src/trecs/experiments/decoder_only/decoder_only_uniform_softmax_loss.py b/src/trecs/experiments/decoder_only/decoder_only_uniform_softmax_loss.py
@@ -0,0 +1,26 @@
+from trecs.experiments.decoder_only import DecoderOnlyExperiment
+
+
+def setup() -> DecoderOnlyExperiment:
+    train_size = 800_000
+    batch_size = 16
+    num_steps = train_size // batch_size
+    return DecoderOnlyExperiment(
+        seed=42,
+        num_steps=num_steps,
+        batch_size=batch_size,
+        learning_rate=0.0005,
+        weight_decay=0.01,
+        context_length=50,
+        num_layers=6,
+        num_features=128,
+        num_hidden=256,
+        num_heads=8,
+        dropout=0.1,
+        eval_every=100,
+        checkpoint_every=1000,
+        loss_function="uniform",
+        unk_proba=0.01,
+        # This will be determined by the encoder.
+        num_tracks=None,
+    )
diff --git a/src/trecs/experiments/decoder_only/decoder_only_with_denominator_softmax_loss.py b/src/trecs/experiments/decoder_only/decoder_only_with_denominator_softmax_loss.py
@@ -0,0 +1,26 @@
+from trecs.experiments.decoder_only import DecoderOnlyExperiment
+
+
+def setup() -> DecoderOnlyExperiment:
+    train_size = 800_000
+    batch_size = 16
+    num_steps = train_size // batch_size
+    return DecoderOnlyExperiment(
+        seed=42,
+        num_steps=num_steps,
+        batch_size=batch_size,
+        learning_rate=0.0005,
+        weight_decay=0.01,
+        context_length=50,
+        num_layers=6,
+        num_features=128,
+        num_hidden=256,
+        num_heads=8,
+        dropout=0.1,
+        eval_every=100,
+        checkpoint_every=1000,
+        loss_function="label_in_denominator",
+        unk_proba=0.01,
+        # This will be determined by the encoder.
+        num_tracks=None,
+    )
diff --git a/src/trecs/util.py b/src/trecs/util.py
@@ -1,6 +1,7 @@
 import contextlib
 from jax import numpy as jnp
 from jax import random
+from jax.scipy.special import logsumexp
 from pathlib import Path
 from typing import Generator, IO
 import importlib.util
@@ -45,7 +46,48 @@ def safe_write(
     tmp_path.rename(path)
 
 
-def sampled_dot_cross_entropy_with_integer_labels(
+def sampled_dot_cross_entropy_with_integer_labels_uniform(
+    key: jnp.ndarray,
+    query: jnp.ndarray,
+    embedding: jnp.ndarray,
+    labels: jnp.ndarray,
+    num_samples: int = 20,
+):
+    """Evaluate the sampled cross entropy based on logits obtained through a dot
+    product `query @ embedding.T`. This function never evaluates the full dot product
+    but only considers a sampled subset of the embedding matrix.
+
+    Args:
+        key: Random number generator key.
+        query: Context to contract with the output embedding with shape
+            `(batch_size, num_features)`.
+        embedding: Output embedding with shape `(num_classes, num_features)`.
+        labels: Target labels with shape `(batch_size,)`.
+        num_samples: Number of samples for the sampled softmax cross-entropy.
+
+    Returns:
+        Sampled cross-entropy with shape `(batch_size,)`.
+    """
+    batch_size, query_num_features = query.shape
+    num_classes, embedding_num_features = embedding.shape
+    assert query_num_features == embedding_num_features
+
+    # Logits for the labels we are after.
+    label_logits = jnp.vecdot(query, embedding[labels])
+
+    # Sample indices uniformly at random with replacement. This introduces extra
+    # variance because we can double-sample certain indices, but this effect is small
+    # when num_samples << num_classes.
+    idx = random.randint(key, (num_samples, batch_size), 0, num_classes)
+    sampled_logits = jnp.vecdot(query, embedding[idx]).T
+    return (
+        -label_logits
+        + logsumexp(sampled_logits, axis=1)
+        + jnp.log(num_classes / num_samples)
+    )
+
+
+def sampled_dot_cross_entropy_with_integer_labels_and_label_in_denominator(
     key: jnp.ndarray,
     query: jnp.ndarray,
     embedding: jnp.ndarray,
@@ -62,6 +104,7 @@ def sampled_dot_cross_entropy_with_integer_labels(
             `(batch_size, num_features)`.
         embedding: Output embedding with shape `(num_classes, num_features)`.
         labels: Target labels with shape `(batch_size,)`.
+        num_samples: Number of samples for the sampled softmax cross-entropy.
 
     Returns:
         Sampled cross-entropy with shape `(batch_size,)`.
diff --git a/tests/assets/decoder_only_mini_experiment.py b/tests/assets/decoder_only_mini_experiment.py
@@ -18,4 +18,5 @@ def setup():
         unk_proba=0.05,
         weight_decay=0.01,
         num_tracks=None,
+        loss_function="uniform",
     )
diff --git a/tests/test_experiments.py b/tests/test_experiments.py
@@ -0,0 +1,19 @@
+from pathlib import Path
+from trecs import experiments
+from trecs.util import load_module
+import pytest
+
+
+EXPERIMENTS = [
+    path
+    for path in Path(experiments.__file__).parent.glob("*/*.py")
+    if not path.stem.startswith("_")
+]
+
+
+@pytest.mark.parametrize("path", EXPERIMENTS, ids=lambda x: "/".join(x.parts[-2:]))
+def test_setup_experiment(path: Path) -> None:
+    module = load_module(path)
+    assert hasattr(module, "setup")
+    experiment = module.setup()
+    assert isinstance(experiment, experiments.Experiment)
diff --git a/tests/test_util.py b/tests/test_util.py
@@ -4,13 +4,22 @@
 from jax.scipy.special import softmax
 import numpy
 from optax import softmax_cross_entropy_with_integer_labels
+import pytest
 from trecs.util import (
-    sampled_dot_cross_entropy_with_integer_labels,
+    sampled_dot_cross_entropy_with_integer_labels_and_label_in_denominator,
+    sampled_dot_cross_entropy_with_integer_labels_uniform,
     evaluate_eop_loss_mask,
 )
 
 
-def test_sampled_dot_cross_entropy_with_integer_labels() -> None:
+@pytest.mark.parametrize(
+    "func",
+    [
+        sampled_dot_cross_entropy_with_integer_labels_and_label_in_denominator,
+        sampled_dot_cross_entropy_with_integer_labels_uniform,
+    ],
+)
+def test_sampled_dot_cross_entropy_with_integer_labels(func) -> None:
     # Generate synthetic data.
     rngs = nnx.Rngs(17)
     batch_size = 1024
@@ -30,9 +39,7 @@ def test_sampled_dot_cross_entropy_with_integer_labels() -> None:
 
     # Evaluate the sampled cross-entropy and fit a linear regression model to verify
     # slope and intercept.
-    actual = sampled_dot_cross_entropy_with_integer_labels(
-        rngs(), query, embedding, labels, num_samples=60
-    )
+    actual = func(rngs(), query, embedding, labels, num_samples=60)
     fit = numpy.polynomial.Polynomial.fit(expected, actual, 1).convert()
     intercept, coef = fit.coef
     assert abs(coef - 1) < 0.01
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -18,4 +18,5 @@ def setup():`
`18`	`18`	`unk_proba=0.05,`
`19`	`19`	`weight_decay=0.01,`
`20`	`20`	`num_tracks=None,`
	`21`	`+ loss_function="uniform",`
`21`	`22`	`)`