MinishLab · Pringled · Sep 27, 2024 · Sep 25, 2024 · Sep 26, 2024 · Sep 27, 2024
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -0,0 +1,58 @@
+name: Run tests and upload coverage
+
+on:
+  push
+
+jobs:
+  test:
+    name: Run tests with pytest
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: ["ubuntu-latest", "windows-latest", "macos-latest"]
+        python-version: ["3.10"]
+      fail-fast: false
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python ${{ matrix.python-version }} on ${{ matrix.os }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+          allow-prereleases: true
+
+      # Step for Windows: Create and activate a virtual environment
+      - name: Create and activate a virtual environment (Windows)
+        if: ${{ runner.os == 'Windows' }}
+        run: |
+          irm https://astral.sh/uv/install.ps1 | iex
+          uv venv .venv
+          "VIRTUAL_ENV=.venv" | Out-File -FilePath $env:GITHUB_ENV -Append
+          "$PWD/.venv/Scripts" | Out-File -FilePath $env:GITHUB_PATH -Append
+
+      # Step for Unix: Create and activate a virtual environment
+      - name: Create and activate a virtual environment (Unix)
+        if: ${{ runner.os != 'Windows' }}
+        run: |
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          uv venv .venv
+          echo "VIRTUAL_ENV=.venv" >> $GITHUB_ENV
+          echo "$PWD/.venv/bin" >> $GITHUB_PATH
+
+      # Install dependencies using uv pip
+      - name: Install dependencies
+        run: make install
+       # run: uv pip install -e ".[pytest]"
+
+      # Run tests with coverage
+      - name: Run tests under coverage
+        run: |
+          coverage run -m pytest
+          coverage report
+
+      # Upload results to Codecov
+      - name: Upload results to Codecov
+        uses: codecov/codecov-action@v4
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -15,6 +15,8 @@ repos:
         description: Prevent giant files from being committed.
       - id: check-case-conflict
         description: Check for files with names that would conflict on case-insensitive filesystems like MacOS/Windows.
+      - id: check-yaml
+        description: Check yaml files for syntax errors.
   - repo: https://github.com/jsh9/pydoclint
     rev: 0.5.3
     hooks:

diff --git a/README.md b/README.md
@@ -22,12 +22,11 @@
     <a href="https://pypi.org/project/model2vec/"><img src="https://img.shields.io/pypi/pyversions/model2vec" alt="Supported Python versions"></a>
     <a href="https://pepy.tech/project/model2vec">
     <img src="https://static.pepy.tech/badge/model2vec" alt="Downloads">
-  </a>
+    </a>
     <a href="https://github.com/MinishLab/model2vec/blob/main/LICENSE"><img src="https://img.shields.io/badge/license-MIT-green" alt="License - MIT"></a>
   </h2>
 </div>
 
-
 <p align="center">
   <img src="assets/images/model2vec_model_diagram.png" alt="Model2Vec">
 </p>
@@ -66,7 +65,7 @@ embeddings = model.encode(["It's dangerous to go alone!", "It's a secret to ever
 
 And that's it. You can use the model to classify texts, to cluster, or to build a RAG system.
 
-Instead of using on of our models, you can distill your own Model2Vec model from a Sentence Transformer model. The following code snippet shows how to distill a model:
+Instead of using one of our models, you can distill your own Model2Vec model from a Sentence Transformer model. The following code snippet shows how to distill a model:
 ```python
 from model2vec.distill import distill
 

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,8 +1,15 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
 import numpy as np
 import pytest
+import torch
 from tokenizers import Tokenizer
 from tokenizers.models import WordLevel
 from tokenizers.pre_tokenizers import Whitespace
+from transformers import AutoModel, BertTokenizerFast
 
 
 @pytest.fixture
@@ -18,6 +25,40 @@ def mock_tokenizer() -> Tokenizer:
     return tokenizer
 
 
+@pytest.fixture
+def mock_berttokenizer() -> BertTokenizerFast:
+    """Load the real BertTokenizerFast from the provided tokenizer.json file."""
+    tokenizer_path = Path("tests/data/test_tokenizer/tokenizer.json")
+    return BertTokenizerFast(tokenizer_file=str(tokenizer_path))
+
+
+@pytest.fixture
+def mock_transformer() -> AutoModel:
+    """Create a mock transformer model."""
+
+    class MockPreTrainedModel:
+        def __init__(self) -> None:
+            self.device = "cpu"
+
+        def to(self, device: str) -> MockPreTrainedModel:
+            self.device = device
+            return self
+
+        def __call__(self, *args: Any, **kwargs: Any) -> Any:
+            # Simulate a last_hidden_state output for a transformer model
+            batch_size, seq_length = kwargs["input_ids"].shape
+            # Return a tensor of shape (batch_size, seq_length, 768)
+            return type(
+                "BaseModelOutputWithPoolingAndCrossAttentions",
+                (object,),
+                {
+                    "last_hidden_state": torch.rand(batch_size, seq_length, 768)  # Simulate 768 hidden units
+                },
+            )
+
+    return MockPreTrainedModel()
+
+
 @pytest.fixture
 def mock_vectors() -> np.ndarray:
     """Create mock vectors."""

diff --git a/tests/data/test_tokenizer/special_tokens_map.json b/tests/data/test_tokenizer/special_tokens_map.json
@@ -0,0 +1,7 @@
+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}