feat: improve late chunking and optimize pgvector settings (#51)

lsorber · web-flow · commit 2680b7415e93 · 2024-12-04T17:28:33.000+01:00
diff --git a/README.md b/README.md
@@ -47,8 +47,8 @@ pip install https://github.com/explosion/spacy-models/releases/download/xx_sent_
 Next, it is optional but recommended to install [an accelerated llama-cpp-python precompiled binary](https://github.com/abetlen/llama-cpp-python?tab=readme-ov-file#supported-backends) with:
 
 ```sh
-# Configure which llama-cpp-python precompiled binary to install (⚠️ only v0.2.88 is supported right now):
-LLAMA_CPP_PYTHON_VERSION=0.2.88
+# Configure which llama-cpp-python precompiled binary to install (⚠️ only v0.3.2 is supported right now):
+LLAMA_CPP_PYTHON_VERSION=0.3.2
 PYTHON_VERSION=310
 ACCELERATOR=metal|cu121|cu122|cu123|cu124
 PLATFORM=macosx_11_0_arm64|linux_x86_64|win_amd64
@@ -116,7 +116,7 @@ my_config = RAGLiteConfig(
 my_config = RAGLiteConfig(
     db_url="sqlite:///raglite.sqlite",
     llm="llama-cpp-python/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF/*Q4_K_M.gguf@8192",
-    embedder="llama-cpp-python/lm-kit/bge-m3-gguf/*F16.gguf",
+    embedder="llama-cpp-python/lm-kit/bge-m3-gguf/*F16.gguf@1024",  # A context size of 1024 tokens is the sweet spot for bge-m3.
 )
 ```
 
@@ -281,7 +281,7 @@ You can specify the database URL, LLM, and embedder directly in the Chainlit fro
 raglite chainlit \
     --db_url sqlite:///raglite.sqlite \
     --llm llama-cpp-python/bartowski/Llama-3.2-3B-Instruct-GGUF/*Q4_K_M.gguf@4096 \
-    --embedder llama-cpp-python/lm-kit/bge-m3-gguf/*F16.gguf
+    --embedder llama-cpp-python/lm-kit/bge-m3-gguf/*F16.gguf@1024
 ```
 
 To use an API-based LLM, make sure to include your credentials in a `.env` file or supply them inline:
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -43,7 +43,7 @@ services:
       - dev
 
   postgres:
-    image: pgvector/pgvector:pg16
+    image: pgvector/pgvector:pg17
     environment:
       POSTGRES_USER: raglite_user
       POSTGRES_PASSWORD: raglite_password
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -33,7 +33,7 @@ spacy = ">=3.7.0,<3.8.0"
 # Large Language Models:
 huggingface-hub = ">=0.22.0"
 litellm = ">=1.47.1"
-llama-cpp-python = ">=0.2.88"
+llama-cpp-python = ">=0.3.2"
 pydantic = ">=2.7.0"
 # Approximate Nearest Neighbors:
 pynndescent = ">=0.5.12"
diff --git a/src/raglite/_config.py b/src/raglite/_config.py
@@ -33,9 +33,9 @@ class RAGLiteConfig:
     # Embedder config used for indexing.
     embedder: str = field(
         default_factory=lambda: (  # Nomic-embed may be better if only English is used.
-            "llama-cpp-python/lm-kit/bge-m3-gguf/*F16.gguf"
+            "llama-cpp-python/lm-kit/bge-m3-gguf/*F16.gguf@1024"
             if llama_supports_gpu_offload() or (os.cpu_count() or 1) >= 4  # noqa: PLR2004
-            else "llama-cpp-python/lm-kit/bge-m3-gguf/*Q4_K_M.gguf"
+            else "llama-cpp-python/lm-kit/bge-m3-gguf/*Q4_K_M.gguf@1024"
         )
     )
     embedder_normalize: bool = True
diff --git a/src/raglite/_database.py b/src/raglite/_database.py
@@ -331,22 +331,20 @@ def create_database_engine(config: RAGLiteConfig | None = None) -> Engine:
         with Session(engine) as session:
             metrics = {"cosine": "cosine", "dot": "ip", "euclidean": "l2", "l1": "l1", "l2": "l2"}
             session.execute(
-                text(
-                    """
+                text("""
                 CREATE INDEX IF NOT EXISTS keyword_search_chunk_index ON chunk USING GIN (to_tsvector('simple', body));
-                """
-                )
+                """)
             )
             session.execute(
-                text(
-                    f"""
+                text(f"""
                 CREATE INDEX IF NOT EXISTS vector_search_chunk_index ON chunk_embedding
                 USING hnsw (
-                     (embedding::halfvec({embedding_dim}))
-                     halfvec_{metrics[config.vector_search_index_metric]}_ops
+                    (embedding::halfvec({embedding_dim}))
+                    halfvec_{metrics[config.vector_search_index_metric]}_ops
                 );
-                """
-                )
+                SET hnsw.ef_search = {20 * 4 * 8};
+                SET hnsw.iterative_scan = {'relaxed_order' if config.reranker else 'strict_order'};
+                """)
             )
             session.commit()
     elif db_backend == "sqlite":
@@ -355,39 +353,31 @@ def create_database_engine(config: RAGLiteConfig | None = None) -> Engine:
         # [1] https://www.sqlite.org/fts5.html#external_content_tables
         with Session(engine) as session:
             session.execute(
-                text(
-                    """
+                text("""
                 CREATE VIRTUAL TABLE IF NOT EXISTS keyword_search_chunk_index USING fts5(body, content='chunk', content_rowid='rowid');
-                """
-                )
+                """)
             )
             session.execute(
-                text(
-                    """
+                text("""
                 CREATE TRIGGER IF NOT EXISTS keyword_search_chunk_index_auto_insert AFTER INSERT ON chunk BEGIN
                     INSERT INTO keyword_search_chunk_index(rowid, body) VALUES (new.rowid, new.body);
                 END;
-                """
-                )
+                """)
             )
             session.execute(
-                text(
-                    """
+                text("""
                 CREATE TRIGGER IF NOT EXISTS keyword_search_chunk_index_auto_delete AFTER DELETE ON chunk BEGIN
                     INSERT INTO keyword_search_chunk_index(keyword_search_chunk_index, rowid, body) VALUES('delete', old.rowid, old.body);
                 END;
-                """
-                )
+                """)
             )
             session.execute(
-                text(
-                    """
+                text("""
                 CREATE TRIGGER IF NOT EXISTS keyword_search_chunk_index_auto_update AFTER UPDATE ON chunk BEGIN
                     INSERT INTO keyword_search_chunk_index(keyword_search_chunk_index, rowid, body) VALUES('delete', old.rowid, old.body);
                     INSERT INTO keyword_search_chunk_index(rowid, body) VALUES (new.rowid, new.body);
                 END;
-                """
-                )
+                """)
             )
             session.commit()
     return engine
diff --git a/src/raglite/_litellm.py b/src/raglite/_litellm.py
@@ -1,10 +1,13 @@
 """Add support for llama-cpp-python models to LiteLLM."""
 
 import asyncio
+import contextlib
 import logging
+import os
 import warnings
 from collections.abc import AsyncIterator, Callable, Iterator
 from functools import cache
+from io import StringIO
 from typing import Any, ClassVar, cast
 
 import httpx
@@ -28,7 +31,8 @@
 from raglite._config import RAGLiteConfig
 
 # Reduce the logging level for LiteLLM and flashrank.
-logging.getLogger("litellm").setLevel(logging.WARNING)
+os.environ["LITELLM_LOG"] = "WARNING"
+logging.getLogger("LiteLLM").setLevel(logging.WARNING)
 logging.getLogger("flashrank").setLevel(logging.WARNING)
 
 
@@ -96,14 +100,21 @@ def llm(model: str, **kwargs: Any) -> Llama:
             filename, n_ctx_str = filename_n_ctx
             n_ctx = int(n_ctx_str)
         # Load the LLM.
-        with warnings.catch_warnings():  # Filter huggingface_hub warning about HF_TOKEN.
+        with (
+            contextlib.redirect_stderr(StringIO()),  # Filter spurious llama.cpp output.
+            warnings.catch_warnings(),  # Filter huggingface_hub warning about HF_TOKEN.
+        ):
             warnings.filterwarnings("ignore", category=UserWarning)
             llm = Llama.from_pretrained(
                 repo_id=repo_id,
                 filename=filename,
                 n_ctx=n_ctx,
                 n_gpu_layers=-1,
                 verbose=False,
+                # Workaround to enable long context embedding models [1].
+                # [1] https://github.com/abetlen/llama-cpp-python/issues/1762
+                n_batch=n_ctx if n_ctx > 0 else 1024,
+                n_ubatch=n_ctx if n_ctx > 0 else 1024,
                 **kwargs,
             )
         # Enable caching.
diff --git a/src/raglite/_search.py b/src/raglite/_search.py
@@ -26,7 +26,11 @@
 
 
 def vector_search(
-    query: str | FloatMatrix, *, num_results: int = 3, config: RAGLiteConfig | None = None
+    query: str | FloatMatrix,
+    *,
+    num_results: int = 3,
+    oversample: int = 8,
+    config: RAGLiteConfig | None = None,
 ) -> tuple[list[ChunkId], list[float]]:
     """Search chunks using ANN vector search."""
     # Read the config.
@@ -57,7 +61,9 @@ def vector_search(
             )
             distance = distance_func(query_embedding).label("distance")
             results = session.exec(
-                select(ChunkEmbedding.chunk_id, distance).order_by(distance).limit(8 * num_results)
+                select(ChunkEmbedding.chunk_id, distance)
+                .order_by(distance)
+                .limit(oversample * num_results)
             )
             chunk_ids_, distance = zip(*results, strict=True)
             chunk_ids, similarity = np.asarray(chunk_ids_), 1.0 - np.asarray(distance)
@@ -70,7 +76,7 @@ def vector_search(
         from pynndescent import NNDescent
 
         multi_vector_indices, distance = cast(NNDescent, index).query(
-            query_embedding[np.newaxis, :], k=8 * num_results
+            query_embedding[np.newaxis, :], k=oversample * num_results
         )
         similarity = 1 - distance[0, :]
         # Transform the multi-vector indices into chunk indices, and then to chunk ids.
@@ -105,36 +111,32 @@ def keyword_search(
         if db_backend == "postgresql":
             # Convert the query to a tsquery [1].
             # [1] https://www.postgresql.org/docs/current/textsearch-controls.html
-            query_escaped = re.sub(r"[&|!():<>\"]", " ", query)
+            query_escaped = re.sub(f"[{re.escape(string.punctuation)}]", " ", query)
             tsv_query = " | ".join(query_escaped.split())
             # Perform keyword search with tsvector.
-            statement = text(
-                """
+            statement = text("""
                 SELECT id as chunk_id, ts_rank(to_tsvector('simple', body), to_tsquery('simple', :query)) AS score
                 FROM chunk
                 WHERE to_tsvector('simple', body) @@ to_tsquery('simple', :query)
                 ORDER BY score DESC
                 LIMIT :limit;
-            """
-            )
+                """)
             results = session.execute(statement, params={"query": tsv_query, "limit": num_results})
         elif db_backend == "sqlite":
             # Convert the query to an FTS5 query [1].
             # [1] https://www.sqlite.org/fts5.html#full_text_query_syntax
-            query_escaped = re.sub(f"[{re.escape(string.punctuation)}]", "", query)
+            query_escaped = re.sub(f"[{re.escape(string.punctuation)}]", " ", query)
             fts5_query = " OR ".join(query_escaped.split())
             # Perform keyword search with FTS5. In FTS5, BM25 scores are negative [1], so we
             # negate them to make them positive.
             # [1] https://www.sqlite.org/fts5.html#the_bm25_function
-            statement = text(
-                """
+            statement = text("""
                 SELECT chunk.id as chunk_id, -bm25(keyword_search_chunk_index) as score
                 FROM chunk JOIN keyword_search_chunk_index ON chunk.rowid = keyword_search_chunk_index.rowid
                 WHERE keyword_search_chunk_index MATCH :match
                 ORDER BY score DESC
                 LIMIT :limit;
-            """
-            )
+                """)
             results = session.execute(statement, params={"match": fts5_query, "limit": num_results})
         # Unpack the results.
         results = list(results)  # type: ignore[assignment]
@@ -162,12 +164,12 @@ def reciprocal_rank_fusion(
 
 
 def hybrid_search(
-    query: str, *, num_results: int = 3, num_rerank: int = 100, config: RAGLiteConfig | None = None
+    query: str, *, num_results: int = 3, oversample: int = 4, config: RAGLiteConfig | None = None
 ) -> tuple[list[ChunkId], list[float]]:
     """Search chunks by combining ANN vector search with BM25 keyword search."""
     # Run both searches.
-    vs_chunk_ids, _ = vector_search(query, num_results=num_rerank, config=config)
-    ks_chunk_ids, _ = keyword_search(query, num_results=num_rerank, config=config)
+    vs_chunk_ids, _ = vector_search(query, num_results=oversample * num_results, config=config)
+    ks_chunk_ids, _ = keyword_search(query, num_results=oversample * num_results, config=config)
     # Combine the results with Reciprocal Rank Fusion (RRF).
     chunk_ids, hybrid_score = reciprocal_rank_fusion([vs_chunk_ids, ks_chunk_ids])
     chunk_ids, hybrid_score = chunk_ids[:num_results], hybrid_score[:num_results]
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -69,7 +69,7 @@ def database(request: pytest.FixtureRequest) -> str:
     scope="session",
     params=[
         pytest.param(
-            "llama-cpp-python/lm-kit/bge-m3-gguf/*Q4_K_M.gguf",
+            "llama-cpp-python/lm-kit/bge-m3-gguf/*Q4_K_M.gguf@1024",  # More context degrades performance.
             id="bge_m3",
         ),
         pytest.param(
diff --git a/tests/test_rerank.py b/tests/test_rerank.py
@@ -1,12 +1,24 @@
 """Test RAGLite's reranking functionality."""
 
+import random
+from typing import TypeVar
+
 import pytest
 from rerankers.models.flashrank_ranker import FlashRankRanker
 from rerankers.models.ranker import BaseRanker
+from scipy.stats import kendalltau
 
 from raglite import RAGLiteConfig, hybrid_search, rerank_chunks, retrieve_chunks
 from raglite._database import Chunk
 
+T = TypeVar("T")
+
+
+def kendall_tau(a: list[T], b: list[T]) -> float:
+    """Measure the Kendall rank correlation coefficient between two lists."""
+    τ: float = kendalltau(range(len(a)), [a.index(el) for el in b])[0]  # noqa: PLC2401
+    return τ
+
 
 @pytest.fixture(
     params=[
@@ -40,16 +52,19 @@ def test_reranker(
     )
     # Search for a query.
     query = "What does it mean for two events to be simultaneous?"
-    chunk_ids, _ = hybrid_search(query, num_results=3, config=raglite_test_config)
+    chunk_ids, _ = hybrid_search(query, num_results=20, config=raglite_test_config)
     # Retrieve the chunks.
     chunks = retrieve_chunks(chunk_ids, config=raglite_test_config)
     assert all(isinstance(chunk, Chunk) for chunk in chunks)
     assert all(chunk_id == chunk.id for chunk_id, chunk in zip(chunk_ids, chunks, strict=True))
-    # Rerank the chunks given an inverted chunk order.
-    reranked_chunks = rerank_chunks(query, chunks[::-1], config=raglite_test_config)
-    if reranker is not None and "text-embedding-3-small" not in raglite_test_config.embedder:
-        assert reranked_chunks[0] == chunks[0]
-    # Test that we can also rerank given the chunk_ids only.
-    reranked_chunks = rerank_chunks(query, chunk_ids[::-1], config=raglite_test_config)
-    if reranker is not None and "text-embedding-3-small" not in raglite_test_config.embedder:
-        assert reranked_chunks[0] == chunks[0]
+    # Randomly shuffle the chunks.
+    random.seed(42)
+    chunks_random = random.sample(chunks, len(chunks))
+    # Rerank the chunks starting from a pathological order and verify that it improves the ranking.
+    for arg in (chunks[::-1], chunk_ids[::-1]):
+        reranked_chunks = rerank_chunks(query, arg, config=raglite_test_config)
+        if reranker:
+            τ_search = kendall_tau(chunks, reranked_chunks)  # noqa: PLC2401
+            τ_inverse = kendall_tau(chunks[::-1], reranked_chunks)  # noqa: PLC2401
+            τ_random = kendall_tau(chunks_random, reranked_chunks)  # noqa: PLC2401
+            assert τ_search >= τ_random >= τ_inverse

Original file line number	Diff line number	Diff line change
`@@ -33,9 +33,9 @@ class RAGLiteConfig:`
`33`	`33`	`# Embedder config used for indexing.`
`34`	`34`	`embedder: str = field(`
`35`	`35`	`default_factory=lambda: ( # Nomic-embed may be better if only English is used.`
`36`		`- "llama-cpp-python/lm-kit/bge-m3-gguf/*F16.gguf"`
	`36`	`+ "llama-cpp-python/lm-kit/bge-m3-gguf/*F16.gguf@1024"`
`37`	`37`	`if llama_supports_gpu_offload() or (os.cpu_count() or 1) >= 4 # noqa: PLR2004`
`38`		`- else "llama-cpp-python/lm-kit/bge-m3-gguf/*Q4_K_M.gguf"`
	`38`	`+ else "llama-cpp-python/lm-kit/bge-m3-gguf/*Q4_K_M.gguf@1024"`
`39`	`39`	`)`
`40`	`40`	`)`
`41`	`41`	`embedder_normalize: bool = True`