fix: add fallbacks for model info (#44)

undo76 · web-flow · commit 776e0c5e6960 · 2024-11-22T12:02:35.000+01:00
diff --git a/src/raglite/_database.py b/src/raglite/_database.py
@@ -8,7 +8,6 @@
 from typing import Any
 
 import numpy as np
-from litellm import get_model_info  # type: ignore[attr-defined]
 from markdown_it import MarkdownIt
 from pydantic import ConfigDict
 from sqlalchemy.engine import Engine, make_url
@@ -24,7 +23,7 @@
 )
 
 from raglite._config import RAGLiteConfig
-from raglite._litellm import LlamaCppPythonLLM
+from raglite._litellm import get_embedding_dim
 from raglite._typing import Embedding, FloatMatrix, FloatVector, PickledObject
 
 
@@ -274,14 +273,8 @@ def create_database_engine(config: RAGLiteConfig | None = None) -> Engine:
         with Session(engine) as session:
             session.execute(text("CREATE EXTENSION IF NOT EXISTS vector;"))
             session.commit()
-    # If the user has configured a llama-cpp-python model, we ensure that LiteLLM's model info is up
-    # to date by loading that LLM.
-    if config.embedder.startswith("llama-cpp-python"):
-        _ = LlamaCppPythonLLM.llm(config.embedder, embedding=True)
-    llm_provider = "llama-cpp-python" if config.embedder.startswith("llama-cpp") else None
-    model_info = get_model_info(config.embedder, custom_llm_provider=llm_provider)
-    embedding_dim = model_info.get("output_vector_size") or -1
-    assert embedding_dim > 0
+    # Get the embedding dimension.
+    embedding_dim = get_embedding_dim(config)
     # Create all SQLModel tables.
     ChunkEmbedding.set_embedding_dim(embedding_dim)
     SQLModel.metadata.create_all(engine)
diff --git a/src/raglite/_litellm.py b/src/raglite/_litellm.py
@@ -14,6 +14,7 @@
     GenericStreamingChunk,
     ModelResponse,
     convert_to_model_response_object,
+    get_model_info,
 )
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from llama_cpp import (  # type: ignore[attr-defined]
@@ -24,6 +25,8 @@
     LlamaRAMCache,
 )
 
+from raglite._config import RAGLiteConfig
+
 # Reduce the logging level for LiteLLM and flashrank.
 logging.getLogger("litellm").setLevel(logging.WARNING)
 logging.getLogger("flashrank").setLevel(logging.WARNING)
@@ -259,3 +262,54 @@ async def astreaming(  # type: ignore[misc,override]  # noqa: PLR0913
         {"provider": "llama-cpp-python", "custom_handler": LlamaCppPythonLLM()}
     )
     litellm.suppress_debug_info = True
+
+
+@cache
+def get_context_size(config: RAGLiteConfig, *, fallback: int = 2048) -> int:
+    """Get the context size for the configured LLM."""
+    # If the user has configured a llama-cpp-python model, we ensure that LiteLLM's model info is up
+    # to date by loading that LLM.
+    if config.llm.startswith("llama-cpp-python"):
+        _ = LlamaCppPythonLLM.llm(config.llm)
+    # Attempt to read the context size from LiteLLM's model info.
+    llm_provider = "llama-cpp-python" if config.llm.startswith("llama-cpp") else None
+    model_info = get_model_info(config.llm, custom_llm_provider=llm_provider)
+    max_tokens = model_info.get("max_tokens")
+    if isinstance(max_tokens, int) and max_tokens > 0:
+        return max_tokens
+    # Fall back to a default context size if the model info is not available.
+    if fallback > 0:
+        warnings.warn(
+            f"Could not determine the context size of {config.llm} from LiteLLM's model_info, using {fallback}.",
+            stacklevel=2,
+        )
+        return 2048
+    error_message = f"Could not determine the context size of {config.llm}."
+    raise ValueError(error_message)
+
+
+@cache
+def get_embedding_dim(config: RAGLiteConfig, *, fallback: bool = True) -> int:
+    """Get the embedding dimension for the configured embedder."""
+    # If the user has configured a llama-cpp-python model, we ensure that LiteLLM's model info is up
+    # to date by loading that LLM.
+    if config.embedder.startswith("llama-cpp-python"):
+        _ = LlamaCppPythonLLM.llm(config.embedder, embedding=True)
+    # Attempt to read the embedding dimension from LiteLLM's model info.
+    llm_provider = "llama-cpp-python" if config.embedder.startswith("llama-cpp") else None
+    model_info = get_model_info(config.embedder, custom_llm_provider=llm_provider)
+    embedding_dim = model_info.get("output_vector_size")
+    if isinstance(embedding_dim, int) and embedding_dim > 0:
+        return embedding_dim
+    # If that fails, fall back to embedding a single sentence and reading its embedding dimension.
+    if fallback:
+        from raglite._embed import embed_sentences
+
+        warnings.warn(
+            f"Could not determine the embedding dimension of {config.embedder} from LiteLLM's model_info, using fallback.",
+            stacklevel=2,
+        )
+        fallback_embeddings = embed_sentences(["Hello world"], config=config)
+        return fallback_embeddings.shape[1]
+    error_message = f"Could not determine the embedding dimension of {config.embedder}."
+    raise ValueError(error_message)
diff --git a/src/raglite/_rag.py b/src/raglite/_rag.py
@@ -2,11 +2,11 @@
 
 from collections.abc import AsyncIterator, Iterator
 
-from litellm import acompletion, completion, get_model_info  # type: ignore[attr-defined]
+from litellm import acompletion, completion
 
 from raglite._config import RAGLiteConfig
 from raglite._database import Chunk
-from raglite._litellm import LlamaCppPythonLLM
+from raglite._litellm import get_context_size
 from raglite._search import hybrid_search, rerank_chunks, retrieve_segments
 from raglite._typing import SearchMethod
 
@@ -27,15 +27,9 @@ def _max_contexts(
     config: RAGLiteConfig | None = None,
 ) -> int:
     """Determine the maximum number of contexts for RAG."""
-    # If the user has configured a llama-cpp-python model, we ensure that LiteLLM's model info is up
-    # to date by loading that LLM.
+    # Get the model's context size.
     config = config or RAGLiteConfig()
-    if config.llm.startswith("llama-cpp-python"):
-        _ = LlamaCppPythonLLM.llm(config.llm)
-    # Get the model's maximum context size.
-    llm_provider = "llama-cpp-python" if config.llm.startswith("llama-cpp") else None
-    model_info = get_model_info(config.llm, custom_llm_provider=llm_provider)
-    max_tokens = model_info.get("max_tokens") or 2048
+    max_tokens = get_context_size(config)
     # Reduce the maximum number of contexts to take into account the LLM's context size.
     max_context_tokens = (
         max_tokens