Merge pull request #22 from shcherbak-ai/dev

SergiiShcherbak · web-flow · commit 0f01b14b97bf · 2025-05-20T13:15:19.000+02:00
v0.4.0
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 
 - **Refactor**: Code reorganization that doesn't change functionality but improves structure or maintainability
 
+## [0.4.0](https://github.com/shcherbak-ai/contextgem/releases/tag/v0.4.0) - 2025-05-20
+### Added
+- Support for local SaT model paths in Document's `sat_model_id` parameter
+
 ## [0.3.0](https://github.com/shcherbak-ai/contextgem/releases/tag/v0.3.0) - 2025-05-19
 ### Added
 - Expanded JsonObjectConcept to support nested class hierarchies, nested dictionary structures, lists containing objects, and literal types.
diff --git a/contextgem/__init__.py b/contextgem/__init__.py
@@ -20,7 +20,7 @@
 ContextGem - Effortless LLM extraction from documents
 """
 
-__version__ = "0.3.0"
+__version__ = "0.4.0"
 __author__ = "Shcherbak AI AS"
 
 from contextgem.public import (
diff --git a/contextgem/internal/__init__.py b/contextgem/internal/__init__.py
@@ -72,6 +72,7 @@
     NonEmptyStr,
     ReferenceDepth,
     SaTModelId,
+    StandardSaTModelId,
     _deserialize_type_hint,
     _dynamic_pydantic_model,
     _format_dict_structure,
@@ -126,6 +127,7 @@
     "DefaultPromptType",
     "ReferenceDepth",
     "SaTModelId",
+    "StandardSaTModelId",
     "LanguageRequirement",
     "JustificationDepth",
     "AsyncCalsAndKwargs",
diff --git a/contextgem/internal/typings/__init__.py b/contextgem/internal/typings/__init__.py
@@ -29,6 +29,7 @@
     NonEmptyStr,
     ReferenceDepth,
     SaTModelId,
+    StandardSaTModelId,
 )
 from contextgem.internal.typings.strings_to_types import _deserialize_type_hint
 from contextgem.internal.typings.typed_class_utils import (
@@ -58,6 +59,7 @@
     "DefaultPromptType",
     "ReferenceDepth",
     "SaTModelId",
+    "StandardSaTModelId",
     "LanguageRequirement",
     "JustificationDepth",
     "AsyncCalsAndKwargs",
diff --git a/contextgem/internal/typings/aliases.py b/contextgem/internal/typings/aliases.py
@@ -27,7 +27,8 @@
 
 import sys
 from decimal import Decimal
-from typing import Annotated, Any, Callable, Coroutine, Literal, TypeVar
+from pathlib import Path
+from typing import Annotated, Any, Callable, Coroutine, Literal, TypeVar, Union
 
 from pydantic import Field, StrictStr, StringConstraints
 
@@ -54,7 +55,8 @@
 
 ReferenceDepth = Literal["paragraphs", "sentences"]
 
-SaTModelId = Literal[
+# Define standard SaT model IDs as a separate type
+StandardSaTModelId = Literal[
     "sat-1l",
     "sat-1l-sm",
     "sat-3l",
@@ -66,6 +68,13 @@
     "sat-12l-sm",
 ]
 
+# Combined type for sat_model_id parameter
+SaTModelId = Union[
+    StandardSaTModelId,
+    str,  # Local path as a string
+    Path,  # Local path as a Path object
+]
+
 LanguageRequirement = Literal["en", "adapt"]
 
 JustificationDepth = Literal["brief", "balanced", "comprehensive"]
diff --git a/contextgem/internal/utils.py b/contextgem/internal/utils.py
@@ -29,7 +29,7 @@
 from collections import defaultdict
 from functools import lru_cache
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Coroutine, Literal, TypeVar
+from typing import TYPE_CHECKING, Any, Callable, Coroutine, Literal, TypeVar, get_args
 
 from jinja2 import Environment, Template, nodes
 from wtpsplit import SaT
@@ -51,6 +51,7 @@
     ExtractedInstanceType,
     ReferenceDepth,
     SaTModelId,
+    StandardSaTModelId,
 )
 
 T = TypeVar("T")
@@ -586,17 +587,69 @@ def _validate_parsed_llm_output(
 def _get_sat_model(model_id: SaTModelId = "sat-3l-sm") -> SaT:
     """
     Retrieves and caches a SaT model to be used for paragraphs and sentence segmentation.
+    Performs validation of the model ID or path before attempting to load the model.
 
     :param model_id:
         The identifier of the SaT model. Defaults to "sat-3l-sm".
+        Can be:
+        - A standard SaT model ID (e.g., "sat-3l-sm")
+        - A local path to a SaT model directory (as a string or Path object)
 
     :return:
         An instance of the SaT model associated with the given `model_id`.
-    """
-    logger.info(f"Loading SaT model {model_id}...")
-    model = SaT(model_id)
-    logger.info(f"SaT model {model_id} loaded.")
-    return model
+
+    :raises ValueError:
+        If the provided path doesn't exist or is not a directory.
+    :raises RuntimeError:
+        If the provided path exists but does not contain a valid SaT model.
+    """
+    # Convert Path object to string if needed
+    if isinstance(model_id, Path):
+        model_id = str(model_id)
+
+    # Check if it's a standard model ID
+    is_standard_model = False
+    if isinstance(model_id, str):
+        # Get standard models directly from the type definition
+        standard_models = get_args(StandardSaTModelId)
+        is_standard_model = model_id in standard_models
+
+    # Determine if it's a local path (but not a standard model ID)
+    is_local_path = False
+    if isinstance(model_id, str) and not is_standard_model:
+        path = Path(model_id)
+
+        # Validate that the path exists and is a directory
+        if not path.exists() or not path.is_dir():
+            raise ValueError(
+                f"The provided SaT model path '{model_id}' does not exist or is not a directory."
+            )
+
+        is_local_path = True
+
+    # Log appropriate message
+    if is_local_path:
+        logger.info(f"Loading SaT model from local path {model_id}...")
+    else:
+        logger.info(f"Loading SaT model {model_id}...")
+
+    # Attempt to load the model
+    try:
+        model = SaT(model_id)
+        logger.info(f"SaT model loaded successfully.")
+        return model
+    except Exception as e:
+        if is_local_path:
+            # If it's a local path that exists but isn't a valid SaT model
+            logger.error(f"Failed to load SaT model from path '{model_id}': {str(e)}")
+            raise RuntimeError(
+                f"The directory at '{model_id}' exists but does not contain a valid SaT model. "
+                f"Error: {str(e)}"
+            ) from e
+        else:
+            # For standard model IDs or other errors
+            logger.error(f"Failed to load SaT model '{model_id}': {str(e)}")
+            raise
 
 
 def _group_instances_by_fields(
diff --git a/contextgem/public/documents.py b/contextgem/public/documents.py
@@ -39,6 +39,7 @@
 import itertools
 import warnings
 from copy import deepcopy
+from pathlib import Path
 from typing import Any, Literal, Optional
 
 from pydantic import Field, field_validator, model_validator
@@ -81,8 +82,10 @@ class Document(_AssignedInstancesProcessor):
     :ivar paragraph_segmentation_mode: Mode for paragraph segmentation. When set to "sat",
         uses a SaT (Segment Any Text https://arxiv.org/abs/2406.16678) model. Defaults to "newlines".
     :type paragraph_segmentation_mode: Literal["newlines", "sat"]
-    :ivar sat_model_id: SaT model ID for paragraph/sentence segmentation.
-        Defaults to "sat-3l-sm". See https://github.com/segment-any-text/wtpsplit for the list of available models.
+    :ivar sat_model_id: SaT model ID for paragraph/sentence segmentation or a local path to a SaT model.
+        For model IDs, defaults to "sat-3l-sm". See https://github.com/segment-any-text/wtpsplit
+        for the list of available models. For local paths, provide either a string path or a Path
+        object pointing to the directory containing the SaT model.
     :type sat_model_id: SaTModelId
 
     Note:
@@ -285,6 +288,21 @@ def _validate_images(cls, images: list[Image]) -> list[Image]:
             seen.add(image.base64_data)
         return images
 
+    @field_validator("sat_model_id")
+    @classmethod
+    def _validate_sat_model_id(cls, sat_model_id: SaTModelId) -> str:
+        """
+        Validates and converts the sat_model_id to ensure it's a string.
+        If a Path object is provided, it's converted to a string representation.
+        This conversion ensures the document remains fully serializable.
+
+        :param sat_model_id: The SaT model ID or path to validate
+        :return: String representation of the model ID or path
+        """
+        if isinstance(sat_model_id, Path):
+            return str(sat_model_id)
+        return sat_model_id
+
     @model_validator(mode="before")
     @classmethod
     def _validate_document_pre(cls, data: Any) -> Any:
diff --git a/docs/docs-raw-for-llm.txt b/docs/docs-raw-for-llm.txt
@@ -5575,8 +5575,11 @@ class contextgem.public.documents.Document(**data)
         "newlines".
 
       * **sat_model_id** -- SaT model ID for paragraph/sentence
-        segmentation. Defaults to "sat-3l-sm". See https://github.com
-        /segment-any-text/wtpsplit for the list of available models.
+        segmentation or a local path to a SaT model. For model IDs,
+        defaults to "sat-3l-sm". See https://github.com/segment-any-
+        text/wtpsplit for the list of available models. For local
+        paths, provide either a string path or a Path object pointing
+        to the directory containing the SaT model.
 
    Parameters:
       * **custom_data** (*dict*)
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -22,7 +22,7 @@
 project = "ContextGem"
 copyright = "2025, Shcherbak AI AS"
 author = "Sergii Shcherbak"
-release = "0.3.0"
+release = "0.4.0"
 
 
 # Add path to the package
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "contextgem"
-version = "0.3.0"
+version = "0.4.0"
 description = "Effortless LLM extraction from documents"
 authors = [
     {name = "shcherbak-ai", email = "sergii@shcherbak.ai"}
diff --git a/tests/test_all.py b/tests/test_all.py
@@ -24,6 +24,7 @@
 
 import os
 import sys
+import tempfile
 import warnings
 import xml.etree.ElementTree as ET
 import zipfile
@@ -71,7 +72,7 @@
     dedicated_stream,
     logger,
 )
-from contextgem.internal.utils import _split_text_into_paragraphs
+from contextgem.internal.utils import _get_sat_model, _split_text_into_paragraphs
 from contextgem.public.utils import JsonObjectClassStruct
 from tests.utils import (
     VCR_FILTER_HEADERS,
@@ -2511,6 +2512,50 @@ def test_init_document_and_pipeline(self, context: Document | DocumentPipeline):
         with pytest.raises(ValueError):
             context.add_concepts([concept, concept])
 
+    def test_local_sat_model(self):
+        """
+        Tests the loading of a local SAT model.
+        """
+
+        # Test nonexistent path
+        with pytest.raises(ValueError) as exc_info:
+            non_existent_path = "/nonexistent/path/to/model"
+            _get_sat_model(non_existent_path)
+            assert "does not exist or is not a directory" in str(exc_info.value)
+            # Document creation should also fail
+            with pytest.raises(ValueError):
+                Document(
+                    raw_text="Sample text",
+                    paragraph_segmentation_mode="sat",
+                    sat_model_id=non_existent_path,
+                )
+
+        # Test file path (not a directory)
+        with tempfile.NamedTemporaryFile() as temp_file:
+            with pytest.raises(ValueError) as exc_info:
+                _get_sat_model(temp_file.name)
+            assert "does not exist or is not a directory" in str(exc_info.value)
+            # Document creation should also fail
+            with pytest.raises(ValueError):
+                Document(
+                    raw_text="Sample text",
+                    paragraph_segmentation_mode="sat",
+                    sat_model_id=temp_file.name,
+                )
+
+        # Test valid path but invalid model
+        with tempfile.TemporaryDirectory() as temp_dir:
+            with pytest.raises(RuntimeError) as exc_info:
+                _get_sat_model(temp_dir)
+            assert "does not contain a valid SaT model" in str(exc_info.value)
+            # Document creation should also fail
+            with pytest.raises(RuntimeError):
+                Document(
+                    raw_text="Sample text",
+                    paragraph_segmentation_mode="sat",
+                    sat_model_id=temp_dir,
+                )
+
     @pytest.mark.vcr()
     def test_system_messages(self):
         """