google · aksg87 · Aug 8, 2025 · Aug 8, 2025
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -105,7 +105,14 @@ For full testing across Python versions:
 tox  # runs pylint + pytest on Python 3.10 and 3.11
 ```
 
-### 5. Submit Your Pull Request
+### 5. Adding Custom Model Providers
+
+If you want to add support for a new LLM provider, please refer to the [Provider System Documentation](langextract/providers/README.md). The recommended approach is to create an external plugin package rather than modifying the core library. This allows for:
+- Independent versioning and releases
+- Faster iteration without core review cycles
+- Custom dependencies without affecting core users
+
+### 6. Submit Your Pull Request
 
 All submissions, including submissions by project members, require review. We
 use [GitHub pull requests](https://docs.github.com/articles/about-pull-requests)

diff --git a/README.md b/README.md
@@ -255,7 +255,7 @@ result = lx.extract(
 
 ## Using OpenAI Models
 
-LangExtract also supports OpenAI models. Example OpenAI configuration:
+LangExtract supports OpenAI models (requires optional dependency: `pip install langextract[openai]`):
 
 ```python
 import langextract as lx
@@ -264,8 +264,7 @@ result = lx.extract(
     text_or_documents=input_text,
     prompt_description=prompt,
     examples=examples,
-    language_model_type=lx.inference.OpenAILanguageModel,
-    model_id="gpt-4o",
+    model_id="gpt-4o",  # Automatically selects OpenAI provider
     api_key=os.environ.get('OPENAI_API_KEY'),
     fence_output=True,
     use_schema_constraints=False
@@ -285,8 +284,7 @@ result = lx.extract(
     text_or_documents=input_text,
     prompt_description=prompt,
     examples=examples,
-    language_model_type=lx.inference.OllamaLanguageModel,
-    model_id="gemma2:2b",  # or any Ollama model
+    model_id="gemma2:2b",  # Automatically selects Ollama provider
     model_url="http://localhost:11434",
     fence_output=False,
     use_schema_constraints=False
@@ -328,6 +326,15 @@ with development, testing, and pull requests. You must sign a
 [Contributor License Agreement](https://cla.developers.google.com/about)
 before submitting patches.
 
+### Adding Custom Model Providers
+
+LangExtract supports custom LLM providers through a plugin system. You can add support for new models by creating an external Python package that registers with LangExtract's provider registry. This allows you to:
+- Add new model support without modifying the core library
+- Distribute your provider independently
+- Maintain custom dependencies
+
+For detailed instructions, see the [Provider System Documentation](langextract/providers/README.md).
+
 ## Testing
 
 To run tests locally from the source:

diff --git a/examples/custom_provider_plugin/README.md b/examples/custom_provider_plugin/README.md
@@ -0,0 +1,88 @@
+# Custom Provider Plugin Example
+
+This example demonstrates how to create a custom provider plugin that extends LangExtract with your own model backend.
+
+**Note**: This is an example included in the LangExtract repository for reference. It is not part of the LangExtract package and won't be installed when you `pip install langextract`.
+
+## Structure
+
+```
+custom_provider_plugin/
+├── pyproject.toml                      # Package configuration and metadata
+├── README.md                            # This file
+├── langextract_provider_example/        # Package directory
+│   ├── __init__.py                     # Package initialization
+│   └── provider.py                     # Custom provider implementation
+└── test_example_provider.py            # Test script
+```
+
+## Key Components
+
+### Provider Implementation (`provider.py`)
+
+```python
+@lx.providers.registry.register(
+    r'^gemini',  # Pattern for model IDs this provider handles
+)
+class CustomGeminiProvider(lx.inference.BaseLanguageModel):
+    def __init__(self, model_id: str, **kwargs):
+        # Initialize your backend client
+
+    def infer(self, batch_prompts, **kwargs):
+        # Call your backend API and return results
+```
+
+### Package Configuration (`pyproject.toml`)
+
+```toml
+[project.entry-points."langextract.providers"]
+custom_gemini = "langextract_provider_example:CustomGeminiProvider"
+```
+
+This entry point allows LangExtract to automatically discover your provider.
+
+## Installation
+
+```bash
+# Navigate to this example directory first
+cd examples/custom_provider_plugin
+
+# Install in development mode
+pip install -e .
+
+# Test the provider (must be run from this directory)
+python test_example_provider.py
+```
+
+## Usage
+
+Since this example registers the same pattern as the default Gemini provider, you must explicitly specify it:
+
+```python
+import langextract as lx
+
+config = lx.factory.ModelConfig(
+    model_id="gemini-2.5-flash",
+    provider="CustomGeminiProvider",
+    provider_kwargs={"api_key": "your-api-key"}
+)
+model = lx.factory.create_model(config)
+
+result = lx.extract(
+    text_or_documents="Your text here",
+    model=model,
+    prompt_description="Extract key information"
+)
+```
+
+## Creating Your Own Provider
+
+1. Copy this example as a starting point
+2. Update the provider class name and registration pattern
+3. Replace the Gemini implementation with your own backend
+4. Update package name in `pyproject.toml`
+5. Install and test your plugin
+
+## License
+
+Apache License 2.0
diff --git a/examples/custom_provider_plugin/langextract_provider_example/__init__.py b/examples/custom_provider_plugin/langextract_provider_example/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2025 Google LLC.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Example custom provider plugin for LangExtract."""
+
+from langextract_provider_example.provider import CustomGeminiProvider
+
+__all__ = ["CustomGeminiProvider"]
+__version__ = "0.1.0"
diff --git a/examples/custom_provider_plugin/langextract_provider_example/provider.py b/examples/custom_provider_plugin/langextract_provider_example/provider.py
@@ -0,0 +1,125 @@
+# Copyright 2025 Google LLC.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Minimal example of a custom provider plugin for LangExtract."""
+
+from __future__ import annotations
+
+import dataclasses
+from typing import Any, Iterator, Sequence
+
+import langextract as lx
+
+
+@lx.providers.registry.register(
+    r'^gemini',  # Matches Gemini model IDs (same as default provider)
+)
+@dataclasses.dataclass(init=False)
+class CustomGeminiProvider(lx.inference.BaseLanguageModel):
+  """Example custom LangExtract provider implementation.
+
+  This demonstrates how to create a custom provider for LangExtract
+  that can intercept and handle model requests. This example uses
+  Gemini as the backend, but you would replace this with your own
+  API or model implementation.
+
+  Note: Since this registers the same pattern as the default Gemini provider,
+  you must explicitly specify this provider when creating a model:
+
+  config = lx.factory.ModelConfig(
+      model_id="gemini-2.5-flash",
+      provider="CustomGeminiProvider"
+  )
+  model = lx.factory.create_model(config)
+  """
+
+  model_id: str
+  api_key: str | None
+  temperature: float
+  _client: Any = dataclasses.field(repr=False, compare=False)
+
+  def __init__(
+      self,
+      model_id: str = 'gemini-2.5-flash',
+      api_key: str | None = None,
+      temperature: float = 0.0,
+      **kwargs: Any,
+  ) -> None:
+    """Initialize the custom provider.
+
+    Args:
+      model_id: The model ID.
+      api_key: API key for the service.
+      temperature: Sampling temperature.
+      **kwargs: Additional parameters.
+    """
+    # TODO: Replace with your own client initialization
+    try:
+      from google import genai  # pylint: disable=import-outside-toplevel
+    except ImportError as e:
+      raise lx.exceptions.InferenceConfigError(
+          'This example requires google-genai package. '
+          'Install with: pip install google-genai'
+      ) from e
+
+    self.model_id = model_id
+    self.api_key = api_key
+    self.temperature = temperature
+
+    # Store any additional kwargs for potential use
+    self._extra_kwargs = kwargs
+
+    if not self.api_key:
+      raise lx.exceptions.InferenceConfigError(
+          'API key required. Set GEMINI_API_KEY or pass api_key parameter.'
+      )
+
+    self._client = genai.Client(api_key=self.api_key)
+
+    super().__init__()
+
+  def infer(
+      self, batch_prompts: Sequence[str], **kwargs: Any
+  ) -> Iterator[Sequence[lx.inference.ScoredOutput]]:
+    """Run inference on a batch of prompts.
+
+    Args:
+      batch_prompts: Input prompts to process.
+      **kwargs: Additional generation parameters.
+
+    Yields:
+      Lists of ScoredOutputs, one per prompt.
+    """
+    config = {
+        'temperature': kwargs.get('temperature', self.temperature),
+    }
+
+    # Add other parameters if provided
+    for key in ['max_output_tokens', 'top_p', 'top_k']:
+      if key in kwargs:
+        config[key] = kwargs[key]
+
+    for prompt in batch_prompts:
+      try:
+        # TODO: Replace this with your own API/model calls
+        response = self._client.models.generate_content(
+            model=self.model_id, contents=prompt, config=config
+        )
+        output = response.text.strip()
+        yield [lx.inference.ScoredOutput(score=1.0, output=output)]
+
+      except Exception as e:
+        raise lx.exceptions.InferenceRuntimeError(
+            f'API error: {str(e)}', original=e
+        ) from e
diff --git a/examples/custom_provider_plugin/pyproject.toml b/examples/custom_provider_plugin/pyproject.toml
@@ -0,0 +1,38 @@
+# Copyright 2025 Google LLC.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "langextract-provider-example"  # Change to your package name
+version = "0.1.0"  # Update version for releases
+description = "Example custom provider plugin for LangExtract"
+readme = "README.md"
+requires-python = ">=3.10"
+license = {text = "Apache-2.0"}
+dependencies = [
+    # Uncomment when creating a standalone plugin package:
+    # "langextract",  # Will install latest version
+    "google-genai>=0.2.0",  # Replace with your backend's SDK
+]
+
+# Register the provider with LangExtract's plugin system
+[project.entry-points."langextract.providers"]
+custom_gemini = "langextract_provider_example:CustomGeminiProvider"
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["langextract_provider_example*"]
diff --git a/examples/custom_provider_plugin/test_example_provider.py b/examples/custom_provider_plugin/test_example_provider.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+# Copyright 2025 Google LLC.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Simple test for the custom provider plugin."""
+
+import os
+
+# Import the provider to trigger registration with LangExtract
+# Note: This manual import is only needed when running without installation.
+# After `pip install -e .`, the entry point system handles this automatically.
+from langextract_provider_example import CustomGeminiProvider  # noqa: F401
+
+import langextract as lx
+
+
+def main():
+  """Test the custom provider."""
+  api_key = os.getenv("GEMINI_API_KEY") or os.getenv("LANGEXTRACT_API_KEY")
+
+  if not api_key:
+    print("Set GEMINI_API_KEY or LANGEXTRACT_API_KEY to test")
+    return
+
+  # Create model using explicit provider selection
+  config = lx.factory.ModelConfig(
+      model_id="gemini-2.5-flash",
+      provider="CustomGeminiProvider",
+      provider_kwargs={"api_key": api_key},
+  )
+  model = lx.factory.create_model(config)
+
+  print(f"✓ Created {model.__class__.__name__}")
+
+  # Test inference
+  prompts = ["Say hello"]
+  results = list(model.infer(prompts))
+
+  if results and results[0]:
+    print(f"✓ Inference worked: {results[0][0].output[:50]}...")
+  else:
+    print("✗ No response")
+
+
+if __name__ == "__main__":
+  main()