neo4j · NathalieCharbel · Jun 24, 2025 · Jun 24, 2025 · Jun 25, 2025 · Jun 26, 2025
@@ -7,6 +7,7 @@
 - Support for Python 3.13
 - Added support for automatic schema extraction from text using LLMs. In the `SimpleKGPipeline`, when the user provides no schema, the automatic schema extraction is enabled by default.
 - Added ability to return a user-defined message if context is empty in GraphRAG (which skips the LLM call).
+- Added automatic rate limiting with retry logic and exponential backoff for all LLM providers using tenacity. The `RateLimitHandler` interface allows for custom rate limiting strategies, including the ability to disable rate limiting entirely.
 
 ### Fixed
 

@@ -347,6 +347,28 @@ MistralAILLM
     :members:
 
 
+Rate Limiting
+=============
+
+RateLimitHandler
+----------------
+
+.. autoclass:: neo4j_graphrag.llm.rate_limit.RateLimitHandler
+    :members:
+
+RetryRateLimitHandler
+---------------------
+
+.. autoclass:: neo4j_graphrag.llm.rate_limit.RetryRateLimitHandler
+    :members:
+
+NoOpRateLimitHandler
+--------------------
+
+.. autoclass:: neo4j_graphrag.llm.rate_limit.NoOpRateLimitHandler
+    :members:
+
+
 PromptTemplate
 ==============
 
@@ -473,6 +495,8 @@ Errors
 
   * :class:`neo4j_graphrag.exceptions.LLMGenerationError`
 
+    * :class:`neo4j_graphrag.exceptions.RateLimitError`
+
   * :class:`neo4j_graphrag.exceptions.SchemaValidationError`
 
   * :class:`neo4j_graphrag.exceptions.PdfLoaderError`
@@ -597,6 +621,13 @@ LLMGenerationError
    :show-inheritance:
 
 
+RateLimitError
+==============
+
+.. autoclass:: neo4j_graphrag.exceptions.RateLimitError
+   :show-inheritance:
+
+
 SchemaValidationError
 =====================
 

@@ -294,6 +294,91 @@ Here's an example using the Python Ollama client:
 See :ref:`llminterface`.
 
 
+Rate Limit Handling
+===================
+
+All LLM implementations include automatic rate limiting that uses retry logic with exponential backoff by default. This feature helps handle API rate limits from LLM providers gracefully by automatically retrying failed requests with increasing wait times between attempts.
+
+Default Rate Limit Handler
+--------------------------
+
+Rate limiting is enabled by default for all LLM instances with the following configuration:
+
+- **Max attempts**: 3
+- **Min wait**: 1.0 seconds  
+- **Max wait**: 60.0 seconds
+- **Multiplier**: 2.0 (exponential backoff)
+
+.. code:: python
+
+    from neo4j_graphrag.llm import OpenAILLM
+
+    # Rate limiting is automatically enabled
+    llm = OpenAILLM(model_name="gpt-4o")
+
+    # The LLM will automatically retry on rate limit errors
+    response = llm.invoke("Hello, world!")
+
+.. note::
+
+   To change the default configuration of `RetryRateLimitHandler`:
+
+    .. code:: python
+
+        from neo4j_graphrag.llm import OpenAILLM
+        from neo4j_graphrag.llm.rate_limit import RetryRateLimitHandler
+
+        # Customize rate limiting parameters
+        llm = OpenAILLM(
+            model_name="gpt-4o",
+            rate_limit_handler=RetryRateLimitHandler(
+                max_attempts=10,    # Increase max retry attempts
+                min_wait=2.0,       # Increase minimum wait time
+                max_wait=120.0,     # Increase maximum wait time
+                multiplier=3.0      # More aggressive backoff
+            )
+        )
+
+Custom Rate Limiting
+--------------------
+
+You can customize the rate limiting behavior by creating your own rate limit handler:
+
+.. code:: python
+
+    from neo4j_graphrag.llm import AnthropicLLM
+    from neo4j_graphrag.llm.rate_limit import RateLimitHandler
+
+    class CustomRateLimitHandler(RateLimitHandler):
+        """Implement your custom rate limiting strategy."""
+        # Implement required methods: handle_sync, handle_async
+        pass
+
+    # Create custom rate limit handler and pass it to the LLM interface
+    custom_handler = CustomRateLimitHandler()
+
+    llm = AnthropicLLM(
+        model_name="claude-3-sonnet-20240229",
+        rate_limit_handler=custom_handler,
+    )
+
+Disabling Rate Limiting
+-----------------------
+
+For high-throughput applications or when you handle rate limiting externally, you can disable it:
+
+.. code:: python
+
+    from neo4j_graphrag.llm import CohereLLM, NoOpRateLimitHandler
+
+    # Disable rate limiting completely
+    llm = CohereLLM(
+        model_name="command-r-plus",
+        rate_limit_handler=NoOpRateLimitHandler(),
+    )
+    llm.invoke("Hello, world!")
+
+
 Configuring the Prompt
 ========================
 

@@ -60,6 +60,7 @@ scipy = [
   { version = "^1.13.0", python = ">=3.9,<3.13" },
   { version = "^1.15.0", python = ">=3.13,<3.14" }
 ]
+tenacity = "^9.1.2"
 
 [tool.poetry.group.dev.dependencies]
 urllib3 = "<2"

@@ -138,3 +138,7 @@ class InvalidHybridSearchRankerError(Neo4jGraphRagError):
 
 class SearchQueryParseError(Neo4jGraphRagError):
     """Exception raised when there is a query parse error in the text search string."""
+
+
+class RateLimitError(LLMGenerationError):
+    """Exception raised when API rate limit is exceeded."""
@@ -18,6 +18,13 @@
 from .mistralai_llm import MistralAILLM
 from .ollama_llm import OllamaLLM
 from .openai_llm import AzureOpenAILLM, OpenAILLM
+from .rate_limit import (
+    RateLimitHandler,
+    NoOpRateLimitHandler,
+    RetryRateLimitHandler,
+    rate_limit_handler,
+    async_rate_limit_handler,
+)
 from .types import LLMResponse
 from .vertexai_llm import VertexAILLM
 
@@ -31,4 +38,10 @@
     "VertexAILLM",
     "AzureOpenAILLM",
     "MistralAILLM",
+    # Rate limiting components
+    "RateLimitHandler",
+    "NoOpRateLimitHandler",
+    "RetryRateLimitHandler",
+    "rate_limit_handler",
+    "async_rate_limit_handler",
 ]
@@ -19,6 +19,11 @@
 
 from neo4j_graphrag.exceptions import LLMGenerationError
 from neo4j_graphrag.llm.base import LLMInterface
+from neo4j_graphrag.llm.rate_limit import (
+    RateLimitHandler,
+    rate_limit_handler,
+    async_rate_limit_handler,
+)
 from neo4j_graphrag.llm.types import (
     BaseMessage,
     LLMResponse,
@@ -62,6 +67,7 @@ def __init__(
         self,
         model_name: str,
         model_params: Optional[dict[str, Any]] = None,
+        rate_limit_handler: Optional[RateLimitHandler] = None,
         **kwargs: Any,
     ):
         try:
@@ -71,7 +77,7 @@ def __init__(
                 """Could not import Anthropic Python client.
                 Please install it with `pip install "neo4j-graphrag[anthropic]"`."""
             )
-        super().__init__(model_name, model_params)
+        super().__init__(model_name, model_params, rate_limit_handler)
         self.anthropic = anthropic
         self.client = anthropic.Anthropic(**kwargs)
         self.async_client = anthropic.AsyncAnthropic(**kwargs)
@@ -93,6 +99,7 @@ def get_messages(
         messages.append(UserMessage(content=input).model_dump())
         return messages  # type: ignore
 
+    @rate_limit_handler
     def invoke(
         self,
         input: str,
@@ -129,6 +136,7 @@ def invoke(
         except self.anthropic.APIError as e:
             raise LLMGenerationError(e)
 
+    @async_rate_limit_handler
     async def ainvoke(
         self,
         input: str,

@@ -21,28 +21,40 @@
 from neo4j_graphrag.types import LLMMessage
 
 from .types import LLMResponse, ToolCallResponse
+from .rate_limit import (
+    DEFAULT_RATE_LIMIT_HANDLER,
+)
 
 from neo4j_graphrag.tool import Tool
 
+from .rate_limit import RateLimitHandler
+
 
 class LLMInterface(ABC):
     """Interface for large language models.
 
     Args:
         model_name (str): The name of the language model.
         model_params (Optional[dict]): Additional parameters passed to the model when text is sent to it. Defaults to None.
+        rate_limit_handler (Optional[RateLimitHandler]): Handler for rate limiting. Defaults to retry with exponential backoff.
         **kwargs (Any): Arguments passed to the model when for the class is initialised. Defaults to None.
     """
 
     def __init__(
         self,
         model_name: str,
         model_params: Optional[dict[str, Any]] = None,
+        rate_limit_handler: Optional[RateLimitHandler] = None,
         **kwargs: Any,
     ):
         self.model_name = model_name
         self.model_params = model_params or {}
 
+        if rate_limit_handler is not None:
+            self._rate_limit_handler = rate_limit_handler
+        else:
+            self._rate_limit_handler = DEFAULT_RATE_LIMIT_HANDLER
+
     @abstractmethod
     def invoke(
         self,

@@ -20,6 +20,11 @@
 
 from neo4j_graphrag.exceptions import LLMGenerationError
 from neo4j_graphrag.llm.base import LLMInterface
+from neo4j_graphrag.llm.rate_limit import (
+    RateLimitHandler,
+    rate_limit_handler,
+    async_rate_limit_handler,
+)
 from neo4j_graphrag.llm.types import (
     BaseMessage,
     LLMResponse,
@@ -60,6 +65,7 @@ def __init__(
         self,
         model_name: str = "",
         model_params: Optional[dict[str, Any]] = None,
+        rate_limit_handler: Optional[RateLimitHandler] = None,
         **kwargs: Any,
     ) -> None:
         try:
@@ -69,7 +75,7 @@ def __init__(
                 """Could not import cohere python client.
                 Please install it with `pip install "neo4j-graphrag[cohere]"`."""
             )
-        super().__init__(model_name, model_params)
+        super().__init__(model_name, model_params, rate_limit_handler)
         self.cohere = cohere
         self.cohere_api_error = cohere.core.api_error.ApiError
 
@@ -96,6 +102,7 @@ def get_messages(
         messages.append(UserMessage(content=input).model_dump())
         return messages  # type: ignore
 
+    @rate_limit_handler
     def invoke(
         self,
         input: str,
@@ -127,6 +134,7 @@ def invoke(
             content=res.message.content[0].text if res.message.content else "",
         )
 
+    @async_rate_limit_handler
     async def ainvoke(
         self,
         input: str,