exa docs and python package update (#31307)

theishangoswami · ccurme · web-flow · commit f16456139bb9 · 2025-05-21T21:33:30.000-04:00
Added support for new Exa API features. Updated Exa docs and python package (langchain-exa). Description Added support for new Exa API features in the langchain-exa package: - Added max_characters option for text content - Added support for summary and custom summary prompts - Added livecrawl option with "always", "fallback", "never" settings - Added "auto" option for search type - Updated documentation and tests Dependencies - No new dependencies required. Using existing features from exa-py. twitter: @theishangoswami --------- Co-authored-by: Chester Curme <chester.curme@gmail.com>
diff --git a/libs/partners/exa/README.md b/libs/partners/exa/README.md
@@ -27,6 +27,30 @@ results  = exa.invoke("What is the capital of France?")
 print(results)
 ```
 
+### Advanced Features
+
+You can use advanced features like text limits, summaries, and live crawling:
+
+```python
+from langchain_exa import ExaSearchRetriever, TextContentsOptions
+
+# Create a new instance with advanced options
+exa = ExaSearchRetriever(
+    exa_api_key="YOUR API KEY",
+    k=20,  # Number of results (1-100)
+    type="auto",  # Can be "neural", "keyword", or "auto"
+    livecrawl="always",  # Can be "always", "fallback", or "never"
+    summary=True,  # Get an AI-generated summary of each result
+    text_contents_options={"max_characters": 3000}  # Limit text length
+)
+
+# Search for a query with custom summary prompt
+exa_with_custom_summary = ExaSearchRetriever(
+    exa_api_key="YOUR API KEY",
+    summary={"query": "generate one line summary in simple words."}  # Custom summary prompt
+)
+```
+
 ## Exa Search Results
 
 You can run the ExaSearchResults module as follows
@@ -48,6 +72,33 @@ search_results = search_tool._run(
 print("Search Results:", search_results)
 ```
 
+### Advanced Features
+
+You can use advanced features like text limits, summaries, and live crawling:
+
+```python
+from langchain_exa import ExaSearchResults
+
+# Initialize the ExaSearchResults tool
+search_tool = ExaSearchResults(exa_api_key="YOUR API KEY")
+
+# Perform a search query with advanced options
+search_results = search_tool._run(
+    query="Latest AI research papers",
+    num_results=10,  # Number of results (1-100)
+    type="auto",  # Can be "neural", "keyword", or "auto"
+    livecrawl="always",  # Can be "always", "fallback", or "never"
+    summary=True,  # Get an AI-generated summary of each result
+    text_contents_options={"max_characters": 2000}  # Limit text length
+)
+
+# With custom summary prompt
+search_results_with_custom_summary = search_tool._run(
+    query="Latest AI research papers",
+    summary={"query": "generate one liner"}  # Custom summary prompt
+)
+```
+
 ## Exa Find Similar Results
 
 You can run the ExaFindSimilarResults module as follows
@@ -67,4 +118,22 @@ similar_results = find_similar_tool._run(
 )
 
 print("Similar Results:", similar_results)
+```
+
+### Advanced Features
+
+```python
+from langchain_exa import ExaFindSimilarResults
+
+# Initialize the ExaFindSimilarResults tool
+find_similar_tool = ExaFindSimilarResults(exa_api_key="YOUR API KEY")
+
+# Find similar results with advanced options
+similar_results = find_similar_tool._run(
+    url="http://espn.com",
+    num_results=10,  # Number of results (1-100)
+    livecrawl="fallback",  # Can be "always", "fallback", or "never"
+    summary=True,  # Get an AI-generated summary of each result
+    text_contents_options={"max_characters": 1500}  # Limit text length
+)
 ```
diff --git a/libs/partners/exa/langchain_exa/retrievers.py b/libs/partners/exa/langchain_exa/retrievers.py
@@ -27,14 +27,16 @@ def _get_metadata(result: Any) -> dict[str, Any]:
         metadata["highlights"] = result.highlights
     if getattr(result, "highlight_scores"):
         metadata["highlight_scores"] = result.highlight_scores
+    if getattr(result, "summary"):
+        metadata["summary"] = result.summary
     return metadata
 
 
 class ExaSearchRetriever(BaseRetriever):
     """Exa Search retriever."""
 
     k: int = 10  # num_results
-    """The number of search results to return."""
+    """The number of search results to return (1 to 100)."""
     include_domains: Optional[list[str]] = None
     """A list of domains to include in the search."""
     exclude_domains: Optional[list[str]] = None
@@ -50,11 +52,20 @@ class ExaSearchRetriever(BaseRetriever):
     use_autoprompt: Optional[bool] = None
     """Whether to use autoprompt for the search."""
     type: str = "neural"
-    """The type of search, 'keyword' or 'neural'. Default: neural"""
+    """The type of search, 'keyword', 'neural', or 'auto'. Default: neural"""
     highlights: Optional[Union[HighlightsContentsOptions, bool]] = None
     """Whether to set the page content to the highlights of the results."""
-    text_contents_options: Union[TextContentsOptions, Literal[True]] = True
-    """How to set the page content of the results"""
+    text_contents_options: Union[TextContentsOptions, dict[str, Any], Literal[True]] = (
+        True
+    )
+    """How to set the page content of the results. Can be True or a dict with options
+    like max_characters."""
+    livecrawl: Optional[Literal["always", "fallback", "never"]] = None
+    """Option to crawl live webpages if content is not in the index. Options: "always",
+    "fallback", "never"."""
+    summary: Optional[Union[bool, dict[str, str]]] = None
+    """Whether to include a summary of the content. Can be a boolean or a dict with a
+    custom query."""
 
     client: Exa = Field(default=None)
     exa_api_key: SecretStr = Field(default=None)
@@ -82,6 +93,9 @@ def _get_relevant_documents(
             start_published_date=self.start_published_date,
             end_published_date=self.end_published_date,
             use_autoprompt=self.use_autoprompt,
+            livecrawl=self.livecrawl,
+            summary=self.summary,
+            type=self.type,
         )
 
         results = response.results
diff --git a/libs/partners/exa/langchain_exa/tools.py b/libs/partners/exa/langchain_exa/tools.py
@@ -1,6 +1,6 @@
 """Tool for the Exa Search API."""
 
-from typing import Any, Optional, Union
+from typing import Any, Literal, Optional, Union
 
 from exa_py import Exa  # type: ignore[untyped-import]
 from exa_py.api import (
@@ -74,8 +74,10 @@ def validate_environment(cls, values: dict) -> Any:
     def _run(
         self,
         query: str,
-        num_results: int,
-        text_contents_options: Optional[Union[TextContentsOptions, bool]] = None,
+        num_results: int = 10,
+        text_contents_options: Optional[
+            Union[TextContentsOptions, dict[str, Any], bool]
+        ] = None,
         highlights: Optional[Union[HighlightsContentsOptions, bool]] = None,
         include_domains: Optional[list[str]] = None,
         exclude_domains: Optional[list[str]] = None,
@@ -84,9 +86,30 @@ def _run(
         start_published_date: Optional[str] = None,
         end_published_date: Optional[str] = None,
         use_autoprompt: Optional[bool] = None,
+        livecrawl: Optional[Literal["always", "fallback", "never"]] = None,
+        summary: Optional[Union[bool, dict[str, str]]] = None,
+        type: Optional[Literal["neural", "keyword", "auto"]] = None,
         run_manager: Optional[CallbackManagerForToolRun] = None,
     ) -> Union[list[dict], str]:
-        """Use the tool."""
+        """Use the tool.
+
+        Args:
+            query: The search query.
+            num_results: The number of search results to return (1 to 100). Default: 10
+            text_contents_options: How to set the page content of the results. Can be True or a dict with options like max_characters.
+            highlights: Whether to include highlights in the results.
+            include_domains: A list of domains to include in the search.
+            exclude_domains: A list of domains to exclude from the search.
+            start_crawl_date: The start date for the crawl (in YYYY-MM-DD format).
+            end_crawl_date: The end date for the crawl (in YYYY-MM-DD format).
+            start_published_date: The start date for when the document was published (in YYYY-MM-DD format).
+            end_published_date: The end date for when the document was published (in YYYY-MM-DD format).
+            use_autoprompt: Whether to use autoprompt for the search.
+            livecrawl: Option to crawl live webpages if content is not in the index. Options: "always", "fallback", "never"
+            summary: Whether to include a summary of the content. Can be a boolean or a dict with a custom query.
+            type: The type of search, 'keyword', 'neural', or 'auto'.
+            run_manager: The run manager for callbacks.
+        """  # noqa: E501
         try:
             return self.client.search_and_contents(
                 query,
@@ -100,6 +123,9 @@ def _run(
                 start_published_date=start_published_date,
                 end_published_date=end_published_date,
                 use_autoprompt=use_autoprompt,
+                livecrawl=livecrawl,
+                summary=summary,
+                type=type,
             )  # type: ignore
         except Exception as e:
             return repr(e)
@@ -128,8 +154,10 @@ def validate_environment(cls, values: dict) -> Any:
     def _run(
         self,
         url: str,
-        num_results: int,
-        text_contents_options: Optional[Union[TextContentsOptions, bool]] = None,
+        num_results: int = 10,
+        text_contents_options: Optional[
+            Union[TextContentsOptions, dict[str, Any], bool]
+        ] = None,
         highlights: Optional[Union[HighlightsContentsOptions, bool]] = None,
         include_domains: Optional[list[str]] = None,
         exclude_domains: Optional[list[str]] = None,
@@ -139,9 +167,29 @@ def _run(
         end_published_date: Optional[str] = None,
         exclude_source_domain: Optional[bool] = None,
         category: Optional[str] = None,
+        livecrawl: Optional[Literal["always", "fallback", "never"]] = None,
+        summary: Optional[Union[bool, dict[str, str]]] = None,
         run_manager: Optional[CallbackManagerForToolRun] = None,
     ) -> Union[list[dict], str]:
-        """Use the tool."""
+        """Use the tool.
+
+        Args:
+            url: The URL to find similar pages for.
+            num_results: The number of search results to return (1 to 100). Default: 10
+            text_contents_options: How to set the page content of the results. Can be True or a dict with options like max_characters.
+            highlights: Whether to include highlights in the results.
+            include_domains: A list of domains to include in the search.
+            exclude_domains: A list of domains to exclude from the search.
+            start_crawl_date: The start date for the crawl (in YYYY-MM-DD format).
+            end_crawl_date: The end date for the crawl (in YYYY-MM-DD format).
+            start_published_date: The start date for when the document was published (in YYYY-MM-DD format).
+            end_published_date: The end date for when the document was published (in YYYY-MM-DD format).
+            exclude_source_domain: If True, exclude pages from the same domain as the source URL.
+            category: Filter for similar pages by category.
+            livecrawl: Option to crawl live webpages if content is not in the index. Options: "always", "fallback", "never"
+            summary: Whether to include a summary of the content. Can be a boolean or a dict with a custom query.
+            run_manager: The run manager for callbacks.
+        """  # noqa: E501
         try:
             return self.client.find_similar_and_contents(
                 url,
@@ -156,6 +204,8 @@ def _run(
                 end_published_date=end_published_date,
                 exclude_source_domain=exclude_source_domain,
                 category=category,
+                livecrawl=livecrawl,
+                summary=summary,
             )  # type: ignore
         except Exception as e:
             return repr(e)
diff --git a/libs/partners/exa/pyproject.toml b/libs/partners/exa/pyproject.toml
@@ -6,9 +6,9 @@ build-backend = "pdm.backend"
 authors = []
 license = { text = "MIT" }
 requires-python = ">=3.9"
-dependencies = ["langchain-core<1.0.0,>=0.3.15", "exa-py<2.0.0,>=1.0.8"]
+dependencies = ["langchain-core<1.0.0,>=0.3.60", "exa-py<2.0.0,>=1.0.8"]
 name = "langchain-exa"
-version = "0.2.1"
+version = "0.3.0"
 description = "An integration package connecting Exa and LangChain"
 readme = "README.md"
 
diff --git a/libs/partners/exa/tests/integration_tests/test_retriever.py b/libs/partners/exa/tests/integration_tests/test_retriever.py
@@ -26,3 +26,19 @@ def test_exa_retriever_highlights() -> None:
     assert isinstance(highlight_scores, list)
     assert isinstance(highlights[0], str)
     assert isinstance(highlight_scores[0], float)
+
+
+def test_exa_retriever_advanced_features() -> None:
+    retriever = ExaSearchRetriever(
+        k=3, text_contents_options={"max_characters": 1000}, summary=True, type="auto"
+    )
+    res = retriever.invoke("best time to visit japan")
+    print(res)  # noqa: T201
+    assert len(res) == 3  # requested k=3
+    assert isinstance(res, list)
+    assert isinstance(res[0], Document)
+    # Verify summary is in metadata
+    assert "summary" in res[0].metadata
+    assert isinstance(res[0].metadata["summary"], str)
+    # Verify text was limited
+    assert len(res[0].page_content) <= 1000
diff --git a/libs/partners/exa/tests/integration_tests/test_search_tool.py b/libs/partners/exa/tests/integration_tests/test_search_tool.py
@@ -8,3 +8,23 @@ def test_search_tool() -> None:
     res = tool.invoke({"query": "best time to visit japan", "num_results": 5})
     print(res)  # noqa: T201
     assert not isinstance(res, str)  # str means error for this tool\
+
+
+def test_search_tool_advanced_features() -> None:
+    tool = ExaSearchResults()
+    res = tool.invoke(
+        {
+            "query": "best time to visit japan",
+            "num_results": 3,
+            "text_contents_options": {"max_characters": 1000},
+            "summary": True,
+            "type": "auto",
+        }
+    )
+    print(res)  # noqa: T201
+    assert not isinstance(res, str)  # str means error for this tool
+    assert len(res.results) == 3
+    # Verify summary exists
+    assert hasattr(res.results[0], "summary")
+    # Verify text was limited
+    assert len(res.results[0].text) <= 1000
diff --git a/libs/partners/exa/uv.lock b/libs/partners/exa/uv.lock