feat: add arxiv toolkit (#994)

Wendong-Fan · SubjectAlphaChisato · web-flow · commit 3978d4704754 · 2024-10-16T17:34:09.000+08:00
Co-authored-by: dreamflyfreya &lt;dreamfly@sas.upenn.edu&gt;
diff --git a/camel/toolkits/__init__.py b/camel/toolkits/__init__.py
@@ -20,22 +20,26 @@
 )
 from .open_api_specs.security_config import openapi_security_config
 
-from .google_maps_toolkit import GoogleMapsToolkit
+
 from .math_toolkit import MathToolkit, MATH_FUNCS
-from .open_api_toolkit import OpenAPIToolkit
-from .retrieval_toolkit import RetrievalToolkit
 from .search_toolkit import SearchToolkit, SEARCH_FUNCS
-from .twitter_toolkit import TwitterToolkit
 from .weather_toolkit import WeatherToolkit, WEATHER_FUNCS
-from .slack_toolkit import SlackToolkit
 from .dalle_toolkit import DalleToolkit, DALLE_FUNCS
-from .linkedin_toolkit import LinkedInToolkit
-from .reddit_toolkit import RedditToolkit
 
+from .base import BaseToolkit
+from .google_maps_toolkit import GoogleMapsToolkit
 from .code_execution import CodeExecutionToolkit
 from .github_toolkit import GithubToolkit
+from .arxiv_toolkit import ArxivToolkit
+from .linkedin_toolkit import LinkedInToolkit
+from .reddit_toolkit import RedditToolkit
+from .slack_toolkit import SlackToolkit
+from .twitter_toolkit import TwitterToolkit
+from .open_api_toolkit import OpenAPIToolkit
+from .retrieval_toolkit import RetrievalToolkit
 
 __all__ = [
+    'BaseToolkit',
     'FunctionTool',
     'OpenAIFunction',
     'get_openai_function_schema',
@@ -54,6 +58,7 @@
     'LinkedInToolkit',
     'RedditToolkit',
     'CodeExecutionToolkit',
+    'ArxivToolkit',
     'MATH_FUNCS',
     'SEARCH_FUNCS',
     'WEATHER_FUNCS',
diff --git a/camel/toolkits/arxiv_toolkit.py b/camel/toolkits/arxiv_toolkit.py
@@ -0,0 +1,148 @@
+# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
+
+from typing import Dict, Generator, List, Optional
+
+from camel.toolkits.base import BaseToolkit
+from camel.toolkits.function_tool import FunctionTool
+from camel.utils import dependencies_required
+
+
+class ArxivToolkit(BaseToolkit):
+    r"""A toolkit for interacting with the arXiv API to search and download
+    academic papers.
+    """
+
+    @dependencies_required('arxiv')
+    def __init__(self) -> None:
+        r"""Initializes the ArxivToolkit and sets up the arXiv client."""
+        import arxiv
+
+        self.client = arxiv.Client()
+
+    def _get_search_results(
+        self,
+        query: str,
+        paper_ids: Optional[List[str]] = None,
+        max_results: Optional[int] = 5,
+    ) -> Generator:
+        r"""Retrieves search results from the arXiv API based on the provided
+        query and optional paper IDs.
+
+        Args:
+            query (str): The search query string used to search for papers on
+                arXiv.
+            paper_ids (List[str], optional): A list of specific arXiv paper
+                IDs to search for. (default::obj: `None`)
+            max_results (int, optional): The maximum number of search results
+                to retrieve. (default::obj: `5`)
+
+        Returns:
+            Generator: A generator that yields results from the arXiv search
+                query, which includes metadata about each paper matching the
+                query.
+        """
+        import arxiv
+
+        paper_ids = paper_ids or []
+        search_query = arxiv.Search(
+            query=query,
+            id_list=paper_ids,
+            max_results=max_results,
+        )
+        return self.client.results(search_query)
+
+    def search_papers(
+        self,
+        query: str,
+        paper_ids: Optional[List[str]] = None,
+        max_results: Optional[int] = 5,
+    ) -> List[Dict[str, str]]:
+        r"""Searches for academic papers on arXiv using a query string and
+        optional paper IDs.
+
+        Args:
+            query (str): The search query string.
+            paper_ids (List[str], optional): A list of specific arXiv paper
+                IDs to search for. (default::obj: `None`)
+            max_results (int, optional): The maximum number of search results
+                to return. (default::obj: `5`)
+
+        Returns:
+            List[Dict[str, str]]: A list of dictionaries, each containing
+                information about a paper, including title, published date,
+                authors, entry ID, summary, and extracted text from the paper.
+        """
+        from arxiv2text import arxiv_to_text
+
+        search_results = self._get_search_results(
+            query, paper_ids, max_results
+        )
+        papers_data = []
+
+        for paper in search_results:
+            paper_info = {
+                "title": paper.title,
+                "published_date": paper.updated.date().isoformat(),
+                "authors": [author.name for author in paper.authors],
+                "entry_id": paper.entry_id,
+                "summary": paper.summary,
+                # TODO: Use chunkr instead of atxiv_to_text for better
+                # performance
+                "paper_text": arxiv_to_text(paper.pdf_url),
+            }
+            papers_data.append(paper_info)
+
+        return papers_data
+
+    def download_papers(
+        self,
+        query: str,
+        paper_ids: Optional[List[str]] = None,
+        max_results: Optional[int] = 5,
+        output_dir: Optional[str] = "./",
+    ) -> str:
+        r"""Downloads PDFs of academic papers from arXiv based on the provided
+        query.
+
+        Args:
+            query (str): The search query string.
+            paper_ids (List[str], optional): A list of specific arXiv paper
+                IDs to download. (default::obj: `None`)
+            max_results (int, optional): The maximum number of search results
+                to download. (default::obj: `5`)
+            output_dir (str, optional): The directory to save the downloaded
+                PDFs. Defaults to the current directory.
+
+        Returns:
+            str: Status message indicating success or failure.
+        """
+        try:
+            search_results = self._get_search_results(
+                query, paper_ids, max_results
+            )
+
+            for paper in search_results:
+                paper.download_pdf(
+                    dirpath=output_dir, filename=f"{paper.title}" + ".pdf"
+                )
+            return "papers downloaded successfully"
+        except Exception as e:
+            return f"An error occurred: {e}"
+
+    def get_tools(self) -> List[FunctionTool]:
+        return [
+            FunctionTool(self.search_papers),
+            FunctionTool(self.download_papers),
+        ]
diff --git a/camel/toolkits/base.py b/camel/toolkits/base.py
@@ -14,10 +14,9 @@
 
 from typing import List
 
+from camel.toolkits import FunctionTool
 from camel.utils import AgentOpsMeta
 
-from .function_tool import FunctionTool
-
 
 class BaseToolkit(metaclass=AgentOpsMeta):
     def get_tools(self) -> List[FunctionTool]:
diff --git a/camel/toolkits/code_execution.py b/camel/toolkits/code_execution.py
@@ -15,8 +15,7 @@
 
 from camel.interpreters import InternalPythonInterpreter
 from camel.toolkits import FunctionTool
-
-from .base import BaseToolkit
+from camel.toolkits.base import BaseToolkit
 
 
 class CodeExecutionToolkit(BaseToolkit):
diff --git a/camel/toolkits/github_toolkit.py b/camel/toolkits/github_toolkit.py
@@ -18,11 +18,10 @@
 
 from pydantic import BaseModel
 
+from camel.toolkits import FunctionTool
+from camel.toolkits.base import BaseToolkit
 from camel.utils import dependencies_required
 
-from .base import BaseToolkit
-from .function_tool import FunctionTool
-
 
 class GithubIssue(BaseModel):
     r"""Represents a GitHub issue.
diff --git a/examples/tool_call/arxiv_toolkit_example.py b/examples/tool_call/arxiv_toolkit_example.py
@@ -0,0 +1,93 @@
+# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
+
+from camel.agents import ChatAgent
+from camel.configs.openai_config import ChatGPTConfig
+from camel.messages import BaseMessage
+from camel.models import ModelFactory
+from camel.toolkits import ArxivToolkit
+from camel.types import ModelPlatformType, ModelType
+
+# Define system message
+sys_msg = BaseMessage.make_assistant_message(
+    role_name='Tools calling opertor', content='You are a helpful assistant'
+)
+
+# Set model config
+tools = ArxivToolkit().get_tools()
+model_config_dict = ChatGPTConfig(
+    temperature=0.0,
+).as_dict()
+
+model = ModelFactory.create(
+    model_platform=ModelPlatformType.OPENAI,
+    model_type=ModelType.GPT_4O_MINI,
+    model_config_dict=model_config_dict,
+)
+
+# Set agent
+camel_agent = ChatAgent(
+    system_message=sys_msg,
+    model=model,
+    tools=tools,
+)
+camel_agent.reset()
+
+# Define a user message
+usr_msg = BaseMessage.make_user_message(
+    role_name="CAMEL User",
+    content="""Search paper 'attention is all you need' for me""",
+)
+
+# Get response information
+response = camel_agent.step(usr_msg)
+print(str(response.info['tool_calls'])[:1000])
+'''
+===============================================================================
+[FunctionCallingRecord(func_name='search_papers', args={'query': 'attention is 
+all you need'}, result=[{'title': "Attention Is All You Need But You Don't 
+Need All Of It For Inference of Large Language Models", 'published_date': 
+'2024-07-22', 'authors': ['Georgy Tyukin', 'Gbetondji J-S Dovonon', 'Jean 
+Kaddour', 'Pasquale Minervini'], 'entry_id': 'http://arxiv.org/abs/2407.
+15516v1', 'summary': 'The inference demand for LLMs has skyrocketed in recent 
+months, and serving\nmodels with low latencies remains challenging due to the 
+quadratic input length\ncomplexity of the attention layers. In this work, we 
+investigate the effect of\ndropping MLP and attention layers at inference time 
+on the performance of\nLlama-v2 models. We find that dropping dreeper 
+attention layers only marginally\ndecreases performance but leads to the best 
+speedups alongside dropping entire\nlayers. For example, removing 33\\% of 
+attention layers in a 13B Llama2 model\nresults in a 1.8\\% drop in average 
+performance ove...
+===============================================================================
+'''
+
+
+# Define a user message
+usr_msg = BaseMessage.make_user_message(
+    role_name="CAMEL User",
+    content="""Download paper "attention is all you need" for me to my 
+    local path '/Users/enrei/Desktop/camel0826/camel/examples/tool_call'""",
+)
+
+# Get response information
+response = camel_agent.step(usr_msg)
+print(str(response.info['tool_calls'])[:1000])
+'''
+===============================================================================
+[FunctionCallingRecord(func_name='download_papers', args={'query': 'attention 
+is all you need', 'output_dir': '/Users/enrei/Desktop/camel0826/camel/examples/
+tool_call', 'paper_ids': ['2407.15516v1', '2107.08000v1', '2306.01926v1', 
+'2112.05993v1', '1912.11959v2']}, result='papers downloaded successfully')]
+===============================================================================
+'''
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/test/toolkits/test_arxiv_functions.py b/test/toolkits/test_arxiv_functions.py