ScrapeGraphAI · VinciGit00 · May 8, 2024 · May 8, 2024 · May 8, 2024 · May 9, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,22 @@
+## [0.10.0-beta.4](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.10.0-beta.3...v0.10.0-beta.4) (2024-05-09)
+
+
+### Features
+
+* Add support for passing pdf path as source ([f10f3b1](https://github.com/VinciGit00/Scrapegraph-ai/commit/f10f3b1438e0c625b7f2fa52faeb5a6c12116113))
+
+
+### Bug Fixes
+
+* limit python version to < 3.12 ([a37fbbc](https://github.com/VinciGit00/Scrapegraph-ai/commit/a37fbbcbcfc3ddd0cc66f586f279676b52c4abfe))
+
+## [0.10.0-beta.3](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.10.0-beta.2...v0.10.0-beta.3) (2024-05-09)
+
+
+### Features
+
+* update info ([4ed0fb8](https://github.com/VinciGit00/Scrapegraph-ai/commit/4ed0fb89c3e6068190a7775bedcb6ae65ba59d18))
+
 ## [0.10.0-beta.2](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.10.0-beta.1...v0.10.0-beta.2) (2024-05-08)
 
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 [tool.poetry]
 name = "scrapegraphai"
 
-version = "0.10.0b2"
+version = "0.10.0b4"
 
 description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
 authors = [
@@ -23,7 +23,7 @@ classifiers = [
 ]
 
 [tool.poetry.dependencies]
-python = "^3.9"
+python = ">=3.9, <3.12"
 langchain = "0.1.15"
 langchain-openai = "^0.1.6"
 langchain-google-genai = "^1.0.3"

diff --git a/requirements.txt b/requirements.txt
@@ -18,3 +18,4 @@ playwright==1.43.0
 langchain-aws==0.1.2
 langchain-anthropic==0.1.11 
 yahoo-search-py==0.3
+pypdf==4.2.0
diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py
@@ -4,16 +4,16 @@
 from abc import ABC, abstractmethod
 from typing import Optional
 from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings
-from langchain_community.embeddings import HuggingFaceHubEmbeddings, OllamaEmbeddings
+from langchain_community.embeddings import HuggingFaceHubEmbeddings, OllamaEmbeddings, BedrockEmbeddings
+from langchain_google_genai import GoogleGenerativeAIEmbeddings
 from ..helpers import models_tokens
-from ..models import AzureOpenAI, Bedrock, Gemini, Groq, HuggingFace, Ollama, OpenAI, Anthropic
+from ..models import AzureOpenAI, Bedrock, Gemini, Groq, HuggingFace, Ollama, OpenAI, Anthropic, Claude
 
 
 class AbstractGraph(ABC):
     """
     Scaffolding class for creating a graph representation and executing it.
 
-    Attributes:
         prompt (str): The prompt for the graph.
         source (str): The source of the graph.
         config (dict): Configuration parameters for the graph.
@@ -162,7 +162,7 @@ def _create_llm(self, llm_config: dict, chat=False) -> object:
                     try:
                         self.model_token = models_tokens["ollama"][llm_params["model"]]
                     except KeyError as exc:
-                        raise KeyError("Model not supported") from exc
+                        self.model_token = 8192
                 else:
                     self.model_token = 8192
             except AttributeError:

diff --git a/scrapegraphai/graphs/csv_scraper_graph.py b/scrapegraphai/graphs/csv_scraper_graph.py
@@ -30,7 +30,7 @@ def _create_graph(self):
         Creates the graph of nodes representing the workflow for web scraping.
         """
         fetch_node = FetchNode(
-            input="csv_dir",
+            input="csv",
             output=["doc"],
         )
         parse_node = ParseNode(
@@ -78,4 +78,4 @@ def run(self) -> str:
         inputs = {"user_prompt": self.prompt, self.input_key: self.source}
         self.final_state, self.execution_info = self.graph.execute(inputs)
 
-        return self.final_state.get("answer", "No answer found.")
+        return self.final_state.get("answer", "No answer found.")
diff --git a/scrapegraphai/graphs/pdf_scraper_graph.py b/scrapegraphai/graphs/pdf_scraper_graph.py
@@ -56,36 +56,29 @@ def _create_graph(self) -> BaseGraph:
         """
 
         fetch_node = FetchNode(
-            input="pdf_dir",
+            input='pdf',
             output=["doc"],
-            node_config={
-                "headless": self.headless,
-                "verbose": self.verbose
-            }
         )
         parse_node = ParseNode(
             input="doc",
             output=["parsed_doc"],
             node_config={
                 "chunk_size": self.model_token,
-                "verbose": self.verbose
             }
         )
         rag_node = RAGNode(
             input="user_prompt & (parsed_doc | doc)",
             output=["relevant_chunks"],
             node_config={
-                "llm": self.llm_model,
+                "llm_model": self.llm_model,
                 "embedder_model": self.embedder_model,
-                "verbose": self.verbose
             }
         )
         generate_answer_node = GenerateAnswerNode(
             input="user_prompt & (relevant_chunks | parsed_doc | doc)",
             output=["answer"],
             node_config={
-                "llm": self.llm_model,
-                "verbose": self.verbose
+                "llm_model": self.llm_model,
             }
         )
 

diff --git a/scrapegraphai/graphs/script_creator_graph.py b/scrapegraphai/graphs/script_creator_graph.py
@@ -110,4 +110,4 @@ def run(self) -> str:
         inputs = {"user_prompt": self.prompt, self.input_key: self.source}
         self.final_state, self.execution_info = self.graph.execute(inputs)
 
-        return self.final_state.get("answer", "No answer found.")
+        return self.final_state.get("answer", "No answer found ")
diff --git a/scrapegraphai/graphs/xml_scraper_graph.py b/scrapegraphai/graphs/xml_scraper_graph.py
@@ -56,7 +56,7 @@ def _create_graph(self) -> BaseGraph:
         """
 
         fetch_node = FetchNode(
-            input="xml_dir",
+            input="xml",
             output=["doc"]
         )
         parse_node = ParseNode(
@@ -108,4 +108,4 @@ def run(self) -> str:
         inputs = {"user_prompt": self.prompt, self.input_key: self.source}
         self.final_state, self.execution_info = self.graph.execute(inputs)
 
-        return self.final_state.get("answer", "No answer found.")
+        return self.final_state.get("answer", "No answer found.")
diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py
@@ -39,7 +39,8 @@
         "dolphin-mixtral": 32000,
         "mistral-openorca": 32000,
         "stablelm-zephyr": 8192,
-        "nomic-embed-text": 8192
+        "nomic-embed-text": 8192,
+        "mxbai-embed-large'": 8192
     },
     "groq": {
         "llama3-8b-8192": 8192,

diff --git a/scrapegraphai/models/__init__.py b/scrapegraphai/models/__init__.py
@@ -12,3 +12,4 @@
 from .groq import Groq
 from .bedrock import Bedrock
 from .anthropic import Anthropic
+from .claude import Claude
diff --git a/scrapegraphai/models/claude.py b/scrapegraphai/models/claude.py
@@ -0,0 +1,19 @@
+"""
+Claude model
+"""
+from langchain_anthropic import ChatAnthropic
+
+
+class Claude(ChatAnthropic):
+    """Class for wrapping bedrock module"""
+
+    def __init__(self, llm_config: dict):
+        """
+        A wrapper for the Claude class that provides default configuration
+        and could be extended with additional methods if needed.
+
+        Args:
+            llm_config (dict): Configuration parameters for the language model.
+        """
+        # Initialize the superclass (ChatAnthropic) with provided config parameters
+        super().__init__(**llm_config)
diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py
@@ -1,10 +1,11 @@
 """ 
 FetchNode Module
 """
-
+import pandas as pd
 from typing import List, Optional
 from langchain_community.document_loaders import AsyncChromiumLoader
 from langchain_core.documents import Document
+from langchain_community.document_loaders import PyPDFLoader
 from .base_node import BaseNode
 from ..utils.remover import remover
 
@@ -21,19 +22,21 @@ class FetchNode(BaseNode):
     Attributes:
         headless (bool): A flag indicating whether the browser should run in headless mode.
         verbose (bool): A flag indicating whether to print verbose output during execution.
-    
+
     Args:
         input (str): Boolean expression defining the input keys needed from the state.
         output (List[str]): List of output keys to be updated in the state.
         node_config (Optional[dict]): Additional configuration for the node.
         node_name (str): The unique identifier name for the node, defaulting to "Fetch".
     """
 
-    def __init__(self, input: str, output: List[str], node_config: Optional[dict]=None, node_name: str = "Fetch"):
+    def __init__(self, input: str, output: List[str], node_config: Optional[dict] = None, node_name: str = "Fetch"):
         super().__init__(node_name, "node", input, output, 1)
 
-        self.headless = True if node_config is None else node_config.get("headless", True)
-        self.verbose = False if node_config is None else node_config.get("verbose", False)
+        self.headless = True if node_config is None else node_config.get(
+            "headless", True)
+        self.verbose = False if node_config is None else node_config.get(
+            "verbose", False)
 
     def execute(self, state):
         """
@@ -56,7 +59,6 @@ def execute(self, state):
 
         # Interpret input keys based on the provided input expression
         input_keys = self.get_input_keys(state)
-
         # Fetching data from the state based on the input keys
         input_data = [state[key] for key in input_keys]
 
@@ -66,14 +68,33 @@ def execute(self, state):
                 "source": "local_dir"
             })]
         # if it is a local directory
+
+        # handling for pdf
+        elif self.input == "pdf":
+            loader = PyPDFLoader(source)
+            compressed_document = loader.load()
+
+        elif self.input == "csv":
+            compressed_document = [Document(page_content=pd.read_csv(source), metadata={
+                "source": "xml"
+            })]
+        elif self.input == "xml":
+            with open(source, 'r', encoding='utf-8') as f:
+                data = f.read()
+            compressed_document = [Document(page_content=data, metadata={
+                "source": "xml"
+            })]
+        elif self.input == "pdf_dir":
+            pass
+
         elif not source.startswith("http"):
             compressed_document = [Document(page_content=remover(source), metadata={
                 "source": "local_dir"
             })]
 
         else:
             if self.node_config is not None and self.node_config.get("endpoint") is not None:
-                
+
                 loader = AsyncChromiumLoader(
                     [source],
                     proxies={"http": self.node_config["endpoint"]},