Skip to content

Pre/beta #200

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 16 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,22 @@
## [0.10.0-beta.4](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.10.0-beta.3...v0.10.0-beta.4) (2024-05-09)


### Features

* Add support for passing pdf path as source ([f10f3b1](https://github.com/VinciGit00/Scrapegraph-ai/commit/f10f3b1438e0c625b7f2fa52faeb5a6c12116113))


### Bug Fixes

* limit python version to < 3.12 ([a37fbbc](https://github.com/VinciGit00/Scrapegraph-ai/commit/a37fbbcbcfc3ddd0cc66f586f279676b52c4abfe))

## [0.10.0-beta.3](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.10.0-beta.2...v0.10.0-beta.3) (2024-05-09)


### Features

* update info ([4ed0fb8](https://github.com/VinciGit00/Scrapegraph-ai/commit/4ed0fb89c3e6068190a7775bedcb6ae65ba59d18))

## [0.10.0-beta.2](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.10.0-beta.1...v0.10.0-beta.2) (2024-05-08)


Expand Down
3,346 changes: 3,346 additions & 0 deletions poetry.lock

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[tool.poetry]
name = "scrapegraphai"

version = "0.10.0b2"
version = "0.10.0b4"

description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
authors = [
Expand All @@ -23,7 +23,7 @@ classifiers = [
]

[tool.poetry.dependencies]
python = "^3.9"
python = ">=3.9, <3.12"
langchain = "0.1.15"
langchain-openai = "^0.1.6"
langchain-google-genai = "^1.0.3"
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ playwright==1.43.0
langchain-aws==0.1.2
langchain-anthropic==0.1.11
yahoo-search-py==0.3
pypdf==4.2.0
8 changes: 4 additions & 4 deletions scrapegraphai/graphs/abstract_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,16 @@
from abc import ABC, abstractmethod
from typing import Optional
from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceHubEmbeddings, OllamaEmbeddings
from langchain_community.embeddings import HuggingFaceHubEmbeddings, OllamaEmbeddings, BedrockEmbeddings
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from ..helpers import models_tokens
from ..models import AzureOpenAI, Bedrock, Gemini, Groq, HuggingFace, Ollama, OpenAI, Anthropic
from ..models import AzureOpenAI, Bedrock, Gemini, Groq, HuggingFace, Ollama, OpenAI, Anthropic, Claude


class AbstractGraph(ABC):
"""
Scaffolding class for creating a graph representation and executing it.

Attributes:
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
Expand Down Expand Up @@ -162,7 +162,7 @@ def _create_llm(self, llm_config: dict, chat=False) -> object:
try:
self.model_token = models_tokens["ollama"][llm_params["model"]]
except KeyError as exc:
raise KeyError("Model not supported") from exc
self.model_token = 8192
else:
self.model_token = 8192
except AttributeError:
Expand Down
4 changes: 2 additions & 2 deletions scrapegraphai/graphs/csv_scraper_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def _create_graph(self):
Creates the graph of nodes representing the workflow for web scraping.
"""
fetch_node = FetchNode(
input="csv_dir",
input="csv",
output=["doc"],
)
parse_node = ParseNode(
Expand Down Expand Up @@ -78,4 +78,4 @@ def run(self) -> str:
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
self.final_state, self.execution_info = self.graph.execute(inputs)

return self.final_state.get("answer", "No answer found.")
return self.final_state.get("answer", "No answer found.")
13 changes: 3 additions & 10 deletions scrapegraphai/graphs/pdf_scraper_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,36 +56,29 @@ def _create_graph(self) -> BaseGraph:
"""

fetch_node = FetchNode(
input="pdf_dir",
input='pdf',
output=["doc"],
node_config={
"headless": self.headless,
"verbose": self.verbose
}
)
parse_node = ParseNode(
input="doc",
output=["parsed_doc"],
node_config={
"chunk_size": self.model_token,
"verbose": self.verbose
}
)
rag_node = RAGNode(
input="user_prompt & (parsed_doc | doc)",
output=["relevant_chunks"],
node_config={
"llm": self.llm_model,
"llm_model": self.llm_model,
"embedder_model": self.embedder_model,
"verbose": self.verbose
}
)
generate_answer_node = GenerateAnswerNode(
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
output=["answer"],
node_config={
"llm": self.llm_model,
"verbose": self.verbose
"llm_model": self.llm_model,
}
)

Expand Down
2 changes: 1 addition & 1 deletion scrapegraphai/graphs/script_creator_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,4 +110,4 @@ def run(self) -> str:
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
self.final_state, self.execution_info = self.graph.execute(inputs)

return self.final_state.get("answer", "No answer found.")
return self.final_state.get("answer", "No answer found ")
4 changes: 2 additions & 2 deletions scrapegraphai/graphs/xml_scraper_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def _create_graph(self) -> BaseGraph:
"""

fetch_node = FetchNode(
input="xml_dir",
input="xml",
output=["doc"]
)
parse_node = ParseNode(
Expand Down Expand Up @@ -108,4 +108,4 @@ def run(self) -> str:
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
self.final_state, self.execution_info = self.graph.execute(inputs)

return self.final_state.get("answer", "No answer found.")
return self.final_state.get("answer", "No answer found.")
3 changes: 2 additions & 1 deletion scrapegraphai/helpers/models_tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@
"dolphin-mixtral": 32000,
"mistral-openorca": 32000,
"stablelm-zephyr": 8192,
"nomic-embed-text": 8192
"nomic-embed-text": 8192,
"mxbai-embed-large'": 8192
},
"groq": {
"llama3-8b-8192": 8192,
Expand Down
1 change: 1 addition & 0 deletions scrapegraphai/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@
from .groq import Groq
from .bedrock import Bedrock
from .anthropic import Anthropic
from .claude import Claude
19 changes: 19 additions & 0 deletions scrapegraphai/models/claude.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
"""
Claude model
"""
from langchain_anthropic import ChatAnthropic


class Claude(ChatAnthropic):
"""Class for wrapping bedrock module"""

def __init__(self, llm_config: dict):
"""
A wrapper for the Claude class that provides default configuration
and could be extended with additional methods if needed.

Args:
llm_config (dict): Configuration parameters for the language model.
"""
# Initialize the superclass (ChatAnthropic) with provided config parameters
super().__init__(**llm_config)
35 changes: 28 additions & 7 deletions scrapegraphai/nodes/fetch_node.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
"""
FetchNode Module
"""

import pandas as pd
from typing import List, Optional
from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader
from .base_node import BaseNode
from ..utils.remover import remover

Expand All @@ -21,19 +22,21 @@ class FetchNode(BaseNode):
Attributes:
headless (bool): A flag indicating whether the browser should run in headless mode.
verbose (bool): A flag indicating whether to print verbose output during execution.

Args:
input (str): Boolean expression defining the input keys needed from the state.
output (List[str]): List of output keys to be updated in the state.
node_config (Optional[dict]): Additional configuration for the node.
node_name (str): The unique identifier name for the node, defaulting to "Fetch".
"""

def __init__(self, input: str, output: List[str], node_config: Optional[dict]=None, node_name: str = "Fetch"):
def __init__(self, input: str, output: List[str], node_config: Optional[dict] = None, node_name: str = "Fetch"):
super().__init__(node_name, "node", input, output, 1)

self.headless = True if node_config is None else node_config.get("headless", True)
self.verbose = False if node_config is None else node_config.get("verbose", False)
self.headless = True if node_config is None else node_config.get(
"headless", True)
self.verbose = False if node_config is None else node_config.get(
"verbose", False)

def execute(self, state):
"""
Expand All @@ -56,7 +59,6 @@ def execute(self, state):

# Interpret input keys based on the provided input expression
input_keys = self.get_input_keys(state)

# Fetching data from the state based on the input keys
input_data = [state[key] for key in input_keys]

Expand All @@ -66,14 +68,33 @@ def execute(self, state):
"source": "local_dir"
})]
# if it is a local directory

# handling for pdf
elif self.input == "pdf":
loader = PyPDFLoader(source)
compressed_document = loader.load()

elif self.input == "csv":
compressed_document = [Document(page_content=pd.read_csv(source), metadata={
"source": "xml"
})]
elif self.input == "xml":
with open(source, 'r', encoding='utf-8') as f:
data = f.read()
compressed_document = [Document(page_content=data, metadata={
"source": "xml"
})]
elif self.input == "pdf_dir":
pass

elif not source.startswith("http"):
compressed_document = [Document(page_content=remover(source), metadata={
"source": "local_dir"
})]

else:
if self.node_config is not None and self.node_config.get("endpoint") is not None:

loader = AsyncChromiumLoader(
[source],
proxies={"http": self.node_config["endpoint"]},
Expand Down