Skip to content

Commit 3d8c3a3

Browse files
committed
feat: add explore graph
1 parent ce089be commit 3d8c3a3

File tree

8 files changed

+291
-18
lines changed

8 files changed

+291
-18
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg?style=for-the-badge)](https://opensource.org/licenses/MIT)
1010
[![](https://dcbadge.vercel.app/api/server/gkxQDAjfeX)](https://discord.gg/gkxQDAjfeX)
1111

12-
ScrapeGraphAI is a *web scraping* python library that uses LLM and direct graph logic to create scraping pipelines for websites and local documents (XML, HTML, JSON, etc.).
12+
ScrapeGraphAI is a *web scraping* python library that uses LLM and direct graph logic to create scraping pipelines for websites and local documents (XML, HTML, JSON, CSVS, etc.).
1313

1414
Just say which information you want to extract and the library will do it for you!
1515

scrapegraphai/graphs/explore_graph.py

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -19,40 +19,37 @@
1919

2020
class ExploreGraph(AbstractGraph):
2121
"""
22-
SmartScraper is a scraping pipeline that automates the process of
23-
extracting information from web pages
24-
using a natural language model to interpret and answer prompts.
22+
ExploreGraph is a web scraping pipeline that automates the extraction of information
23+
from web pages using natural language models to interpret and respond to prompts.
2524
2625
Attributes:
2726
prompt (str): The prompt for the graph.
28-
source (str): The source of the graph.
27+
source (str): The source URL or local directory for the graph.
2928
config (dict): Configuration parameters for the graph.
3029
schema (str): The schema for the graph output.
31-
llm_model: An instance of a language model client, configured for generating answers.
32-
embedder_model: An instance of an embedding model client,
33-
configured for generating embeddings.
30+
llm_model: An instance of a language model client for generating answers.
31+
embedder_model: An instance of an embedding model client for generating embeddings.
3432
verbose (bool): A flag indicating whether to show print statements during execution.
3533
headless (bool): A flag indicating whether to run the graph in headless mode.
3634
3735
Args:
3836
prompt (str): The prompt for the graph.
39-
source (str): The source of the graph.
37+
source (str): The source URL or local directory for the graph.
4038
config (dict): Configuration parameters for the graph.
41-
schema (str): The schema for the graph output.
39+
schema (Optional[BaseModel]): The schema for the graph output.
4240
4341
Example:
44-
>>> smart_scraper = ExploreGraph(
42+
>>> explore_graph = ExploreGraph(
4543
... "List me all the attractions in Chioggia.",
4644
... "https://en.wikipedia.org/wiki/Chioggia",
4745
... {"llm": {"model": "gpt-3.5-turbo"}}
4846
... )
49-
>>> result = smart_scraper.run()
50-
)
47+
>>> result = explore_graph.run()
48+
>>> print(result)
5149
"""
5250

5351
def __init__(self, prompt: str, source: str, config: dict, schema: Optional[BaseModel] = None):
5452
super().__init__(prompt, config, source, schema)
55-
5653
self.input_key = "url" if source.startswith("http") else "local_dir"
5754

5855
def _create_graph(self) -> BaseGraph:
@@ -109,7 +106,6 @@ def _create_graph(self) -> BaseGraph:
109106
search_link_node,
110107
generate_answer_node,
111108
],
112-
113109
edges=[
114110
(fetch_node, parse_node),
115111
(parse_node, rag_node),
@@ -126,7 +122,6 @@ def run(self) -> str:
126122
Returns:
127123
str: The answer to the prompt.
128124
"""
129-
130125
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
131126
self.final_state, self.execution_info = self.graph.execute(inputs)
132127

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
"""
2+
ParallelSearchGraph Module
3+
"""
4+
from copy import copy, deepcopy
5+
from typing import Optional
6+
from pydantic import BaseModel
7+
8+
from .base_graph import BaseGraph
9+
from .abstract_graph import AbstractGraph
10+
11+
from ..nodes import (
12+
GraphIteratorNode,
13+
ReRankNode,
14+
MergeExploreGraphsNode
15+
)
16+
17+
from ..graphs.explore_graph import ExploreGraph
18+
19+
20+
class ParallelSearchGraph(AbstractGraph):
21+
"""
22+
SmartScraper is a scraping pipeline that automates the process of
23+
extracting information from web pages
24+
using a natural language model to interpret and answer prompts.
25+
26+
Attributes:
27+
prompt (str): The prompt for the graph.
28+
source (str): The source of the graph.
29+
config (dict): Configuration parameters for the graph.
30+
schema (str): The schema for the graph output.
31+
llm_model: An instance of a language model client, configured for generating answers.
32+
embedder_model: An instance of an embedding model client,
33+
configured for generating embeddings.
34+
verbose (bool): A flag indicating whether to show print statements during execution.
35+
headless (bool): A flag indicating whether to run the graph in headless mode.
36+
37+
Args:
38+
prompt (str): The prompt for the graph.
39+
source (str): The source of the graph.
40+
config (dict): Configuration parameters for the graph.
41+
schema (str): The schema for the graph output.
42+
43+
Example:
44+
>>> smart_scraper = ParallelSearchGraph(
45+
... "List me all the attractions in Chioggia.",
46+
... "https://en.wikipedia.org/wiki/Chioggia",
47+
... {"llm": {"model": "gpt-3.5-turbo"}}
48+
... )
49+
>>> result = smart_scraper.run()
50+
)
51+
"""
52+
53+
def __init__(self, prompt: str, source: str, config: dict, schema: Optional[BaseModel] = None):
54+
super().__init__(prompt, config, source, schema)
55+
56+
self.input_key = "url" if source.startswith("http") else "local_dir"
57+
58+
if all(isinstance(value, str) for value in config.values()):
59+
self.copy_config = copy(config)
60+
else:
61+
self.copy_config = deepcopy(config)
62+
self.copy_schema = deepcopy(schema)
63+
64+
super().__init__(prompt, config, schema)
65+
66+
def _create_graph(self) -> BaseGraph:
67+
"""
68+
Creates the graph of nodes representing the workflow for web scraping.
69+
70+
Returns:
71+
BaseGraph: A graph instance representing the web scraping workflow.
72+
"""
73+
74+
explore_graph_instance = ExploreGraph(
75+
prompt="",
76+
source="",
77+
config=self.copy_config,
78+
)
79+
80+
rerank_link_node = ReRankNode(
81+
input="user_prompt & urls",
82+
output=["results"],
83+
node_config={
84+
"graph_instance": explore_graph_instance ,
85+
}
86+
)
87+
88+
graph_iterator_node = GraphIteratorNode(
89+
input="user_prompt & urls",
90+
output=["results"],
91+
node_config={
92+
"graph_instance": explore_graph_instance ,
93+
}
94+
)
95+
96+
merge_explore_graphs_node = MergeExploreGraphsNode(input="user_prompt & results",
97+
output=["answer"],
98+
node_config={
99+
"llm_model": self.llm_model,
100+
"schema": self.schema
101+
}
102+
)
103+
104+
105+
return BaseGraph(
106+
nodes=[
107+
rerank_link_node,
108+
graph_iterator_node,
109+
merge_explore_graphs_node,
110+
],
111+
edges=[
112+
(rerank_link_node, graph_iterator_node),
113+
(graph_iterator_node, merge_explore_graphs_node),
114+
],
115+
entry_point=rerank_link_node
116+
)
117+
118+
def run(self) -> str:
119+
"""
120+
Executes the scraping process and returns the answer to the prompt.
121+
122+
Returns:
123+
str: The answer to the prompt.
124+
"""
125+
126+
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
127+
self.final_state, self.execution_info = self.graph.execute(inputs)
128+
129+
return self.final_state.get("answer", "No answer found.")

scrapegraphai/graphs/search_graph.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,6 @@ def __init__(self, prompt: str, config: dict, schema: Optional[BaseModel] = None
5151
self.copy_config = copy(config)
5252
else:
5353
self.copy_config = deepcopy(config)
54-
5554
self.copy_schema = deepcopy(schema)
5655

5756
super().__init__(prompt, config, schema)

scrapegraphai/nodes/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,5 @@
2020
from .graph_iterator_node import GraphIteratorNode
2121
from .merge_answers_node import MergeAnswersNode
2222
from .generate_answer_omni_node import GenerateAnswerOmniNode
23+
from .merge_explore_graphs_node import MergeExploreGraphsNode
24+
from .rerank_node import ReRankNode

scrapegraphai/nodes/graph_iterator_node.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import asyncio
66
import copy
77
from typing import List, Optional
8-
8+
from ..graphs.explore_graph import ExploreGraph
99
from tqdm.asyncio import tqdm
1010

1111
from ..utils.logging import get_logger
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
"""
2+
MergeExploreGraphsNode Module
3+
"""
4+
5+
# Imports from standard library
6+
from typing import List, Optional
7+
8+
# Imports from Langchain
9+
from langchain.prompts import PromptTemplate
10+
from langchain_core.output_parsers import JsonOutputParser
11+
from langchain_core.runnables import RunnableParallel
12+
from tqdm import tqdm
13+
14+
15+
from ..utils.logging import get_logger
16+
from ..models import Ollama
17+
# Imports from the library
18+
from .base_node import BaseNode
19+
from ..helpers import template_chunks, template_no_chunks, template_merge
20+
21+
22+
class MergeExploreGraphsNode(BaseNode):
23+
"""
24+
A node that generates an answer using a large language model (LLM) based on the user's input
25+
and the content extracted from a webpage. It constructs a prompt from the user's input
26+
and the scraped content, feeds it to the LLM, and parses the LLM's response to produce
27+
an answer.
28+
29+
Attributes:
30+
llm_model: An instance of a language model client, configured for generating answers.
31+
verbose (bool): A flag indicating whether to show print statements during execution.
32+
33+
Args:
34+
input (str): Boolean expression defining the input keys needed from the state.
35+
output (List[str]): List of output keys to be updated in the state.
36+
node_config (dict): Additional configuration for the node.
37+
node_name (str): The unique identifier name for the node, defaulting to "GenerateAnswer".
38+
"""
39+
40+
def __init__(
41+
self,
42+
input: str,
43+
output: List[str],
44+
node_config: Optional[dict] = None,
45+
node_name: str = "GenerateAnswer",
46+
):
47+
super().__init__(node_name, "node", input, output, 2, node_config)
48+
49+
self.llm_model = node_config["llm_model"]
50+
51+
if isinstance(node_config["llm_model"], Ollama):
52+
self.llm_model.format="json"
53+
54+
self.verbose = (
55+
True if node_config is None else node_config.get("verbose", False)
56+
)
57+
58+
def execute(self, state: dict) -> dict:
59+
"""
60+
Generates an answer by constructing a prompt from the user's input and the scraped
61+
content, querying the language model, and parsing its response.
62+
63+
Args:
64+
state (dict): The current state of the graph. The input keys will be used
65+
to fetch the correct data from the state.
66+
67+
Returns:
68+
dict: The updated state with the output key containing the generated answer.
69+
70+
Raises:
71+
KeyError: If the input keys are not found in the state, indicating
72+
that the necessary information for generating an answer is missing.
73+
"""
74+
75+
self.logger.info(f"--- Executing {self.node_name} Node ---")
76+
77+
78+
state.update({self.output[0]: "answaer"})
79+
return state

scrapegraphai/nodes/rerank_node.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
"""
2+
ReRankNode Module
3+
"""
4+
5+
6+
from typing import List, Optional
7+
from ..utils.logging import get_logger
8+
from .base_node import BaseNode
9+
10+
11+
class ReRankNode(BaseNode):
12+
"""
13+
A node responsible for compressing the input tokens and storing the document
14+
in a vector database for retrieval. Relevant chunks are stored in the state.
15+
16+
It allows scraping of big documents without exceeding the token limit of the language model.
17+
18+
Attributes:
19+
llm_model: An instance of a language model client, configured for generating answers.
20+
embedder_model: An instance of an embedding model client, configured for generating embeddings.
21+
verbose (bool): A flag indicating whether to show print statements during execution.
22+
23+
Args:
24+
input (str): Boolean expression defining the input keys needed from the state.
25+
output (List[str]): List of output keys to be updated in the state.
26+
node_config (dict): Additional configuration for the node.
27+
node_name (str): The unique identifier name for the node, defaulting to "Parse".
28+
"""
29+
30+
def __init__(
31+
self,
32+
input: str,
33+
output: List[str],
34+
node_config: Optional[dict] = None,
35+
node_name: str = "RAG",
36+
):
37+
super().__init__(node_name, "node", input, output, 2, node_config)
38+
39+
self.llm_model = node_config["llm_model"]
40+
self.embedder_model = node_config.get("embedder_model", None)
41+
self.verbose = (
42+
False if node_config is None else node_config.get("verbose", False)
43+
)
44+
45+
def execute(self, state: dict) -> dict:
46+
"""
47+
Executes the node's logic to implement RAG (Retrieval-Augmented Generation).
48+
The method updates the state with relevant chunks of the document.
49+
50+
Args:
51+
state (dict): The current state of the graph. The input keys will be used to fetch the
52+
correct data from the state.
53+
54+
Returns:
55+
dict: The updated state with the output key containing the relevant chunks of the document.
56+
57+
Raises:
58+
KeyError: If the input keys are not found in the state, indicating that the
59+
necessary information for compressing the content is missing.
60+
"""
61+
62+
self.logger.info(f"--- Executing {self.node_name} Node ---")
63+
64+
65+
66+
self.logger.info("--- (tokens compressed and vector stored) ---")
67+
68+
state.update({self.output[0]: "compressed_docs"})
69+
return state

0 commit comments

Comments
 (0)