Skip to content

Commit 0441657

Browse files
VinciGit00DiTo97vedovati-matteo
committed
feat: add explore graph
Co-Authored-By: Federico Minutoli <40361744+DiTo97@users.noreply.github.com> Co-Authored-By: Matteo Vedovati <68272450+vedovati-matteo@users.noreply.github.com>
1 parent e5bb5ae commit 0441657

File tree

3 files changed

+147
-5
lines changed

3 files changed

+147
-5
lines changed

scrapegraphai/graphs/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,4 @@
2020
from .json_scraper_multi import JSONScraperMultiGraph
2121
from .csv_scraper_graph_multi import CSVScraperMultiGraph
2222
from .xml_scraper_graph_multi import XMLScraperMultiGraph
23+
from .explore_graph import ExploreGraph

scrapegraphai/graphs/explore_graph.py

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
"""
2+
ExploreGraph Module
3+
"""
4+
5+
from copy import copy, deepcopy
6+
from typing import Optional
7+
from pydantic import BaseModel
8+
9+
from .base_graph import BaseGraph
10+
from .abstract_graph import AbstractGraph
11+
from .smart_scraper_graph import SmartScraperGraph
12+
13+
from ..nodes import (
14+
FetchNode,
15+
ParseNode,
16+
RAGNode,
17+
GenerateAnswerNode,
18+
SearchLinkNode
19+
)
20+
21+
22+
class ExploreGraph(AbstractGraph):
23+
"""
24+
ExploreGraph is a scraping pipeline that searches the internet for answers to a given prompt.
25+
It only requires a user prompt to search the internet and generate an answer.
26+
27+
Attributes:
28+
prompt (str): The user prompt to search the internet.
29+
llm_model (dict): The configuration for the language model.
30+
embedder_model (dict): The configuration for the embedder model.
31+
headless (bool): A flag to run the browser in headless mode.
32+
verbose (bool): A flag to display the execution information.
33+
model_token (int): The token limit for the language model.
34+
35+
Args:
36+
prompt (str): The user prompt to search the internet.
37+
config (dict): Configuration parameters for the graph.
38+
schema (Optional[str]): The schema for the graph output.
39+
40+
Example:
41+
>>> search_graph = ExploreGraph(
42+
... "What is Chioggia famous for?",
43+
... {"llm": {"model": "gpt-3.5-turbo"}}
44+
... )
45+
>>> result = search_graph.run()
46+
"""
47+
48+
def __init__(self, prompt: str, config: dict, schema: Optional[BaseModel] = None):
49+
50+
self.max_results = config.get("max_results", 3)
51+
52+
if all(isinstance(value, str) for value in config.values()):
53+
self.copy_config = copy(config)
54+
else:
55+
self.copy_config = deepcopy(config)
56+
57+
self.copy_schema = deepcopy(schema)
58+
59+
super().__init__(prompt, config, schema)
60+
61+
def _create_graph(self) -> BaseGraph:
62+
"""
63+
Creates the graph of nodes representing the workflow for web scraping and searching.
64+
65+
Returns:
66+
BaseGraph: A graph instance representing the web scraping and searching workflow.
67+
"""
68+
69+
# ************************************************
70+
# Create a SmartScraperGraph instance
71+
# ************************************************
72+
73+
fetch_node = FetchNode(
74+
input="url | local_dir",
75+
output=["doc", "link_urls", "img_urls"],
76+
node_config={
77+
"loader_kwargs": self.config.get("loader_kwargs", {}),
78+
}
79+
)
80+
parse_node = ParseNode(
81+
input="doc",
82+
output=["parsed_doc"],
83+
node_config={
84+
"chunk_size": self.model_token
85+
}
86+
)
87+
rag_node = RAGNode(
88+
input="user_prompt & (parsed_doc | doc)",
89+
output=["relevant_chunks"],
90+
node_config={
91+
"llm_model": self.llm_model,
92+
"embedder_model": self.embedder_model
93+
}
94+
)
95+
generate_answer_node = GenerateAnswerNode(
96+
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
97+
output=["answer"],
98+
node_config={
99+
"llm_model": self.llm_model,
100+
"schema": self.schema,
101+
}
102+
)
103+
104+
search_link_node = SearchLinkNode(
105+
input="doc",
106+
output=[{"link": "description"}],
107+
node_config={
108+
"llm_model": self.llm_model,
109+
}
110+
)
111+
112+
return BaseGraph(
113+
nodes=[
114+
fetch_node,
115+
parse_node,
116+
rag_node,
117+
generate_answer_node,
118+
],
119+
edges=[
120+
(fetch_node, parse_node),
121+
(parse_node, rag_node),
122+
(rag_node, generate_answer_node),
123+
(generate_answer_node, search_link_node)
124+
],
125+
entry_point=fetch_node
126+
)
127+
128+
def run(self) -> str:
129+
"""
130+
Executes the web scraping and searching process.
131+
132+
Returns:
133+
str: The answer to the prompt.
134+
"""
135+
inputs = {"user_prompt": self.prompt}
136+
self.final_state, self.execution_info = self.graph.execute(inputs)
137+
138+
return self.final_state.get("answer", "No answer found.")

scrapegraphai/nodes/search_link_node.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -87,15 +87,18 @@ def execute(self, state: dict) -> dict:
8787
Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain
8888
whether the content at the link is directly relevant.
8989
90+
The output should be a dictionary whose key is the link and whose value is a short description or a slug relevant
91+
for the link; if no such description or slug can be learnt from the scraped content, just leave it null
92+
9093
Output only a list of relevant links in the format:
91-
[
92-
"link1",
93-
"link2",
94-
"link3",
94+
{
95+
"link1": "description link 1",
96+
"link2": "description link 2",
97+
"link3": "description link 3",
9598
.
9699
.
97100
.
98-
]
101+
}
99102
"""
100103
relevant_links = []
101104

0 commit comments

Comments
 (0)