Skip to content

Commit 300fc05

Browse files
committed
add explore graph
1 parent 0441657 commit 300fc05

File tree

3 files changed

+92
-57
lines changed

3 files changed

+92
-57
lines changed
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper
3+
"""
4+
from scrapegraphai.graphs import ExploreGraph
5+
from scrapegraphai.utils import prettify_exec_info
6+
# ************************************************
7+
# Define the configuration for the graph
8+
# ************************************************
9+
10+
graph_config = {
11+
"llm": {
12+
"model": "ollama/mistral",
13+
"temperature": 0,
14+
"format": "json", # Ollama needs the format to be specified explicitly
15+
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
16+
},
17+
"embeddings": {
18+
"model": "ollama/nomic-embed-text",
19+
"temperature": 0,
20+
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
21+
},
22+
"verbose": True,
23+
"headless": False
24+
}
25+
26+
# ************************************************
27+
# Create the SmartScraperGraph instance and run it
28+
# ************************************************
29+
30+
smart_scraper_graph = ExploreGraph(
31+
prompt="List me all the titles",
32+
# also accepts a string with the already downloaded HTML code
33+
source="https://www.wired.com/",
34+
config=graph_config
35+
)
36+
37+
result = smart_scraper_graph.run()
38+
print(result)
39+
40+
# ************************************************
41+
# Get graph execution info
42+
# ************************************************
43+
44+
graph_exec_info = smart_scraper_graph.get_execution_info()
45+
print(prettify_exec_info(graph_exec_info))

scrapegraphai/graphs/explore_graph.py

Lines changed: 35 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,12 @@
1-
"""
1+
"""
22
ExploreGraph Module
33
"""
44

5-
from copy import copy, deepcopy
65
from typing import Optional
76
from pydantic import BaseModel
87

98
from .base_graph import BaseGraph
109
from .abstract_graph import AbstractGraph
11-
from .smart_scraper_graph import SmartScraperGraph
1210

1311
from ..nodes import (
1412
FetchNode,
@@ -20,56 +18,50 @@
2018

2119

2220
class ExploreGraph(AbstractGraph):
23-
"""
24-
ExploreGraph is a scraping pipeline that searches the internet for answers to a given prompt.
25-
It only requires a user prompt to search the internet and generate an answer.
21+
"""
22+
SmartScraper is a scraping pipeline that automates the process of
23+
extracting information from web pages
24+
using a natural language model to interpret and answer prompts.
2625
2726
Attributes:
28-
prompt (str): The user prompt to search the internet.
29-
llm_model (dict): The configuration for the language model.
30-
embedder_model (dict): The configuration for the embedder model.
31-
headless (bool): A flag to run the browser in headless mode.
32-
verbose (bool): A flag to display the execution information.
33-
model_token (int): The token limit for the language model.
27+
prompt (str): The prompt for the graph.
28+
source (str): The source of the graph.
29+
config (dict): Configuration parameters for the graph.
30+
schema (str): The schema for the graph output.
31+
llm_model: An instance of a language model client, configured for generating answers.
32+
embedder_model: An instance of an embedding model client,
33+
configured for generating embeddings.
34+
verbose (bool): A flag indicating whether to show print statements during execution.
35+
headless (bool): A flag indicating whether to run the graph in headless mode.
3436
3537
Args:
36-
prompt (str): The user prompt to search the internet.
38+
prompt (str): The prompt for the graph.
39+
source (str): The source of the graph.
3740
config (dict): Configuration parameters for the graph.
38-
schema (Optional[str]): The schema for the graph output.
41+
schema (str): The schema for the graph output.
3942
4043
Example:
41-
>>> search_graph = ExploreGraph(
42-
... "What is Chioggia famous for?",
44+
>>> smart_scraper = ExploreGraph(
45+
... "List me all the attractions in Chioggia.",
46+
... "https://en.wikipedia.org/wiki/Chioggia",
4347
... {"llm": {"model": "gpt-3.5-turbo"}}
4448
... )
45-
>>> result = search_graph.run()
49+
>>> result = smart_scraper.run()
50+
)
4651
"""
4752

48-
def __init__(self, prompt: str, config: dict, schema: Optional[BaseModel] = None):
49-
50-
self.max_results = config.get("max_results", 3)
53+
def __init__(self, prompt: str, source: str, config: dict, schema: Optional[BaseModel] = None):
54+
super().__init__(prompt, config, source, schema)
5155

52-
if all(isinstance(value, str) for value in config.values()):
53-
self.copy_config = copy(config)
54-
else:
55-
self.copy_config = deepcopy(config)
56-
57-
self.copy_schema = deepcopy(schema)
58-
59-
super().__init__(prompt, config, schema)
56+
self.input_key = "url" if source.startswith("http") else "local_dir"
6057

6158
def _create_graph(self) -> BaseGraph:
6259
"""
63-
Creates the graph of nodes representing the workflow for web scraping and searching.
60+
Creates the graph of nodes representing the workflow for web scraping.
6461
6562
Returns:
66-
BaseGraph: A graph instance representing the web scraping and searching workflow.
63+
BaseGraph: A graph instance representing the web scraping workflow.
6764
"""
68-
69-
# ************************************************
70-
# Create a SmartScraperGraph instance
71-
# ************************************************
72-
7365
fetch_node = FetchNode(
7466
input="url | local_dir",
7567
output=["doc", "link_urls", "img_urls"],
@@ -100,7 +92,7 @@ def _create_graph(self) -> BaseGraph:
10092
"schema": self.schema,
10193
}
10294
)
103-
95+
10496
search_link_node = SearchLinkNode(
10597
input="doc",
10698
output=[{"link": "description"}],
@@ -114,25 +106,28 @@ def _create_graph(self) -> BaseGraph:
114106
fetch_node,
115107
parse_node,
116108
rag_node,
109+
search_link_node,
117110
generate_answer_node,
118111
],
112+
119113
edges=[
120114
(fetch_node, parse_node),
121115
(parse_node, rag_node),
122-
(rag_node, generate_answer_node),
123-
(generate_answer_node, search_link_node)
116+
(rag_node, search_link_node),
117+
(search_link_node, generate_answer_node)
124118
],
125119
entry_point=fetch_node
126120
)
127121

128122
def run(self) -> str:
129123
"""
130-
Executes the web scraping and searching process.
124+
Executes the scraping process and returns the answer to the prompt.
131125
132126
Returns:
133127
str: The answer to the prompt.
134128
"""
135-
inputs = {"user_prompt": self.prompt}
129+
130+
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
136131
self.final_state, self.execution_info = self.graph.execute(inputs)
137132

138133
return self.final_state.get("answer", "No answer found.")

scrapegraphai/nodes/search_link_node.py

Lines changed: 12 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -67,17 +67,15 @@ def execute(self, state: dict) -> dict:
6767

6868
self.logger.info(f"--- Executing {self.node_name} Node ---")
6969

70-
# Interpret input keys based on the provided input expression
71-
input_keys = self.get_input_keys(state)
7270

73-
user_prompt = state[input_keys[0]]
74-
parsed_content_chunks = state[input_keys[1]]
71+
user_prompt = state.get("user_prompt")
72+
links = state.get("link_urls")
73+
parsed_content_chunks = state.get("parsed_doc")
7574
output_parser = JsonOutputParser()
7675

7776
prompt_relevant_links = """
7877
You are a website scraper and you have just scraped the following content from a website.
79-
Content: {content}
80-
78+
8179
You are now tasked with identifying all hyper links within the content that are potentially
8280
relevant to the user task: {user_prompt}
8381
@@ -87,19 +85,15 @@ def execute(self, state: dict) -> dict:
8785
Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain
8886
whether the content at the link is directly relevant.
8987
88+
This is the list of links: {links}
89+
90+
Content: {content}
91+
9092
The output should be a dictionary whose key is the link and whose value is a short description or a slug relevant
9193
for the link; if no such description or slug can be learnt from the scraped content, just leave it null
9294
93-
Output only a list of relevant links in the format:
94-
{
95-
"link1": "description link 1",
96-
"link2": "description link 2",
97-
"link3": "description link 3",
98-
.
99-
.
100-
.
101-
}
10295
"""
96+
10397
relevant_links = []
10498

10599
for i, chunk in enumerate(
@@ -111,12 +105,13 @@ def execute(self, state: dict) -> dict:
111105
):
112106
merge_prompt = PromptTemplate(
113107
template=prompt_relevant_links,
114-
input_variables=["content", "user_prompt"],
108+
input_variables=["content", "user_prompt", "links"],
115109
)
116110
merge_chain = merge_prompt | self.llm_model | output_parser
117111
# merge_chain = merge_prompt | self.llm_model
118112
answer = merge_chain.invoke(
119-
{"content": chunk.page_content, "user_prompt": user_prompt}
113+
{"content": chunk, "links": links,
114+
"user_prompt": user_prompt}
120115
)
121116
relevant_links += answer
122117
state.update({self.output[0]: relevant_links})

0 commit comments

Comments
 (0)