Skip to content

Commit db54d69

Browse files
committed
refactoring of code for pylint integration
1 parent c91975e commit db54d69

File tree

3 files changed

+107
-89
lines changed

3 files changed

+107
-89
lines changed

scrapegraphai/nodes/description_node.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@ def __init__(
3434
node_name: str = "DESCRIPTION",
3535
):
3636
super().__init__(node_name, "node", input, output, 2, node_config)
37-
3837
self.llm_model = node_config["llm_model"]
3938
self.verbose = (
4039
False if node_config is None else node_config.get("verbose", False)
Lines changed: 101 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,3 @@
1-
"""
2-
FetchNodeLevelK Module
3-
"""
41
from typing import List, Optional
52
from .base_node import BaseNode
63
from ..docloaders import ChromiumLoader
@@ -18,14 +15,21 @@ class FetchNodeLevelK(BaseNode):
1815
(with proxy protection).
1916
2017
Attributes:
21-
llm_model: An instance of a language model client, configured for generating answers.
18+
embedder_model: An optional model for embedding the fetched content.
2219
verbose (bool): A flag indicating whether to show print statements during execution.
20+
cache_path (str): Path to cache fetched content.
21+
headless (bool): Whether to run the Chromium browser in headless mode.
22+
loader_kwargs (dict): Additional arguments for the content loader.
23+
browser_base (dict): Optional configuration for the browser base API.
24+
depth (int): Maximum depth of hyperlink graph traversal.
25+
only_inside_links (bool): Whether to fetch only internal links.
26+
min_input_len (int): Minimum required length of input data.
2327
2428
Args:
2529
input (str): Boolean expression defining the input keys needed from the state.
2630
output (List[str]): List of output keys to be updated in the state.
2731
node_config (dict): Additional configuration for the node.
28-
node_name (str): The unique identifier name for the node, defaulting to "Parse".
32+
node_name (str): The unique identifier name for the node, defaulting to "FetchLevelK".
2933
"""
3034

3135
def __init__(
@@ -35,81 +39,68 @@ def __init__(
3539
node_config: Optional[dict] = None,
3640
node_name: str = "FetchLevelK",
3741
):
42+
"""
43+
Initializes the FetchNodeLevelK instance.
44+
45+
Args:
46+
input (str): Boolean expression defining the input keys needed from the state.
47+
output (List[str]): List of output keys to be updated in the state.
48+
node_config (Optional[dict]): Additional configuration for the node.
49+
node_name (str): The name of the node (default is "FetchLevelK").
50+
"""
3851
super().__init__(node_name, "node", input, output, 2, node_config)
39-
52+
4053
self.embedder_model = node_config.get("embedder_model", None)
41-
42-
self.verbose = (
43-
False if node_config is None else node_config.get("verbose", False)
44-
)
45-
54+
self.verbose = node_config.get("verbose", False) if node_config else False
4655
self.cache_path = node_config.get("cache_path", False)
47-
48-
self.headless = (
49-
True if node_config is None else node_config.get("headless", True)
50-
)
51-
52-
self.loader_kwargs = (
53-
{} if node_config is None else node_config.get("loader_kwargs", {})
54-
)
55-
56-
self.browser_base = (
57-
None if node_config is None else node_config.get("browser_base", None)
58-
)
59-
60-
self.depth = (
61-
1 if node_config is None else node_config.get("depth", 1)
62-
)
63-
64-
self.only_inside_links = (
65-
False if node_config is None else node_config.get("only_inside_links", False)
66-
)
67-
56+
self.headless = node_config.get("headless", True) if node_config else True
57+
self.loader_kwargs = node_config.get("loader_kwargs", {}) if node_config else {}
58+
self.browser_base = node_config.get("browser_base", None)
59+
self.depth = node_config.get("depth", 1) if node_config else 1
60+
self.only_inside_links = node_config.get("only_inside_links", False) if node_config else False
6861
self.min_input_len = 1
6962

7063
def execute(self, state: dict) -> dict:
7164
"""
72-
Executes the node's logic to fetch the HTML content of a specified URL and all its sub-links
73-
and update the graph's state with the content.
65+
Executes the node's logic to fetch the HTML content of a specified URL and its sub-links
66+
recursively, then updates the graph's state with the fetched content.
7467
7568
Args:
76-
state (dict): The current state of the graph. The input keys will be used
77-
to fetch the correct data types from the state.
69+
state (dict): The current state of the graph.
7870
7971
Returns:
8072
dict: The updated state with a new output key containing the fetched HTML content.
8173
8274
Raises:
83-
KeyError: If the input key is not found in the state, indicating that the
84-
necessary information to perform the operation is missing.
75+
KeyError: If the input key is not found in the state.
8576
"""
86-
8777
self.logger.info(f"--- Executing {self.node_name} Node ---")
88-
89-
# Interpret input keys based on the provided input expression
78+
9079
input_keys = self.get_input_keys(state)
91-
# Fetching data from the state based on the input keys
9280
input_data = [state[key] for key in input_keys]
93-
9481
source = input_data[0]
95-
82+
9683
documents = [{"source": source}]
97-
98-
loader_kwargs = {}
84+
loader_kwargs = self.node_config.get("loader_kwargs", {}) if self.node_config else {}
9985

100-
if self.node_config is not None:
101-
loader_kwargs = self.node_config.get("loader_kwargs", {})
102-
10386
for _ in range(self.depth):
10487
documents = self.obtain_content(documents, loader_kwargs)
105-
88+
10689
filtered_documents = [doc for doc in documents if 'document' in doc]
107-
10890
state.update({self.output[0]: filtered_documents})
109-
11091
return state
111-
92+
11293
def fetch_content(self, source: str, loader_kwargs) -> Optional[str]:
94+
"""
95+
Fetches the HTML content of a given source URL.
96+
97+
Args:
98+
source (str): The URL to fetch content from.
99+
loader_kwargs (dict): Additional arguments for the content loader.
100+
101+
Returns:
102+
Optional[str]: The fetched HTML content or None if fetching failed.
103+
"""
113104
self.logger.info(f"--- (Fetching HTML from: {source}) ---")
114105

115106
if self.browser_base is not None:
@@ -119,63 +110,96 @@ def fetch_content(self, source: str, loader_kwargs) -> Optional[str]:
119110
raise ImportError("""The browserbase module is not installed.
120111
Please install it using `pip install browserbase`.""")
121112

122-
data = browser_base_fetch(self.browser_base.get("api_key"),
123-
self.browser_base.get("project_id"), [source])
124-
125-
document = [Document(page_content=content,
126-
metadata={"source": source}) for content in data]
127-
113+
data = browser_base_fetch(self.browser_base.get("api_key"),
114+
self.browser_base.get("project_id"), [source])
115+
document = [Document(page_content=content, metadata={"source": source}) for content in data]
128116
else:
129117
loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs)
130-
131118
document = loader.load()
132-
133119
return document
134-
120+
135121
def extract_links(self, html_content: str) -> list:
122+
"""
123+
Extracts all hyperlinks from the HTML content.
124+
125+
Args:
126+
html_content (str): The HTML content to extract links from.
127+
128+
Returns:
129+
list: A list of extracted hyperlinks.
130+
"""
136131
soup = BeautifulSoup(html_content, 'html.parser')
137132
links = [link['href'] for link in soup.find_all('a', href=True)]
138133
self.logger.info(f"Extracted {len(links)} links.")
139134
return links
140-
135+
141136
def get_full_links(self, base_url: str, links: list) -> list:
137+
"""
138+
Converts relative URLs to full URLs based on the base URL.
139+
140+
Args:
141+
base_url (str): The base URL for resolving relative links.
142+
links (list): A list of links to convert.
143+
144+
Returns:
145+
list: A list of full URLs.
146+
"""
142147
full_links = []
143148
for link in links:
144149
if self.only_inside_links and link.startswith("http"):
145150
continue
146151
full_link = link if link.startswith("http") else urljoin(base_url, link)
147152
full_links.append(full_link)
148153
return full_links
149-
154+
150155
def obtain_content(self, documents: List, loader_kwargs) -> List:
156+
"""
157+
Iterates through documents, fetching and updating content recursively.
158+
159+
Args:
160+
documents (List): A list of documents containing the source URLs.
161+
loader_kwargs (dict): Additional arguments for the content loader.
162+
163+
Returns:
164+
List: The updated list of documents with fetched content.
165+
"""
151166
new_documents = []
152167
for doc in documents:
153168
source = doc['source']
154169
if 'document' not in doc:
155170
document = self.fetch_content(source, loader_kwargs)
156-
171+
157172
if not document or not document[0].page_content.strip():
158173
self.logger.warning(f"Failed to fetch content for {source}")
159174
documents.remove(doc)
160175
continue
161-
162-
#doc['document'] = document[0].page_content
176+
163177
doc['document'] = document
164-
165178
links = self.extract_links(doc['document'][0].page_content)
166179
full_links = self.get_full_links(source, links)
167-
168-
# Check if the links are already present in other documents
180+
169181
for link in full_links:
170-
# Check if any document is from the same link
171182
if not any(d.get('source', '') == link for d in documents) and not any(d.get('source', '') == link for d in new_documents):
172-
# Add the document
173183
new_documents.append({"source": link})
174-
184+
175185
documents.extend(new_documents)
176186
return documents
177-
178-
def process_links(self, base_url: str, links: list, loader_kwargs, depth: int, current_depth: int = 1) -> dict:
187+
188+
def process_links(self, base_url: str, links: list,
189+
loader_kwargs, depth: int, current_depth: int = 1) -> dict:
190+
"""
191+
Processes a list of links recursively up to a given depth.
192+
193+
Args:
194+
base_url (str): The base URL for resolving relative links.
195+
links (list): A list of links to process.
196+
loader_kwargs (dict): Additional arguments for the content loader.
197+
depth (int): The maximum depth for recursion.
198+
current_depth (int): The current depth of recursion (default is 1).
199+
200+
Returns:
201+
dict: A dictionary containing processed link content.
202+
"""
179203
content_dict = {}
180204
for idx, link in enumerate(links, start=1):
181205
full_link = link if link.startswith("http") else urljoin(base_url, link)
@@ -184,7 +208,7 @@ def process_links(self, base_url: str, links: list, loader_kwargs, depth: int, c
184208

185209
if current_depth < depth:
186210
new_links = self.extract_links(link_content)
187-
content_dict.update(self.process_links(full_link, new_links, depth, current_depth + 1))
211+
content_dict.update(self.process_links(full_link, new_links, loader_kwargs, depth, current_depth + 1))
188212
else:
189213
self.logger.warning(f"Failed to fetch content for {full_link}")
190-
return content_dict
214+
return content_dict

scrapegraphai/nodes/parse_node_depth_k.py

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
11
"""
22
ParseNodeDepthK Module
33
"""
4-
import re
5-
from typing import List, Optional, Tuple
6-
from .base_node import BaseNode
7-
from ..utils.convert_to_md import convert_to_md
4+
from typing import List, Optional
85
from langchain_community.document_transformers import Html2TextTransformer
6+
from .base_node import BaseNode
97

108
class ParseNodeDepthK(BaseNode):
119
"""
@@ -54,19 +52,16 @@ def execute(self, state: dict) -> dict:
5452
"""
5553

5654
self.logger.info(f"--- Executing {self.node_name} Node ---")
57-
58-
# Interpret input keys based on the provided input expression
55+
5956
input_keys = self.get_input_keys(state)
60-
# Fetching data from the state based on the input keys
6157
input_data = [state[key] for key in input_keys]
6258

6359
documents = input_data[0]
64-
60+
6561
for doc in documents:
6662
document_md = Html2TextTransformer(ignore_links=True).transform_documents(doc["document"])
67-
#document_md = convert_to_md(doc["document"])
6863
doc["document"] = document_md[0].page_content
69-
64+
7065
state.update({self.output[0]: documents})
71-
66+
7267
return state

0 commit comments

Comments
 (0)