1
- """
2
- FetchNodeLevelK Module
3
- """
4
1
from typing import List , Optional
5
2
from .base_node import BaseNode
6
3
from ..docloaders import ChromiumLoader
@@ -18,14 +15,21 @@ class FetchNodeLevelK(BaseNode):
18
15
(with proxy protection).
19
16
20
17
Attributes:
21
- llm_model : An instance of a language model client, configured for generating answers .
18
+ embedder_model : An optional model for embedding the fetched content .
22
19
verbose (bool): A flag indicating whether to show print statements during execution.
20
+ cache_path (str): Path to cache fetched content.
21
+ headless (bool): Whether to run the Chromium browser in headless mode.
22
+ loader_kwargs (dict): Additional arguments for the content loader.
23
+ browser_base (dict): Optional configuration for the browser base API.
24
+ depth (int): Maximum depth of hyperlink graph traversal.
25
+ only_inside_links (bool): Whether to fetch only internal links.
26
+ min_input_len (int): Minimum required length of input data.
23
27
24
28
Args:
25
29
input (str): Boolean expression defining the input keys needed from the state.
26
30
output (List[str]): List of output keys to be updated in the state.
27
31
node_config (dict): Additional configuration for the node.
28
- node_name (str): The unique identifier name for the node, defaulting to "Parse ".
32
+ node_name (str): The unique identifier name for the node, defaulting to "FetchLevelK ".
29
33
"""
30
34
31
35
def __init__ (
@@ -35,81 +39,68 @@ def __init__(
35
39
node_config : Optional [dict ] = None ,
36
40
node_name : str = "FetchLevelK" ,
37
41
):
42
+ """
43
+ Initializes the FetchNodeLevelK instance.
44
+
45
+ Args:
46
+ input (str): Boolean expression defining the input keys needed from the state.
47
+ output (List[str]): List of output keys to be updated in the state.
48
+ node_config (Optional[dict]): Additional configuration for the node.
49
+ node_name (str): The name of the node (default is "FetchLevelK").
50
+ """
38
51
super ().__init__ (node_name , "node" , input , output , 2 , node_config )
39
-
52
+
40
53
self .embedder_model = node_config .get ("embedder_model" , None )
41
-
42
- self .verbose = (
43
- False if node_config is None else node_config .get ("verbose" , False )
44
- )
45
-
54
+ self .verbose = node_config .get ("verbose" , False ) if node_config else False
46
55
self .cache_path = node_config .get ("cache_path" , False )
47
-
48
- self .headless = (
49
- True if node_config is None else node_config .get ("headless" , True )
50
- )
51
-
52
- self .loader_kwargs = (
53
- {} if node_config is None else node_config .get ("loader_kwargs" , {})
54
- )
55
-
56
- self .browser_base = (
57
- None if node_config is None else node_config .get ("browser_base" , None )
58
- )
59
-
60
- self .depth = (
61
- 1 if node_config is None else node_config .get ("depth" , 1 )
62
- )
63
-
64
- self .only_inside_links = (
65
- False if node_config is None else node_config .get ("only_inside_links" , False )
66
- )
67
-
56
+ self .headless = node_config .get ("headless" , True ) if node_config else True
57
+ self .loader_kwargs = node_config .get ("loader_kwargs" , {}) if node_config else {}
58
+ self .browser_base = node_config .get ("browser_base" , None )
59
+ self .depth = node_config .get ("depth" , 1 ) if node_config else 1
60
+ self .only_inside_links = node_config .get ("only_inside_links" , False ) if node_config else False
68
61
self .min_input_len = 1
69
62
70
63
def execute (self , state : dict ) -> dict :
71
64
"""
72
- Executes the node's logic to fetch the HTML content of a specified URL and all its sub-links
73
- and update the graph's state with the content.
65
+ Executes the node's logic to fetch the HTML content of a specified URL and its sub-links
66
+ recursively, then updates the graph's state with the fetched content.
74
67
75
68
Args:
76
- state (dict): The current state of the graph. The input keys will be used
77
- to fetch the correct data types from the state.
69
+ state (dict): The current state of the graph.
78
70
79
71
Returns:
80
72
dict: The updated state with a new output key containing the fetched HTML content.
81
73
82
74
Raises:
83
- KeyError: If the input key is not found in the state, indicating that the
84
- necessary information to perform the operation is missing.
75
+ KeyError: If the input key is not found in the state.
85
76
"""
86
-
87
77
self .logger .info (f"--- Executing { self .node_name } Node ---" )
88
-
89
- # Interpret input keys based on the provided input expression
78
+
90
79
input_keys = self .get_input_keys (state )
91
- # Fetching data from the state based on the input keys
92
80
input_data = [state [key ] for key in input_keys ]
93
-
94
81
source = input_data [0 ]
95
-
82
+
96
83
documents = [{"source" : source }]
97
-
98
- loader_kwargs = {}
84
+ loader_kwargs = self .node_config .get ("loader_kwargs" , {}) if self .node_config else {}
99
85
100
- if self .node_config is not None :
101
- loader_kwargs = self .node_config .get ("loader_kwargs" , {})
102
-
103
86
for _ in range (self .depth ):
104
87
documents = self .obtain_content (documents , loader_kwargs )
105
-
88
+
106
89
filtered_documents = [doc for doc in documents if 'document' in doc ]
107
-
108
90
state .update ({self .output [0 ]: filtered_documents })
109
-
110
91
return state
111
-
92
+
112
93
def fetch_content (self , source : str , loader_kwargs ) -> Optional [str ]:
94
+ """
95
+ Fetches the HTML content of a given source URL.
96
+
97
+ Args:
98
+ source (str): The URL to fetch content from.
99
+ loader_kwargs (dict): Additional arguments for the content loader.
100
+
101
+ Returns:
102
+ Optional[str]: The fetched HTML content or None if fetching failed.
103
+ """
113
104
self .logger .info (f"--- (Fetching HTML from: { source } ) ---" )
114
105
115
106
if self .browser_base is not None :
@@ -119,63 +110,96 @@ def fetch_content(self, source: str, loader_kwargs) -> Optional[str]:
119
110
raise ImportError ("""The browserbase module is not installed.
120
111
Please install it using `pip install browserbase`.""" )
121
112
122
- data = browser_base_fetch (self .browser_base .get ("api_key" ),
123
- self .browser_base .get ("project_id" ), [source ])
124
-
125
- document = [Document (page_content = content ,
126
- metadata = {"source" : source }) for content in data ]
127
-
113
+ data = browser_base_fetch (self .browser_base .get ("api_key" ),
114
+ self .browser_base .get ("project_id" ), [source ])
115
+ document = [Document (page_content = content , metadata = {"source" : source }) for content in data ]
128
116
else :
129
117
loader = ChromiumLoader ([source ], headless = self .headless , ** loader_kwargs )
130
-
131
118
document = loader .load ()
132
-
133
119
return document
134
-
120
+
135
121
def extract_links (self , html_content : str ) -> list :
122
+ """
123
+ Extracts all hyperlinks from the HTML content.
124
+
125
+ Args:
126
+ html_content (str): The HTML content to extract links from.
127
+
128
+ Returns:
129
+ list: A list of extracted hyperlinks.
130
+ """
136
131
soup = BeautifulSoup (html_content , 'html.parser' )
137
132
links = [link ['href' ] for link in soup .find_all ('a' , href = True )]
138
133
self .logger .info (f"Extracted { len (links )} links." )
139
134
return links
140
-
135
+
141
136
def get_full_links (self , base_url : str , links : list ) -> list :
137
+ """
138
+ Converts relative URLs to full URLs based on the base URL.
139
+
140
+ Args:
141
+ base_url (str): The base URL for resolving relative links.
142
+ links (list): A list of links to convert.
143
+
144
+ Returns:
145
+ list: A list of full URLs.
146
+ """
142
147
full_links = []
143
148
for link in links :
144
149
if self .only_inside_links and link .startswith ("http" ):
145
150
continue
146
151
full_link = link if link .startswith ("http" ) else urljoin (base_url , link )
147
152
full_links .append (full_link )
148
153
return full_links
149
-
154
+
150
155
def obtain_content (self , documents : List , loader_kwargs ) -> List :
156
+ """
157
+ Iterates through documents, fetching and updating content recursively.
158
+
159
+ Args:
160
+ documents (List): A list of documents containing the source URLs.
161
+ loader_kwargs (dict): Additional arguments for the content loader.
162
+
163
+ Returns:
164
+ List: The updated list of documents with fetched content.
165
+ """
151
166
new_documents = []
152
167
for doc in documents :
153
168
source = doc ['source' ]
154
169
if 'document' not in doc :
155
170
document = self .fetch_content (source , loader_kwargs )
156
-
171
+
157
172
if not document or not document [0 ].page_content .strip ():
158
173
self .logger .warning (f"Failed to fetch content for { source } " )
159
174
documents .remove (doc )
160
175
continue
161
-
162
- #doc['document'] = document[0].page_content
176
+
163
177
doc ['document' ] = document
164
-
165
178
links = self .extract_links (doc ['document' ][0 ].page_content )
166
179
full_links = self .get_full_links (source , links )
167
-
168
- # Check if the links are already present in other documents
180
+
169
181
for link in full_links :
170
- # Check if any document is from the same link
171
182
if not any (d .get ('source' , '' ) == link for d in documents ) and not any (d .get ('source' , '' ) == link for d in new_documents ):
172
- # Add the document
173
183
new_documents .append ({"source" : link })
174
-
184
+
175
185
documents .extend (new_documents )
176
186
return documents
177
-
178
- def process_links (self , base_url : str , links : list , loader_kwargs , depth : int , current_depth : int = 1 ) -> dict :
187
+
188
+ def process_links (self , base_url : str , links : list ,
189
+ loader_kwargs , depth : int , current_depth : int = 1 ) -> dict :
190
+ """
191
+ Processes a list of links recursively up to a given depth.
192
+
193
+ Args:
194
+ base_url (str): The base URL for resolving relative links.
195
+ links (list): A list of links to process.
196
+ loader_kwargs (dict): Additional arguments for the content loader.
197
+ depth (int): The maximum depth for recursion.
198
+ current_depth (int): The current depth of recursion (default is 1).
199
+
200
+ Returns:
201
+ dict: A dictionary containing processed link content.
202
+ """
179
203
content_dict = {}
180
204
for idx , link in enumerate (links , start = 1 ):
181
205
full_link = link if link .startswith ("http" ) else urljoin (base_url , link )
@@ -184,7 +208,7 @@ def process_links(self, base_url: str, links: list, loader_kwargs, depth: int, c
184
208
185
209
if current_depth < depth :
186
210
new_links = self .extract_links (link_content )
187
- content_dict .update (self .process_links (full_link , new_links , depth , current_depth + 1 ))
211
+ content_dict .update (self .process_links (full_link , new_links , loader_kwargs , depth , current_depth + 1 ))
188
212
else :
189
213
self .logger .warning (f"Failed to fetch content for { full_link } " )
190
- return content_dict
214
+ return content_dict
0 commit comments