Skip to content

Commit 015c6fd

Browse files
remove link from markdown
1 parent f755d56 commit 015c6fd

File tree

2 files changed

+9
-6
lines changed

2 files changed

+9
-6
lines changed

scrapegraphai/nodes/fetch_node_level_k.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,6 @@ def execute(self, state: dict) -> dict:
9595

9696
documents = [{"source": source}]
9797

98-
self.logger.info(f"--- (Fetching HTML from: {source}) ---")
99-
10098
loader_kwargs = {}
10199

102100
if self.node_config is not None:
@@ -112,6 +110,8 @@ def execute(self, state: dict) -> dict:
112110
return state
113111

114112
def fetch_content(self, source: str, loader_kwargs) -> Optional[str]:
113+
self.logger.info(f"--- (Fetching HTML from: {source}) ---")
114+
115115
if self.browser_base is not None:
116116
try:
117117
from ..docloaders.browser_base import browser_base_fetch
@@ -159,9 +159,10 @@ def obtain_content(self, documents: List, loader_kwargs) -> List:
159159
documents.remove(doc)
160160
continue
161161

162-
doc['document'] = document[0].page_content
162+
#doc['document'] = document[0].page_content
163+
doc['document'] = document
163164

164-
links = self.extract_links(doc['document'])
165+
links = self.extract_links(doc['document'][0].page_content)
165166
full_links = self.get_full_links(source, links)
166167

167168
# Check if the links are already present in other documents

scrapegraphai/nodes/parse_node_depth_k.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from typing import List, Optional, Tuple
66
from .base_node import BaseNode
77
from ..utils.convert_to_md import convert_to_md
8+
from langchain_community.document_transformers import Html2TextTransformer
89

910
class ParseNodeDepthK(BaseNode):
1011
"""
@@ -62,8 +63,9 @@ def execute(self, state: dict) -> dict:
6263
documents = input_data[0]
6364

6465
for doc in documents:
65-
document_md = convert_to_md(doc["document"])
66-
doc["document_md"] = document_md
66+
document_md = Html2TextTransformer(ignore_links=True).transform_documents(doc["document"])
67+
#document_md = convert_to_md(doc["document"])
68+
doc["document"] = document_md[0].page_content
6769

6870
state.update({self.output[0]: documents})
6971

0 commit comments

Comments
 (0)