remove link from markdown

vedovati-matteo · vedovati-matteo · commit 015c6fd90504 · 2024-10-02T13:06:00.000+02:00
diff --git a/scrapegraphai/nodes/fetch_node_level_k.py b/scrapegraphai/nodes/fetch_node_level_k.py
@@ -95,8 +95,6 @@ def execute(self, state: dict) -> dict:
         
         documents = [{"source": source}]
         
-        self.logger.info(f"--- (Fetching HTML from: {source}) ---")
-        
         loader_kwargs = {}
 
         if self.node_config is not None:
@@ -112,6 +110,8 @@ def execute(self, state: dict) -> dict:
         return state
     
     def fetch_content(self, source: str, loader_kwargs) -> Optional[str]:
+        self.logger.info(f"--- (Fetching HTML from: {source}) ---")
+        
         if self.browser_base is not None:
             try:
                 from ..docloaders.browser_base import browser_base_fetch
@@ -159,9 +159,10 @@ def obtain_content(self, documents: List, loader_kwargs) -> List:
                     documents.remove(doc)
                     continue
                 
-                doc['document'] = document[0].page_content
+                #doc['document'] = document[0].page_content
+                doc['document'] = document
                 
-                links = self.extract_links(doc['document'])
+                links = self.extract_links(doc['document'][0].page_content)
                 full_links = self.get_full_links(source, links)
                 
                 # Check if the links are already present in other documents
diff --git a/scrapegraphai/nodes/parse_node_depth_k.py b/scrapegraphai/nodes/parse_node_depth_k.py
@@ -5,6 +5,7 @@
 from typing import List, Optional, Tuple
 from .base_node import BaseNode
 from ..utils.convert_to_md import convert_to_md
+from langchain_community.document_transformers import Html2TextTransformer
 
 class ParseNodeDepthK(BaseNode):
     """
@@ -62,8 +63,9 @@ def execute(self, state: dict) -> dict:
         documents = input_data[0]
         
         for doc in documents:
-            document_md = convert_to_md(doc["document"])
-            doc["document_md"] = document_md
+            document_md = Html2TextTransformer(ignore_links=True).transform_documents(doc["document"])
+            #document_md = convert_to_md(doc["document"])
+            doc["document"] = document_md[0].page_content
         
         state.update({self.output[0]: documents})