Merge pull request #648 from ScrapeGraphAI/637-it-can´t-scrape-urls-from-the-source

f-aguzzi · web-flow · commit 32a102af3c15 · 2024-09-09T12:04:46.000+02:00
637 it can´t scrape urls from the source
diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py
@@ -22,4 +22,4 @@
 from .merge_generated_scripts import MergeGeneratedScriptsNode
 from .fetch_screen_node import FetchScreenNode
 from .generate_answer_from_image_node import GenerateAnswerFromImageNode
-from .concat_answers_node import ConcatAnswersNode
+from .concat_answers_node import ConcatAnswersNode
diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py
@@ -1,15 +1,11 @@
 """
 ParseNode Module
 """
-from typing import Tuple, List, Optional
-from urllib.parse import urljoin
+from typing import List, Optional
 from semchunk import chunk
 from langchain_community.document_transformers import Html2TextTransformer
 from langchain_core.documents import Document
 from .base_node import BaseNode
-from ..helpers import default_filters
-
-import re
 
 class ParseNode(BaseNode):
     """
@@ -44,67 +40,6 @@ def __init__(
         self.parse_html = (
             True if node_config is None else node_config.get("parse_html", True)
         )
-        self.llm_model = node_config['llm_model']
-        self.parse_urls = (
-            False if node_config is None else node_config.get("parse_urls", False)
-        )
-
-    def _clean_urls(self, urls: List[str]) -> List[str]:
-        """
-        Cleans the URLs extracted from the text.
-
-        Args:
-            urls (List[str]): The list of URLs to clean.
-
-        Returns:
-            List[str]: The cleaned URLs.
-        """
-        cleaned_urls = []
-        for url in urls:
-            # Remove any leading 'thumbnail](' or similar patterns
-            url = re.sub(r'.*?\]\(', '', url)
-            
-            # Remove any trailing parentheses or brackets
-            url = url.rstrip(').')
-            
-            cleaned_urls.append(url)
-        
-        return cleaned_urls
-
-    def extract_urls(self, text: str, source: str) -> Tuple[List[str], List[str]]:
-        """
-        Extracts URLs from the given text.
-
-        Args:
-            text (str): The text to extract URLs from.
-
-        Returns:
-            Tuple[List[str], List[str]]: A tuple containing the extracted link URLs and image URLs.
-        """
-        # Return empty lists if the URLs are not to be parsed
-        if not self.parse_urls:
-            return [], []
-        
-        # Regular expression to find URLs (both links and images)
-        image_extensions = default_filters.filter_dict["img_exts"]
-        image_extension_seq = '|'.join(image_extensions).replace('.','')
-        url_pattern = re.compile(r'(https?://[^\s]+|\S+\.(?:' + image_extension_seq + '))')
-
-        # Find all URLs in the string
-        all_urls = url_pattern.findall(text)
-        all_urls = self._clean_urls(all_urls)
-
-        if not source.startswith("http"):
-            # Remove any URLs that is not complete
-            all_urls = [url for url in all_urls if url.startswith("http")]
-        else:
-            # Add to local URLs the source URL
-            all_urls = [urljoin(source, url) for url in all_urls]
-        
-        images = [url for url in all_urls if any(url.endswith(ext) for ext in image_extensions)]
-        links = [url for url in all_urls if url not in images]
-
-        return links, images
 
     def execute(self, state: dict) -> dict:
         """
@@ -127,46 +62,33 @@ def execute(self, state: dict) -> dict:
         input_keys = self.get_input_keys(state)
 
         input_data = [state[key] for key in input_keys]
-
         docs_transformed = input_data[0]
-        source = input_data[1] if self.parse_urls else None
-
-        def count_tokens(text):
-            from ..utils import token_count
-            return token_count(text, self.llm_model.model_name)
 
         if self.parse_html:
             docs_transformed = Html2TextTransformer().transform_documents(input_data[0])
             docs_transformed = docs_transformed[0]
 
-            link_urls, img_urls = self.extract_urls(docs_transformed.page_content, source)
-
             chunks = chunk(text=docs_transformed.page_content,
                             chunk_size=self.node_config.get("chunk_size", 4096)-250,
-                            token_counter=count_tokens,
+                            token_counter=lambda text: len(text.split()),
                             memoize=False)
         else:
             docs_transformed = docs_transformed[0]
 
-            link_urls, img_urls = self.extract_urls(docs_transformed.page_content, source)
-
             chunk_size = self.node_config.get("chunk_size", 4096)
             chunk_size = min(chunk_size - 500, int(chunk_size * 0.9))
 
             if isinstance(docs_transformed, Document):
                 chunks = chunk(text=docs_transformed.page_content,
                             chunk_size=chunk_size,
-                            token_counter=count_tokens,
+                            token_counter=lambda text: len(text.split()),
                             memoize=False)
             else:
                 chunks = chunk(text=docs_transformed,
                                 chunk_size=chunk_size,
-                                token_counter=count_tokens,
+                                token_counter=lambda text: len(text.split()),
                                 memoize=False)
 
         state.update({self.output[0]: chunks})
-        if self.parse_urls:
-            state.update({self.output[1]: link_urls})
-            state.update({self.output[2]: img_urls})
 
         return state