Skip to content

Commit 32a102a

Browse files
authored
Merge pull request #648 from ScrapeGraphAI/637-it-can´t-scrape-urls-from-the-source
637 it can´t scrape urls from the source
2 parents 8a0d46b + f2bb22d commit 32a102a

File tree

2 files changed

+5
-83
lines changed

2 files changed

+5
-83
lines changed

scrapegraphai/nodes/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,4 @@
2222
from .merge_generated_scripts import MergeGeneratedScriptsNode
2323
from .fetch_screen_node import FetchScreenNode
2424
from .generate_answer_from_image_node import GenerateAnswerFromImageNode
25-
from .concat_answers_node import ConcatAnswersNode
25+
from .concat_answers_node import ConcatAnswersNode

scrapegraphai/nodes/parse_node.py

Lines changed: 4 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,11 @@
11
"""
22
ParseNode Module
33
"""
4-
from typing import Tuple, List, Optional
5-
from urllib.parse import urljoin
4+
from typing import List, Optional
65
from semchunk import chunk
76
from langchain_community.document_transformers import Html2TextTransformer
87
from langchain_core.documents import Document
98
from .base_node import BaseNode
10-
from ..helpers import default_filters
11-
12-
import re
139

1410
class ParseNode(BaseNode):
1511
"""
@@ -44,67 +40,6 @@ def __init__(
4440
self.parse_html = (
4541
True if node_config is None else node_config.get("parse_html", True)
4642
)
47-
self.llm_model = node_config['llm_model']
48-
self.parse_urls = (
49-
False if node_config is None else node_config.get("parse_urls", False)
50-
)
51-
52-
def _clean_urls(self, urls: List[str]) -> List[str]:
53-
"""
54-
Cleans the URLs extracted from the text.
55-
56-
Args:
57-
urls (List[str]): The list of URLs to clean.
58-
59-
Returns:
60-
List[str]: The cleaned URLs.
61-
"""
62-
cleaned_urls = []
63-
for url in urls:
64-
# Remove any leading 'thumbnail](' or similar patterns
65-
url = re.sub(r'.*?\]\(', '', url)
66-
67-
# Remove any trailing parentheses or brackets
68-
url = url.rstrip(').')
69-
70-
cleaned_urls.append(url)
71-
72-
return cleaned_urls
73-
74-
def extract_urls(self, text: str, source: str) -> Tuple[List[str], List[str]]:
75-
"""
76-
Extracts URLs from the given text.
77-
78-
Args:
79-
text (str): The text to extract URLs from.
80-
81-
Returns:
82-
Tuple[List[str], List[str]]: A tuple containing the extracted link URLs and image URLs.
83-
"""
84-
# Return empty lists if the URLs are not to be parsed
85-
if not self.parse_urls:
86-
return [], []
87-
88-
# Regular expression to find URLs (both links and images)
89-
image_extensions = default_filters.filter_dict["img_exts"]
90-
image_extension_seq = '|'.join(image_extensions).replace('.','')
91-
url_pattern = re.compile(r'(https?://[^\s]+|\S+\.(?:' + image_extension_seq + '))')
92-
93-
# Find all URLs in the string
94-
all_urls = url_pattern.findall(text)
95-
all_urls = self._clean_urls(all_urls)
96-
97-
if not source.startswith("http"):
98-
# Remove any URLs that is not complete
99-
all_urls = [url for url in all_urls if url.startswith("http")]
100-
else:
101-
# Add to local URLs the source URL
102-
all_urls = [urljoin(source, url) for url in all_urls]
103-
104-
images = [url for url in all_urls if any(url.endswith(ext) for ext in image_extensions)]
105-
links = [url for url in all_urls if url not in images]
106-
107-
return links, images
10843

10944
def execute(self, state: dict) -> dict:
11045
"""
@@ -127,46 +62,33 @@ def execute(self, state: dict) -> dict:
12762
input_keys = self.get_input_keys(state)
12863

12964
input_data = [state[key] for key in input_keys]
130-
13165
docs_transformed = input_data[0]
132-
source = input_data[1] if self.parse_urls else None
133-
134-
def count_tokens(text):
135-
from ..utils import token_count
136-
return token_count(text, self.llm_model.model_name)
13766

13867
if self.parse_html:
13968
docs_transformed = Html2TextTransformer().transform_documents(input_data[0])
14069
docs_transformed = docs_transformed[0]
14170

142-
link_urls, img_urls = self.extract_urls(docs_transformed.page_content, source)
143-
14471
chunks = chunk(text=docs_transformed.page_content,
14572
chunk_size=self.node_config.get("chunk_size", 4096)-250,
146-
token_counter=count_tokens,
73+
token_counter=lambda text: len(text.split()),
14774
memoize=False)
14875
else:
14976
docs_transformed = docs_transformed[0]
15077

151-
link_urls, img_urls = self.extract_urls(docs_transformed.page_content, source)
152-
15378
chunk_size = self.node_config.get("chunk_size", 4096)
15479
chunk_size = min(chunk_size - 500, int(chunk_size * 0.9))
15580

15681
if isinstance(docs_transformed, Document):
15782
chunks = chunk(text=docs_transformed.page_content,
15883
chunk_size=chunk_size,
159-
token_counter=count_tokens,
84+
token_counter=lambda text: len(text.split()),
16085
memoize=False)
16186
else:
16287
chunks = chunk(text=docs_transformed,
16388
chunk_size=chunk_size,
164-
token_counter=count_tokens,
89+
token_counter=lambda text: len(text.split()),
16590
memoize=False)
16691

16792
state.update({self.output[0]: chunks})
168-
if self.parse_urls:
169-
state.update({self.output[1]: link_urls})
170-
state.update({self.output[2]: img_urls})
17193

17294
return state

0 commit comments

Comments
 (0)