Skip to content

637 it can´t scrape urls from the source #648

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Sep 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion scrapegraphai/nodes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@
from .merge_generated_scripts import MergeGeneratedScriptsNode
from .fetch_screen_node import FetchScreenNode
from .generate_answer_from_image_node import GenerateAnswerFromImageNode
from .concat_answers_node import ConcatAnswersNode
from .concat_answers_node import ConcatAnswersNode
86 changes: 4 additions & 82 deletions scrapegraphai/nodes/parse_node.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,11 @@
"""
ParseNode Module
"""
from typing import Tuple, List, Optional
from urllib.parse import urljoin
from typing import List, Optional
from semchunk import chunk
from langchain_community.document_transformers import Html2TextTransformer
from langchain_core.documents import Document
from .base_node import BaseNode
from ..helpers import default_filters

import re

class ParseNode(BaseNode):
"""
Expand Down Expand Up @@ -44,67 +40,6 @@ def __init__(
self.parse_html = (
True if node_config is None else node_config.get("parse_html", True)
)
self.llm_model = node_config['llm_model']
self.parse_urls = (
False if node_config is None else node_config.get("parse_urls", False)
)

def _clean_urls(self, urls: List[str]) -> List[str]:
"""
Cleans the URLs extracted from the text.

Args:
urls (List[str]): The list of URLs to clean.

Returns:
List[str]: The cleaned URLs.
"""
cleaned_urls = []
for url in urls:
# Remove any leading 'thumbnail](' or similar patterns
url = re.sub(r'.*?\]\(', '', url)

# Remove any trailing parentheses or brackets
url = url.rstrip(').')

cleaned_urls.append(url)

return cleaned_urls

def extract_urls(self, text: str, source: str) -> Tuple[List[str], List[str]]:
"""
Extracts URLs from the given text.

Args:
text (str): The text to extract URLs from.

Returns:
Tuple[List[str], List[str]]: A tuple containing the extracted link URLs and image URLs.
"""
# Return empty lists if the URLs are not to be parsed
if not self.parse_urls:
return [], []

# Regular expression to find URLs (both links and images)
image_extensions = default_filters.filter_dict["img_exts"]
image_extension_seq = '|'.join(image_extensions).replace('.','')
url_pattern = re.compile(r'(https?://[^\s]+|\S+\.(?:' + image_extension_seq + '))')

# Find all URLs in the string
all_urls = url_pattern.findall(text)
all_urls = self._clean_urls(all_urls)

if not source.startswith("http"):
# Remove any URLs that is not complete
all_urls = [url for url in all_urls if url.startswith("http")]
else:
# Add to local URLs the source URL
all_urls = [urljoin(source, url) for url in all_urls]

images = [url for url in all_urls if any(url.endswith(ext) for ext in image_extensions)]
links = [url for url in all_urls if url not in images]

return links, images

def execute(self, state: dict) -> dict:
"""
Expand All @@ -127,46 +62,33 @@ def execute(self, state: dict) -> dict:
input_keys = self.get_input_keys(state)

input_data = [state[key] for key in input_keys]

docs_transformed = input_data[0]
source = input_data[1] if self.parse_urls else None

def count_tokens(text):
from ..utils import token_count
return token_count(text, self.llm_model.model_name)

if self.parse_html:
docs_transformed = Html2TextTransformer().transform_documents(input_data[0])
docs_transformed = docs_transformed[0]

link_urls, img_urls = self.extract_urls(docs_transformed.page_content, source)

chunks = chunk(text=docs_transformed.page_content,
chunk_size=self.node_config.get("chunk_size", 4096)-250,
token_counter=count_tokens,
token_counter=lambda text: len(text.split()),
memoize=False)
else:
docs_transformed = docs_transformed[0]

link_urls, img_urls = self.extract_urls(docs_transformed.page_content, source)

chunk_size = self.node_config.get("chunk_size", 4096)
chunk_size = min(chunk_size - 500, int(chunk_size * 0.9))

if isinstance(docs_transformed, Document):
chunks = chunk(text=docs_transformed.page_content,
chunk_size=chunk_size,
token_counter=count_tokens,
token_counter=lambda text: len(text.split()),
memoize=False)
else:
chunks = chunk(text=docs_transformed,
chunk_size=chunk_size,
token_counter=count_tokens,
token_counter=lambda text: len(text.split()),
memoize=False)

state.update({self.output[0]: chunks})
if self.parse_urls:
state.update({self.output[1]: link_urls})
state.update({self.output[2]: img_urls})

return state
Loading