diff --git a/CHANGELOG.md b/CHANGELOG.md index 60e964c4..dffb9062 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,3 @@ -## [0.10.0-beta.6](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.10.0-beta.5...v0.10.0-beta.6) (2024-05-09) - ### Bug Fixes @@ -8,8 +6,10 @@ ## [0.10.0-beta.5](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.10.0-beta.4...v0.10.0-beta.5) (2024-05-09) + ### Bug Fixes + * fixed bugs for csv and xml ([324e977](https://github.com/VinciGit00/Scrapegraph-ai/commit/324e977b853ecaa55bac4bf86e7cd927f7f43d0d)) ## [0.10.0-beta.4](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.10.0-beta.3...v0.10.0-beta.4) (2024-05-09) diff --git a/docs/source/getting_started/examples.rst b/docs/source/getting_started/examples.rst index 11fb5a05..b6e2eb36 100644 --- a/docs/source/getting_started/examples.rst +++ b/docs/source/getting_started/examples.rst @@ -44,9 +44,12 @@ Local models Remember to have installed in your pc ollama `ollama ` Remember to pull the right model for LLM and for the embeddings, like: + .. code-block:: bash ollama pull llama3 + ollama pull nomic-embed-text + ollama pull mistral After that, you can run the following code, using only your machine resources brum brum brum: diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 52266b42..eeb2d0b4 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -8,7 +8,9 @@ from langchain_core.documents import Document from langchain_community.document_loaders import PyPDFLoader from .base_node import BaseNode -from ..utils.remover import remover +from ..utils.cleanup_html import cleanup_html +import requests +from bs4 import BeautifulSoup class FetchNode(BaseNode): @@ -34,6 +36,7 @@ class FetchNode(BaseNode): def __init__(self, input: str, output: List[str], node_config: Optional[dict] = None, node_name: str = "Fetch"): super().__init__(node_name, "node", input, output, 1) + self.headless = True if node_config is None else node_config.get( "headless", True) self.verbose = False if node_config is None else node_config.get( @@ -94,10 +97,22 @@ def execute(self, state): pass elif not source.startswith("http"): - compressed_document = [Document(page_content=remover(source), metadata={ + compressed_document = [Document(page_content=cleanup_html(source), metadata={ "source": "local_dir" })] + elif self.useSoup: + response = requests.get(source) + if response.status_code == 200: + soup = BeautifulSoup(response.text, 'html.parser') + links = soup.find_all('a') + link_urls = [] + for link in links: + if 'href' in link.attrs: + link_urls.append(link['href']) + compressed_document = [Document(page_content=cleanup_html(soup.prettify(), link_urls))] + else: + print(f"Failed to retrieve contents from the webpage at url: {url}") else: if self.node_config is not None and self.node_config.get("endpoint") is not None: @@ -114,7 +129,7 @@ def execute(self, state): document = loader.load() compressed_document = [ - Document(page_content=remover(str(document[0].page_content)))] + Document(page_content=cleanup_html(str(document[0].page_content)))] state.update({self.output[0]: compressed_document}) return state diff --git a/scrapegraphai/utils/remover.py b/scrapegraphai/utils/cleanup_html.py similarity index 78% rename from scrapegraphai/utils/remover.py rename to scrapegraphai/utils/cleanup_html.py index 5e203249..aab1db65 100644 --- a/scrapegraphai/utils/remover.py +++ b/scrapegraphai/utils/cleanup_html.py @@ -5,7 +5,7 @@ from minify_html import minify -def remover(html_content: str) -> str: +def cleanup_html(html_content: str, urls: list = []) -> str: """ Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content. @@ -17,7 +17,7 @@ def remover(html_content: str) -> str: Example: >>> html_content = "Example

Hello World!

" - >>> remover(html_content) + >>> cleanup_html(html_content) 'Title: Example, Body:

Hello World!

' This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized. @@ -35,9 +35,12 @@ def remover(html_content: str) -> str: # Body Extraction (if it exists) body_content = soup.find('body') + urls_content = "" + if urls: + urls_content = f", URLs in page: {urls}" if body_content: # Minify the HTML within the body tag minimized_body = minify(str(body_content)) - return "Title: " + title + ", Body: " + minimized_body + return "Title: " + title + ", Body: " + minimized_body + urls_content - return "Title: " + title + ", Body: No body content found" + return "Title: " + title + ", Body: No body content found" + urls_content