diff --git a/CHANGELOG.md b/CHANGELOG.md
index 60e964c4..dffb9062 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,3 @@
-## [0.10.0-beta.6](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.10.0-beta.5...v0.10.0-beta.6) (2024-05-09)
-
### Bug Fixes
@@ -8,8 +6,10 @@
## [0.10.0-beta.5](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.10.0-beta.4...v0.10.0-beta.5) (2024-05-09)
+
### Bug Fixes
+
* fixed bugs for csv and xml ([324e977](https://github.com/VinciGit00/Scrapegraph-ai/commit/324e977b853ecaa55bac4bf86e7cd927f7f43d0d))
## [0.10.0-beta.4](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.10.0-beta.3...v0.10.0-beta.4) (2024-05-09)
diff --git a/docs/source/getting_started/examples.rst b/docs/source/getting_started/examples.rst
index 11fb5a05..b6e2eb36 100644
--- a/docs/source/getting_started/examples.rst
+++ b/docs/source/getting_started/examples.rst
@@ -44,9 +44,12 @@ Local models
Remember to have installed in your pc ollama `ollama `
Remember to pull the right model for LLM and for the embeddings, like:
+
.. code-block:: bash
ollama pull llama3
+ ollama pull nomic-embed-text
+ ollama pull mistral
After that, you can run the following code, using only your machine resources brum brum brum:
diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py
index 52266b42..eeb2d0b4 100644
--- a/scrapegraphai/nodes/fetch_node.py
+++ b/scrapegraphai/nodes/fetch_node.py
@@ -8,7 +8,9 @@
from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader
from .base_node import BaseNode
-from ..utils.remover import remover
+from ..utils.cleanup_html import cleanup_html
+import requests
+from bs4 import BeautifulSoup
class FetchNode(BaseNode):
@@ -34,6 +36,7 @@ class FetchNode(BaseNode):
def __init__(self, input: str, output: List[str], node_config: Optional[dict] = None, node_name: str = "Fetch"):
super().__init__(node_name, "node", input, output, 1)
+
self.headless = True if node_config is None else node_config.get(
"headless", True)
self.verbose = False if node_config is None else node_config.get(
@@ -94,10 +97,22 @@ def execute(self, state):
pass
elif not source.startswith("http"):
- compressed_document = [Document(page_content=remover(source), metadata={
+ compressed_document = [Document(page_content=cleanup_html(source), metadata={
"source": "local_dir"
})]
+ elif self.useSoup:
+ response = requests.get(source)
+ if response.status_code == 200:
+ soup = BeautifulSoup(response.text, 'html.parser')
+ links = soup.find_all('a')
+ link_urls = []
+ for link in links:
+ if 'href' in link.attrs:
+ link_urls.append(link['href'])
+ compressed_document = [Document(page_content=cleanup_html(soup.prettify(), link_urls))]
+ else:
+ print(f"Failed to retrieve contents from the webpage at url: {url}")
else:
if self.node_config is not None and self.node_config.get("endpoint") is not None:
@@ -114,7 +129,7 @@ def execute(self, state):
document = loader.load()
compressed_document = [
- Document(page_content=remover(str(document[0].page_content)))]
+ Document(page_content=cleanup_html(str(document[0].page_content)))]
state.update({self.output[0]: compressed_document})
return state
diff --git a/scrapegraphai/utils/remover.py b/scrapegraphai/utils/cleanup_html.py
similarity index 78%
rename from scrapegraphai/utils/remover.py
rename to scrapegraphai/utils/cleanup_html.py
index 5e203249..aab1db65 100644
--- a/scrapegraphai/utils/remover.py
+++ b/scrapegraphai/utils/cleanup_html.py
@@ -5,7 +5,7 @@
from minify_html import minify
-def remover(html_content: str) -> str:
+def cleanup_html(html_content: str, urls: list = []) -> str:
"""
Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content.
@@ -17,7 +17,7 @@ def remover(html_content: str) -> str:
Example:
>>> html_content = "
ExampleHello World!
"
- >>> remover(html_content)
+ >>> cleanup_html(html_content)
'Title: Example, Body: Hello World!
'
This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized.
@@ -35,9 +35,12 @@ def remover(html_content: str) -> str:
# Body Extraction (if it exists)
body_content = soup.find('body')
+ urls_content = ""
+ if urls:
+ urls_content = f", URLs in page: {urls}"
if body_content:
# Minify the HTML within the body tag
minimized_body = minify(str(body_content))
- return "Title: " + title + ", Body: " + minimized_body
+ return "Title: " + title + ", Body: " + minimized_body + urls_content
- return "Title: " + title + ", Body: No body content found"
+ return "Title: " + title + ", Body: No body content found" + urls_content