From 134c94e0ed203e82bf76d3871d9caf792572fc34 Mon Sep 17 00:00:00 2001 From: "EURAC\\marperini" Date: Wed, 17 Apr 2024 17:12:59 +0200 Subject: [PATCH 01/30] added first asdt implementation --- pyproject.toml | 1 - scrapegraphai/utils/asdt.py | 114 +++++++++++++++++++++++++++++++ scrapegraphai/utils/tree_base.py | 59 ++++++++++++++++ 3 files changed, 173 insertions(+), 1 deletion(-) create mode 100644 scrapegraphai/utils/asdt.py create mode 100644 scrapegraphai/utils/tree_base.py diff --git a/pyproject.toml b/pyproject.toml index 3c1e272d..0321cd7f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,6 @@ pandas = "2.0.3" python-dotenv = "1.0.1" tiktoken = {version = ">=0.5.2,<0.6.0"} tqdm = "4.66.1" -graphviz = "0.20.1" google = "3.0.0" minify-html = "0.15.0" diff --git a/scrapegraphai/utils/asdt.py b/scrapegraphai/utils/asdt.py new file mode 100644 index 00000000..515a2048 --- /dev/null +++ b/scrapegraphai/utils/asdt.py @@ -0,0 +1,114 @@ +from bs4 import BeautifulSoup, NavigableString +from graphviz import Digraph +from langchain_community.document_loaders import AsyncHtmlLoader + +def tag_structure(tag, include_scripts=True): + """ + Recursively get a tag's structure, including its attributes, children, and textual content. + :param tag: BeautifulSoup tag object + :param include_scripts: Include or exclude