From e1f045b2809fc7db0c252f4c6f2f9a435c66ba91 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sat, 8 Jun 2024 11:44:09 +0200 Subject: [PATCH] feat: add new chunking function --- pyproject.toml | 3 ++- requirements-dev.lock | 29 +++++------------------------ requirements.lock | 12 +++--------- requirements.txt | 1 + scrapegraphai/nodes/parse_node.py | 15 +++++---------- 5 files changed, 16 insertions(+), 44 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 70d28bfd..ebfafa8e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,7 @@ dependencies = [ "playwright==1.43.0", "google==3.0.0", "undetected-playwright==0.3.0", + "semchunk==1.0.1", ] license = "MIT" @@ -80,4 +81,4 @@ dev-dependencies = [ "pytest-mock==3.14.0", "-e file:.[burr]", "-e file:.[docs]", -] \ No newline at end of file +] diff --git a/requirements-dev.lock b/requirements-dev.lock index a1e9a303..50b675e5 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -30,9 +30,6 @@ anyio==4.3.0 # via openai # via starlette # via watchfiles -async-timeout==4.0.3 - # via aiohttp - # via langchain attrs==23.2.0 # via aiohttp # via jsonschema @@ -51,7 +48,6 @@ botocore==1.34.113 # via boto3 # via s3transfer burr==0.19.1 - # via burr # via scrapegraphai cachetools==5.3.3 # via google-auth @@ -67,13 +63,6 @@ click==8.1.7 # via streamlit # via typer # via uvicorn -colorama==0.4.6 - # via click - # via loguru - # via pytest - # via sphinx - # via tqdm - # via uvicorn contourpy==1.2.1 # via matplotlib cycler==0.12.1 @@ -93,9 +82,6 @@ docutils==0.19 # via sphinx email-validator==2.1.1 # via fastapi -exceptiongroup==1.2.1 - # via anyio - # via pytest faiss-cpu==1.8.0 # via scrapegraphai fastapi==0.111.0 @@ -150,7 +136,6 @@ graphviz==0.20.3 # via scrapegraphai greenlet==3.0.3 # via playwright - # via sqlalchemy groq==0.8.0 # via langchain-groq grpcio==1.64.0 @@ -388,6 +373,8 @@ rsa==4.9 # via google-auth s3transfer==0.10.1 # via boto3 +semchunk==1.0.1 + # via scrapegraphai sf-hamilton==1.63.0 # via burr shellingham==1.5.4 @@ -443,8 +430,6 @@ tokenizers==0.19.1 # via anthropic toml==0.10.2 # via streamlit -tomli==2.0.1 - # via pytest toolz==0.12.1 # via altair tornado==6.4 @@ -454,12 +439,11 @@ tqdm==4.66.4 # via huggingface-hub # via openai # via scrapegraphai + # via semchunk typer==0.12.3 # via fastapi-cli typing-extensions==4.12.0 - # via altair # via anthropic - # via anyio # via fastapi # via fastapi-pagination # via google-generativeai @@ -474,7 +458,6 @@ typing-extensions==4.12.0 # via streamlit # via typer # via typing-inspect - # via uvicorn typing-inspect==0.9.0 # via dataclasses-json # via sf-hamilton @@ -492,13 +475,11 @@ urllib3==1.26.18 uvicorn==0.29.0 # via burr # via fastapi -watchdog==4.0.1 - # via streamlit +uvloop==0.19.0 + # via uvicorn watchfiles==0.21.0 # via uvicorn websockets==12.0 # via uvicorn -win32-setctime==1.1.0 - # via loguru yarl==1.9.4 # via aiohttp diff --git a/requirements.lock b/requirements.lock index 8a9dcdfd..1dc6ef4f 100644 --- a/requirements.lock +++ b/requirements.lock @@ -22,9 +22,6 @@ anyio==4.3.0 # via groq # via httpx # via openai -async-timeout==4.0.3 - # via aiohttp - # via langchain attrs==23.2.0 # via aiohttp beautifulsoup4==4.12.3 @@ -43,8 +40,6 @@ certifi==2024.2.2 # via requests charset-normalizer==3.3.2 # via requests -colorama==0.4.6 - # via tqdm dataclasses-json==0.6.6 # via langchain # via langchain-community @@ -54,8 +49,6 @@ distro==1.9.0 # via anthropic # via groq # via openai -exceptiongroup==1.2.1 - # via anyio faiss-cpu==1.8.0 # via scrapegraphai filelock==3.14.0 @@ -94,7 +87,6 @@ graphviz==0.20.3 # via scrapegraphai greenlet==3.0.3 # via playwright - # via sqlalchemy groq==0.8.0 # via langchain-groq grpcio==1.64.0 @@ -246,6 +238,8 @@ rsa==4.9 # via google-auth s3transfer==0.10.1 # via boto3 +semchunk==1.0.1 + # via scrapegraphai six==1.16.0 # via python-dateutil sniffio==1.3.1 @@ -273,9 +267,9 @@ tqdm==4.66.4 # via huggingface-hub # via openai # via scrapegraphai + # via semchunk typing-extensions==4.12.0 # via anthropic - # via anyio # via google-generativeai # via groq # via huggingface-hub diff --git a/requirements.txt b/requirements.txt index 254f9f1a..a2b95acb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,3 +18,4 @@ playwright==1.43.0 langchain-aws==0.1.2 yahoo-search-py==0.3 undetected-playwright==0.3.0 +semchunk==1.0.1 \ No newline at end of file diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py index 9c9a89b0..3e77b3e9 100644 --- a/scrapegraphai/nodes/parse_node.py +++ b/scrapegraphai/nodes/parse_node.py @@ -3,8 +3,7 @@ """ from typing import List, Optional - -from langchain.text_splitter import RecursiveCharacterTextSplitter +from semchunk import chunk from langchain_community.document_transformers import Html2TextTransformer from ..utils.logging import get_logger from .base_node import BaseNode @@ -67,20 +66,16 @@ def execute(self, state: dict) -> dict: # Fetching data from the state based on the input keys input_data = [state[key] for key in input_keys] - - text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( - chunk_size=self.node_config.get("chunk_size", 4096), - chunk_overlap=0, - ) - # Parse the document docs_transformed = input_data[0] if self.parse_html: docs_transformed = Html2TextTransformer().transform_documents(input_data[0]) docs_transformed = docs_transformed[0] - chunks = text_splitter.split_text(docs_transformed.page_content) - + chunks = chunk(text=docs_transformed.page_content, + chunk_size= self.node_config.get("chunk_size", 4096), + token_counter=lambda x: len(x.split()), + memoize=False) state.update({self.output[0]: chunks}) return state