From 2ae19aee56bed32f5ae34715c29f464bfa329f8e Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Mon, 22 Jul 2024 16:50:58 +0200 Subject: [PATCH 1/5] update function --- pyproject.toml | 3 +- requirements-dev.lock | 585 -------------------- requirements-dev.txt | 4 - requirements.lock | 366 ------------ scrapegraphai/nodes/generate_answer_node.py | 68 +-- scrapegraphai/utils/__init__.py | 1 + scrapegraphai/utils/merge_results.py | 30 + 7 files changed, 70 insertions(+), 987 deletions(-) delete mode 100644 requirements-dev.lock delete mode 100644 requirements-dev.txt delete mode 100644 requirements.lock create mode 100644 scrapegraphai/utils/merge_results.py diff --git a/pyproject.toml b/pyproject.toml index e5b997ba..30725709 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,8 @@ dependencies = [ "undetected-playwright==0.3.0", "semchunk==1.0.1", "html2text==2024.2.26", - "langchain-fireworks==0.1.3" + "langchain-fireworks==0.1.3", + "langchain-community==0.2.9" ] license = "MIT" diff --git a/requirements-dev.lock b/requirements-dev.lock deleted file mode 100644 index b0bcaaa0..00000000 --- a/requirements-dev.lock +++ /dev/null @@ -1,585 +0,0 @@ -# generated by rye -# use `rye lock` or `rye sync` to update this lockfile -# -# last locked with the following flags: -# pre: false -# features: [] -# all-features: false -# with-sources: false - --e file:. -aiofiles==23.2.1 - # via burr -aiohttp==3.9.5 - # via langchain - # via langchain-community - # via langchain-fireworks -aiosignal==1.3.1 - # via aiohttp -alabaster==0.7.16 - # via sphinx -altair==5.3.0 - # via streamlit -annotated-types==0.7.0 - # via pydantic -anthropic==0.26.1 - # via langchain-anthropic -anyio==4.3.0 - # via anthropic - # via groq - # via httpx - # via openai - # via starlette - # via watchfiles -astroid==3.2.2 - # via pylint -async-timeout==4.0.3 - # via aiohttp - # via langchain -attrs==23.2.0 - # via aiohttp - # via jsonschema - # via referencing -babel==2.15.0 - # via sphinx -beautifulsoup4==4.12.3 - # via furo - # via google - # via scrapegraphai -blinker==1.8.2 - # via streamlit -boto3==1.34.113 - # via langchain-aws -botocore==1.34.113 - # via boto3 - # via s3transfer -burr==0.22.1 - # via scrapegraphai -cachetools==5.3.3 - # via google-auth - # via streamlit -certifi==2024.2.2 - # via httpcore - # via httpx - # via requests -charset-normalizer==3.3.2 - # via requests -click==8.1.7 - # via burr - # via streamlit - # via typer - # via uvicorn -contourpy==1.2.1 - # via matplotlib -cycler==0.12.1 - # via matplotlib -dataclasses-json==0.6.6 - # via langchain - # via langchain-community -defusedxml==0.7.1 - # via langchain-anthropic -dill==0.3.8 - # via pylint -distro==1.9.0 - # via anthropic - # via groq - # via openai -dnspython==2.6.1 - # via email-validator -docstring-parser==0.16 - # via google-cloud-aiplatform -docutils==0.19 - # via sphinx -email-validator==2.1.1 - # via fastapi -exceptiongroup==1.2.1 - # via anyio - # via pytest -faiss-cpu==1.8.0 - # via scrapegraphai -fastapi==0.111.0 - # via burr - # via fastapi-pagination -fastapi-cli==0.0.4 - # via fastapi -fastapi-pagination==0.12.24 - # via burr -filelock==3.14.0 - # via huggingface-hub -fireworks-ai==0.14.0 - # via langchain-fireworks -fonttools==4.52.1 - # via matplotlib -free-proxy==1.1.1 - # via scrapegraphai -frozenlist==1.4.1 - # via aiohttp - # via aiosignal -fsspec==2024.5.0 - # via huggingface-hub -furo==2024.5.6 - # via scrapegraphai -gitdb==4.0.11 - # via gitpython -gitpython==3.1.43 - # via streamlit -google==3.0.0 - # via scrapegraphai -google-ai-generativelanguage==0.6.4 - # via google-generativeai -google-api-core==2.19.0 - # via google-ai-generativelanguage - # via google-api-python-client - # via google-cloud-aiplatform - # via google-cloud-bigquery - # via google-cloud-core - # via google-cloud-resource-manager - # via google-cloud-storage - # via google-generativeai -google-api-python-client==2.130.0 - # via google-generativeai -google-auth==2.29.0 - # via google-ai-generativelanguage - # via google-api-core - # via google-api-python-client - # via google-auth-httplib2 - # via google-cloud-aiplatform - # via google-cloud-bigquery - # via google-cloud-core - # via google-cloud-resource-manager - # via google-cloud-storage - # via google-generativeai -google-auth-httplib2==0.2.0 - # via google-api-python-client -google-cloud-aiplatform==1.58.0 - # via langchain-google-vertexai -google-cloud-bigquery==3.25.0 - # via google-cloud-aiplatform -google-cloud-core==2.4.1 - # via google-cloud-bigquery - # via google-cloud-storage -google-cloud-resource-manager==1.12.3 - # via google-cloud-aiplatform -google-cloud-storage==2.17.0 - # via google-cloud-aiplatform - # via langchain-google-vertexai -google-crc32c==1.5.0 - # via google-cloud-storage - # via google-resumable-media -google-generativeai==0.5.4 - # via langchain-google-genai -google-resumable-media==2.7.1 - # via google-cloud-bigquery - # via google-cloud-storage -googleapis-common-protos==1.63.0 - # via google-api-core - # via grpc-google-iam-v1 - # via grpcio-status -graphviz==0.20.3 - # via burr - # via scrapegraphai -greenlet==3.0.3 - # via playwright -groq==0.8.0 - # via langchain-groq -grpc-google-iam-v1==0.13.1 - # via google-cloud-resource-manager -grpcio==1.64.0 - # via google-api-core - # via googleapis-common-protos - # via grpc-google-iam-v1 - # via grpcio-status -grpcio-status==1.62.2 - # via google-api-core -h11==0.14.0 - # via httpcore - # via uvicorn -html2text==2024.2.26 - # via scrapegraphai -httpcore==1.0.5 - # via httpx -httplib2==0.22.0 - # via google-api-python-client - # via google-auth-httplib2 -httptools==0.6.1 - # via uvicorn -httpx==0.27.0 - # via anthropic - # via fastapi - # via fireworks-ai - # via groq - # via openai -httpx-sse==0.4.0 - # via fireworks-ai -huggingface-hub==0.23.1 - # via tokenizers -idna==3.7 - # via anyio - # via email-validator - # via httpx - # via requests - # via yarl -imagesize==1.4.1 - # via sphinx -importlib-metadata==8.0.0 - # via sphinx -importlib-resources==6.4.0 - # via matplotlib -iniconfig==2.0.0 - # via pytest -isort==5.13.2 - # via pylint -jinja2==3.1.4 - # via altair - # via burr - # via fastapi - # via pydeck - # via sphinx -jiter==0.4.0 - # via anthropic -jmespath==1.0.1 - # via boto3 - # via botocore -jsonpatch==1.33 - # via langchain - # via langchain-core -jsonpointer==2.4 - # via jsonpatch -jsonschema==4.22.0 - # via altair -jsonschema-specifications==2023.12.1 - # via jsonschema -kiwisolver==1.4.5 - # via matplotlib -langchain==0.1.15 - # via scrapegraphai -langchain-anthropic==0.1.11 - # via scrapegraphai -langchain-aws==0.1.3 - # via scrapegraphai -langchain-community==0.0.38 - # via langchain -langchain-core==0.1.52 - # via langchain - # via langchain-anthropic - # via langchain-aws - # via langchain-community - # via langchain-fireworks - # via langchain-google-genai - # via langchain-google-vertexai - # via langchain-groq - # via langchain-openai - # via langchain-text-splitters -langchain-fireworks==0.1.3 - # via scrapegraphai -langchain-google-genai==1.0.3 - # via scrapegraphai -langchain-google-vertexai==1.0.4 - # via scrapegraphai -langchain-groq==0.1.3 - # via scrapegraphai -langchain-openai==0.1.6 - # via scrapegraphai -langchain-text-splitters==0.0.2 - # via langchain -langsmith==0.1.63 - # via langchain - # via langchain-community - # via langchain-core -loguru==0.7.2 - # via burr -lxml==5.2.2 - # via free-proxy -markdown-it-py==3.0.0 - # via rich -markupsafe==2.1.5 - # via jinja2 -marshmallow==3.21.2 - # via dataclasses-json -matplotlib==3.9.0 - # via burr -mccabe==0.7.0 - # via pylint -mdurl==0.1.2 - # via markdown-it-py -minify-html==0.15.0 - # via scrapegraphai -multidict==6.0.5 - # via aiohttp - # via yarl -mypy-extensions==1.0.0 - # via typing-inspect -numpy==1.26.4 - # via altair - # via contourpy - # via faiss-cpu - # via langchain - # via langchain-aws - # via langchain-community - # via matplotlib - # via pandas - # via pyarrow - # via pydeck - # via sf-hamilton - # via shapely - # via streamlit -openai==1.30.3 - # via burr - # via langchain-fireworks - # via langchain-openai -orjson==3.10.3 - # via fastapi - # via langsmith -packaging==23.2 - # via altair - # via google-cloud-aiplatform - # via google-cloud-bigquery - # via huggingface-hub - # via langchain-core - # via marshmallow - # via matplotlib - # via pytest - # via sphinx - # via streamlit -pandas==2.2.2 - # via altair - # via scrapegraphai - # via sf-hamilton - # via streamlit -pillow==10.3.0 - # via fireworks-ai - # via matplotlib - # via streamlit -platformdirs==4.2.2 - # via pylint -playwright==1.43.0 - # via scrapegraphai - # via undetected-playwright -pluggy==1.5.0 - # via pytest -proto-plus==1.23.0 - # via google-ai-generativelanguage - # via google-api-core - # via google-cloud-aiplatform - # via google-cloud-resource-manager -protobuf==4.25.3 - # via google-ai-generativelanguage - # via google-api-core - # via google-cloud-aiplatform - # via google-cloud-resource-manager - # via google-generativeai - # via googleapis-common-protos - # via grpc-google-iam-v1 - # via grpcio-status - # via proto-plus - # via streamlit -pyarrow==16.1.0 - # via streamlit -pyasn1==0.6.0 - # via pyasn1-modules - # via rsa -pyasn1-modules==0.4.0 - # via google-auth -pydantic==2.7.1 - # via anthropic - # via burr - # via fastapi - # via fastapi-pagination - # via fireworks-ai - # via google-cloud-aiplatform - # via google-generativeai - # via groq - # via langchain - # via langchain-core - # via langsmith - # via openai -pydantic-core==2.18.2 - # via pydantic -pydeck==0.9.1 - # via streamlit -pyee==11.1.0 - # via playwright -pygments==2.18.0 - # via furo - # via rich - # via sphinx -pylint==3.2.5 -pyparsing==3.1.2 - # via httplib2 - # via matplotlib -pytest==8.0.0 - # via pytest-mock -pytest-mock==3.14.0 -python-dateutil==2.9.0.post0 - # via botocore - # via google-cloud-bigquery - # via matplotlib - # via pandas -python-dotenv==1.0.1 - # via scrapegraphai - # via uvicorn -python-multipart==0.0.9 - # via fastapi -pytz==2024.1 - # via pandas -pyyaml==6.0.1 - # via huggingface-hub - # via langchain - # via langchain-community - # via langchain-core - # via uvicorn -referencing==0.35.1 - # via jsonschema - # via jsonschema-specifications -regex==2024.5.15 - # via tiktoken -requests==2.32.2 - # via burr - # via free-proxy - # via google-api-core - # via google-cloud-bigquery - # via google-cloud-storage - # via huggingface-hub - # via langchain - # via langchain-community - # via langchain-fireworks - # via langsmith - # via sphinx - # via streamlit - # via tiktoken -rich==13.7.1 - # via streamlit - # via typer -rpds-py==0.18.1 - # via jsonschema - # via referencing -rsa==4.9 - # via google-auth -s3transfer==0.10.1 - # via boto3 -semchunk==1.0.1 - # via scrapegraphai -sf-hamilton==1.63.0 - # via burr -shapely==2.0.4 - # via google-cloud-aiplatform -shellingham==1.5.4 - # via typer -six==1.16.0 - # via python-dateutil -smmap==5.0.1 - # via gitdb -sniffio==1.3.1 - # via anthropic - # via anyio - # via groq - # via httpx - # via openai -snowballstemmer==2.2.0 - # via sphinx -soupsieve==2.5 - # via beautifulsoup4 -sphinx==6.0.0 - # via furo - # via scrapegraphai - # via sphinx-basic-ng -sphinx-basic-ng==1.0.0b2 - # via furo -sphinxcontrib-applehelp==1.0.8 - # via sphinx -sphinxcontrib-devhelp==1.0.6 - # via sphinx -sphinxcontrib-htmlhelp==2.0.5 - # via sphinx -sphinxcontrib-jsmath==1.0.1 - # via sphinx -sphinxcontrib-qthelp==1.0.7 - # via sphinx -sphinxcontrib-serializinghtml==1.1.10 - # via sphinx -sqlalchemy==2.0.30 - # via langchain - # via langchain-community -starlette==0.37.2 - # via fastapi -streamlit==1.35.0 - # via burr -tenacity==8.3.0 - # via langchain - # via langchain-community - # via langchain-core - # via streamlit -tiktoken==0.7.0 - # via langchain-openai - # via scrapegraphai -tokenizers==0.19.1 - # via anthropic -toml==0.10.2 - # via streamlit -tomli==2.0.1 - # via pylint - # via pytest -tomlkit==0.12.5 - # via pylint -toolz==0.12.1 - # via altair -tornado==6.4 - # via streamlit -tqdm==4.66.4 - # via google-generativeai - # via huggingface-hub - # via openai - # via scrapegraphai - # via semchunk -typer==0.12.3 - # via fastapi-cli -typing-extensions==4.12.0 - # via altair - # via anthropic - # via anyio - # via astroid - # via fastapi - # via fastapi-pagination - # via google-generativeai - # via groq - # via huggingface-hub - # via openai - # via pydantic - # via pydantic-core - # via pyee - # via pylint - # via sf-hamilton - # via sqlalchemy - # via starlette - # via streamlit - # via typer - # via typing-inspect - # via uvicorn -typing-inspect==0.9.0 - # via dataclasses-json - # via sf-hamilton -tzdata==2024.1 - # via pandas -ujson==5.10.0 - # via fastapi -undetected-playwright==0.3.0 - # via scrapegraphai -uritemplate==4.1.1 - # via google-api-python-client -urllib3==1.26.18 - # via botocore - # via requests -uvicorn==0.29.0 - # via burr - # via fastapi -uvloop==0.19.0 - # via uvicorn -watchfiles==0.21.0 - # via uvicorn -websockets==12.0 - # via uvicorn -yarl==1.9.4 - # via aiohttp -zipp==3.19.2 - # via importlib-metadata - # via importlib-resources diff --git a/requirements-dev.txt b/requirements-dev.txt deleted file mode 100644 index d33296d5..00000000 --- a/requirements-dev.txt +++ /dev/null @@ -1,4 +0,0 @@ -sphinx==7.1.2 -furo==2024.5.6 -pytest==8.0.0 -burr[start]==0.22.1 \ No newline at end of file diff --git a/requirements.lock b/requirements.lock deleted file mode 100644 index 7a8bb455..00000000 --- a/requirements.lock +++ /dev/null @@ -1,366 +0,0 @@ -# generated by rye -# use `rye lock` or `rye sync` to update this lockfile -# -# last locked with the following flags: -# pre: false -# features: [] -# all-features: false -# with-sources: false - --e file:. -aiohttp==3.9.5 - # via langchain - # via langchain-community - # via langchain-fireworks -aiosignal==1.3.1 - # via aiohttp -annotated-types==0.7.0 - # via pydantic -anthropic==0.26.1 - # via langchain-anthropic -anyio==4.3.0 - # via anthropic - # via groq - # via httpx - # via openai -async-timeout==4.0.3 - # via aiohttp - # via langchain -attrs==23.2.0 - # via aiohttp -beautifulsoup4==4.12.3 - # via google - # via scrapegraphai -boto3==1.34.113 - # via langchain-aws -botocore==1.34.113 - # via boto3 - # via s3transfer -cachetools==5.3.3 - # via google-auth -certifi==2024.2.2 - # via httpcore - # via httpx - # via requests -charset-normalizer==3.3.2 - # via requests -dataclasses-json==0.6.6 - # via langchain - # via langchain-community -defusedxml==0.7.1 - # via langchain-anthropic -distro==1.9.0 - # via anthropic - # via groq - # via openai -docstring-parser==0.16 - # via google-cloud-aiplatform -exceptiongroup==1.2.1 - # via anyio -faiss-cpu==1.8.0 - # via scrapegraphai -filelock==3.14.0 - # via huggingface-hub -fireworks-ai==0.14.0 - # via langchain-fireworks -free-proxy==1.1.1 - # via scrapegraphai -frozenlist==1.4.1 - # via aiohttp - # via aiosignal -fsspec==2024.5.0 - # via huggingface-hub -google==3.0.0 - # via scrapegraphai -google-ai-generativelanguage==0.6.4 - # via google-generativeai -google-api-core==2.19.0 - # via google-ai-generativelanguage - # via google-api-python-client - # via google-cloud-aiplatform - # via google-cloud-bigquery - # via google-cloud-core - # via google-cloud-resource-manager - # via google-cloud-storage - # via google-generativeai -google-api-python-client==2.130.0 - # via google-generativeai -google-auth==2.29.0 - # via google-ai-generativelanguage - # via google-api-core - # via google-api-python-client - # via google-auth-httplib2 - # via google-cloud-aiplatform - # via google-cloud-bigquery - # via google-cloud-core - # via google-cloud-resource-manager - # via google-cloud-storage - # via google-generativeai -google-auth-httplib2==0.2.0 - # via google-api-python-client -google-cloud-aiplatform==1.58.0 - # via langchain-google-vertexai -google-cloud-bigquery==3.25.0 - # via google-cloud-aiplatform -google-cloud-core==2.4.1 - # via google-cloud-bigquery - # via google-cloud-storage -google-cloud-resource-manager==1.12.3 - # via google-cloud-aiplatform -google-cloud-storage==2.17.0 - # via google-cloud-aiplatform - # via langchain-google-vertexai -google-crc32c==1.5.0 - # via google-cloud-storage - # via google-resumable-media -google-generativeai==0.5.4 - # via langchain-google-genai -google-resumable-media==2.7.1 - # via google-cloud-bigquery - # via google-cloud-storage -googleapis-common-protos==1.63.0 - # via google-api-core - # via grpc-google-iam-v1 - # via grpcio-status -graphviz==0.20.3 - # via scrapegraphai -greenlet==3.0.3 - # via playwright -groq==0.8.0 - # via langchain-groq -grpc-google-iam-v1==0.13.1 - # via google-cloud-resource-manager -grpcio==1.64.0 - # via google-api-core - # via googleapis-common-protos - # via grpc-google-iam-v1 - # via grpcio-status -grpcio-status==1.62.2 - # via google-api-core -h11==0.14.0 - # via httpcore -html2text==2024.2.26 - # via scrapegraphai -httpcore==1.0.5 - # via httpx -httplib2==0.22.0 - # via google-api-python-client - # via google-auth-httplib2 -httpx==0.27.0 - # via anthropic - # via fireworks-ai - # via groq - # via openai -httpx-sse==0.4.0 - # via fireworks-ai -huggingface-hub==0.23.1 - # via tokenizers -idna==3.7 - # via anyio - # via httpx - # via requests - # via yarl -jiter==0.4.0 - # via anthropic -jmespath==1.0.1 - # via boto3 - # via botocore -jsonpatch==1.33 - # via langchain - # via langchain-core -jsonpointer==2.4 - # via jsonpatch -langchain==0.1.15 - # via scrapegraphai -langchain-anthropic==0.1.11 - # via scrapegraphai -langchain-aws==0.1.3 - # via scrapegraphai -langchain-community==0.0.38 - # via langchain -langchain-core==0.1.52 - # via langchain - # via langchain-anthropic - # via langchain-aws - # via langchain-community - # via langchain-fireworks - # via langchain-google-genai - # via langchain-google-vertexai - # via langchain-groq - # via langchain-openai - # via langchain-text-splitters -langchain-fireworks==0.1.3 - # via scrapegraphai -langchain-google-genai==1.0.3 - # via scrapegraphai -langchain-google-vertexai==1.0.4 - # via scrapegraphai -langchain-groq==0.1.3 - # via scrapegraphai -langchain-openai==0.1.6 - # via scrapegraphai -langchain-text-splitters==0.0.2 - # via langchain -langsmith==0.1.63 - # via langchain - # via langchain-community - # via langchain-core -lxml==5.2.2 - # via free-proxy -marshmallow==3.21.2 - # via dataclasses-json -minify-html==0.15.0 - # via scrapegraphai -multidict==6.0.5 - # via aiohttp - # via yarl -mypy-extensions==1.0.0 - # via typing-inspect -numpy==1.26.4 - # via faiss-cpu - # via langchain - # via langchain-aws - # via langchain-community - # via pandas - # via shapely -openai==1.30.3 - # via langchain-fireworks - # via langchain-openai -orjson==3.10.3 - # via langsmith -packaging==23.2 - # via google-cloud-aiplatform - # via google-cloud-bigquery - # via huggingface-hub - # via langchain-core - # via marshmallow -pandas==2.2.2 - # via scrapegraphai -pillow==10.3.0 - # via fireworks-ai -playwright==1.43.0 - # via scrapegraphai - # via undetected-playwright -proto-plus==1.23.0 - # via google-ai-generativelanguage - # via google-api-core - # via google-cloud-aiplatform - # via google-cloud-resource-manager -protobuf==4.25.3 - # via google-ai-generativelanguage - # via google-api-core - # via google-cloud-aiplatform - # via google-cloud-resource-manager - # via google-generativeai - # via googleapis-common-protos - # via grpc-google-iam-v1 - # via grpcio-status - # via proto-plus -pyasn1==0.6.0 - # via pyasn1-modules - # via rsa -pyasn1-modules==0.4.0 - # via google-auth -pydantic==2.7.1 - # via anthropic - # via fireworks-ai - # via google-cloud-aiplatform - # via google-generativeai - # via groq - # via langchain - # via langchain-core - # via langsmith - # via openai -pydantic-core==2.18.2 - # via pydantic -pyee==11.1.0 - # via playwright -pyparsing==3.1.2 - # via httplib2 -python-dateutil==2.9.0.post0 - # via botocore - # via google-cloud-bigquery - # via pandas -python-dotenv==1.0.1 - # via scrapegraphai -pytz==2024.1 - # via pandas -pyyaml==6.0.1 - # via huggingface-hub - # via langchain - # via langchain-community - # via langchain-core -regex==2024.5.15 - # via tiktoken -requests==2.32.2 - # via free-proxy - # via google-api-core - # via google-cloud-bigquery - # via google-cloud-storage - # via huggingface-hub - # via langchain - # via langchain-community - # via langchain-fireworks - # via langsmith - # via tiktoken -rsa==4.9 - # via google-auth -s3transfer==0.10.1 - # via boto3 -semchunk==1.0.1 - # via scrapegraphai -shapely==2.0.4 - # via google-cloud-aiplatform -six==1.16.0 - # via python-dateutil -sniffio==1.3.1 - # via anthropic - # via anyio - # via groq - # via httpx - # via openai -soupsieve==2.5 - # via beautifulsoup4 -sqlalchemy==2.0.30 - # via langchain - # via langchain-community -tenacity==8.3.0 - # via langchain - # via langchain-community - # via langchain-core -tiktoken==0.7.0 - # via langchain-openai - # via scrapegraphai -tokenizers==0.19.1 - # via anthropic -tqdm==4.66.4 - # via google-generativeai - # via huggingface-hub - # via openai - # via scrapegraphai - # via semchunk -typing-extensions==4.12.0 - # via anthropic - # via anyio - # via google-generativeai - # via groq - # via huggingface-hub - # via openai - # via pydantic - # via pydantic-core - # via pyee - # via sqlalchemy - # via typing-inspect -typing-inspect==0.9.0 - # via dataclasses-json -tzdata==2024.1 - # via pandas -undetected-playwright==0.3.0 - # via scrapegraphai -uritemplate==4.1.1 - # via google-api-python-client -urllib3==1.26.18 - # via botocore - # via requests -yarl==1.9.4 - # via aiohttp diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index f764e58b..eb440a75 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -7,6 +7,8 @@ from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel from tqdm import tqdm +import asyncio +from ..utils.merge_results import merge_results from ..utils.logging import get_logger from ..models import Ollama, OpenAI from .base_node import BaseNode @@ -109,42 +111,46 @@ def execute(self, state: dict) -> dict: chains_dict = {} + if len(doc) == 1: + prompt = PromptTemplate( + template=template_no_chunks_prompt, + input_variables=["question"], + partial_variables={"context": doc, + "format_instructions": format_instructions}) + chain = prompt | self.llm_model | output_parser + answer = chain.invoke({"question": user_prompt}) + + state.update({self.output[0]: answer}) + return state + # Use tqdm to add progress bar for i, chunk in enumerate(tqdm(doc, desc="Processing chunks", disable=not self.verbose)): - if len(doc) == 1: - prompt = PromptTemplate( - template=template_no_chunks_prompt, - input_variables=["question"], - partial_variables={"context": chunk, - "format_instructions": format_instructions}) - chain = prompt | self.llm_model | output_parser - answer = chain.invoke({"question": user_prompt}) - break prompt = PromptTemplate( - template=template_chunks_prompt, - input_variables=["question"], - partial_variables={"context": chunk, - "chunk_id": i + 1, - "format_instructions": format_instructions}) - # Dynamically name the chains based on their index + template=template_chunks, + input_variables=["question"], + partial_variables={"context": chunk, + "chunk_id": i + 1, + "format_instructions": format_instructions}) + # Add chain to dictionary with dynamic name chain_name = f"chunk{i+1}" chains_dict[chain_name] = prompt | self.llm_model | output_parser - if len(chains_dict) > 1: - # Use dictionary unpacking to pass the dynamically named chains to RunnableParallel - map_chain = RunnableParallel(**chains_dict) - # Chain - answer = map_chain.invoke({"question": user_prompt}) - # Merge the answers from the chunks - merge_prompt = PromptTemplate( - template = template_merge_prompt, - input_variables=["context", "question"], - partial_variables={"format_instructions": format_instructions}, - ) - merge_chain = merge_prompt | self.llm_model | output_parser - answer = merge_chain.invoke({"context": answer, "question": user_prompt}) - - # Update the state with the generated answer - state.update({self.output[0]: answer}) + + async def process_chains(): + async_runner = RunnableParallel() + for chain_name, chain in chains_dict.items(): + async_runner.add(chain.ainvoke([{"question": user_prompt}] * len(doc))) + + batch_results = await async_runner.run() + return batch_results + + loop = asyncio.get_event_loop() + batch_answers = loop.run_until_complete(process_chains()) + + # Merge batch results (assuming same structure) + merged_answer = merge_results(batch_answers) + answers = merged_answer + + state.update({self.output[0]: answers}) return state diff --git a/scrapegraphai/utils/__init__.py b/scrapegraphai/utils/__init__.py index 707d2b18..15fd6886 100644 --- a/scrapegraphai/utils/__init__.py +++ b/scrapegraphai/utils/__init__.py @@ -11,3 +11,4 @@ from .cleanup_html import cleanup_html from .logging import * from .convert_to_md import convert_to_md +from .merge_results import merge_results diff --git a/scrapegraphai/utils/merge_results.py b/scrapegraphai/utils/merge_results.py new file mode 100644 index 00000000..ff5728fa --- /dev/null +++ b/scrapegraphai/utils/merge_results.py @@ -0,0 +1,30 @@ +def merge_results(batch_answers): + """ + Merges the results from single-chunk processing and batch processing, and adds separators between the chunks. + Parameters: + ----------- + answers : list of str + A list of strings containing the results from single-chunk processing. + + batch_answers : list of dict + A list of dictionaries, where each dictionary contains a key "text" with the batch processing result as a string. + + Returns: + -------- + str + A single string containing all merged results, with each result separated by a newline character. + + Example: + -------- + >>> answers = ["Result from single-chunk 1", "Result from single-chunk 2"] + >>> batch_answers = [{"text": "Result from batch 1"}, {"text": "Result from batch 2"}] + >>> merge_results(answers, batch_answers) + 'Result from single-chunk 1\nResult from single-chunk 2\nResult from batch 1\nResult from batch 2' + """ + # Combine answers from single-chunk processing and batch processing + merged_answers = [answer["text"] for answer in batch_answers] + + # Add separators between chunks + merged_answers = "\n".join(merged_answers) + + return merged_answers \ No newline at end of file From 0c4b2908d98efbb2b0a6faf68618a801d726bb5f Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Mon, 22 Jul 2024 19:58:33 +0200 Subject: [PATCH 2/5] feat: add generate_answer node paralellization Co-Authored-By: Federico Minutoli <40361744+DiTo97@users.noreply.github.com> --- scrapegraphai/nodes/generate_answer_node.py | 26 +++++++++------------ 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index eb440a75..9cd5dce5 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -1,13 +1,12 @@ """ GenerateAnswerNode Module """ - +import asyncio from typing import List, Optional from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel from tqdm import tqdm -import asyncio from ..utils.merge_results import merge_results from ..utils.logging import get_logger from ..models import Ollama, OpenAI @@ -136,21 +135,18 @@ def execute(self, state: dict) -> dict: chain_name = f"chunk{i+1}" chains_dict[chain_name] = prompt | self.llm_model | output_parser + async_runner = RunnableParallel(**chains_dict) - async def process_chains(): - async_runner = RunnableParallel() - for chain_name, chain in chains_dict.items(): - async_runner.add(chain.ainvoke([{"question": user_prompt}] * len(doc))) - - batch_results = await async_runner.run() - return batch_results + batch_results = async_runner.invoke({"question": user_prompt}) - loop = asyncio.get_event_loop() - batch_answers = loop.run_until_complete(process_chains()) + merge_prompt = PromptTemplate( + template = template_merge_prompt, + input_variables=["context", "question"], + partial_variables={"format_instructions": format_instructions}, + ) - # Merge batch results (assuming same structure) - merged_answer = merge_results(batch_answers) - answers = merged_answer + merge_chain = merge_prompt | self.llm_model | output_parser + answer = merge_chain.invoke({"context": batch_results, "question": user_prompt}) - state.update({self.output[0]: answers}) + state.update({self.output[0]: answer}) return state From cf2734392cda6ef6eda50517671d44d4b06e26c7 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Tue, 23 Jul 2024 13:05:50 +0200 Subject: [PATCH 3/5] removed unused function --- scrapegraphai/nodes/generate_answer_node.py | 1 - scrapegraphai/utils/__init__.py | 1 - scrapegraphai/utils/merge_results.py | 30 --------------------- 3 files changed, 32 deletions(-) delete mode 100644 scrapegraphai/utils/merge_results.py diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index 9cd5dce5..d864124e 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -7,7 +7,6 @@ from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel from tqdm import tqdm -from ..utils.merge_results import merge_results from ..utils.logging import get_logger from ..models import Ollama, OpenAI from .base_node import BaseNode diff --git a/scrapegraphai/utils/__init__.py b/scrapegraphai/utils/__init__.py index 15fd6886..707d2b18 100644 --- a/scrapegraphai/utils/__init__.py +++ b/scrapegraphai/utils/__init__.py @@ -11,4 +11,3 @@ from .cleanup_html import cleanup_html from .logging import * from .convert_to_md import convert_to_md -from .merge_results import merge_results diff --git a/scrapegraphai/utils/merge_results.py b/scrapegraphai/utils/merge_results.py deleted file mode 100644 index ff5728fa..00000000 --- a/scrapegraphai/utils/merge_results.py +++ /dev/null @@ -1,30 +0,0 @@ -def merge_results(batch_answers): - """ - Merges the results from single-chunk processing and batch processing, and adds separators between the chunks. - Parameters: - ----------- - answers : list of str - A list of strings containing the results from single-chunk processing. - - batch_answers : list of dict - A list of dictionaries, where each dictionary contains a key "text" with the batch processing result as a string. - - Returns: - -------- - str - A single string containing all merged results, with each result separated by a newline character. - - Example: - -------- - >>> answers = ["Result from single-chunk 1", "Result from single-chunk 2"] - >>> batch_answers = [{"text": "Result from batch 1"}, {"text": "Result from batch 2"}] - >>> merge_results(answers, batch_answers) - 'Result from single-chunk 1\nResult from single-chunk 2\nResult from batch 1\nResult from batch 2' - """ - # Combine answers from single-chunk processing and batch processing - merged_answers = [answer["text"] for answer in batch_answers] - - # Add separators between chunks - merged_answers = "\n".join(merged_answers) - - return merged_answers \ No newline at end of file From df1ecc00192a48abc6bbbe16444507c4bdf6362c Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Tue, 23 Jul 2024 13:06:59 +0200 Subject: [PATCH 4/5] Update generate_answer_node.py --- scrapegraphai/nodes/generate_answer_node.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index d864124e..81812598 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -107,8 +107,6 @@ def execute(self, state: dict) -> dict: template_chunks_prompt = self.additional_info + template_chunks_prompt template_merge_prompt = self.additional_info + template_merge_prompt - chains_dict = {} - if len(doc) == 1: prompt = PromptTemplate( template=template_no_chunks_prompt, @@ -121,7 +119,7 @@ def execute(self, state: dict) -> dict: state.update({self.output[0]: answer}) return state - # Use tqdm to add progress bar + chains_dict = {} for i, chunk in enumerate(tqdm(doc, desc="Processing chunks", disable=not self.verbose)): prompt = PromptTemplate( From 2edad66788cbd92f197e3b37db13c44bfa39e36a Mon Sep 17 00:00:00 2001 From: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> Date: Tue, 23 Jul 2024 20:51:49 +0200 Subject: [PATCH 5/5] chore: rebuild requirements --- requirements.txt | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 124840e5..440bf78a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,5 @@ langchain>=0.2.10 -langchain_community>=0.2.9 langchain-google-genai>=1.0.7 -langchain-fireworks>=0.1.3 langchain-google-vertexai langchain-openai>=0.1.17 langchain-groq>=0.1.3 @@ -22,4 +20,5 @@ playwright>=1.43.0 google>=3.0.0 undetected-playwright>=0.3.0 semchunk>=1.0.1 - +langchain-fireworks>=0.1.3 +langchain-community>=0.2.9