From 030b8601a34878fbc5aeaac807ecef23e93a9e18 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sat, 29 Jun 2024 20:26:25 +0200 Subject: [PATCH 01/10] Update generate_answer_node.py --- scrapegraphai/nodes/generate_answer_node.py | 81 ++++++++++++--------- 1 file changed, 47 insertions(+), 34 deletions(-) diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index 476421f0..724e387c 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -1,16 +1,28 @@ """ GenerateAnswerNode Module """ - +import asyncio from typing import List, Optional from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser -from langchain_core.runnables import RunnableParallel +from langchain_core.runnables import AsyncRunnable from tqdm import tqdm from ..utils.logging import get_logger from ..models import Ollama, OpenAI from .base_node import BaseNode -from ..helpers import template_chunks, template_no_chunks, template_merge, template_chunks_md, template_no_chunks_md, template_merge_md +from ..helpers import ( +template_chunks, template_no_chunks, template_merge, +template_chunks_md, template_no_chunks_md, template_merge_md +) + +def merge_results(answers, batch_answers): + # Combine answers from single-chunk processing and batch processing + merged_answers = answers + [answer["text"] for answer in batch_answers] + + # Add separators between chunks + merged_answers = "\n".join(merged_answers) + + return merged_answers class GenerateAnswerNode(BaseNode): """ @@ -38,12 +50,9 @@ def __init__( node_name: str = "GenerateAnswer", ): super().__init__(node_name, "node", input, output, 2, node_config) - + self.llm_model = node_config["llm_model"] - if isinstance(node_config["llm_model"], Ollama): - self.llm_model.format="json" - self.verbose = ( True if node_config is None else node_config.get("verbose", False) ) @@ -89,7 +98,7 @@ def execute(self, state: dict) -> dict: format_instructions = output_parser.get_format_instructions() - if isinstance(self.llm_model, OpenAI) and not self.script_creator or self.force and not self.script_creator: + if isinstance(self.llm_model, OpenAI) and not self.script_creator or self.force and not self.script_creator: template_no_chunks_prompt = template_no_chunks_md template_chunks_prompt = template_chunks_md template_merge_prompt = template_merge_md @@ -99,44 +108,48 @@ def execute(self, state: dict) -> dict: template_merge_prompt = template_merge chains_dict = {} + answers = [] # Use tqdm to add progress bar for i, chunk in enumerate(tqdm(doc, desc="Processing chunks", disable=not self.verbose)): if len(doc) == 1: + # No batching needed for single chunk prompt = PromptTemplate( - template=template_no_chunks_prompt, + template=template_no_chunks, input_variables=["question"], partial_variables={"context": chunk.page_content, - "format_instructions": format_instructions}) - chain = prompt | self.llm_model | output_parser + "format_instructions": format_instructions}) + chain = prompt | self.llm_model | output_parser answer = chain.invoke({"question": user_prompt}) - + else: + # Prepare prompt with chunk information prompt = PromptTemplate( - template=template_chunks_prompt, + template=template_chunks, input_variables=["question"], partial_variables={"context": chunk.page_content, - "chunk_id": i + 1, - "format_instructions": format_instructions}) - - # Dynamically name the chains based on their index - chain_name = f"chunk{i+1}" - chains_dict[chain_name] = prompt | self.llm_model | output_parser + "chunk_id": i + 1, + "format_instructions": format_instructions}) + # Add chain to dictionary with dynamic name + chain_name = f"chunk{i+1}" + chains_dict[chain_name] = prompt | self.llm_model | output_parser + # Batch process chains if there are multiple chunks if len(chains_dict) > 1: - # Use dictionary unpacking to pass the dynamically named chains to RunnableParallel - map_chain = RunnableParallel(**chains_dict) - # Chain - answer = map_chain.invoke({"question": user_prompt}) - # Merge the answers from the chunks - merge_prompt = PromptTemplate( - template = template_merge_prompt, - input_variables=["context", "question"], - partial_variables={"format_instructions": format_instructions}, - ) - merge_chain = merge_prompt | self.llm_model | output_parser - answer = merge_chain.invoke({"context": answer, "question": user_prompt}) - - # Update the state with the generated answer - state.update({self.output[0]: answer}) + async def process_chains(): + async_runner = AsyncRunnable() + for chain_name, chain in chains_dict.items(): + async_runner.add(chain.abatch([{"question": user_prompt}] * len(doc))) + batch_results = await async_runner.run() + return batch_results + + # Run asynchronous batch processing and get results + loop = asyncio.get_event_loop() + batch_answers = loop.run_until_complete(process_chains()) + + # Merge batch results (assuming same structure) + merged_answer = merge_results(answers, batch_answers) + answers = merged_answer + + state.update({self.output[0]: answers}) return state From 0d6d43fd86a8bfa91579f43bb12b5708993a2426 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sun, 30 Jun 2024 13:27:01 +0200 Subject: [PATCH 02/10] refactoring of mergre_result function --- scrapegraphai/nodes/generate_answer_node.py | 10 +------ scrapegraphai/utils/__init__.py | 1 + scrapegraphai/utils/merge_results.py | 31 +++++++++++++++++++++ 3 files changed, 33 insertions(+), 9 deletions(-) create mode 100644 scrapegraphai/utils/merge_results.py diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index 724e387c..8c4e8855 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -7,6 +7,7 @@ from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import AsyncRunnable from tqdm import tqdm +from ..utils.merge_results import merge_results from ..utils.logging import get_logger from ..models import Ollama, OpenAI from .base_node import BaseNode @@ -15,15 +16,6 @@ template_chunks_md, template_no_chunks_md, template_merge_md ) -def merge_results(answers, batch_answers): - # Combine answers from single-chunk processing and batch processing - merged_answers = answers + [answer["text"] for answer in batch_answers] - - # Add separators between chunks - merged_answers = "\n".join(merged_answers) - - return merged_answers - class GenerateAnswerNode(BaseNode): """ A node that generates an answer using a large language model (LLM) based on the user's input diff --git a/scrapegraphai/utils/__init__.py b/scrapegraphai/utils/__init__.py index 707d2b18..c76813e2 100644 --- a/scrapegraphai/utils/__init__.py +++ b/scrapegraphai/utils/__init__.py @@ -11,3 +11,4 @@ from .cleanup_html import cleanup_html from .logging import * from .convert_to_md import convert_to_md +from .merge_results import merge_results \ No newline at end of file diff --git a/scrapegraphai/utils/merge_results.py b/scrapegraphai/utils/merge_results.py new file mode 100644 index 00000000..7f09ebda --- /dev/null +++ b/scrapegraphai/utils/merge_results.py @@ -0,0 +1,31 @@ +def merge_results(answers, batch_answers): + """ + Merges the results from single-chunk processing and batch processing, and adds separators between the chunks. + + Parameters: + ----------- + answers : list of str + A list of strings containing the results from single-chunk processing. + + batch_answers : list of dict + A list of dictionaries, where each dictionary contains a key "text" with the batch processing result as a string. + + Returns: + -------- + str + A single string containing all merged results, with each result separated by a newline character. + + Example: + -------- + >>> answers = ["Result from single-chunk 1", "Result from single-chunk 2"] + >>> batch_answers = [{"text": "Result from batch 1"}, {"text": "Result from batch 2"}] + >>> merge_results(answers, batch_answers) + 'Result from single-chunk 1\nResult from single-chunk 2\nResult from batch 1\nResult from batch 2' + """ + # Combine answers from single-chunk processing and batch processing + merged_answers = answers + [answer["text"] for answer in batch_answers] + + # Add separators between chunks + merged_answers = "\n".join(merged_answers) + + return merged_answers From 3a46e726811fb0a076fd285b859b34777de632e0 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Sat, 20 Jul 2024 18:22:26 +0000 Subject: [PATCH 03/10] ci(release): 1.10.0 [skip ci] ## [1.10.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.9.2...v1.10.0) (2024-07-20) ### Features * add gpt4o omni ([431edb7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/431edb7bb2504f4c1335c3ae3ce2f91867fa7222)) * add searchngx integration ([5c92186](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/5c9218608140bf694fbfd96aa90276bc438bb475)) * refactoring_to_md function ([602dd00](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/602dd00209ee1d72a1223fc4793759450921fcf9)) ### Bug Fixes * add gpt o mini for azure ([77777c8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/77777c898d1fad40f340b06c5b36d35b65409ea6)) * parse_node ([07f1e23](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/07f1e23d235db1a0db2cb155f10b73b0bf882269)) * search link node ([cf3ab55](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/cf3ab5564ae5c415c63d1771b32ea68f5169ca82)) ### chore * correct search engine name ([7ba2f6a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/7ba2f6ae0b9d2e9336e973e1f57ab8355c739e27)) * remove unused import ([fd1b7cb](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/fd1b7cb24a7c252277607abde35826e3c58e34ef)) * **ci:** upgrade lockfiles ([c7b05a4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/c7b05a4993df14d6ed4848121a3cd209571232f7)) * upgrade tiktoken ([7314bc3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/7314bc383068db590662bf7e512f799529308991)) ### Docs * **gpt-4o-mini:** added new gpt, fixed chromium lazy loading, ([99dc849](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/99dc8497d85289759286a973e4aecc3f924d3ada)) ### CI * **release:** 1.10.0-beta.1 [skip ci] ([8f619de](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/8f619de23540216934b53bcf3426702e56c48f31)) * **release:** 1.10.0-beta.2 [skip ci] ([aa7d4f0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/aa7d4f0ebfc2623a51ce1e4887ff26c9906b0a95)) * **release:** 1.10.0-beta.3 [skip ci] ([bf0a2f3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bf0a2f386f38cbe81d1e5ea3e05357f8ecabcab2)) * **release:** 1.10.0-beta.4 [skip ci] ([a91807a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/a91807a20cc07b15feb1ddd5cf7a1c323ff32b46)) * **release:** 1.10.0-beta.5 [skip ci] ([0d5f925](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/0d5f9259d8fb148de7c95cf6f67f9562c5d2c880)) * **release:** 1.9.0-beta.3 [skip ci] ([d3e63d9](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/d3e63d91be79f74e8a3fdb00e692d546c24cead5)) * **release:** 1.9.0-beta.4 [skip ci] ([2fa04b5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/2fa04b58159abf7af890ebc0768fe23d51bf177f)) * **release:** 1.9.0-beta.5 [skip ci] ([bb62439](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bb624399cfc3924825892dd48697fc298ad3b002)) * **release:** 1.9.0-beta.6 [skip ci] ([54a69de](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/54a69de69e8077e02fd5584783ca62cc2e0ec5bb)) --- CHANGELOG.md | 42 ++++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 2 +- 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 01fb0c3a..b79488b7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,45 @@ +## [1.10.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.9.2...v1.10.0) (2024-07-20) + + +### Features + +* add gpt4o omni ([431edb7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/431edb7bb2504f4c1335c3ae3ce2f91867fa7222)) +* add searchngx integration ([5c92186](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/5c9218608140bf694fbfd96aa90276bc438bb475)) +* refactoring_to_md function ([602dd00](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/602dd00209ee1d72a1223fc4793759450921fcf9)) + + +### Bug Fixes + +* add gpt o mini for azure ([77777c8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/77777c898d1fad40f340b06c5b36d35b65409ea6)) +* parse_node ([07f1e23](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/07f1e23d235db1a0db2cb155f10b73b0bf882269)) +* search link node ([cf3ab55](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/cf3ab5564ae5c415c63d1771b32ea68f5169ca82)) + + +### chore + +* correct search engine name ([7ba2f6a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/7ba2f6ae0b9d2e9336e973e1f57ab8355c739e27)) +* remove unused import ([fd1b7cb](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/fd1b7cb24a7c252277607abde35826e3c58e34ef)) +* **ci:** upgrade lockfiles ([c7b05a4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/c7b05a4993df14d6ed4848121a3cd209571232f7)) +* upgrade tiktoken ([7314bc3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/7314bc383068db590662bf7e512f799529308991)) + + +### Docs + +* **gpt-4o-mini:** added new gpt, fixed chromium lazy loading, ([99dc849](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/99dc8497d85289759286a973e4aecc3f924d3ada)) + + +### CI + +* **release:** 1.10.0-beta.1 [skip ci] ([8f619de](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/8f619de23540216934b53bcf3426702e56c48f31)) +* **release:** 1.10.0-beta.2 [skip ci] ([aa7d4f0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/aa7d4f0ebfc2623a51ce1e4887ff26c9906b0a95)) +* **release:** 1.10.0-beta.3 [skip ci] ([bf0a2f3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bf0a2f386f38cbe81d1e5ea3e05357f8ecabcab2)) +* **release:** 1.10.0-beta.4 [skip ci] ([a91807a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/a91807a20cc07b15feb1ddd5cf7a1c323ff32b46)) +* **release:** 1.10.0-beta.5 [skip ci] ([0d5f925](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/0d5f9259d8fb148de7c95cf6f67f9562c5d2c880)) +* **release:** 1.9.0-beta.3 [skip ci] ([d3e63d9](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/d3e63d91be79f74e8a3fdb00e692d546c24cead5)) +* **release:** 1.9.0-beta.4 [skip ci] ([2fa04b5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/2fa04b58159abf7af890ebc0768fe23d51bf177f)) +* **release:** 1.9.0-beta.5 [skip ci] ([bb62439](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bb624399cfc3924825892dd48697fc298ad3b002)) +* **release:** 1.9.0-beta.6 [skip ci] ([54a69de](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/54a69de69e8077e02fd5584783ca62cc2e0ec5bb)) + ## [1.10.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.10.0-beta.4...v1.10.0-beta.5) (2024-07-20) diff --git a/pyproject.toml b/pyproject.toml index c42bf33b..daa040c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "scrapegraphai" -version = "1.10.0b5" +version = "1.10.0" From 208ab267ceda30b4527222d9dfd61e5c5ed243c3 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sat, 20 Jul 2024 21:05:31 +0200 Subject: [PATCH 04/10] Create requirements.txt --- requirements.txt | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..502e9df5 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,25 @@ +beautifulsoup4==4.12.3 +burr==0.24.0 +free_proxy==1.1.1 +google==3.0.0 +graphviz==0.20.3 +html2text==2024.2.26 +langchain==0.2.10 +langchain_anthropic==0.1.20 +langchain_aws==0.1.11 +langchain_community==0.2.9 +langchain_core==0.2.22 +langchain_fireworks==0.1.5 +langchain_google_genai==1.0.7 +langchain_google_vertexai==1.0.6 +langchain_groq==0.1.6 +langchain_openai==0.1.17 +minify_html==0.15.0 +pandas==2.2.2 +playwright==1.43.0 +pydantic==2.8.2 +Requests==2.32.3 +semchunk==2.2.0 +tiktoken==0.7.0 +tqdm==4.66.4 +undetected_playwright==0.3.0 From ce6be37fbc1095afe4df6a2fc206923e477190e5 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sun, 21 Jul 2024 18:37:15 +0200 Subject: [PATCH 05/10] fix: abstract_graph moel token bug --- scrapegraphai/graphs/abstract_graph.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index 7f8ec4ea..ca2b2e81 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -148,6 +148,10 @@ def _create_llm(self, llm_config: dict, chat=False) -> object: # If model instance is passed directly instead of the model details if "model_instance" in llm_params: + try: + self.model_token = llm_params["model_tokens"] + except KeyError as exc: + raise KeyError("model_tokens not specified") from exc return llm_params["model_instance"] # Instantiate the language model based on the model name From 03f528a34e385f16fed2070ddf64717de0036fb2 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Sun, 21 Jul 2024 16:38:44 +0000 Subject: [PATCH 06/10] ci(release): 1.10.1 [skip ci] ## [1.10.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.10.0...v1.10.1) (2024-07-21) ### Bug Fixes * abstract_graph moel token bug ([ce6be37](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ce6be37fbc1095afe4df6a2fc206923e477190e5)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b79488b7..a942a57d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.10.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.10.0...v1.10.1) (2024-07-21) + + +### Bug Fixes + +* abstract_graph moel token bug ([ce6be37](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ce6be37fbc1095afe4df6a2fc206923e477190e5)) + ## [1.10.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.9.2...v1.10.0) (2024-07-20) diff --git a/pyproject.toml b/pyproject.toml index daa040c5..60fe1ef9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "scrapegraphai" -version = "1.10.0" +version = "1.10.1" From b49f9866fd86f0fc873227a4f7eecd8248c7b80f Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sun, 21 Jul 2024 19:20:19 +0200 Subject: [PATCH 07/10] fixed telemetry version --- scrapegraphai/telemetry/telemetry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapegraphai/telemetry/telemetry.py b/scrapegraphai/telemetry/telemetry.py index d1c8a367..3bf6d90f 100644 --- a/scrapegraphai/telemetry/telemetry.py +++ b/scrapegraphai/telemetry/telemetry.py @@ -91,7 +91,7 @@ def _check_config_and_environ_for_telemetry_flag( "python_version": f"{platform.python_version()}/{platform.python_implementation()}", "distinct_id": g_anonymous_id, "scrapegraphai_version": VERSION, - "telemetry_version": "0.0.1", + "telemetry_version": "0.0.2", } From b0418b679cf45e1e680d2daadcc47e6e4f585575 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sun, 21 Jul 2024 19:23:40 +0200 Subject: [PATCH 08/10] fix: telemetry version From da451e5f1618e70eb1bd48a4ff3ad65ba31b0a84 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Sun, 21 Jul 2024 17:25:05 +0000 Subject: [PATCH 09/10] ci(release): 1.10.2 [skip ci] ## [1.10.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.10.1...v1.10.2) (2024-07-21) ### Bug Fixes * telemetry version ([b0418b6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b0418b679cf45e1e680d2daadcc47e6e4f585575)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a942a57d..0b35e2c2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.10.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.10.1...v1.10.2) (2024-07-21) + + +### Bug Fixes + +* telemetry version ([b0418b6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b0418b679cf45e1e680d2daadcc47e6e4f585575)) + ## [1.10.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.10.0...v1.10.1) (2024-07-21) diff --git a/pyproject.toml b/pyproject.toml index 60fe1ef9..02c8d0bc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "scrapegraphai" -version = "1.10.1" +version = "1.10.2" From f9e7cc427a1c48879647791facbaa821aa247053 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Mon, 22 Jul 2024 16:00:17 +0200 Subject: [PATCH 10/10] fixed bugs --- examples/local_models/smart_scraper_ollama.py | 2 +- pyproject.toml | 1 + requirements-dev.lock | 138 ++++++++++-------- requirements.lock | 102 ++++++++----- scrapegraphai/nodes/generate_answer_node.py | 37 +++-- scrapegraphai/nodes/parse_node.py | 2 +- 6 files changed, 165 insertions(+), 117 deletions(-) diff --git a/examples/local_models/smart_scraper_ollama.py b/examples/local_models/smart_scraper_ollama.py index 0b3fcbfc..5b415873 100644 --- a/examples/local_models/smart_scraper_ollama.py +++ b/examples/local_models/smart_scraper_ollama.py @@ -28,7 +28,7 @@ # ************************************************ smart_scraper_graph = SmartScraperGraph( - prompt="List me all the titles", + prompt="List me all the projects with their descriptions", source="https://perinim.github.io/projects", config=graph_config ) diff --git a/pyproject.toml b/pyproject.toml index 62ac7225..7972103f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ dependencies = [ "semchunk>=1.0.1", "html2text>=2024.2.26", "langchain-fireworks>=0.1.3", + "langchain-community==0.2.9" ] license = "MIT" diff --git a/requirements-dev.lock b/requirements-dev.lock index 9f298273..0f521ea9 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -8,10 +8,11 @@ # with-sources: false -e file:. -aiofiles==23.2.1 +aiofiles==24.1.0 # via burr aiohttp==3.9.5 # via langchain + # via langchain-community # via langchain-fireworks aiosignal==1.3.1 # via aiohttp @@ -23,14 +24,14 @@ annotated-types==0.7.0 # via pydantic anthropic==0.31.2 # via langchain-anthropic -anyio==4.3.0 +anyio==4.4.0 # via anthropic # via groq # via httpx # via openai # via starlette # via watchfiles -astroid==3.2.2 +astroid==3.2.4 # via pylint async-timeout==4.0.3 # via aiohttp @@ -54,10 +55,10 @@ botocore==1.34.145 # via s3transfer burr==0.22.1 # via scrapegraphai -cachetools==5.3.3 +cachetools==5.4.0 # via google-auth # via streamlit -certifi==2024.2.2 +certifi==2024.7.4 # via httpcore # via httpx # via requests @@ -72,9 +73,12 @@ contourpy==1.2.1 # via matplotlib cycler==0.12.1 # via matplotlib +dataclasses-json==0.6.7 + # via langchain-community defusedxml==0.7.1 # via langchain-anthropic dill==0.3.8 + # via multiprocess # via pylint distro==1.9.0 # via anthropic @@ -86,32 +90,31 @@ docstring-parser==0.16 # via google-cloud-aiplatform docutils==0.19 # via sphinx -email-validator==2.1.1 +email-validator==2.2.0 # via fastapi -exceptiongroup==1.2.1 +exceptiongroup==1.2.2 # via anyio # via pytest -faiss-cpu==1.8.0 +faiss-cpu==1.8.0.post1 # via scrapegraphai -fastapi==0.111.0 +fastapi==0.111.1 # via burr - # via fastapi-pagination fastapi-cli==0.0.4 # via fastapi -fastapi-pagination==0.12.24 +fastapi-pagination==0.12.26 # via burr -filelock==3.14.0 +filelock==3.15.4 # via huggingface-hub fireworks-ai==0.14.0 # via langchain-fireworks -fonttools==4.52.1 +fonttools==4.53.1 # via matplotlib free-proxy==1.1.1 # via scrapegraphai frozenlist==1.4.1 # via aiohttp # via aiosignal -fsspec==2024.5.0 +fsspec==2024.6.1 # via huggingface-hub furo==2024.5.6 # via scrapegraphai @@ -123,7 +126,7 @@ google==3.0.0 # via scrapegraphai google-ai-generativelanguage==0.6.6 # via google-generativeai -google-api-core==2.19.0 +google-api-core==2.19.1 # via google-ai-generativelanguage # via google-api-python-client # via google-cloud-aiplatform @@ -132,9 +135,9 @@ google-api-core==2.19.0 # via google-cloud-resource-manager # via google-cloud-storage # via google-generativeai -google-api-python-client==2.130.0 +google-api-python-client==2.137.0 # via google-generativeai -google-auth==2.29.0 +google-auth==2.32.0 # via google-ai-generativelanguage # via google-api-core # via google-api-python-client @@ -147,14 +150,14 @@ google-auth==2.29.0 # via google-generativeai google-auth-httplib2==0.2.0 # via google-api-python-client -google-cloud-aiplatform==1.58.0 +google-cloud-aiplatform==1.59.0 # via langchain-google-vertexai google-cloud-bigquery==3.25.0 # via google-cloud-aiplatform google-cloud-core==2.4.1 # via google-cloud-bigquery # via google-cloud-storage -google-cloud-resource-manager==1.12.3 +google-cloud-resource-manager==1.12.4 # via google-cloud-aiplatform google-cloud-storage==2.17.0 # via google-cloud-aiplatform @@ -167,7 +170,7 @@ google-generativeai==0.7.2 google-resumable-media==2.7.1 # via google-cloud-bigquery # via google-cloud-storage -googleapis-common-protos==1.63.0 +googleapis-common-protos==1.63.2 # via google-api-core # via grpc-google-iam-v1 # via grpcio-status @@ -176,12 +179,11 @@ graphviz==0.20.3 # via scrapegraphai greenlet==3.0.3 # via playwright - # via sqlalchemy -groq==0.8.0 +groq==0.9.0 # via langchain-groq grpc-google-iam-v1==0.13.1 # via google-cloud-resource-manager -grpcio==1.64.0 +grpcio==1.65.1 # via google-api-core # via googleapis-common-protos # via grpc-google-iam-v1 @@ -208,7 +210,7 @@ httpx==0.27.0 # via openai httpx-sse==0.4.0 # via fireworks-ai -huggingface-hub==0.23.1 +huggingface-hub==0.24.0 # via tokenizers idna==3.7 # via anyio @@ -232,42 +234,46 @@ jinja2==3.1.4 # via fastapi # via pydeck # via sphinx -jiter==0.4.0 +jiter==0.5.0 # via anthropic jmespath==1.0.1 # via boto3 # via botocore jsonpatch==1.33 # via langchain-core -jsonpointer==2.4 +jsonpointer==3.0.0 # via jsonpatch -jsonschema==4.22.0 +jsonschema==4.23.0 # via altair jsonschema-specifications==2023.12.1 # via jsonschema kiwisolver==1.4.5 # via matplotlib langchain==0.2.10 + # via langchain-community # via scrapegraphai langchain-anthropic==0.1.20 # via scrapegraphai langchain-aws==0.1.11 # via scrapegraphai +langchain-community==0.2.9 + # via scrapegraphai langchain-core==0.2.22 # via langchain # via langchain-anthropic # via langchain-aws + # via langchain-community # via langchain-fireworks # via langchain-google-genai # via langchain-google-vertexai # via langchain-groq # via langchain-openai # via langchain-text-splitters -langchain-fireworks==0.1.3 +langchain-fireworks==0.1.5 # via scrapegraphai langchain-google-genai==1.0.7 # via scrapegraphai -langchain-google-vertexai==1.0.4 +langchain-google-vertexai==1.0.6 # via scrapegraphai langchain-groq==0.1.6 # via scrapegraphai @@ -277,6 +283,7 @@ langchain-text-splitters==0.2.2 # via langchain langsmith==0.1.93 # via langchain + # via langchain-community # via langchain-core loguru==0.7.2 # via burr @@ -286,7 +293,9 @@ markdown-it-py==3.0.0 # via rich markupsafe==2.1.5 # via jinja2 -matplotlib==3.9.0 +marshmallow==3.21.3 + # via dataclasses-json +matplotlib==3.9.1 # via burr mccabe==0.7.0 # via pylint @@ -294,9 +303,13 @@ mdurl==0.1.2 # via markdown-it-py minify-html==0.15.0 # via scrapegraphai +mpire==2.10.2 + # via semchunk multidict==6.0.5 # via aiohttp # via yarl +multiprocess==0.70.16 + # via mpire mypy-extensions==1.0.0 # via typing-inspect numpy==1.26.4 @@ -305,6 +318,7 @@ numpy==1.26.4 # via faiss-cpu # via langchain # via langchain-aws + # via langchain-community # via matplotlib # via pandas # via pyarrow @@ -316,15 +330,16 @@ openai==1.36.1 # via burr # via langchain-fireworks # via langchain-openai -orjson==3.10.3 - # via fastapi +orjson==3.10.6 # via langsmith -packaging==23.2 +packaging==24.1 # via altair + # via faiss-cpu # via google-cloud-aiplatform # via google-cloud-bigquery # via huggingface-hub # via langchain-core + # via marshmallow # via matplotlib # via pytest # via sphinx @@ -334,18 +349,18 @@ pandas==2.2.2 # via scrapegraphai # via sf-hamilton # via streamlit -pillow==10.3.0 +pillow==10.4.0 # via fireworks-ai # via matplotlib # via streamlit platformdirs==4.2.2 # via pylint -playwright==1.43.0 +playwright==1.45.0 # via scrapegraphai # via undetected-playwright pluggy==1.5.0 # via pytest -proto-plus==1.23.0 +proto-plus==1.24.0 # via google-ai-generativelanguage # via google-api-core # via google-cloud-aiplatform @@ -361,14 +376,14 @@ protobuf==4.25.3 # via grpcio-status # via proto-plus # via streamlit -pyarrow==16.1.0 +pyarrow==17.0.0 # via streamlit pyasn1==0.6.0 # via pyasn1-modules # via rsa pyasn1-modules==0.4.0 # via google-auth -pydantic==2.7.1 +pydantic==2.8.2 # via anthropic # via burr # via fastapi @@ -381,7 +396,7 @@ pydantic==2.7.1 # via langchain-core # via langsmith # via openai -pydantic-core==2.18.2 +pydantic-core==2.20.1 # via pydantic pydeck==0.9.1 # via streamlit @@ -389,9 +404,10 @@ pyee==11.1.0 # via playwright pygments==2.18.0 # via furo + # via mpire # via rich # via sphinx -pylint==3.2.5 +pylint==3.2.6 pyparsing==3.1.2 # via httplib2 # via matplotlib @@ -413,6 +429,7 @@ pytz==2024.1 pyyaml==6.0.1 # via huggingface-hub # via langchain + # via langchain-community # via langchain-core # via uvicorn referencing==0.35.1 @@ -420,7 +437,7 @@ referencing==0.35.1 # via jsonschema-specifications regex==2024.5.15 # via tiktoken -requests==2.32.2 +requests==2.32.3 # via burr # via free-proxy # via google-api-core @@ -428,6 +445,7 @@ requests==2.32.2 # via google-cloud-storage # via huggingface-hub # via langchain + # via langchain-community # via langchain-fireworks # via langsmith # via sphinx @@ -436,18 +454,18 @@ requests==2.32.2 rich==13.7.1 # via streamlit # via typer -rpds-py==0.18.1 +rpds-py==0.19.0 # via jsonschema # via referencing rsa==4.9 # via google-auth -s3transfer==0.10.1 +s3transfer==0.10.2 # via boto3 -semchunk==1.0.1 +semchunk==2.2.0 # via scrapegraphai -sf-hamilton==1.63.0 +sf-hamilton==1.71.0 # via burr -shapely==2.0.4 +shapely==2.0.5 # via google-cloud-aiplatform shellingham==1.5.4 # via typer @@ -475,22 +493,24 @@ sphinxcontrib-applehelp==1.0.8 # via sphinx sphinxcontrib-devhelp==1.0.6 # via sphinx -sphinxcontrib-htmlhelp==2.0.5 +sphinxcontrib-htmlhelp==2.0.6 # via sphinx sphinxcontrib-jsmath==1.0.1 # via sphinx -sphinxcontrib-qthelp==1.0.7 +sphinxcontrib-qthelp==1.0.8 # via sphinx sphinxcontrib-serializinghtml==1.1.10 # via sphinx -sqlalchemy==2.0.30 +sqlalchemy==2.0.31 # via langchain + # via langchain-community starlette==0.37.2 # via fastapi -streamlit==1.35.0 +streamlit==1.36.0 # via burr -tenacity==8.3.0 +tenacity==8.5.0 # via langchain + # via langchain-community # via langchain-core # via streamlit tiktoken==0.7.0 @@ -503,21 +523,22 @@ toml==0.10.2 tomli==2.0.1 # via pylint # via pytest -tomlkit==0.12.5 +tomlkit==0.13.0 # via pylint toolz==0.12.1 # via altair -tornado==6.4 +tornado==6.4.1 # via streamlit tqdm==4.66.4 # via google-generativeai # via huggingface-hub + # via mpire # via openai # via scrapegraphai # via semchunk typer==0.12.3 # via fastapi-cli -typing-extensions==4.12.0 +typing-extensions==4.12.2 # via altair # via anthropic # via anyio @@ -540,24 +561,23 @@ typing-extensions==4.12.0 # via typing-inspect # via uvicorn typing-inspect==0.9.0 + # via dataclasses-json # via sf-hamilton tzdata==2024.1 # via pandas -ujson==5.10.0 - # via fastapi undetected-playwright==0.3.0 # via scrapegraphai uritemplate==4.1.1 # via google-api-python-client -urllib3==1.26.18 +urllib3==1.26.19 # via botocore # via requests -uvicorn==0.29.0 +uvicorn==0.30.3 # via burr # via fastapi uvloop==0.19.0 # via uvicorn -watchfiles==0.21.0 +watchfiles==0.22.0 # via uvicorn websockets==12.0 # via uvicorn diff --git a/requirements.lock b/requirements.lock index 881432d6..d7046ae5 100644 --- a/requirements.lock +++ b/requirements.lock @@ -10,6 +10,7 @@ -e file:. aiohttp==3.9.5 # via langchain + # via langchain-community # via langchain-fireworks aiosignal==1.3.1 # via aiohttp @@ -17,7 +18,7 @@ annotated-types==0.7.0 # via pydantic anthropic==0.31.2 # via langchain-anthropic -anyio==4.3.0 +anyio==4.4.0 # via anthropic # via groq # via httpx @@ -35,27 +36,31 @@ boto3==1.34.145 botocore==1.34.145 # via boto3 # via s3transfer -cachetools==5.3.3 +cachetools==5.4.0 # via google-auth -certifi==2024.2.2 +certifi==2024.7.4 # via httpcore # via httpx # via requests charset-normalizer==3.3.2 # via requests +dataclasses-json==0.6.7 + # via langchain-community defusedxml==0.7.1 # via langchain-anthropic +dill==0.3.8 + # via multiprocess distro==1.9.0 # via anthropic # via groq # via openai docstring-parser==0.16 # via google-cloud-aiplatform -exceptiongroup==1.2.1 +exceptiongroup==1.2.2 # via anyio -faiss-cpu==1.8.0 +faiss-cpu==1.8.0.post1 # via scrapegraphai -filelock==3.14.0 +filelock==3.15.4 # via huggingface-hub fireworks-ai==0.14.0 # via langchain-fireworks @@ -64,13 +69,13 @@ free-proxy==1.1.1 frozenlist==1.4.1 # via aiohttp # via aiosignal -fsspec==2024.5.0 +fsspec==2024.6.1 # via huggingface-hub google==3.0.0 # via scrapegraphai google-ai-generativelanguage==0.6.6 # via google-generativeai -google-api-core==2.19.0 +google-api-core==2.19.1 # via google-ai-generativelanguage # via google-api-python-client # via google-cloud-aiplatform @@ -79,9 +84,9 @@ google-api-core==2.19.0 # via google-cloud-resource-manager # via google-cloud-storage # via google-generativeai -google-api-python-client==2.130.0 +google-api-python-client==2.137.0 # via google-generativeai -google-auth==2.29.0 +google-auth==2.32.0 # via google-ai-generativelanguage # via google-api-core # via google-api-python-client @@ -94,14 +99,14 @@ google-auth==2.29.0 # via google-generativeai google-auth-httplib2==0.2.0 # via google-api-python-client -google-cloud-aiplatform==1.58.0 +google-cloud-aiplatform==1.59.0 # via langchain-google-vertexai google-cloud-bigquery==3.25.0 # via google-cloud-aiplatform google-cloud-core==2.4.1 # via google-cloud-bigquery # via google-cloud-storage -google-cloud-resource-manager==1.12.3 +google-cloud-resource-manager==1.12.4 # via google-cloud-aiplatform google-cloud-storage==2.17.0 # via google-cloud-aiplatform @@ -114,7 +119,7 @@ google-generativeai==0.7.2 google-resumable-media==2.7.1 # via google-cloud-bigquery # via google-cloud-storage -googleapis-common-protos==1.63.0 +googleapis-common-protos==1.63.2 # via google-api-core # via grpc-google-iam-v1 # via grpcio-status @@ -122,12 +127,11 @@ graphviz==0.20.3 # via scrapegraphai greenlet==3.0.3 # via playwright - # via sqlalchemy -groq==0.8.0 +groq==0.9.0 # via langchain-groq grpc-google-iam-v1==0.13.1 # via google-cloud-resource-manager -grpcio==1.64.0 +grpcio==1.65.1 # via google-api-core # via googleapis-common-protos # via grpc-google-iam-v1 @@ -150,43 +154,47 @@ httpx==0.27.0 # via openai httpx-sse==0.4.0 # via fireworks-ai -huggingface-hub==0.23.1 +huggingface-hub==0.24.0 # via tokenizers idna==3.7 # via anyio # via httpx # via requests # via yarl -jiter==0.4.0 +jiter==0.5.0 # via anthropic jmespath==1.0.1 # via boto3 # via botocore jsonpatch==1.33 # via langchain-core -jsonpointer==2.4 +jsonpointer==3.0.0 # via jsonpatch langchain==0.2.10 + # via langchain-community # via scrapegraphai langchain-anthropic==0.1.20 # via scrapegraphai langchain-aws==0.1.11 # via scrapegraphai +langchain-community==0.2.9 + # via scrapegraphai langchain-core==0.2.22 # via langchain # via langchain-anthropic # via langchain-aws + # via langchain-community # via langchain-fireworks # via langchain-google-genai # via langchain-google-vertexai # via langchain-groq # via langchain-openai # via langchain-text-splitters -langchain-fireworks==0.1.3 +langchain-fireworks==0.1.5 # via scrapegraphai langchain-google-genai==1.0.7 # via scrapegraphai -langchain-google-vertexai==1.0.4 +langchain-google-vertexai==1.0.6 # via scrapegraphai langchain-groq==0.1.6 # via scrapegraphai @@ -196,38 +204,50 @@ langchain-text-splitters==0.2.2 # via langchain langsmith==0.1.93 # via langchain + # via langchain-community # via langchain-core lxml==5.2.2 # via free-proxy +marshmallow==3.21.3 + # via dataclasses-json minify-html==0.15.0 # via scrapegraphai +mpire==2.10.2 + # via semchunk multidict==6.0.5 # via aiohttp # via yarl +multiprocess==0.70.16 + # via mpire +mypy-extensions==1.0.0 + # via typing-inspect numpy==1.26.4 # via faiss-cpu # via langchain # via langchain-aws + # via langchain-community # via pandas # via shapely openai==1.36.1 # via langchain-fireworks # via langchain-openai -orjson==3.10.3 +orjson==3.10.6 # via langsmith -packaging==23.2 +packaging==24.1 + # via faiss-cpu # via google-cloud-aiplatform # via google-cloud-bigquery # via huggingface-hub # via langchain-core + # via marshmallow pandas==2.2.2 # via scrapegraphai -pillow==10.3.0 +pillow==10.4.0 # via fireworks-ai -playwright==1.43.0 +playwright==1.45.0 # via scrapegraphai # via undetected-playwright -proto-plus==1.23.0 +proto-plus==1.24.0 # via google-ai-generativelanguage # via google-api-core # via google-cloud-aiplatform @@ -247,7 +267,7 @@ pyasn1==0.6.0 # via rsa pyasn1-modules==0.4.0 # via google-auth -pydantic==2.7.1 +pydantic==2.8.2 # via anthropic # via fireworks-ai # via google-cloud-aiplatform @@ -257,10 +277,12 @@ pydantic==2.7.1 # via langchain-core # via langsmith # via openai -pydantic-core==2.18.2 +pydantic-core==2.20.1 # via pydantic pyee==11.1.0 # via playwright +pygments==2.18.0 + # via mpire pyparsing==3.1.2 # via httplib2 python-dateutil==2.9.0.post0 @@ -274,26 +296,28 @@ pytz==2024.1 pyyaml==6.0.1 # via huggingface-hub # via langchain + # via langchain-community # via langchain-core regex==2024.5.15 # via tiktoken -requests==2.32.2 +requests==2.32.3 # via free-proxy # via google-api-core # via google-cloud-bigquery # via google-cloud-storage # via huggingface-hub # via langchain + # via langchain-community # via langchain-fireworks # via langsmith # via tiktoken rsa==4.9 # via google-auth -s3transfer==0.10.1 +s3transfer==0.10.2 # via boto3 -semchunk==1.0.1 +semchunk==2.2.0 # via scrapegraphai -shapely==2.0.4 +shapely==2.0.5 # via google-cloud-aiplatform six==1.16.0 # via python-dateutil @@ -305,10 +329,12 @@ sniffio==1.3.1 # via openai soupsieve==2.5 # via beautifulsoup4 -sqlalchemy==2.0.30 +sqlalchemy==2.0.31 # via langchain -tenacity==8.3.0 + # via langchain-community +tenacity==8.5.0 # via langchain + # via langchain-community # via langchain-core tiktoken==0.7.0 # via langchain-openai @@ -318,10 +344,11 @@ tokenizers==0.19.1 tqdm==4.66.4 # via google-generativeai # via huggingface-hub + # via mpire # via openai # via scrapegraphai # via semchunk -typing-extensions==4.12.0 +typing-extensions==4.12.2 # via anthropic # via anyio # via google-generativeai @@ -332,13 +359,16 @@ typing-extensions==4.12.0 # via pydantic-core # via pyee # via sqlalchemy + # via typing-inspect +typing-inspect==0.9.0 + # via dataclasses-json tzdata==2024.1 # via pandas undetected-playwright==0.3.0 # via scrapegraphai uritemplate==4.1.1 # via google-api-python-client -urllib3==1.26.18 +urllib3==1.26.19 # via botocore # via requests yarl==1.9.4 diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index e70ba194..569a2ebc 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -5,7 +5,7 @@ from typing import List, Optional from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser -from langchain_core.runnables import AsyncRunnable +from langchain_core.runnables import RunnableParallel from tqdm import tqdm from ..utils.merge_results import merge_results from ..utils.logging import get_logger @@ -42,7 +42,7 @@ def __init__( node_name: str = "GenerateAnswer", ): super().__init__(node_name, "node", input, output, 2, node_config) - + self.llm_model = node_config["llm_model"] self.verbose = ( @@ -112,34 +112,31 @@ def execute(self, state: dict) -> dict: chains_dict = {} answers = [] - # Use tqdm to add progress bar - for i, chunk in enumerate(tqdm(doc, desc="Processing chunks", disable=not self.verbose)): - if len(doc) == 1: + if len(doc) == 1: # No batching needed for single chunk prompt = PromptTemplate( template=template_no_chunks, input_variables=["question"], - partial_variables={"context": chunk, - "format_instructions": format_instructions}) - chain = prompt | self.llm_model | output_parser + partial_variables={"context": doc, + "format_instructions": format_instructions}) + chain = prompt | self.llm_model | output_parser answer = chain.invoke({"question": user_prompt}) - break - prompt = PromptTemplate( - template=template_chunks_prompt, - input_variables=["question"], - partial_variables={"context": chunk, + else: + for i, chunk in enumerate(tqdm(doc, desc="Processing chunks", disable=not self.verbose)): + prompt = PromptTemplate( + template=template_chunks, + input_variables=["question"], + partial_variables={"context": chunk, "chunk_id": i + 1, "format_instructions": format_instructions}) - # Dynamically name the chains based on their index - chain_name = f"chunk{i+1}" - chains_dict[chain_name] = prompt | self.llm_model | output_parser - + # Add chain to dictionary with dynamic name + chain_name = f"chunk{i+1}" + chains_dict[chain_name] = prompt | self.llm_model | output_parser - # Batch process chains if there are multiple chunks - if len(chains_dict) > 1: + async def process_chains(): - async_runner = AsyncRunnable() + async_runner = RunnableParallel() for chain_name, chain in chains_dict.items(): async_runner.add(chain.abatch([{"question": user_prompt}] * len(doc))) batch_results = await async_runner.run() diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py index cdca1b55..f099662d 100644 --- a/scrapegraphai/nodes/parse_node.py +++ b/scrapegraphai/nodes/parse_node.py @@ -41,7 +41,7 @@ def __init__( False if node_config is None else node_config.get("verbose", False) ) self.parse_html = ( - True if node_config is None else node_config.get("parse_html", True) + True if node_config is None else node_config.get("parse_html", False) ) def execute(self, state: dict) -> dict: