diff --git a/CHANGELOG.md b/CHANGELOG.md index 6c230b5e..cdad3161 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,40 @@ +## [1.31.1-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.31.1-beta.3...v1.31.1-beta.4) (2024-11-21) + + +### Bug Fixes + +* add new model istance ([2f3cafe](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/2f3cafeab0bce38571fa10d71f454b2a31766ddc)) + +## [1.31.1-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.31.1-beta.2...v1.31.1-beta.3) (2024-11-21) + + +### Bug Fixes + +* fetch node regex ([e2af232](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e2af2326f6c56e2abcc7dd5de9acdfb710507e0a)) + +## [1.31.1-beta.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.31.1-beta.1...v1.31.1-beta.2) (2024-11-20) + + +### Bug Fixes + +* generate answer node timeout ([32ef554](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/32ef5547f1d864c750cd47c115be6f38a1931d2c)) + +## [1.31.1-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.31.0...v1.31.1-beta.1) (2024-11-20) + + +### Bug Fixes + +* timeout ([c243106](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/c243106552cec3b1df254c0d0a45401eb2f5c89d)) + + +### CI + +* **release:** 1.31.0-beta.1 [skip ci] ([1df7eb0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/1df7eb0bcd923bc62fd19dddc0ce9b757e9742cf)), closes [#805](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/805) [#805](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/805) + ## [1.31.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.30.0...v1.31.0) (2024-11-19) + ### Features * refactoring of generate answer node ([1f465e6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/1f465e636d2869e4e36555124767de026d3a66ae)) diff --git a/pyproject.toml b/pyproject.toml index 9097f162..6baff74f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,9 +3,7 @@ name = "scrapegraphai" -version = "1.31.0" - - +version = "1.31.1b4" diff --git a/requirements-dev.lock b/requirements-dev.lock index 61bd3e2b..b2d32e41 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -30,6 +30,8 @@ anyio==4.4.0 astroid==3.2.4 # via pylint async-timeout==4.0.3 + # via aiohttp + # via langchain # via scrapegraphai attrs==24.2.0 # via aiohttp @@ -78,6 +80,9 @@ distro==1.9.0 # via openai docutils==0.19 # via sphinx +exceptiongroup==1.2.2 + # via anyio + # via pytest fastapi==0.112.0 # via burr fastapi-pagination==0.12.26 @@ -131,7 +136,6 @@ graphviz==0.20.3 # via burr greenlet==3.0.3 # via playwright - # via sqlalchemy grpcio==1.65.4 # via google-api-core # via grpcio-status @@ -500,6 +504,9 @@ tokenizers==0.19.1 # via transformers toml==0.10.2 # via streamlit +tomli==2.1.0 + # via pylint + # via pytest tomlkit==0.13.0 # via pylint tornado==6.4.1 @@ -517,6 +524,8 @@ transformers==4.44.2 # via scrapegraphai typing-extensions==4.12.2 # via altair + # via anyio + # via astroid # via fastapi # via fastapi-pagination # via google-generativeai @@ -531,6 +540,7 @@ typing-extensions==4.12.2 # via sqlalchemy # via streamlit # via typing-inspect + # via uvicorn typing-inspect==0.9.0 # via dataclasses-json # via sf-hamilton diff --git a/requirements.lock b/requirements.lock index c2c40996..38be6e68 100644 --- a/requirements.lock +++ b/requirements.lock @@ -19,6 +19,8 @@ anyio==4.4.0 # via httpx # via openai async-timeout==4.0.3 + # via aiohttp + # via langchain # via scrapegraphai attrs==23.2.0 # via aiohttp @@ -48,6 +50,8 @@ dill==0.3.8 # via multiprocess distro==1.9.0 # via openai +exceptiongroup==1.2.2 + # via anyio fastembed==0.3.6 # via scrapegraphai filelock==3.15.4 @@ -87,7 +91,6 @@ googlesearch-python==1.2.5 # via scrapegraphai greenlet==3.0.3 # via playwright - # via sqlalchemy grpcio==1.65.1 # via google-api-core # via grpcio-status @@ -368,6 +371,7 @@ tqdm==4.66.4 transformers==4.44.2 # via scrapegraphai typing-extensions==4.12.2 + # via anyio # via google-generativeai # via huggingface-hub # via langchain-core diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py index 8f367ceb..705e2969 100644 --- a/scrapegraphai/helpers/models_tokens.py +++ b/scrapegraphai/helpers/models_tokens.py @@ -161,6 +161,7 @@ "claude-3-sonnet-20240229": 200000, "claude-3-haiku-20240307": 200000, "claude-3-5-sonnet-20240620": 200000, + "claude-3-5-haiku-latest": 200000, "claude-3-haiku-20240307": 4000, }, "bedrock": { @@ -168,6 +169,7 @@ "anthropic.claude-3-sonnet-20240229-v1:0": 200000, "anthropic.claude-3-opus-20240229-v1:0": 200000, "anthropic.claude-3-5-sonnet-20240620-v1:0": 200000, + "claude-3-5-haiku-latest": 200000, "anthropic.claude-v2:1": 200000, "anthropic.claude-v2": 100000, "anthropic.claude-instant-v1": 100000, diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 55f05ab6..f964eb8b 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -80,28 +80,30 @@ def __init__( None if node_config is None else node_config.get("scrape_do", None) ) + def is_valid_url(self, source: str) -> bool: + """ + Validates if the source string is a valid URL using regex. + + Parameters: + source (str): The URL string to validate + + Raises: + ValueError: If the URL is invalid + """ + import re + url_pattern = r'^https?://[^\s/$.?#].[^\s]*$' + if not bool(re.match(url_pattern, source)): + raise ValueError(f"Invalid URL format: {source}. URL must start with http(s):// and contain a valid domain.") + return True + def execute(self, state): """ Executes the node's logic to fetch HTML content from a specified URL and update the state with this content. - - Args: - state (dict): The current state of the graph. The input keys will be used - to fetch the correct data types from the state. - - Returns: - dict: The updated state with a new output key containing the fetched HTML content. - - Raises: - KeyError: If the input key is not found in the state, indicating that the - necessary information to perform the operation is missing. """ - self.logger.info(f"--- Executing {self.node_name} Node ---") - # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) - # Fetching data from the state based on the input keys input_data = [state[key] for key in input_keys] source = input_data[0] @@ -124,10 +126,16 @@ def execute(self, state): return handlers[input_type](state, input_type, source) elif self.input == "pdf_dir": return state - elif not source.startswith("http") and not source.startswith("www"): - return self.handle_local_source(state, source) - else: - return self.handle_web_source(state, source) + + # For web sources, validate URL before proceeding + try: + if self.is_valid_url(source): + return self.handle_web_source(state, source) + except ValueError as e: + # Re-raise the exception from is_valid_url + raise + + return self.handle_local_source(state, source) def handle_directory(self, state, input_type, source): """ diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index 758cdaf1..d1a42965 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -60,7 +60,22 @@ def __init__( self.script_creator = node_config.get("script_creator", False) self.is_md_scraper = node_config.get("is_md_scraper", False) self.additional_info = node_config.get("additional_info") - self.timeout = node_config.get("timeout", 30) + self.timeout = node_config.get("timeout", 120) + + def invoke_with_timeout(self, chain, inputs, timeout): + """Helper method to invoke chain with timeout""" + try: + start_time = time.time() + response = chain.invoke(inputs) + if time.time() - start_time > timeout: + raise Timeout(f"Response took longer than {timeout} seconds") + return response + except Timeout as e: + self.logger.error(f"Timeout error: {str(e)}") + raise + except Exception as e: + self.logger.error(f"Error during chain execution: {str(e)}") + raise def execute(self, state: dict) -> dict: """ @@ -116,21 +131,6 @@ def execute(self, state: dict) -> dict: template_chunks_prompt = self.additional_info + template_chunks_prompt template_merge_prompt = self.additional_info + template_merge_prompt - def invoke_with_timeout(chain, inputs, timeout): - try: - with get_openai_callback() as cb: - start_time = time.time() - response = chain.invoke(inputs) - if time.time() - start_time > timeout: - raise Timeout(f"Response took longer than {timeout} seconds") - return response - except Timeout as e: - self.logger.error(f"Timeout error: {str(e)}") - raise - except Exception as e: - self.logger.error(f"Error during chain execution: {str(e)}") - raise - if len(doc) == 1: prompt = PromptTemplate( template=template_no_chunks_prompt, @@ -138,17 +138,15 @@ def invoke_with_timeout(chain, inputs, timeout): partial_variables={"context": doc, "format_instructions": format_instructions} ) chain = prompt | self.llm_model + if output_parser: + chain = chain | output_parser try: - raw_response = invoke_with_timeout(chain, {"question": user_prompt}, self.timeout) + answer = self.invoke_with_timeout(chain, {"question": user_prompt}, self.timeout) except Timeout: state.update({self.output[0]: {"error": "Response timeout exceeded"}}) return state - if output_parser: - chain = chain | output_parser - - answer = chain.invoke({"question": user_prompt}) state.update({self.output[0]: answer}) return state @@ -168,9 +166,9 @@ def invoke_with_timeout(chain, inputs, timeout): async_runner = RunnableParallel(**chains_dict) try: - batch_results = invoke_with_timeout( - async_runner, - {"question": user_prompt}, + batch_results = self.invoke_with_timeout( + async_runner, + {"question": user_prompt}, self.timeout ) except Timeout: @@ -187,7 +185,7 @@ def invoke_with_timeout(chain, inputs, timeout): if output_parser: merge_chain = merge_chain | output_parser try: - answer = invoke_with_timeout( + answer = self.invoke_with_timeout( merge_chain, {"context": batch_results, "question": user_prompt}, self.timeout