diff --git a/CHANGELOG.md b/CHANGELOG.md index 4aab762e..75031131 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,5 @@ -## [1.16.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.16.0-beta.3...v1.16.0-beta.4) (2024-09-02) +## [1.16.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.15.2...v1.16.0) (2024-09-01) + ### Features @@ -11,6 +12,9 @@ * deepcopy fail for coping model_instance config ([cd07418](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/cd07418474112cecd53ab47866262f2f31294223)) * fix pydantic object copy ([553527a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/553527a269cdd70c0c174ad5c78cbf35c00b22c1)) +## [1.15.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.15.1...v1.15.2) (2024-09-01) + + ## [1.16.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.16.0-beta.2...v1.16.0-beta.3) (2024-09-01) @@ -27,6 +31,7 @@ + ### Bug Fixes * pyproject.toml ([360ce1c](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/360ce1c0e468c959e63555120ac7cecf55563846)) diff --git a/pyproject.toml b/pyproject.toml index 69d91465..c4509951 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,6 @@ [project] name = "scrapegraphai" + version = "1.16.0b4" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index 189f72f2..c7ec3988 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -141,7 +141,7 @@ def _create_llm(self, llm_config: dict) -> object: try: self.model_token = models_tokens[llm_params["model_provider"]][llm_params["model"]] except KeyError: - print("Model not found, using default token size (8192)") + print(f"Model {llm_params['model_provider']}/{llm_params['model']} not found, using default token size (8192)") self.model_token = 8192 try: diff --git a/scrapegraphai/graphs/deep_scraper_graph.py b/scrapegraphai/graphs/deep_scraper_graph.py index d07a5276..0b0c84f8 100644 --- a/scrapegraphai/graphs/deep_scraper_graph.py +++ b/scrapegraphai/graphs/deep_scraper_graph.py @@ -75,7 +75,8 @@ def _create_repeated_graph(self) -> BaseGraph: input="doc", output=["parsed_doc"], node_config={ - "chunk_size": self.model_token + "chunk_size": self.model_token, + "llm_model": self.llm_model } ) diff --git a/scrapegraphai/graphs/markdown_scraper_graph.py b/scrapegraphai/graphs/markdown_scraper_graph.py index c177facd..82bd8a75 100644 --- a/scrapegraphai/graphs/markdown_scraper_graph.py +++ b/scrapegraphai/graphs/markdown_scraper_graph.py @@ -60,7 +60,8 @@ def _create_graph(self) -> BaseGraph: output=["parsed_doc"], node_config={ "parse_html": False, - "chunk_size": self.model_token + "chunk_size": self.model_token, + "llm_model": self.llm_model } ) generate_answer_node = GenerateAnswerNode( diff --git a/scrapegraphai/graphs/omni_scraper_graph.py b/scrapegraphai/graphs/omni_scraper_graph.py index 8b5f7fc9..006533d9 100644 --- a/scrapegraphai/graphs/omni_scraper_graph.py +++ b/scrapegraphai/graphs/omni_scraper_graph.py @@ -74,7 +74,8 @@ def _create_graph(self) -> BaseGraph: input="doc", output=["parsed_doc"], node_config={ - "chunk_size": self.model_token + "chunk_size": self.model_token, + "llm_model": self.llm_model } ) image_to_text_node = ImageToTextNode( diff --git a/scrapegraphai/graphs/pdf_scraper_graph.py b/scrapegraphai/graphs/pdf_scraper_graph.py index ae783aba..9b54136c 100644 --- a/scrapegraphai/graphs/pdf_scraper_graph.py +++ b/scrapegraphai/graphs/pdf_scraper_graph.py @@ -68,7 +68,8 @@ def _create_graph(self) -> BaseGraph: output=["parsed_doc"], node_config={ "parse_html": False, - "chunk_size": self.model_token + "chunk_size": self.model_token, + "llm_model": self.llm_model } ) diff --git a/scrapegraphai/graphs/script_creator_graph.py b/scrapegraphai/graphs/script_creator_graph.py index bb5629c5..73c709eb 100644 --- a/scrapegraphai/graphs/script_creator_graph.py +++ b/scrapegraphai/graphs/script_creator_graph.py @@ -73,11 +73,12 @@ def _create_graph(self) -> BaseGraph: input="doc", output=["parsed_doc"], node_config={"chunk_size": self.model_token, - "parse_html": False + "parse_html": False, + "llm_model": self.llm_model } ) generate_scraper_node = GenerateScraperNode( - input="user_prompt & (doc)", + input="user_prompt & (parsed_doc)", output=["answer"], node_config={ "llm_model": self.llm_model, diff --git a/scrapegraphai/graphs/search_link_graph.py b/scrapegraphai/graphs/search_link_graph.py index 66b2f223..566569a8 100644 --- a/scrapegraphai/graphs/search_link_graph.py +++ b/scrapegraphai/graphs/search_link_graph.py @@ -64,7 +64,8 @@ def _create_graph(self) -> BaseGraph: input="doc", output=["parsed_doc"], node_config={ - "chunk_size": self.model_token + "chunk_size": self.model_token, + "llm_model": self.llm_model } ) search_link_node = SearchLinkNode( diff --git a/scrapegraphai/graphs/speech_graph.py b/scrapegraphai/graphs/speech_graph.py index 8d77621a..8c1ff278 100644 --- a/scrapegraphai/graphs/speech_graph.py +++ b/scrapegraphai/graphs/speech_graph.py @@ -68,7 +68,8 @@ def _create_graph(self) -> BaseGraph: input="doc", output=["parsed_doc"], node_config={ - "chunk_size": self.model_token + "chunk_size": self.model_token, + "llm_model": self.llm_model } ) generate_answer_node = GenerateAnswerNode( diff --git a/scrapegraphai/nodes/generate_scraper_node.py b/scrapegraphai/nodes/generate_scraper_node.py index a7c5e5bb..ce6259df 100644 --- a/scrapegraphai/nodes/generate_scraper_node.py +++ b/scrapegraphai/nodes/generate_scraper_node.py @@ -102,9 +102,19 @@ def execute(self, state: dict) -> dict: TEMPLATE_NO_CHUNKS += self.additional_info if len(doc) > 1: - raise NotImplementedError( - "Currently GenerateScraperNode cannot handle more than 1 context chunks" - ) + # Short term partial fix for issue #543 (Context length exceeded) + # If there are more than one chunks returned by ParseNode we just use the first one + # on the basis that the structure of the remainder of the HTML page is probably + # very similar to the first chunk therefore the generated script should still work. + # The better fix is to generate multiple scripts then use the LLM to merge them. + + #raise NotImplementedError( + # "Currently GenerateScraperNode cannot handle more than 1 context chunks" + #) + self.logger.warn(f"Warning: {self.node_name} Node provided with {len(doc)} chunks but can only " + "support 1, ignoring remaining chunks") + doc = [doc[0]] + template = TEMPLATE_NO_CHUNKS else: template = TEMPLATE_NO_CHUNKS diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py index 3e8ed5ac..7413229f 100644 --- a/scrapegraphai/nodes/parse_node.py +++ b/scrapegraphai/nodes/parse_node.py @@ -40,6 +40,7 @@ def __init__( self.parse_html = ( True if node_config is None else node_config.get("parse_html", True) ) + self.llm_model = node_config['llm_model'] def execute(self, state: dict) -> dict: """ @@ -64,31 +65,33 @@ def execute(self, state: dict) -> dict: input_data = [state[key] for key in input_keys] docs_transformed = input_data[0] + def count_tokens(text): + from ..utils import token_count + return token_count(text, self.llm_model.model_name) + if self.parse_html: docs_transformed = Html2TextTransformer().transform_documents(input_data[0]) docs_transformed = docs_transformed[0] chunks = chunk(text=docs_transformed.page_content, chunk_size=self.node_config.get("chunk_size", 4096)-250, - token_counter=lambda text: len(text.split()), + token_counter=count_tokens, memoize=False) else: docs_transformed = docs_transformed[0] - chunk_size = self.node_config.get("chunk_size", 4096) chunk_size = min(chunk_size - 500, int(chunk_size * 0.9)) if isinstance(docs_transformed, Document): chunks = chunk(text=docs_transformed.page_content, chunk_size=chunk_size, - token_counter=lambda text: len(text.split()), + token_counter=count_tokens, memoize=False) else: chunks = chunk(text=docs_transformed, chunk_size=chunk_size, - token_counter=lambda text: len(text.split()), + token_counter=count_tokens, memoize=False) state.update({self.output[0]: chunks}) - return state diff --git a/scrapegraphai/utils/__init__.py b/scrapegraphai/utils/__init__.py index 707d2b18..0a1b8d5b 100644 --- a/scrapegraphai/utils/__init__.py +++ b/scrapegraphai/utils/__init__.py @@ -11,3 +11,4 @@ from .cleanup_html import cleanup_html from .logging import * from .convert_to_md import convert_to_md +from .token_calculator import * diff --git a/scrapegraphai/utils/token_calculator.py b/scrapegraphai/utils/token_calculator.py index c5e5fbbb..2545c865 100644 --- a/scrapegraphai/utils/token_calculator.py +++ b/scrapegraphai/utils/token_calculator.py @@ -6,27 +6,26 @@ from ..helpers.models_tokens import models_tokens -def truncate_text_tokens(text: str, model: str, encoding_name: str) -> List[str]: +def truncate_text_tokens(text: str, model: str) -> List[str]: """ Truncates text into chunks that are small enough to be processed by specified llm models. Args: text (str): The input text to be truncated. model (str): The name of the llm model to determine the maximum token limit. - encoding_name (str): The encoding strategy used to encode the text before truncation. Returns: List[str]: A list of text chunks, each within the token limit of the specified model. Example: - >>> truncate_text_tokens("This is a sample text for truncation.", "GPT-3", "EMBEDDING_ENCODING") + >>> truncate_text_tokens("This is a sample text for truncation.", "gpt-4o-mini") ["This is a sample text", "for truncation."] This function ensures that each chunk of text can be tokenized by the specified model without exceeding the model's token limit. """ - encoding = tiktoken.get_encoding(encoding_name) + encoding = tiktoken.encoding_for_model(model) max_tokens = min(models_tokens[model] - 500, int(models_tokens[model] * 0.9)) encoded_text = encoding.encode(text) @@ -36,3 +35,28 @@ def truncate_text_tokens(text: str, model: str, encoding_name: str) -> List[str] result = [encoding.decode(chunk) for chunk in chunks] return result + + +def token_count(text: str, model: str) -> List[str]: + """ + Return the number of tokens within the text, based on the encoding of the specified model. + + Args: + text (str): The input text to be counted. + model (str): The name of the llm model to determine the encoding. + + Returns: + int: Number of tokens. + + Example: + >>> token_count("This is a sample text for counting.", "gpt-4o-mini") + 9 + + This function ensures that each chunk of text can be tokenized + by the specified model without exceeding the model's token limit. + """ + + encoding = tiktoken.encoding_for_model(model) + num_tokens = len(encoding.encode(text)) + + return num_tokens