Skip to content

543 script creator graph only use first chunk #619

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
## [1.16.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.16.0-beta.3...v1.16.0-beta.4) (2024-09-02)
## [1.16.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.15.2...v1.16.0) (2024-09-01)



### Features
Expand All @@ -11,6 +12,9 @@
* deepcopy fail for coping model_instance config ([cd07418](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/cd07418474112cecd53ab47866262f2f31294223))
* fix pydantic object copy ([553527a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/553527a269cdd70c0c174ad5c78cbf35c00b22c1))

## [1.15.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.15.1...v1.15.2) (2024-09-01)


## [1.16.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.16.0-beta.2...v1.16.0-beta.3) (2024-09-01)


Expand All @@ -27,6 +31,7 @@




### Bug Fixes

* pyproject.toml ([360ce1c](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/360ce1c0e468c959e63555120ac7cecf55563846))
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
[project]
name = "scrapegraphai"

version = "1.16.0b4"

description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
Expand Down
2 changes: 1 addition & 1 deletion scrapegraphai/graphs/abstract_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def _create_llm(self, llm_config: dict) -> object:
try:
self.model_token = models_tokens[llm_params["model_provider"]][llm_params["model"]]
except KeyError:
print("Model not found, using default token size (8192)")
print(f"Model {llm_params['model_provider']}/{llm_params['model']} not found, using default token size (8192)")
self.model_token = 8192

try:
Expand Down
3 changes: 2 additions & 1 deletion scrapegraphai/graphs/deep_scraper_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ def _create_repeated_graph(self) -> BaseGraph:
input="doc",
output=["parsed_doc"],
node_config={
"chunk_size": self.model_token
"chunk_size": self.model_token,
"llm_model": self.llm_model
}
)

Expand Down
3 changes: 2 additions & 1 deletion scrapegraphai/graphs/markdown_scraper_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ def _create_graph(self) -> BaseGraph:
output=["parsed_doc"],
node_config={
"parse_html": False,
"chunk_size": self.model_token
"chunk_size": self.model_token,
"llm_model": self.llm_model
}
)
generate_answer_node = GenerateAnswerNode(
Expand Down
3 changes: 2 additions & 1 deletion scrapegraphai/graphs/omni_scraper_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ def _create_graph(self) -> BaseGraph:
input="doc",
output=["parsed_doc"],
node_config={
"chunk_size": self.model_token
"chunk_size": self.model_token,
"llm_model": self.llm_model
}
)
image_to_text_node = ImageToTextNode(
Expand Down
3 changes: 2 additions & 1 deletion scrapegraphai/graphs/pdf_scraper_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@ def _create_graph(self) -> BaseGraph:
output=["parsed_doc"],
node_config={
"parse_html": False,
"chunk_size": self.model_token
"chunk_size": self.model_token,
"llm_model": self.llm_model
}
)

Expand Down
5 changes: 3 additions & 2 deletions scrapegraphai/graphs/script_creator_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,12 @@ def _create_graph(self) -> BaseGraph:
input="doc",
output=["parsed_doc"],
node_config={"chunk_size": self.model_token,
"parse_html": False
"parse_html": False,
"llm_model": self.llm_model
}
)
generate_scraper_node = GenerateScraperNode(
input="user_prompt & (doc)",
input="user_prompt & (parsed_doc)",
output=["answer"],
node_config={
"llm_model": self.llm_model,
Expand Down
3 changes: 2 additions & 1 deletion scrapegraphai/graphs/search_link_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ def _create_graph(self) -> BaseGraph:
input="doc",
output=["parsed_doc"],
node_config={
"chunk_size": self.model_token
"chunk_size": self.model_token,
"llm_model": self.llm_model
}
)
search_link_node = SearchLinkNode(
Expand Down
3 changes: 2 additions & 1 deletion scrapegraphai/graphs/speech_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@ def _create_graph(self) -> BaseGraph:
input="doc",
output=["parsed_doc"],
node_config={
"chunk_size": self.model_token
"chunk_size": self.model_token,
"llm_model": self.llm_model
}
)
generate_answer_node = GenerateAnswerNode(
Expand Down
16 changes: 13 additions & 3 deletions scrapegraphai/nodes/generate_scraper_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,19 @@ def execute(self, state: dict) -> dict:
TEMPLATE_NO_CHUNKS += self.additional_info

if len(doc) > 1:
raise NotImplementedError(
"Currently GenerateScraperNode cannot handle more than 1 context chunks"
)
# Short term partial fix for issue #543 (Context length exceeded)
# If there are more than one chunks returned by ParseNode we just use the first one
# on the basis that the structure of the remainder of the HTML page is probably
# very similar to the first chunk therefore the generated script should still work.
# The better fix is to generate multiple scripts then use the LLM to merge them.

#raise NotImplementedError(
# "Currently GenerateScraperNode cannot handle more than 1 context chunks"
#)
self.logger.warn(f"Warning: {self.node_name} Node provided with {len(doc)} chunks but can only "
"support 1, ignoring remaining chunks")
doc = [doc[0]]
template = TEMPLATE_NO_CHUNKS
else:
template = TEMPLATE_NO_CHUNKS

Expand Down
13 changes: 8 additions & 5 deletions scrapegraphai/nodes/parse_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def __init__(
self.parse_html = (
True if node_config is None else node_config.get("parse_html", True)
)
self.llm_model = node_config['llm_model']

def execute(self, state: dict) -> dict:
"""
Expand All @@ -64,31 +65,33 @@ def execute(self, state: dict) -> dict:
input_data = [state[key] for key in input_keys]
docs_transformed = input_data[0]

def count_tokens(text):
from ..utils import token_count
return token_count(text, self.llm_model.model_name)

if self.parse_html:
docs_transformed = Html2TextTransformer().transform_documents(input_data[0])
docs_transformed = docs_transformed[0]

chunks = chunk(text=docs_transformed.page_content,
chunk_size=self.node_config.get("chunk_size", 4096)-250,
token_counter=lambda text: len(text.split()),
token_counter=count_tokens,
memoize=False)
else:
docs_transformed = docs_transformed[0]

chunk_size = self.node_config.get("chunk_size", 4096)
chunk_size = min(chunk_size - 500, int(chunk_size * 0.9))

if isinstance(docs_transformed, Document):
chunks = chunk(text=docs_transformed.page_content,
chunk_size=chunk_size,
token_counter=lambda text: len(text.split()),
token_counter=count_tokens,
memoize=False)
else:
chunks = chunk(text=docs_transformed,
chunk_size=chunk_size,
token_counter=lambda text: len(text.split()),
token_counter=count_tokens,
memoize=False)

state.update({self.output[0]: chunks})

return state
1 change: 1 addition & 0 deletions scrapegraphai/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@
from .cleanup_html import cleanup_html
from .logging import *
from .convert_to_md import convert_to_md
from .token_calculator import *
32 changes: 28 additions & 4 deletions scrapegraphai/utils/token_calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,27 +6,26 @@
from ..helpers.models_tokens import models_tokens


def truncate_text_tokens(text: str, model: str, encoding_name: str) -> List[str]:
def truncate_text_tokens(text: str, model: str) -> List[str]:
"""
Truncates text into chunks that are small enough to be processed by specified llm models.

Args:
text (str): The input text to be truncated.
model (str): The name of the llm model to determine the maximum token limit.
encoding_name (str): The encoding strategy used to encode the text before truncation.

Returns:
List[str]: A list of text chunks, each within the token limit of the specified model.

Example:
>>> truncate_text_tokens("This is a sample text for truncation.", "GPT-3", "EMBEDDING_ENCODING")
>>> truncate_text_tokens("This is a sample text for truncation.", "gpt-4o-mini")
["This is a sample text", "for truncation."]

This function ensures that each chunk of text can be tokenized
by the specified model without exceeding the model's token limit.
"""

encoding = tiktoken.get_encoding(encoding_name)
encoding = tiktoken.encoding_for_model(model)
max_tokens = min(models_tokens[model] - 500, int(models_tokens[model] * 0.9))
encoded_text = encoding.encode(text)

Expand All @@ -36,3 +35,28 @@ def truncate_text_tokens(text: str, model: str, encoding_name: str) -> List[str]
result = [encoding.decode(chunk) for chunk in chunks]

return result


def token_count(text: str, model: str) -> List[str]:
"""
Return the number of tokens within the text, based on the encoding of the specified model.

Args:
text (str): The input text to be counted.
model (str): The name of the llm model to determine the encoding.

Returns:
int: Number of tokens.

Example:
>>> token_count("This is a sample text for counting.", "gpt-4o-mini")
9

This function ensures that each chunk of text can be tokenized
by the specified model without exceeding the model's token limit.
"""

encoding = tiktoken.encoding_for_model(model)
num_tokens = len(encoding.encode(text))

return num_tokens