From e2fe39c093009254d5849aea8a49fad0aea450fa Mon Sep 17 00:00:00 2001 From: Matteo Vedovati Date: Thu, 26 Sep 2024 17:59:50 +0200 Subject: [PATCH 01/36] Reasoning node created --- scrapegraphai/nodes/__init__.py | 3 +- scrapegraphai/nodes/reasoning_node.py | 482 ++++++++++++++++++++++++++ 2 files changed, 484 insertions(+), 1 deletion(-) create mode 100644 scrapegraphai/nodes/reasoning_node.py diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py index e5427044..2a0f261a 100644 --- a/scrapegraphai/nodes/__init__.py +++ b/scrapegraphai/nodes/__init__.py @@ -25,4 +25,5 @@ from .concat_answers_node import ConcatAnswersNode from .prompt_refiner_node import PromptRefinerNode from .html_analyzer_node import HtmlAnalyzerNode -from .generate_code_node import GenerateCodeNode \ No newline at end of file +from .generate_code_node import GenerateCodeNode +from .reasoning_node import ReasoningNode \ No newline at end of file diff --git a/scrapegraphai/nodes/reasoning_node.py b/scrapegraphai/nodes/reasoning_node.py new file mode 100644 index 00000000..4d9b29da --- /dev/null +++ b/scrapegraphai/nodes/reasoning_node.py @@ -0,0 +1,482 @@ +""" +PromptRefinerNode Module +""" +from typing import List, Optional +from langchain.prompts import PromptTemplate +from langchain_core.output_parsers import StrOutputParser +from langchain_core.runnables import RunnableParallel +from langchain_core.utils.pydantic import is_basemodel_subclass +from langchain_openai import ChatOpenAI, AzureChatOpenAI +from langchain_mistralai import ChatMistralAI +from langchain_community.chat_models import ChatOllama +from tqdm import tqdm +from .base_node import BaseNode +from ..utils import transform_schema + +class ReasoningNode(BaseNode): + """ + A node that refine the user prompt with the use of the schema and additional context and + create a precise prompt in subsequent steps that explicitly link elements in the user's + original input to their corresponding representations in the JSON schema. + + Attributes: + llm_model: An instance of a language model client, configured for generating answers. + verbose (bool): A flag indicating whether to show print statements during execution. + + Args: + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "GenerateAnswer". + """ + + def __init__( + self, + input: str, + output: List[str], + node_config: Optional[dict] = None, + node_name: str = "PromptRefiner", + ): + super().__init__(node_name, "node", input, output, 2, node_config) + + self.llm_model = node_config["llm_model"] + + if isinstance(node_config["llm_model"], ChatOllama): + self.llm_model.format="json" + + self.verbose = ( + True if node_config is None else node_config.get("verbose", False) + ) + self.force = ( + False if node_config is None else node_config.get("force", False) + ) + self.script_creator = ( + False if node_config is None else node_config.get("script_creator", False) + ) + self.is_md_scraper = ( + False if node_config is None else node_config.get("is_md_scraper", False) + ) + + self.additional_info = node_config.get("additional_info") + + self.output_schema = node_config.get("schema") + + def execute(self, state: dict) -> dict: + """ + Generate a refined prompt using the user's prompt, the schema, and additional context. + + Args: + state (dict): The current state of the graph. The input keys will be used + to fetch the correct data from the state. + + Returns: + dict: The updated state with the output key containing the generated answer. + + Raises: + KeyError: If the input keys are not found in the state, indicating + that the necessary information for generating an answer is missing. + """ + + self.logger.info(f"--- Executing {self.node_name} Node ---") + + user_prompt = state['user_prompt'] + + self.simplefied_schema = transform_schema(self.output_schema.schema()) + + if self.additional_info is not None: + prompt = PromptTemplate( + template=TEMPLATE_REFINER_WITH_CONTEXT, + partial_variables={"user_input": user_prompt, + "json_schema": str(self.simplefied_schema), + "additional_context": self.additional_info}) + else: + prompt = PromptTemplate( + template=TEMPLATE_REFINER, + partial_variables={"user_input": user_prompt, + "json_schema": str(self.simplefied_schema)}) + + output_parser = StrOutputParser() + + chain = prompt | self.llm_model | output_parser + refined_prompt = chain.invoke({}) + + state.update({self.output[0]: refined_prompt}) + return state + + +TEMPLATE_REASONING = """ +**Task**: Analyze the user's request and the provided JSON schema to clearly map the desired data extraction.\n +Break down the user's request into key components, and then explicitly connect these components to the +corresponding elements within the JSON schema. + +**User's Request**: +{user_input} + +**Desired JSON Output Schema**: +```json +{json_schema} +``` + +**Analysis Instructions**: +1. **Break Down User Request:** +* Clearly identify the core entities or data types the user is asking for.\n +* Highlight any specific attributes or relationships mentioned in the request.\n + +2. **Map to JSON Schema**: +* For each identified element in the user request, pinpoint its exact counterpart in the JSON schema.\n +* Explain how the schema structure accommodates the user's needs. +* If applicable, mention any schema elements that are not directly addressed in the user's request.\n + +This analysis will be used to guide the HTML structure examination and ultimately inform the code generation process.\n +Please generate only the analysis and no other text. + +**Response**: +""" + +TEMPLATE_REASONING_WITH_CONTEXT = """ +**Task**: Analyze the user's request, the provided JSON schema, and the additional context the user provided to clearly map the desired data extraction.\n +Break down the user's request into key components, and then explicitly connect these components to the corresponding elements within the JSON schema.\n + +**User's Request**: +{user_input} + +**Desired JSON Output Schema**: +```json +{json_schema} +``` + +**Additional Context**: +{additional_context} + +**Analysis Instructions**: +1. **Break Down User Request:** +* Clearly identify the core entities or data types the user is asking for.\n +* Highlight any specific attributes or relationships mentioned in the request.\n + +2. **Map to JSON Schema**: +* For each identified element in the user request, pinpoint its exact counterpart in the JSON schema.\n +* Explain how the schema structure accommodates the user's needs.\n +* If applicable, mention any schema elements that are not directly addressed in the user's request.\n + +This analysis will be used to guide the HTML structure examination and ultimately inform the code generation process.\n +Please generate only the analysis and no other text. + +**Response**: +""" + +# TEMPLATE_REASONING_v1 (Emphasis on Clarity) +TEMPLATE_REASONING_v1 = """ +**Task:** Meticulously analyze the user's request and the provided JSON schema to create a crystal-clear mapping for data extraction. + +**User's Request:** +{user_input} + +**Desired JSON Output Schema:** +```json +{json_schema} +``` + +**Analysis Steps:** + +1. **Deconstruct User Request:** + * Pinpoint the core data the user needs (e.g., specific entities, attributes, relationships). + * Highlight any filtering or sorting criteria mentioned in the request. + +2. **Connect to JSON Schema:** + * For each element the user wants, locate its precise match in the schema. + * Explain how the schema's structure fulfills the user's needs (e.g., nested objects, arrays). + * If any schema parts aren't relevant to the request, point them out. + +**Remember:** +* This analysis is crucial for building the HTML structure and generating code. +* Be thorough and explicit in your explanations. +* Focus solely on the analysis; avoid extraneous text. + +**Response:** +""" + +# TEMPLATE_REASONING_v2 (Focus on Data Transformation) +TEMPLATE_REASONING_v2 = """ +**Task:** Analyze the user's request and the JSON schema to determine the necessary data transformations for extraction. + +**User's Request:** +{user_input} + +**Desired JSON Output Schema:** +```json +{json_schema} +``` + +**Analysis Steps:** + +1. **Understand User's Needs:** + * Identify the specific data the user wants and how they want it presented. + * Note any calculations, formatting, or restructuring required. + +2. **Schema Mapping and Transformations:** + * Match user's needs to schema elements, noting any data type conversions needed. + * Outline the steps to transform the schema data into the user's desired format. + * If the schema lacks necessary data, clearly state this. + +**Key Points:** +* This analysis guides how we'll manipulate the schema data to match the user's request. +* Be explicit about the transformations needed (e.g., filtering, renaming, calculations). +* Focus on the analysis; no additional text is required. + +**Response:** +""" + +# TEMPLATE_REASONING_v3 (Highlighting Potential Challenges) +TEMPLATE_REASONING_v3 = """ +**Task:** Analyze the user's request and JSON schema, identifying potential challenges in data extraction. + +**User's Request:** +{user_input} + +**Desired JSON Output Schema:** +```json +{json_schema} +``` + +**Analysis Steps:** + +1. **Thorough Request Understanding:** + * Clearly identify all data elements the user wants. + * Note any ambiguities or complexities in the request. + +2. **Schema Mapping and Challenges:** + * Match user needs to schema elements, flagging any mismatches or missing data. + * Highlight any complex schema structures that might complicate extraction. + * If the request is vague, suggest clarifications needed from the user. + +**Important Notes:** +* This analysis helps us anticipate and address potential roadblocks in code generation. +* Be proactive in identifying challenges, not just mapping data. +* If the request is unclear, ask specific questions for clarification. +* Focus on the analysis; avoid any unnecessary text. + +**Response:** +""" + +# TEMPLATE_REASONING_v4 (Concise and Actionable) +TEMPLATE_REASONING_v4 = """ +**Task:** Map user request to JSON schema, providing actionable insights for data extraction. + +**User's Request:** +{user_input} + +**Desired JSON Output Schema:** +```json +{json_schema} +``` + +**Analysis:** + +* **Key Data:** [List the specific data elements the user wants] +* **Schema Mapping:** [Concisely map each desired element to its schema counterpart] +* **Transformations:** [Briefly list any data manipulations needed] +* **Challenges:** [Highlight any potential issues or ambiguities] + +**Response:** +""" + +# TEMPLATE_REASONING_v5 (Schema-Centric Approach) +TEMPLATE_REASONING_v5 = """ +**Task:** Analyze the JSON schema to determine how it can fulfill the user's data request. + +**User's Request:** +{user_input} + +**Desired JSON Output Schema:** +```json +{json_schema} +``` + +**Analysis:** + +1. **Schema Structure Breakdown:** + * Describe the key entities, relationships, and nesting in the schema. + * Highlight any relevant data types or formatting within the schema. + +2. **Fulfilling User's Needs:** + * Explain how the schema's structure can provide the data the user wants. + * Point out any schema elements that directly address the user's request. + * Identify any potential gaps or challenges in fulfilling the request. + +**Remember:** +* This analysis prioritizes understanding the schema's capabilities. +* Focus on how the schema's structure can be leveraged for data extraction. +* If the schema is insufficient, clearly state this and suggest potential solutions. +* Provide only the analysis; avoid any additional text. + +**Response:** +""" + +# TEMPLATE_REASONING_WITH_CONTEXT_v1 (Clarity with Context Integration) +TEMPLATE_REASONING_WITH_CONTEXT_v1 = """ +**Task:** Carefully analyze the user's request, the provided JSON schema, and the additional context to create a precise mapping for data extraction. + +**User's Request:** +{user_input} + +**Desired JSON Output Schema:** +```json +{json_schema} +``` + +**Additional Context:** +{additional_context} + +**Analysis Steps:** + +1. **Integrate Context into Request Understanding:** + * Combine the user's explicit request with the additional context to gain a deeper understanding of their needs. + * Identify any implicit requirements or preferences hinted at in the context + +2. **Deconstruct Enhanced Request:** + * Pinpoint the core data the user needs (e.g., specific entities, attributes, relationships). + * Highlight any filtering or sorting criteria mentioned in the request or implied by the context + +3. **Connect to JSON Schema:** + * For each element the user wants, locate its precise match in the schema + * Explain how the schema's structure fulfills the user's needs (e.g., nested objects, arrays) + * If any schema parts aren't relevant to the request, point them out. + +**Remember:** +* The additional context is crucial for refining the analysis and ensuring accurate data extraction +* Be thorough and explicit in your explanations. +* Focus solely on the analysis; avoid extraneous text. + +**Response:** +""" + +# TEMPLATE_REASONING_WITH_CONTEXT_v2 (Context-Driven Data Transformation) +TEMPLATE_REASONING_WITH_CONTEXT_v2 = """ +**Task:** Analyze the user's request, JSON schema, and context to determine the data transformations needed for extraction. + +**User's Request:** +{user_input} + +**Desired JSON Output Schema:** +```json +{json_schema} +``` + +**Additional Context:** +{additional_context} + +**Analysis Steps:** + +1. **Contextual Understanding of User's Needs:** + * Combine the request and context to fully grasp the desired data and its presentation + * Note any calculations, formatting, or restructuring implied by the context. + +2. **Schema Mapping and Contextual Transformations:** + * Match user's needs to schema elements, considering context for data type conversions + * Outline the steps to transform schema data into the user's desired format, as informed by the context + * If the schema lacks necessary data, clearly state this + +**Key Points:** +* The context is vital for tailoring data transformations to the user's specific situation. +* Be explicit about the transformations needed, referencing the context where relevant +* Focus on the analysis; no additional text is required + +**Response:** +""" + +# TEMPLATE_REASONING_WITH_CONTEXT_v3 (Contextual Challenge Identification) +TEMPLATE_REASONING_WITH_CONTEXT_v3 = """ +**Task:** Analyze the user's request, JSON schema, and context, identifying potential challenges in data extraction + +**User's Request:** +{user_input} + +**Desired JSON Output Schema:** +```json +{json_schema} +``` + +**Additional Context:** +{additional_context} + +**Analysis Steps:** + +1. **Context-Enhanced Request Understanding:** + * Use the context to clarify any ambiguities or complexities in the request + * Identify any implicit requirements or potential conflicts highlighted by the context + +2. **Schema Mapping and Contextual Challenges:** + * Match user needs to schema elements, flagging any mismatches or missing data, considering the context + * Highlight any complex schema structures or contextual factors that might complicate extraction + * If the request remains unclear even with context, suggest specific clarifications needed from the user + +**Important Notes:** +* The context is key for anticipating and addressing potential roadblocks in code generation +* Be proactive in identifying challenges, especially those arising from the context +* If further clarification is needed, ask +specific questions tailored to the context + +* Focus on the analysis; avoid any unnecessary text + +**Response:** +""" + +# TEMPLATE_REASONING_WITH_CONTEXT_v4 (Concise and Actionable, with Context) +TEMPLATE_REASONING_WITH_CONTEXT_v4 = """ +**Task:** Map user request to JSON schema, incorporating context for actionable insights. + +**User's Request:** +{user_input} + +**Desired JSON Output Schema:** +```json +{json_schema} +``` + +**Additional Context:** +{additional_context} + +**Analysis:** + +* **Key Data (Contextualized):** [List the specific data elements the user wants, considering the context] +* **Schema Mapping (Context-Aware):** [Concisely map each desired element to its schema counterpart, noting any context-driven adjustments] +* **Transformations (Context-Informed):** [Briefly list any data manipulations needed, taking the context into account] +* **Challenges (Contextual):** [Highlight any potential issues or ambiguities arising from the request or context] + +**Response:** +""" + +# TEMPLATE_REASONING_WITH_CONTEXT_v5 (Schema-Centric with Contextual Lens) +TEMPLATE_REASONING_WITH_CONTEXT_v5 = """ +**Task:** Analyze the JSON schema through the lens of the user's request and context, determining how it can fulfill their needs + +**User's Request:** +{user_input} + +**Desired JSON Output Schema:** +```json +{json_schema} +``` + +**Additional Context:** +{additional_context} + +**Analysis:** + +1. **Schema Structure Breakdown (Contextualized):** + * Describe the key entities, relationships, and nesting in the schema, highlighting those most relevant to the context + * Point out any relevant data types or formatting within the schema that align with the context + +2. **Fulfilling User's Needs (Context-Driven):** + * Explain how the schema's structure, combined with the context, can provide the data the user wants + * Identify any schema elements that directly or indirectly address the user's request, considering the context + * Address any potential gaps or challenges in fulfilling the request, taking the context into account + +**Remember:** +* This analysis prioritizes understanding the schema's capabilities in relation to the specific context +* Focus on how the schema's structure, combined with the context, can be leveraged for data extraction +* If the schema is insufficient even with context, clearly state this and suggest potential solutions +* Provide only the analysis; avoid any additional text + +**Response:** +""" From 3228f7dafbcde757d4dd8a27a7727c7a6f50561d Mon Sep 17 00:00:00 2001 From: Matteo Vedovati Date: Thu, 26 Sep 2024 18:10:37 +0200 Subject: [PATCH 02/36] Update reasoning_node.py --- scrapegraphai/nodes/reasoning_node.py | 456 ++++---------------------- 1 file changed, 65 insertions(+), 391 deletions(-) diff --git a/scrapegraphai/nodes/reasoning_node.py b/scrapegraphai/nodes/reasoning_node.py index 4d9b29da..295b2d28 100644 --- a/scrapegraphai/nodes/reasoning_node.py +++ b/scrapegraphai/nodes/reasoning_node.py @@ -15,9 +15,7 @@ class ReasoningNode(BaseNode): """ - A node that refine the user prompt with the use of the schema and additional context and - create a precise prompt in subsequent steps that explicitly link elements in the user's - original input to their corresponding representations in the JSON schema. + ... Attributes: llm_model: An instance of a language model client, configured for generating answers. @@ -50,20 +48,14 @@ def __init__( self.force = ( False if node_config is None else node_config.get("force", False) ) - self.script_creator = ( - False if node_config is None else node_config.get("script_creator", False) - ) - self.is_md_scraper = ( - False if node_config is None else node_config.get("is_md_scraper", False) - ) - self.additional_info = node_config.get("additional_info") + self.additional_info = node_config.get("additional_info", None) self.output_schema = node_config.get("schema") def execute(self, state: dict) -> dict: """ - Generate a refined prompt using the user's prompt, the schema, and additional context. + ... Args: state (dict): The current state of the graph. The input keys will be used @@ -79,19 +71,79 @@ def execute(self, state: dict) -> dict: self.logger.info(f"--- Executing {self.node_name} Node ---") + TEMPLATE_REASONING = """ + **Task**: Analyze the user's request and the provided JSON schema to clearly map the desired data extraction.\n + Break down the user's request into key components, and then explicitly connect these components to the + corresponding elements within the JSON schema. + + **User's Request**: + {user_input} + + **Desired JSON Output Schema**: + ```json + {json_schema} + ``` + + **Analysis Instructions**: + 1. **Break Down User Request:** + * Clearly identify the core entities or data types the user is asking for.\n + * Highlight any specific attributes or relationships mentioned in the request.\n + + 2. **Map to JSON Schema**: + * For each identified element in the user request, pinpoint its exact counterpart in the JSON schema.\n + * Explain how the schema structure accommodates the user's needs. + * If applicable, mention any schema elements that are not directly addressed in the user's request.\n + + This analysis will be used to guide the HTML structure examination and ultimately inform the code generation process.\n + Please generate only the analysis and no other text. + + **Response**: + """ + + TEMPLATE_REASONING_WITH_CONTEXT = """ + **Task**: Analyze the user's request, the provided JSON schema, and the additional context the user provided to clearly map the desired data extraction.\n + Break down the user's request into key components, and then explicitly connect these components to the corresponding elements within the JSON schema.\n + + **User's Request**: + {user_input} + + **Desired JSON Output Schema**: + ```json + {json_schema} + ``` + + **Additional Context**: + {additional_context} + + **Analysis Instructions**: + 1. **Break Down User Request:** + * Clearly identify the core entities or data types the user is asking for.\n + * Highlight any specific attributes or relationships mentioned in the request.\n + + 2. **Map to JSON Schema**: + * For each identified element in the user request, pinpoint its exact counterpart in the JSON schema.\n + * Explain how the schema structure accommodates the user's needs.\n + * If applicable, mention any schema elements that are not directly addressed in the user's request.\n + + This analysis will be used to guide the HTML structure examination and ultimately inform the code generation process.\n + Please generate only the analysis and no other text. + + **Response**: + """ + user_prompt = state['user_prompt'] self.simplefied_schema = transform_schema(self.output_schema.schema()) if self.additional_info is not None: prompt = PromptTemplate( - template=TEMPLATE_REFINER_WITH_CONTEXT, + template=TEMPLATE_REASONING_WITH_CONTEXT, partial_variables={"user_input": user_prompt, "json_schema": str(self.simplefied_schema), "additional_context": self.additional_info}) else: prompt = PromptTemplate( - template=TEMPLATE_REFINER, + template=TEMPLATE_REASONING, partial_variables={"user_input": user_prompt, "json_schema": str(self.simplefied_schema)}) @@ -102,381 +154,3 @@ def execute(self, state: dict) -> dict: state.update({self.output[0]: refined_prompt}) return state - - -TEMPLATE_REASONING = """ -**Task**: Analyze the user's request and the provided JSON schema to clearly map the desired data extraction.\n -Break down the user's request into key components, and then explicitly connect these components to the -corresponding elements within the JSON schema. - -**User's Request**: -{user_input} - -**Desired JSON Output Schema**: -```json -{json_schema} -``` - -**Analysis Instructions**: -1. **Break Down User Request:** -* Clearly identify the core entities or data types the user is asking for.\n -* Highlight any specific attributes or relationships mentioned in the request.\n - -2. **Map to JSON Schema**: -* For each identified element in the user request, pinpoint its exact counterpart in the JSON schema.\n -* Explain how the schema structure accommodates the user's needs. -* If applicable, mention any schema elements that are not directly addressed in the user's request.\n - -This analysis will be used to guide the HTML structure examination and ultimately inform the code generation process.\n -Please generate only the analysis and no other text. - -**Response**: -""" - -TEMPLATE_REASONING_WITH_CONTEXT = """ -**Task**: Analyze the user's request, the provided JSON schema, and the additional context the user provided to clearly map the desired data extraction.\n -Break down the user's request into key components, and then explicitly connect these components to the corresponding elements within the JSON schema.\n - -**User's Request**: -{user_input} - -**Desired JSON Output Schema**: -```json -{json_schema} -``` - -**Additional Context**: -{additional_context} - -**Analysis Instructions**: -1. **Break Down User Request:** -* Clearly identify the core entities or data types the user is asking for.\n -* Highlight any specific attributes or relationships mentioned in the request.\n - -2. **Map to JSON Schema**: -* For each identified element in the user request, pinpoint its exact counterpart in the JSON schema.\n -* Explain how the schema structure accommodates the user's needs.\n -* If applicable, mention any schema elements that are not directly addressed in the user's request.\n - -This analysis will be used to guide the HTML structure examination and ultimately inform the code generation process.\n -Please generate only the analysis and no other text. - -**Response**: -""" - -# TEMPLATE_REASONING_v1 (Emphasis on Clarity) -TEMPLATE_REASONING_v1 = """ -**Task:** Meticulously analyze the user's request and the provided JSON schema to create a crystal-clear mapping for data extraction. - -**User's Request:** -{user_input} - -**Desired JSON Output Schema:** -```json -{json_schema} -``` - -**Analysis Steps:** - -1. **Deconstruct User Request:** - * Pinpoint the core data the user needs (e.g., specific entities, attributes, relationships). - * Highlight any filtering or sorting criteria mentioned in the request. - -2. **Connect to JSON Schema:** - * For each element the user wants, locate its precise match in the schema. - * Explain how the schema's structure fulfills the user's needs (e.g., nested objects, arrays). - * If any schema parts aren't relevant to the request, point them out. - -**Remember:** -* This analysis is crucial for building the HTML structure and generating code. -* Be thorough and explicit in your explanations. -* Focus solely on the analysis; avoid extraneous text. - -**Response:** -""" - -# TEMPLATE_REASONING_v2 (Focus on Data Transformation) -TEMPLATE_REASONING_v2 = """ -**Task:** Analyze the user's request and the JSON schema to determine the necessary data transformations for extraction. - -**User's Request:** -{user_input} - -**Desired JSON Output Schema:** -```json -{json_schema} -``` - -**Analysis Steps:** - -1. **Understand User's Needs:** - * Identify the specific data the user wants and how they want it presented. - * Note any calculations, formatting, or restructuring required. - -2. **Schema Mapping and Transformations:** - * Match user's needs to schema elements, noting any data type conversions needed. - * Outline the steps to transform the schema data into the user's desired format. - * If the schema lacks necessary data, clearly state this. - -**Key Points:** -* This analysis guides how we'll manipulate the schema data to match the user's request. -* Be explicit about the transformations needed (e.g., filtering, renaming, calculations). -* Focus on the analysis; no additional text is required. - -**Response:** -""" - -# TEMPLATE_REASONING_v3 (Highlighting Potential Challenges) -TEMPLATE_REASONING_v3 = """ -**Task:** Analyze the user's request and JSON schema, identifying potential challenges in data extraction. - -**User's Request:** -{user_input} - -**Desired JSON Output Schema:** -```json -{json_schema} -``` - -**Analysis Steps:** - -1. **Thorough Request Understanding:** - * Clearly identify all data elements the user wants. - * Note any ambiguities or complexities in the request. - -2. **Schema Mapping and Challenges:** - * Match user needs to schema elements, flagging any mismatches or missing data. - * Highlight any complex schema structures that might complicate extraction. - * If the request is vague, suggest clarifications needed from the user. - -**Important Notes:** -* This analysis helps us anticipate and address potential roadblocks in code generation. -* Be proactive in identifying challenges, not just mapping data. -* If the request is unclear, ask specific questions for clarification. -* Focus on the analysis; avoid any unnecessary text. - -**Response:** -""" - -# TEMPLATE_REASONING_v4 (Concise and Actionable) -TEMPLATE_REASONING_v4 = """ -**Task:** Map user request to JSON schema, providing actionable insights for data extraction. - -**User's Request:** -{user_input} - -**Desired JSON Output Schema:** -```json -{json_schema} -``` - -**Analysis:** - -* **Key Data:** [List the specific data elements the user wants] -* **Schema Mapping:** [Concisely map each desired element to its schema counterpart] -* **Transformations:** [Briefly list any data manipulations needed] -* **Challenges:** [Highlight any potential issues or ambiguities] - -**Response:** -""" - -# TEMPLATE_REASONING_v5 (Schema-Centric Approach) -TEMPLATE_REASONING_v5 = """ -**Task:** Analyze the JSON schema to determine how it can fulfill the user's data request. - -**User's Request:** -{user_input} - -**Desired JSON Output Schema:** -```json -{json_schema} -``` - -**Analysis:** - -1. **Schema Structure Breakdown:** - * Describe the key entities, relationships, and nesting in the schema. - * Highlight any relevant data types or formatting within the schema. - -2. **Fulfilling User's Needs:** - * Explain how the schema's structure can provide the data the user wants. - * Point out any schema elements that directly address the user's request. - * Identify any potential gaps or challenges in fulfilling the request. - -**Remember:** -* This analysis prioritizes understanding the schema's capabilities. -* Focus on how the schema's structure can be leveraged for data extraction. -* If the schema is insufficient, clearly state this and suggest potential solutions. -* Provide only the analysis; avoid any additional text. - -**Response:** -""" - -# TEMPLATE_REASONING_WITH_CONTEXT_v1 (Clarity with Context Integration) -TEMPLATE_REASONING_WITH_CONTEXT_v1 = """ -**Task:** Carefully analyze the user's request, the provided JSON schema, and the additional context to create a precise mapping for data extraction. - -**User's Request:** -{user_input} - -**Desired JSON Output Schema:** -```json -{json_schema} -``` - -**Additional Context:** -{additional_context} - -**Analysis Steps:** - -1. **Integrate Context into Request Understanding:** - * Combine the user's explicit request with the additional context to gain a deeper understanding of their needs. - * Identify any implicit requirements or preferences hinted at in the context - -2. **Deconstruct Enhanced Request:** - * Pinpoint the core data the user needs (e.g., specific entities, attributes, relationships). - * Highlight any filtering or sorting criteria mentioned in the request or implied by the context - -3. **Connect to JSON Schema:** - * For each element the user wants, locate its precise match in the schema - * Explain how the schema's structure fulfills the user's needs (e.g., nested objects, arrays) - * If any schema parts aren't relevant to the request, point them out. - -**Remember:** -* The additional context is crucial for refining the analysis and ensuring accurate data extraction -* Be thorough and explicit in your explanations. -* Focus solely on the analysis; avoid extraneous text. - -**Response:** -""" - -# TEMPLATE_REASONING_WITH_CONTEXT_v2 (Context-Driven Data Transformation) -TEMPLATE_REASONING_WITH_CONTEXT_v2 = """ -**Task:** Analyze the user's request, JSON schema, and context to determine the data transformations needed for extraction. - -**User's Request:** -{user_input} - -**Desired JSON Output Schema:** -```json -{json_schema} -``` - -**Additional Context:** -{additional_context} - -**Analysis Steps:** - -1. **Contextual Understanding of User's Needs:** - * Combine the request and context to fully grasp the desired data and its presentation - * Note any calculations, formatting, or restructuring implied by the context. - -2. **Schema Mapping and Contextual Transformations:** - * Match user's needs to schema elements, considering context for data type conversions - * Outline the steps to transform schema data into the user's desired format, as informed by the context - * If the schema lacks necessary data, clearly state this - -**Key Points:** -* The context is vital for tailoring data transformations to the user's specific situation. -* Be explicit about the transformations needed, referencing the context where relevant -* Focus on the analysis; no additional text is required - -**Response:** -""" - -# TEMPLATE_REASONING_WITH_CONTEXT_v3 (Contextual Challenge Identification) -TEMPLATE_REASONING_WITH_CONTEXT_v3 = """ -**Task:** Analyze the user's request, JSON schema, and context, identifying potential challenges in data extraction - -**User's Request:** -{user_input} - -**Desired JSON Output Schema:** -```json -{json_schema} -``` - -**Additional Context:** -{additional_context} - -**Analysis Steps:** - -1. **Context-Enhanced Request Understanding:** - * Use the context to clarify any ambiguities or complexities in the request - * Identify any implicit requirements or potential conflicts highlighted by the context - -2. **Schema Mapping and Contextual Challenges:** - * Match user needs to schema elements, flagging any mismatches or missing data, considering the context - * Highlight any complex schema structures or contextual factors that might complicate extraction - * If the request remains unclear even with context, suggest specific clarifications needed from the user - -**Important Notes:** -* The context is key for anticipating and addressing potential roadblocks in code generation -* Be proactive in identifying challenges, especially those arising from the context -* If further clarification is needed, ask -specific questions tailored to the context - -* Focus on the analysis; avoid any unnecessary text - -**Response:** -""" - -# TEMPLATE_REASONING_WITH_CONTEXT_v4 (Concise and Actionable, with Context) -TEMPLATE_REASONING_WITH_CONTEXT_v4 = """ -**Task:** Map user request to JSON schema, incorporating context for actionable insights. - -**User's Request:** -{user_input} - -**Desired JSON Output Schema:** -```json -{json_schema} -``` - -**Additional Context:** -{additional_context} - -**Analysis:** - -* **Key Data (Contextualized):** [List the specific data elements the user wants, considering the context] -* **Schema Mapping (Context-Aware):** [Concisely map each desired element to its schema counterpart, noting any context-driven adjustments] -* **Transformations (Context-Informed):** [Briefly list any data manipulations needed, taking the context into account] -* **Challenges (Contextual):** [Highlight any potential issues or ambiguities arising from the request or context] - -**Response:** -""" - -# TEMPLATE_REASONING_WITH_CONTEXT_v5 (Schema-Centric with Contextual Lens) -TEMPLATE_REASONING_WITH_CONTEXT_v5 = """ -**Task:** Analyze the JSON schema through the lens of the user's request and context, determining how it can fulfill their needs - -**User's Request:** -{user_input} - -**Desired JSON Output Schema:** -```json -{json_schema} -``` - -**Additional Context:** -{additional_context} - -**Analysis:** - -1. **Schema Structure Breakdown (Contextualized):** - * Describe the key entities, relationships, and nesting in the schema, highlighting those most relevant to the context - * Point out any relevant data types or formatting within the schema that align with the context - -2. **Fulfilling User's Needs (Context-Driven):** - * Explain how the schema's structure, combined with the context, can provide the data the user wants - * Identify any schema elements that directly or indirectly address the user's request, considering the context - * Address any potential gaps or challenges in fulfilling the request, taking the context into account - -**Remember:** -* This analysis prioritizes understanding the schema's capabilities in relation to the specific context -* Focus on how the schema's structure, combined with the context, can be leveraged for data extraction -* If the schema is insufficient even with context, clearly state this and suggest potential solutions -* Provide only the analysis; avoid any additional text - -**Response:** -""" From 0b125896641b8da06fa987ffb11159a46916f90c Mon Sep 17 00:00:00 2001 From: Matteo Vedovati Date: Thu, 26 Sep 2024 18:28:26 +0200 Subject: [PATCH 03/36] reasoning node prompt refinement --- scrapegraphai/nodes/reasoning_node.py | 59 +++++++++++++++------------ 1 file changed, 34 insertions(+), 25 deletions(-) diff --git a/scrapegraphai/nodes/reasoning_node.py b/scrapegraphai/nodes/reasoning_node.py index 295b2d28..e054eeca 100644 --- a/scrapegraphai/nodes/reasoning_node.py +++ b/scrapegraphai/nodes/reasoning_node.py @@ -72,42 +72,42 @@ def execute(self, state: dict) -> dict: self.logger.info(f"--- Executing {self.node_name} Node ---") TEMPLATE_REASONING = """ - **Task**: Analyze the user's request and the provided JSON schema to clearly map the desired data extraction.\n - Break down the user's request into key components, and then explicitly connect these components to the - corresponding elements within the JSON schema. + **Task**: Analyze the user's request and the provided JSON schema to guide an LLM in extracting information directly from HTML. **User's Request**: {user_input} - **Desired JSON Output Schema**: + **Target JSON Schema**: ```json {json_schema} ``` **Analysis Instructions**: - 1. **Break Down User Request:** - * Clearly identify the core entities or data types the user is asking for.\n - * Highlight any specific attributes or relationships mentioned in the request.\n + 1. **Interpret User Request:** + * Identify the key information types or entities the user is seeking. + * Note any specific attributes, relationships, or constraints mentioned. 2. **Map to JSON Schema**: - * For each identified element in the user request, pinpoint its exact counterpart in the JSON schema.\n - * Explain how the schema structure accommodates the user's needs. - * If applicable, mention any schema elements that are not directly addressed in the user's request.\n + * For each identified element in the user request, locate its corresponding field in the JSON schema. + * Explain how the schema structure represents the requested information. + * Highlight any relevant schema elements not explicitly mentioned in the user's request. - This analysis will be used to guide the HTML structure examination and ultimately inform the code generation process.\n - Please generate only the analysis and no other text. + 3. **Data Transformation Guidance**: + * Provide guidance on any necessary transformations to align extracted data with the JSON schema requirements. - **Response**: + This analysis will be used to instruct an LLM that has the HTML content in its context. The LLM will use this guidance to extract the information and return it directly in the specified JSON format. + + **Reasoning Output**: + [Your detailed analysis based on the above instructions] """ TEMPLATE_REASONING_WITH_CONTEXT = """ - **Task**: Analyze the user's request, the provided JSON schema, and the additional context the user provided to clearly map the desired data extraction.\n - Break down the user's request into key components, and then explicitly connect these components to the corresponding elements within the JSON schema.\n + **Task**: Analyze the user's request, provided JSON schema, and additional context to guide an LLM in extracting information directly from HTML. **User's Request**: {user_input} - **Desired JSON Output Schema**: + **Target JSON Schema**: ```json {json_schema} ``` @@ -116,19 +116,28 @@ def execute(self, state: dict) -> dict: {additional_context} **Analysis Instructions**: - 1. **Break Down User Request:** - * Clearly identify the core entities or data types the user is asking for.\n - * Highlight any specific attributes or relationships mentioned in the request.\n + 1. **Interpret User Request and Context:** + * Identify the key information types or entities the user is seeking. + * Note any specific attributes, relationships, or constraints mentioned. + * Incorporate insights from the additional context to refine understanding of the task. 2. **Map to JSON Schema**: - * For each identified element in the user request, pinpoint its exact counterpart in the JSON schema.\n - * Explain how the schema structure accommodates the user's needs.\n - * If applicable, mention any schema elements that are not directly addressed in the user's request.\n + * For each identified element in the user request, locate its corresponding field in the JSON schema. + * Explain how the schema structure represents the requested information. + * Highlight any relevant schema elements not explicitly mentioned in the user's request. + + 3. **Extraction Strategy**: + * Based on the additional context, suggest specific strategies for locating and extracting the required information from the HTML. + * Highlight any potential challenges or special considerations mentioned in the context. + + 4. **Data Transformation Guidance**: + * Provide guidance on any necessary transformations to align extracted data with the JSON schema requirements. + * Note any special formatting, validation, or business logic considerations from the additional context. - This analysis will be used to guide the HTML structure examination and ultimately inform the code generation process.\n - Please generate only the analysis and no other text. + This analysis will be used to instruct an LLM that has the HTML content in its context. The LLM will use this guidance to extract the information and return it directly in the specified JSON format. - **Response**: + **Reasoning Output**: + [Your detailed analysis based on the above instructions, incorporating insights from the additional context] """ user_prompt = state['user_prompt'] From b7b3e9660f02b346f1159b7cf7a52be6ce37b4f7 Mon Sep 17 00:00:00 2001 From: Matteo Vedovati Date: Thu, 26 Sep 2024 18:33:22 +0200 Subject: [PATCH 04/36] reasoning node refactoring --- scrapegraphai/nodes/reasoning_node.py | 78 ++----------------- scrapegraphai/prompts/__init__.py | 3 +- .../prompts/reasoning_node_prompts.py | 72 +++++++++++++++++ 3 files changed, 81 insertions(+), 72 deletions(-) create mode 100644 scrapegraphai/prompts/reasoning_node_prompts.py diff --git a/scrapegraphai/nodes/reasoning_node.py b/scrapegraphai/nodes/reasoning_node.py index e054eeca..3c65bbc4 100644 --- a/scrapegraphai/nodes/reasoning_node.py +++ b/scrapegraphai/nodes/reasoning_node.py @@ -12,10 +12,15 @@ from tqdm import tqdm from .base_node import BaseNode from ..utils import transform_schema +from ..prompts import ( + TEMPLATE_REASONING, TEMPLATE_REASONING_WITH_CONTEXT +) class ReasoningNode(BaseNode): """ - ... + A node that refine the user prompt with the use of the schema and additional context and + create a precise prompt in subsequent steps that explicitly link elements in the user's + original input to their corresponding representations in the JSON schema. Attributes: llm_model: An instance of a language model client, configured for generating answers. @@ -55,7 +60,7 @@ def __init__( def execute(self, state: dict) -> dict: """ - ... + Generate a refined prompt for the reasoning task based on the user's input and the JSON schema. Args: state (dict): The current state of the graph. The input keys will be used @@ -70,75 +75,6 @@ def execute(self, state: dict) -> dict: """ self.logger.info(f"--- Executing {self.node_name} Node ---") - - TEMPLATE_REASONING = """ - **Task**: Analyze the user's request and the provided JSON schema to guide an LLM in extracting information directly from HTML. - - **User's Request**: - {user_input} - - **Target JSON Schema**: - ```json - {json_schema} - ``` - - **Analysis Instructions**: - 1. **Interpret User Request:** - * Identify the key information types or entities the user is seeking. - * Note any specific attributes, relationships, or constraints mentioned. - - 2. **Map to JSON Schema**: - * For each identified element in the user request, locate its corresponding field in the JSON schema. - * Explain how the schema structure represents the requested information. - * Highlight any relevant schema elements not explicitly mentioned in the user's request. - - 3. **Data Transformation Guidance**: - * Provide guidance on any necessary transformations to align extracted data with the JSON schema requirements. - - This analysis will be used to instruct an LLM that has the HTML content in its context. The LLM will use this guidance to extract the information and return it directly in the specified JSON format. - - **Reasoning Output**: - [Your detailed analysis based on the above instructions] - """ - - TEMPLATE_REASONING_WITH_CONTEXT = """ - **Task**: Analyze the user's request, provided JSON schema, and additional context to guide an LLM in extracting information directly from HTML. - - **User's Request**: - {user_input} - - **Target JSON Schema**: - ```json - {json_schema} - ``` - - **Additional Context**: - {additional_context} - - **Analysis Instructions**: - 1. **Interpret User Request and Context:** - * Identify the key information types or entities the user is seeking. - * Note any specific attributes, relationships, or constraints mentioned. - * Incorporate insights from the additional context to refine understanding of the task. - - 2. **Map to JSON Schema**: - * For each identified element in the user request, locate its corresponding field in the JSON schema. - * Explain how the schema structure represents the requested information. - * Highlight any relevant schema elements not explicitly mentioned in the user's request. - - 3. **Extraction Strategy**: - * Based on the additional context, suggest specific strategies for locating and extracting the required information from the HTML. - * Highlight any potential challenges or special considerations mentioned in the context. - - 4. **Data Transformation Guidance**: - * Provide guidance on any necessary transformations to align extracted data with the JSON schema requirements. - * Note any special formatting, validation, or business logic considerations from the additional context. - - This analysis will be used to instruct an LLM that has the HTML content in its context. The LLM will use this guidance to extract the information and return it directly in the specified JSON format. - - **Reasoning Output**: - [Your detailed analysis based on the above instructions, incorporating insights from the additional context] - """ user_prompt = state['user_prompt'] diff --git a/scrapegraphai/prompts/__init__.py b/scrapegraphai/prompts/__init__.py index f7be89c1..ab34580b 100644 --- a/scrapegraphai/prompts/__init__.py +++ b/scrapegraphai/prompts/__init__.py @@ -18,4 +18,5 @@ TEMPLATE_EXECUTION_ANALYSIS, TEMPLATE_EXECUTION_CODE_GENERATION, TEMPLATE_VALIDATION_ANALYSIS, TEMPLATE_VALIDATION_CODE_GENERATION, TEMPLATE_SEMANTIC_COMPARISON, TEMPLATE_SEMANTIC_ANALYSIS, - TEMPLATE_SEMANTIC_CODE_GENERATION) \ No newline at end of file + TEMPLATE_SEMANTIC_CODE_GENERATION) +from .reasoning_node_prompts import TEMPLATE_REASONING, TEMPLATE_REASONING_WITH_CONTEXT \ No newline at end of file diff --git a/scrapegraphai/prompts/reasoning_node_prompts.py b/scrapegraphai/prompts/reasoning_node_prompts.py new file mode 100644 index 00000000..2ecd96e3 --- /dev/null +++ b/scrapegraphai/prompts/reasoning_node_prompts.py @@ -0,0 +1,72 @@ +""" +Reasoning prompts helper +""" + +TEMPLATE_REASONING = """ +**Task**: Analyze the user's request and the provided JSON schema to guide an LLM in extracting information directly from HTML. + +**User's Request**: +{user_input} + +**Target JSON Schema**: +```json +{json_schema} +``` + +**Analysis Instructions**: +1. **Interpret User Request:** +* Identify the key information types or entities the user is seeking. +* Note any specific attributes, relationships, or constraints mentioned. + +2. **Map to JSON Schema**: +* For each identified element in the user request, locate its corresponding field in the JSON schema. +* Explain how the schema structure represents the requested information. +* Highlight any relevant schema elements not explicitly mentioned in the user's request. + +3. **Data Transformation Guidance**: +* Provide guidance on any necessary transformations to align extracted data with the JSON schema requirements. + +This analysis will be used to instruct an LLM that has the HTML content in its context. The LLM will use this guidance to extract the information and return it directly in the specified JSON format. + +**Reasoning Output**: +[Your detailed analysis based on the above instructions] +""" + +TEMPLATE_REASONING_WITH_CONTEXT = """ +**Task**: Analyze the user's request, provided JSON schema, and additional context to guide an LLM in extracting information directly from HTML. + +**User's Request**: +{user_input} + +**Target JSON Schema**: +```json +{json_schema} +``` + +**Additional Context**: +{additional_context} + +**Analysis Instructions**: +1. **Interpret User Request and Context:** +* Identify the key information types or entities the user is seeking. +* Note any specific attributes, relationships, or constraints mentioned. +* Incorporate insights from the additional context to refine understanding of the task. + +2. **Map to JSON Schema**: +* For each identified element in the user request, locate its corresponding field in the JSON schema. +* Explain how the schema structure represents the requested information. +* Highlight any relevant schema elements not explicitly mentioned in the user's request. + +3. **Extraction Strategy**: +* Based on the additional context, suggest specific strategies for locating and extracting the required information from the HTML. +* Highlight any potential challenges or special considerations mentioned in the context. + +4. **Data Transformation Guidance**: +* Provide guidance on any necessary transformations to align extracted data with the JSON schema requirements. +* Note any special formatting, validation, or business logic considerations from the additional context. + +This analysis will be used to instruct an LLM that has the HTML content in its context. The LLM will use this guidance to extract the information and return it directly in the specified JSON format. + +**Reasoning Output**: +[Your detailed analysis based on the above instructions, incorporating insights from the additional context] +""" \ No newline at end of file From afa9aa3fe78ffdf82c9faad82ae5a25375f1674d Mon Sep 17 00:00:00 2001 From: Matteo Vedovati Date: Thu, 26 Sep 2024 18:37:40 +0200 Subject: [PATCH 05/36] import refactoring --- scrapegraphai/nodes/generate_code_node.py | 9 +++------ scrapegraphai/nodes/html_analyzer_node.py | 3 --- scrapegraphai/nodes/prompt_refiner_node.py | 5 ----- scrapegraphai/nodes/reasoning_node.py | 5 ----- 4 files changed, 3 insertions(+), 19 deletions(-) diff --git a/scrapegraphai/nodes/generate_code_node.py b/scrapegraphai/nodes/generate_code_node.py index 1174a4aa..bcb7ea74 100644 --- a/scrapegraphai/nodes/generate_code_node.py +++ b/scrapegraphai/nodes/generate_code_node.py @@ -5,17 +5,16 @@ from langchain.prompts import PromptTemplate from langchain.output_parsers import ResponseSchema, StructuredOutputParser from langchain_core.output_parsers import StrOutputParser -from langchain_core.runnables import RunnableParallel -from langchain_core.utils.pydantic import is_basemodel_subclass from langchain_community.chat_models import ChatOllama import ast import sys from io import StringIO from bs4 import BeautifulSoup import re -from tqdm import tqdm -from .base_node import BaseNode +import json +from jsonschema import validate, ValidationError from pydantic import ValidationError +from .base_node import BaseNode from ..utils import (transform_schema, extract_code, syntax_focused_analysis, syntax_focused_code_generation, @@ -23,8 +22,6 @@ validation_focused_analysis, validation_focused_code_generation, semantic_focused_analysis, semantic_focused_code_generation, are_content_equal) -from jsonschema import validate, ValidationError -import json from ..prompts import ( TEMPLATE_INIT_CODE_GENERATION, TEMPLATE_SEMANTIC_COMPARISON ) diff --git a/scrapegraphai/nodes/html_analyzer_node.py b/scrapegraphai/nodes/html_analyzer_node.py index b07c4040..26304dcd 100644 --- a/scrapegraphai/nodes/html_analyzer_node.py +++ b/scrapegraphai/nodes/html_analyzer_node.py @@ -4,10 +4,7 @@ from typing import List, Optional from langchain.prompts import PromptTemplate from langchain_core.output_parsers import StrOutputParser -from langchain_core.runnables import RunnableParallel -from langchain_core.utils.pydantic import is_basemodel_subclass from langchain_community.chat_models import ChatOllama -from tqdm import tqdm from .base_node import BaseNode from ..utils import reduce_html from ..prompts import ( diff --git a/scrapegraphai/nodes/prompt_refiner_node.py b/scrapegraphai/nodes/prompt_refiner_node.py index dfb62eb6..7cc53020 100644 --- a/scrapegraphai/nodes/prompt_refiner_node.py +++ b/scrapegraphai/nodes/prompt_refiner_node.py @@ -4,12 +4,7 @@ from typing import List, Optional from langchain.prompts import PromptTemplate from langchain_core.output_parsers import StrOutputParser -from langchain_core.runnables import RunnableParallel -from langchain_core.utils.pydantic import is_basemodel_subclass -from langchain_openai import ChatOpenAI, AzureChatOpenAI -from langchain_mistralai import ChatMistralAI from langchain_community.chat_models import ChatOllama -from tqdm import tqdm from .base_node import BaseNode from ..utils import transform_schema from ..prompts import ( diff --git a/scrapegraphai/nodes/reasoning_node.py b/scrapegraphai/nodes/reasoning_node.py index 3c65bbc4..431d8ab1 100644 --- a/scrapegraphai/nodes/reasoning_node.py +++ b/scrapegraphai/nodes/reasoning_node.py @@ -4,12 +4,7 @@ from typing import List, Optional from langchain.prompts import PromptTemplate from langchain_core.output_parsers import StrOutputParser -from langchain_core.runnables import RunnableParallel -from langchain_core.utils.pydantic import is_basemodel_subclass -from langchain_openai import ChatOpenAI, AzureChatOpenAI -from langchain_mistralai import ChatMistralAI from langchain_community.chat_models import ChatOllama -from tqdm import tqdm from .base_node import BaseNode from ..utils import transform_schema from ..prompts import ( From 9fa109453f9f6a6cc60e88a5f6d787e075d51f7f Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra <88108002+VinciGit00@users.noreply.github.com> Date: Fri, 27 Sep 2024 14:09:35 +0200 Subject: [PATCH 06/36] Update reasoning_node_prompts.py --- scrapegraphai/prompts/reasoning_node_prompts.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scrapegraphai/prompts/reasoning_node_prompts.py b/scrapegraphai/prompts/reasoning_node_prompts.py index 2ecd96e3..47ceaa41 100644 --- a/scrapegraphai/prompts/reasoning_node_prompts.py +++ b/scrapegraphai/prompts/reasoning_node_prompts.py @@ -3,7 +3,7 @@ """ TEMPLATE_REASONING = """ -**Task**: Analyze the user's request and the provided JSON schema to guide an LLM in extracting information directly from HTML. +**Task**: Analyze the user's request and the provided JSON schema to guide an LLM in extracting information directly from a markdown file previously parsed froma a HTML file. **User's Request**: {user_input} @@ -33,7 +33,7 @@ """ TEMPLATE_REASONING_WITH_CONTEXT = """ -**Task**: Analyze the user's request, provided JSON schema, and additional context to guide an LLM in extracting information directly from HTML. +**Task**: Analyze the user's request and the provided JSON schema to guide an LLM in extracting information directly from a markdown file previously parsed froma a HTML file. **User's Request**: {user_input} @@ -69,4 +69,4 @@ **Reasoning Output**: [Your detailed analysis based on the above instructions, incorporating insights from the additional context] -""" \ No newline at end of file +""" From bdcffd6360237b27797546a198ceece55ce4bc81 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Fri, 27 Sep 2024 16:41:45 +0200 Subject: [PATCH 07/36] feat: add html_mode to smart_scraper --- examples/extras/html_mode.py | 48 +++++++++++++++++ scrapegraphai/graphs/smart_scraper_graph.py | 57 +++++++++++++-------- 2 files changed, 85 insertions(+), 20 deletions(-) create mode 100644 examples/extras/html_mode.py diff --git a/examples/extras/html_mode.py b/examples/extras/html_mode.py new file mode 100644 index 00000000..c13ba694 --- /dev/null +++ b/examples/extras/html_mode.py @@ -0,0 +1,48 @@ +""" +Basic example of scraping pipeline using SmartScraper +By default smart scraper converts in md format the +code. +""" + +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + + +graph_config = { + "llm": { + "api_key": os.getenv("OPENAI_API_KEY"), + "model": "openai/gpt-4o", + }, + "html_mode": True, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me what does the company do, the name and a contact email.", + source="https://scrapegraphai.com/", + config=graph_config +) + +result = smart_scraper_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index 0c025c3a..7792ed58 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -69,14 +69,7 @@ def _create_graph(self) -> BaseGraph: "scrape_do": self.config.get("scrape_do") } ) - parse_node = ParseNode( - input="doc", - output=["parsed_doc"], - node_config={ - "llm_model": self.llm_model, - "chunk_size": self.model_token - } - ) + generate_answer_node = GenerateAnswerNode( input="user_prompt & (relevant_chunks | parsed_doc | doc)", @@ -88,19 +81,43 @@ def _create_graph(self) -> BaseGraph: } ) + if self.config.get("html_mode") is not True: + + parse_node = ParseNode( + input="doc", + output=["parsed_doc"], + node_config={ + "llm_model": self.llm_model, + "chunk_size": self.model_token + } + ) + + return BaseGraph( + nodes=[ + fetch_node, + parse_node, + generate_answer_node, + ], + edges=[ + (fetch_node, parse_node), + (parse_node, generate_answer_node) + ], + entry_point=fetch_node, + graph_name=self.__class__.__name__ + ) + return BaseGraph( - nodes=[ - fetch_node, - parse_node, - generate_answer_node, - ], - edges=[ - (fetch_node, parse_node), - (parse_node, generate_answer_node) - ], - entry_point=fetch_node, - graph_name=self.__class__.__name__ - ) + nodes=[ + fetch_node, + generate_answer_node, + ], + edges=[ + (fetch_node, generate_answer_node) + ], + entry_point=fetch_node, + graph_name=self.__class__.__name__ + ) + def run(self) -> str: """ From 1e4ee3abdf8dce321977bbc74f1976fba33877bc Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Fri, 27 Sep 2024 16:42:51 +0200 Subject: [PATCH 08/36] Update html_mode.py --- examples/extras/html_mode.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/extras/html_mode.py b/examples/extras/html_mode.py index c13ba694..6e2670a0 100644 --- a/examples/extras/html_mode.py +++ b/examples/extras/html_mode.py @@ -1,7 +1,8 @@ """ Basic example of scraping pipeline using SmartScraper By default smart scraper converts in md format the -code. +code. If you want to just use the original code, you have +to specify in the confi """ import os From 4330179cb65674d65423c1763f90182e85c15a74 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Fri, 27 Sep 2024 14:47:04 +0000 Subject: [PATCH 09/36] ci(release): 1.22.0-beta.4 [skip ci] ## [1.22.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.3...v1.22.0-beta.4) (2024-09-27) ### Features * add html_mode to smart_scraper ([bdcffd6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bdcffd6360237b27797546a198ceece55ce4bc81)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 70bcbbde..bcc66ecd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.22.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.3...v1.22.0-beta.4) (2024-09-27) + + +### Features + +* add html_mode to smart_scraper ([bdcffd6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bdcffd6360237b27797546a198ceece55ce4bc81)) + ## [1.22.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.2...v1.22.0-beta.3) (2024-09-25) diff --git a/pyproject.toml b/pyproject.toml index b7e0b1cc..fc61a859 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.22.0b3" +version = "1.22.0b4" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ From b2822f620a610e61d295cbf4b670aa08fde9de24 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Fri, 27 Sep 2024 17:45:44 +0200 Subject: [PATCH 10/36] feat: add reasoning integration --- examples/extras/reasoning.py | 46 +++++++++++++++++++ scrapegraphai/graphs/smart_scraper_graph.py | 28 +++++++++++ scrapegraphai/nodes/__init__.py | 2 +- scrapegraphai/nodes/reasoning_node.py | 9 ++-- .../prompts/reasoning_node_prompts.py | 2 +- 5 files changed, 81 insertions(+), 6 deletions(-) create mode 100644 examples/extras/reasoning.py diff --git a/examples/extras/reasoning.py b/examples/extras/reasoning.py new file mode 100644 index 00000000..80e57faa --- /dev/null +++ b/examples/extras/reasoning.py @@ -0,0 +1,46 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" + +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + + +graph_config = { + "llm": { + "api_key": os.getenv("OPENAI_API_KEY"), + "model": "openai/gpt-4o", + }, + "reasoning": True, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me what does the company do, the name and a contact email.", + source="https://scrapegraphai.com/", + config=graph_config +) + +result = smart_scraper_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index 0c025c3a..95c2b460 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -9,6 +9,7 @@ from ..nodes import ( FetchNode, ParseNode, + ReasoningNode, GenerateAnswerNode ) @@ -88,6 +89,33 @@ def _create_graph(self) -> BaseGraph: } ) + if self.config.get("reasoning"): + reasoning_node = ReasoningNode( + input="user_prompt & (relevant_chunks | parsed_doc | doc)", + output=["answer"], + node_config={ + "llm_model": self.llm_model, + "additional_info": self.config.get("additional_info"), + "schema": self.schema, + } + ) + + return BaseGraph( + nodes=[ + fetch_node, + parse_node, + reasoning_node, + generate_answer_node, + ], + edges=[ + (fetch_node, parse_node), + (parse_node, reasoning_node), + (reasoning_node, generate_answer_node) + ], + entry_point=fetch_node, + graph_name=self.__class__.__name__ + ) + return BaseGraph( nodes=[ fetch_node, diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py index 2a0f261a..7ed99808 100644 --- a/scrapegraphai/nodes/__init__.py +++ b/scrapegraphai/nodes/__init__.py @@ -26,4 +26,4 @@ from .prompt_refiner_node import PromptRefinerNode from .html_analyzer_node import HtmlAnalyzerNode from .generate_code_node import GenerateCodeNode -from .reasoning_node import ReasoningNode \ No newline at end of file +from .reasoning_node import ReasoningNode diff --git a/scrapegraphai/nodes/reasoning_node.py b/scrapegraphai/nodes/reasoning_node.py index 431d8ab1..6b91155c 100644 --- a/scrapegraphai/nodes/reasoning_node.py +++ b/scrapegraphai/nodes/reasoning_node.py @@ -50,12 +50,13 @@ def __init__( ) self.additional_info = node_config.get("additional_info", None) - + self.output_schema = node_config.get("schema") def execute(self, state: dict) -> dict: """ - Generate a refined prompt for the reasoning task based on the user's input and the JSON schema. + Generate a refined prompt for the reasoning task based + on the user's input and the JSON schema. Args: state (dict): The current state of the graph. The input keys will be used @@ -70,11 +71,11 @@ def execute(self, state: dict) -> dict: """ self.logger.info(f"--- Executing {self.node_name} Node ---") - + user_prompt = state['user_prompt'] self.simplefied_schema = transform_schema(self.output_schema.schema()) - + if self.additional_info is not None: prompt = PromptTemplate( template=TEMPLATE_REASONING_WITH_CONTEXT, diff --git a/scrapegraphai/prompts/reasoning_node_prompts.py b/scrapegraphai/prompts/reasoning_node_prompts.py index 47ceaa41..d9caf937 100644 --- a/scrapegraphai/prompts/reasoning_node_prompts.py +++ b/scrapegraphai/prompts/reasoning_node_prompts.py @@ -31,7 +31,7 @@ **Reasoning Output**: [Your detailed analysis based on the above instructions] """ - + TEMPLATE_REASONING_WITH_CONTEXT = """ **Task**: Analyze the user's request and the provided JSON schema to guide an LLM in extracting information directly from a markdown file previously parsed froma a HTML file. From 6d8f5435d1ecd2d90b06aade50abc064f75c9d78 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Fri, 27 Sep 2024 15:51:48 +0000 Subject: [PATCH 11/36] ci(release): 1.22.0-beta.5 [skip ci] ## [1.22.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.4...v1.22.0-beta.5) (2024-09-27) ### Features * add reasoning integration ([b2822f6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b2822f620a610e61d295cbf4b670aa08fde9de24)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bcc66ecd..689eeec3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.22.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.4...v1.22.0-beta.5) (2024-09-27) + + +### Features + +* add reasoning integration ([b2822f6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b2822f620a610e61d295cbf4b670aa08fde9de24)) + ## [1.22.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.3...v1.22.0-beta.4) (2024-09-27) diff --git a/pyproject.toml b/pyproject.toml index fc61a859..ef0b104a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.22.0b4" +version = "1.22.0b5" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ From f87ffa1d8db32b38c47d9f5aa2ae88f1d7978a04 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Fri, 27 Sep 2024 18:31:42 +0200 Subject: [PATCH 12/36] fix: integration with html_mode --- scrapegraphai/graphs/smart_scraper_graph.py | 44 +++++++++++++++++---- 1 file changed, 37 insertions(+), 7 deletions(-) diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index 4ffc6bed..65f03a24 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -70,7 +70,6 @@ def _create_graph(self) -> BaseGraph: "scrape_do": self.config.get("scrape_do") } ) - generate_answer_node = GenerateAnswerNode( input="user_prompt & (relevant_chunks | parsed_doc | doc)", @@ -82,14 +81,15 @@ def _create_graph(self) -> BaseGraph: } ) - if self.config.get("html_mode") is not True: - + if self.config.get("html_mode") is False: parse_node = ParseNode( input="doc", output=["parsed_doc"], node_config={ "llm_model": self.llm_model, "chunk_size": self.model_token + } + ) if self.config.get("reasoning"): reasoning_node = ReasoningNode( @@ -102,17 +102,17 @@ def _create_graph(self) -> BaseGraph: } ) + if self.config.get("html_mode") is False and self.config.get("reasoning") is True: + return BaseGraph( nodes=[ fetch_node, parse_node, - reasoning_node, generate_answer_node, ], edges=[ (fetch_node, parse_node), - (parse_node, generate_answer_node) (parse_node, reasoning_node), (reasoning_node, generate_answer_node) ], @@ -120,18 +120,48 @@ def _create_graph(self) -> BaseGraph: graph_name=self.__class__.__name__ ) - return BaseGraph( + elif self.config.get("html_mode") is True and self.config.get("reasoning") is True: + + return BaseGraph( nodes=[ fetch_node, + reasoning_node, generate_answer_node, ], edges=[ - (fetch_node, generate_answer_node) + (fetch_node, reasoning_node), + (reasoning_node, generate_answer_node) ], entry_point=fetch_node, graph_name=self.__class__.__name__ ) + elif self.config.get("html_mode") is True and self.config.get("reasoning") is False: + return BaseGraph( + nodes=[ + fetch_node, + generate_answer_node, + ], + edges=[ + (fetch_node, generate_answer_node) + ], + entry_point=fetch_node, + graph_name=self.__class__.__name__ + ) + + return BaseGraph( + nodes=[ + fetch_node, + parse_node, + generate_answer_node, + ], + edges=[ + (fetch_node, parse_node), + (parse_node, generate_answer_node) + ], + entry_point=fetch_node, + graph_name=self.__class__.__name__ + ) def run(self) -> str: """ From 39f78154a6f1123fa8aca5e169c803111c175473 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Sat, 28 Sep 2024 10:42:13 +0000 Subject: [PATCH 13/36] ci(release): 1.22.0-beta.6 [skip ci] ## [1.22.0-beta.6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.5...v1.22.0-beta.6) (2024-09-28) ### Bug Fixes * integration with html_mode ([f87ffa1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f87ffa1d8db32b38c47d9f5aa2ae88f1d7978a04)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 689eeec3..bf3a3bc9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.22.0-beta.6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.5...v1.22.0-beta.6) (2024-09-28) + + +### Bug Fixes + +* integration with html_mode ([f87ffa1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f87ffa1d8db32b38c47d9f5aa2ae88f1d7978a04)) + ## [1.22.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.4...v1.22.0-beta.5) (2024-09-27) diff --git a/pyproject.toml b/pyproject.toml index ef0b104a..da9fdc9c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.22.0b5" +version = "1.22.0b6" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ From ea27b2499ef5dccc46aab8bc7cdc987cfc6e6c20 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Mon, 30 Sep 2024 11:52:14 +0200 Subject: [PATCH 14/36] add empyt nodes --- README.md | 19 +- pyproject.toml | 5 +- scrapegraphai/nodes/__init__.py | 3 + scrapegraphai/nodes/description_node.py | 42 +++++ scrapegraphai/nodes/fetch_node_level_k.py | 42 +++++ .../nodes/generate_answer_node_k_level.py | 50 ++++++ scrapegraphai/nodes/generate_code_node.py | 19 +- scrapegraphai/nodes/rag_node.py | 163 +++--------------- 8 files changed, 178 insertions(+), 165 deletions(-) create mode 100644 scrapegraphai/nodes/description_node.py create mode 100644 scrapegraphai/nodes/fetch_node_level_k.py create mode 100644 scrapegraphai/nodes/generate_answer_node_k_level.py diff --git a/README.md b/README.md index cf437203..51bc3fa9 100644 --- a/README.md +++ b/README.md @@ -38,10 +38,9 @@ Additional dependecies can be added while installing the library: - More Language Models: additional language models are installed, such as Fireworks, Groq, Anthropic, Hugging Face, and Nvidia AI Endpoints. - -This group allows you to use additional language models like Fireworks, Groq, Anthropic, Together AI, Hugging Face, and Nvidia AI Endpoints. -```bash -pip install scrapegraphai[other-language-models] + This group allows you to use additional language models like Fireworks, Groq, Anthropic, Together AI, Hugging Face, and Nvidia AI Endpoints. + ```bash + pip install scrapegraphai[other-language-models] ``` - Semantic Options: this group includes tools for advanced semantic processing, such as Graphviz. @@ -55,23 +54,15 @@ pip install scrapegraphai[other-language-models] pip install scrapegraphai[more-browser-options] ``` -- faiss Options: this group includes faiss integration +- qdrants Options: this group includes qdrant integration for RAGnode and DeepScraperGraph. ```bash - pip install scrapegraphai[faiss-cpu] + pip install scrapegraphai[qdrant] ``` - -### Installing "More Browser Options" - -This group includes an ocr scraper for websites -```bash -pip install scrapegraphai[screenshot_scraper] -``` - ## 💻 Usage There are multiple standard scraping pipelines that can be used to extract information from a website (or local file). diff --git a/pyproject.toml b/pyproject.toml index 26b1fdb7..dde97395 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -100,8 +100,9 @@ screenshot_scraper = [ ] # Group 5: Faiss CPU -faiss-cpu = [ - "faiss-cpu>=1.8.0", +qdrant = [ + "qdrant-client>=1.11.3", + "fastembed>=0.3.6" ] [build-system] diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py index ec16c48e..e5fafb87 100644 --- a/scrapegraphai/nodes/__init__.py +++ b/scrapegraphai/nodes/__init__.py @@ -28,3 +28,6 @@ from .generate_code_node import GenerateCodeNode from .search_node_with_context import SearchLinksWithContext from .reasoning_node import ReasoningNode +from .fetch_node_level_k import FetchNodelevelK +from .generate_answer_node_k_level import GenerateAnswerNodeKLevel +from .description_node import DescriptionNode diff --git a/scrapegraphai/nodes/description_node.py b/scrapegraphai/nodes/description_node.py new file mode 100644 index 00000000..49ab941f --- /dev/null +++ b/scrapegraphai/nodes/description_node.py @@ -0,0 +1,42 @@ +""" +DescriptionNode Module +""" +from typing import List, Optional +from .base_node import BaseNode + +class DescriptionNode(BaseNode): + """ + A node responsible for compressing the input tokens and storing the document + in a vector database for retrieval. Relevant chunks are stored in the state. + + It allows scraping of big documents without exceeding the token limit of the language model. + + Attributes: + llm_model: An instance of a language model client, configured for generating answers. + verbose (bool): A flag indicating whether to show print statements during execution. + + Args: + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "Parse". + """ + + def __init__( + self, + input: str, + output: List[str], + node_config: Optional[dict] = None, + node_name: str = "RAG", + ): + super().__init__(node_name, "node", input, output, 2, node_config) + + self.llm_model = node_config["llm_model"] + self.embedder_model = node_config.get("embedder_model", None) + self.verbose = ( + False if node_config is None else node_config.get("verbose", False) + ) + self.cache_path = node_config.get("cache_path", False) + + def execute(self, state: dict) -> dict: + pass diff --git a/scrapegraphai/nodes/fetch_node_level_k.py b/scrapegraphai/nodes/fetch_node_level_k.py new file mode 100644 index 00000000..18a0d435 --- /dev/null +++ b/scrapegraphai/nodes/fetch_node_level_k.py @@ -0,0 +1,42 @@ +""" +FetchNodelevelK Module +""" +from typing import List, Optional +from .base_node import BaseNode + +class FetchNodelevelK(BaseNode): + """ + A node responsible for compressing the input tokens and storing the document + in a vector database for retrieval. Relevant chunks are stored in the state. + + It allows scraping of big documents without exceeding the token limit of the language model. + + Attributes: + llm_model: An instance of a language model client, configured for generating answers. + verbose (bool): A flag indicating whether to show print statements during execution. + + Args: + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "Parse". + """ + + def __init__( + self, + input: str, + output: List[str], + node_config: Optional[dict] = None, + node_name: str = "RAG", + ): + super().__init__(node_name, "node", input, output, 2, node_config) + + self.llm_model = node_config["llm_model"] + self.embedder_model = node_config.get("embedder_model", None) + self.verbose = ( + False if node_config is None else node_config.get("verbose", False) + ) + self.cache_path = node_config.get("cache_path", False) + + def execute(self, state: dict) -> dict: + pass diff --git a/scrapegraphai/nodes/generate_answer_node_k_level.py b/scrapegraphai/nodes/generate_answer_node_k_level.py new file mode 100644 index 00000000..1d4cdb4d --- /dev/null +++ b/scrapegraphai/nodes/generate_answer_node_k_level.py @@ -0,0 +1,50 @@ +""" +GenerateAnswerNodeKLevel Module +""" +from typing import List, Optional +from .base_node import BaseNode + +class GenerateAnswerNodeKLevel(BaseNode): + """ + A node responsible for compressing the input tokens and storing the document + in a vector database for retrieval. Relevant chunks are stored in the state. + + It allows scraping of big documents without exceeding the token limit of the language model. + + Attributes: + llm_model: An instance of a language model client, configured for generating answers. + verbose (bool): A flag indicating whether to show print statements during execution. + + Args: + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "Parse". + """ + + def __init__( + self, + input: str, + output: List[str], + node_config: Optional[dict] = None, + node_name: str = "GANLK", + ): + super().__init__(node_name, "node", input, output, 2, node_config) + + self.llm_model = node_config["llm_model"] + self.embedder_model = node_config.get("embedder_model", None) + self.verbose = ( + False if node_config is None else node_config.get("verbose", False) + ) + + def execute(self, state: dict) -> dict: + client = state["vectorial_db"] + + answer = client.query( + collection_name="demo_collection", + query_text="This is a query document" + ) + + state["answer"] = answer + + return state diff --git a/scrapegraphai/nodes/generate_code_node.py b/scrapegraphai/nodes/generate_code_node.py index cc72aaf4..746b10a5 100644 --- a/scrapegraphai/nodes/generate_code_node.py +++ b/scrapegraphai/nodes/generate_code_node.py @@ -26,7 +26,6 @@ from .base_node import BaseNode from jsonschema import validate, ValidationError - class GenerateCodeNode(BaseNode): """ A node that generates Python code for a function that extracts data @@ -96,7 +95,7 @@ def execute(self, state: dict) -> dict: Raises: KeyError: If the input keys are not found in the state, indicating that the necessary information for generating an answer is missing. - RuntimeError: If the maximum number of iterations is + RuntimeError: If the maximum number of iterations is reached without obtaining the desired code. """ @@ -170,7 +169,7 @@ def overall_reasoning_loop(self, state: dict) -> dict: self.logger.info(f"--- (Checking if the informations exctrcated are the ones Requested) ---") state = self.semantic_comparison_loop(state) if state["errors"]["semantic"]: - continue + continue break if state["iteration"] == self.max_iterations["overall"] and \ @@ -195,9 +194,9 @@ def syntax_reasoning_loop(self, state: dict) -> dict: state["errors"]["syntax"] = [syntax_message] self.logger.info(f"--- (Synax Error Found: {syntax_message}) ---") analysis = syntax_focused_analysis(state, self.llm_model) - self.logger.info(f"""--- (Regenerating Code + self.logger.info(f"""--- (Regenerating Code to fix the Error) ---""") - state["generated_code"] = syntax_focused_code_generation(state, + state["generated_code"] = syntax_focused_code_generation(state, analysis, self.llm_model) state["generated_code"] = extract_code(state["generated_code"]) return state @@ -217,14 +216,14 @@ def execution_reasoning_loop(self, state: dict) -> dict: self.logger.info(f"--- (Code Execution Error: {execution_result}) ---") analysis = execution_focused_analysis(state, self.llm_model) self.logger.info(f"--- (Regenerating Code to fix the Error) ---") - state["generated_code"] = execution_focused_code_generation(state, + state["generated_code"] = execution_focused_code_generation(state, analysis, self.llm_model) state["generated_code"] = extract_code(state["generated_code"]) return state def validation_reasoning_loop(self, state: dict) -> dict: for _ in range(self.max_iterations["validation"]): - validation, errors = self.validate_dict(state["execution_result"], + validation, errors = self.validate_dict(state["execution_result"], self.output_schema.schema()) if validation: state["errors"]["validation"] = [] @@ -240,7 +239,7 @@ def validation_reasoning_loop(self, state: dict) -> dict: def semantic_comparison_loop(self, state: dict) -> dict: for _ in range(self.max_iterations["semantic"]): - comparison_result = self.semantic_comparison(state["execution_result"], + comparison_result = self.semantic_comparison(state["execution_result"], state["reference_answer"]) if comparison_result["are_semantically_equivalent"]: state["errors"]["semantic"] = [] @@ -342,7 +341,7 @@ def create_sandbox_and_execute(self, function_code): if not extract_data: raise NameError("Function 'extract_data' not found in the generated code.") - result = extract_data(self.raw_html) + result = extract_data(self.raw_html) return True, result except Exception as e: return False, f"Error during execution: {str(e)}" @@ -357,5 +356,5 @@ def validate_dict(self, data: dict, schema): validate(instance=data, schema=schema) return True, None except ValidationError as e: - errors = e.errors() + errors = [e.message] return False, errors diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py index 1174beee..c92e40f0 100644 --- a/scrapegraphai/nodes/rag_node.py +++ b/scrapegraphai/nodes/rag_node.py @@ -1,29 +1,9 @@ """ RAGNode Module """ -import os -import sys from typing import List, Optional -from langchain.docstore.document import Document -from langchain.retrievers import ContextualCompressionRetriever -from langchain.retrievers.document_compressors import ( - DocumentCompressorPipeline, - EmbeddingsFilter, -) -from langchain_community.document_transformers import EmbeddingsRedundantFilter -from langchain_community.vectorstores import FAISS -from langchain_community.chat_models import ChatOllama -from langchain_community.embeddings import OllamaEmbeddings -from langchain_aws import BedrockEmbeddings, ChatBedrock -from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI -from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings, ChatOpenAI, AzureChatOpenAI -from ..utils.logging import get_logger from .base_node import BaseNode -from ..helpers import models_tokens -from ..models import DeepSeek - -optional_modules = {"langchain_anthropic", "langchain_fireworks", - "langchain_groq", "langchain_google_vertexai"} +from qdrant_client import QdrantClient class RAGNode(BaseNode): """ @@ -34,7 +14,6 @@ class RAGNode(BaseNode): Attributes: llm_model: An instance of a language model client, configured for generating answers. - embedder_model: An instance of an embedding model client, configured for generating embeddings. verbose (bool): A flag indicating whether to show print statements during execution. Args: @@ -58,125 +37,31 @@ def __init__( self.verbose = ( False if node_config is None else node_config.get("verbose", False) ) - self.cache_path = node_config.get("cache_path", False) def execute(self, state: dict) -> dict: - # Execution logic - pass - - def _create_default_embedder(self, llm_config=None) -> object: - """ - Create an embedding model instance based on the chosen llm model. - - Returns: - object: An instance of the embedding model client. - Raises: - ValueError: If the model is not supported. - """ - - if isinstance(self.llm_model, ChatGoogleGenerativeAI): - return GoogleGenerativeAIEmbeddings( - google_api_key=llm_config["api_key"], model="models/embedding-001" - ) - if isinstance(self.llm_model, ChatOpenAI): - return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key, - base_url=self.llm_model.openai_api_base) - elif isinstance(self.llm_model, DeepSeek): - return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key) - elif isinstance(self.llm_model, AzureOpenAIEmbeddings): - return self.llm_model - elif isinstance(self.llm_model, AzureChatOpenAI): - return AzureOpenAIEmbeddings() - elif isinstance(self.llm_model, ChatOllama): - params = self.llm_model._lc_kwargs - params.pop("streaming", None) - params.pop("temperature", None) - return OllamaEmbeddings(**params) - elif isinstance(self.llm_model, ChatBedrock): - return BedrockEmbeddings(client=None, model_id=self.llm_model.model_id) - elif all(key in sys.modules for key in optional_modules): - if isinstance(self.llm_model, ChatFireworks): - from langchain_fireworks import FireworksEmbeddings - return FireworksEmbeddings(model=self.llm_model.model_name) - if isinstance(self.llm_model, ChatNVIDIA): - from langchain_nvidia import NVIDIAEmbeddings - return NVIDIAEmbeddings(model=self.llm_model.model_name) - if isinstance(self.llm_model, ChatHuggingFace): - from langchain_huggingface import HuggingFaceEmbeddings - return HuggingFaceEmbeddings(model=self.llm_model.model) - if isinstance(self.llm_model, ChatVertexAI): - from langchain_vertexai import VertexAIEmbeddings - return VertexAIEmbeddings() + if self.node_config.get("client_type") == "memory": + client = QdrantClient(":memory:") + elif self.node_config.get("client_type") == "local_db": + client = QdrantClient(path="path/to/db") + elif self.node_config.get("client_type") == "image": + client = QdrantClient(url="http://localhost:6333") else: - raise ValueError("Embedding Model missing or not supported") - - def _create_embedder(self, embedder_config: dict) -> object: - """ - Create an embedding model instance based on the configuration provided. - - Args: - embedder_config (dict): Configuration parameters for the embedding model. - - Returns: - object: An instance of the embedding model client. - - Raises: - KeyError: If the model is not supported. - """ - embedder_params = {**embedder_config} - if "model_instance" in embedder_config: - return embedder_params["model_instance"] - if "openai" in embedder_params["model"]: - return OpenAIEmbeddings(api_key=embedder_params["api_key"]) - if "azure" in embedder_params["model"]: - return AzureOpenAIEmbeddings() - if "ollama" in embedder_params["model"]: - embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) - try: - models_tokens["ollama"][embedder_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return OllamaEmbeddings(**embedder_params) - if "gemini" in embedder_params["model"]: - try: - models_tokens["gemini"][embedder_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return GoogleGenerativeAIEmbeddings(model=embedder_params["model"]) - if "bedrock" in embedder_params["model"]: - embedder_params["model"] = embedder_params["model"].split("/")[-1] - client = embedder_params.get("client", None) - try: - models_tokens["bedrock"][embedder_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return BedrockEmbeddings(client=client, model_id=embedder_params["model"]) - if all(key in sys.modules for key in optional_modules): - if "hugging_face" in embedder_params["model"]: - from langchain_huggingface import HuggingFaceEmbeddings - embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) - try: - models_tokens["hugging_face"][embedder_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return HuggingFaceEmbeddings(model=embedder_params["model"]) - elif "fireworks" in embedder_params["model"]: - from langchain_fireworks import FireworksEmbeddings - embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) - try: - models_tokens["fireworks"][embedder_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return FireworksEmbeddings(model=embedder_params["model"]) - elif "nvidia" in embedder_params["model"]: - from langchain_nvidia import NVIDIAEmbeddings - embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) - try: - models_tokens["nvidia"][embedder_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return NVIDIAEmbeddings(model=embedder_params["model"], - nvidia_api_key=embedder_params["api_key"]) + raise ValueError("client_type provided not correct") + + docs = ["Qdrant has Langchain integrations", "Qdrant also has Llama Index integrations"] + metadata = [ + {"source": "Langchain-docs"}, + {"source": "Linkedin-docs"}, + ] + ids = [42, 2] + + client.add( + collection_name="demo_collection", + documents=docs, + metadata=metadata, + ids=ids + ) - raise ValueError("Model provided by the configuration not supported") + state["vectorial_db"] = client + return state From 89de5b6cba988421e3f12581707cdbc98a03e289 Mon Sep 17 00:00:00 2001 From: Matteo Vedovati Date: Mon, 30 Sep 2024 12:10:40 +0200 Subject: [PATCH 15/36] Stating anew --- scrapegraphai/nodes/description_node.py | 42 ----------------------- scrapegraphai/nodes/fetch_node_level_k.py | 42 ----------------------- 2 files changed, 84 deletions(-) delete mode 100644 scrapegraphai/nodes/description_node.py delete mode 100644 scrapegraphai/nodes/fetch_node_level_k.py diff --git a/scrapegraphai/nodes/description_node.py b/scrapegraphai/nodes/description_node.py deleted file mode 100644 index 49ab941f..00000000 --- a/scrapegraphai/nodes/description_node.py +++ /dev/null @@ -1,42 +0,0 @@ -""" -DescriptionNode Module -""" -from typing import List, Optional -from .base_node import BaseNode - -class DescriptionNode(BaseNode): - """ - A node responsible for compressing the input tokens and storing the document - in a vector database for retrieval. Relevant chunks are stored in the state. - - It allows scraping of big documents without exceeding the token limit of the language model. - - Attributes: - llm_model: An instance of a language model client, configured for generating answers. - verbose (bool): A flag indicating whether to show print statements during execution. - - Args: - input (str): Boolean expression defining the input keys needed from the state. - output (List[str]): List of output keys to be updated in the state. - node_config (dict): Additional configuration for the node. - node_name (str): The unique identifier name for the node, defaulting to "Parse". - """ - - def __init__( - self, - input: str, - output: List[str], - node_config: Optional[dict] = None, - node_name: str = "RAG", - ): - super().__init__(node_name, "node", input, output, 2, node_config) - - self.llm_model = node_config["llm_model"] - self.embedder_model = node_config.get("embedder_model", None) - self.verbose = ( - False if node_config is None else node_config.get("verbose", False) - ) - self.cache_path = node_config.get("cache_path", False) - - def execute(self, state: dict) -> dict: - pass diff --git a/scrapegraphai/nodes/fetch_node_level_k.py b/scrapegraphai/nodes/fetch_node_level_k.py deleted file mode 100644 index 18a0d435..00000000 --- a/scrapegraphai/nodes/fetch_node_level_k.py +++ /dev/null @@ -1,42 +0,0 @@ -""" -FetchNodelevelK Module -""" -from typing import List, Optional -from .base_node import BaseNode - -class FetchNodelevelK(BaseNode): - """ - A node responsible for compressing the input tokens and storing the document - in a vector database for retrieval. Relevant chunks are stored in the state. - - It allows scraping of big documents without exceeding the token limit of the language model. - - Attributes: - llm_model: An instance of a language model client, configured for generating answers. - verbose (bool): A flag indicating whether to show print statements during execution. - - Args: - input (str): Boolean expression defining the input keys needed from the state. - output (List[str]): List of output keys to be updated in the state. - node_config (dict): Additional configuration for the node. - node_name (str): The unique identifier name for the node, defaulting to "Parse". - """ - - def __init__( - self, - input: str, - output: List[str], - node_config: Optional[dict] = None, - node_name: str = "RAG", - ): - super().__init__(node_name, "node", input, output, 2, node_config) - - self.llm_model = node_config["llm_model"] - self.embedder_model = node_config.get("embedder_model", None) - self.verbose = ( - False if node_config is None else node_config.get("verbose", False) - ) - self.cache_path = node_config.get("cache_path", False) - - def execute(self, state: dict) -> dict: - pass From 336bf705ec6f8200987b9a10f1210d732a35c7b0 Mon Sep 17 00:00:00 2001 From: Matteo Vedovati Date: Mon, 30 Sep 2024 12:18:30 +0200 Subject: [PATCH 16/36] initial creation of FetchNodeLevelK and DescriptionNode --- scrapegraphai/nodes/description_node.py | 42 +++++++++++++++++++++++ scrapegraphai/nodes/fetch_node_level_K.py | 39 +++++++++++++++++++++ 2 files changed, 81 insertions(+) create mode 100644 scrapegraphai/nodes/description_node.py create mode 100644 scrapegraphai/nodes/fetch_node_level_K.py diff --git a/scrapegraphai/nodes/description_node.py b/scrapegraphai/nodes/description_node.py new file mode 100644 index 00000000..200d7032 --- /dev/null +++ b/scrapegraphai/nodes/description_node.py @@ -0,0 +1,42 @@ +""" +DescriptionNode Module +""" +from typing import List, Optional +from .base_node import BaseNode + +class DescriptionNode(BaseNode): + """ + A node responsible for generating a description of a given document. This description is + generated using a language model and is used for retrieving the right documents. + + It allows scraping of big documents without exceeding the token limit of the language model. + + Attributes: + llm_model: An instance of a language model client, configured for generating answers. + verbose (bool): A flag indicating whether to show print statements during execution. + + Args: + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "Parse". + """ + + def __init__( + self, + input: str, + output: List[str], + node_config: Optional[dict] = None, + node_name: str = "Description", + ): + super().__init__(node_name, "node", input, output, 2, node_config) + + self.llm_model = node_config["llm_model"] + self.embedder_model = node_config.get("embedder_model", None) + self.verbose = ( + False if node_config is None else node_config.get("verbose", False) + ) + self.cache_path = node_config.get("cache_path", False) + + def execute(self, state: dict) -> dict: + pass diff --git a/scrapegraphai/nodes/fetch_node_level_K.py b/scrapegraphai/nodes/fetch_node_level_K.py new file mode 100644 index 00000000..2fd3aa8b --- /dev/null +++ b/scrapegraphai/nodes/fetch_node_level_K.py @@ -0,0 +1,39 @@ +""" +FetchNodeLevelK Module +""" +from typing import List, Optional +from .base_node import BaseNode + +class FetchNodeLevelK(BaseNode): + """ + A node responsible for fetching all the pages at a certain level of hyperlink the graph. + + Attributes: + llm_model: An instance of a language model client, configured for generating answers. + verbose (bool): A flag indicating whether to show print statements during execution. + + Args: + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "Parse". + """ + + def __init__( + self, + input: str, + output: List[str], + node_config: Optional[dict] = None, + node_name: str = "FetchLevelK", + ): + super().__init__(node_name, "node", input, output, 2, node_config) + + self.llm_model = node_config["llm_model"] + self.embedder_model = node_config.get("embedder_model", None) + self.verbose = ( + False if node_config is None else node_config.get("verbose", False) + ) + self.cache_path = node_config.get("cache_path", False) + + def execute(self, state: dict) -> dict: + pass From 7411ff061c9ea74ddcd043574da1d968f6abaf99 Mon Sep 17 00:00:00 2001 From: Matteo Vedovati Date: Mon, 30 Sep 2024 12:21:26 +0200 Subject: [PATCH 17/36] Revert "initial creation of FetchNodeLevelK and DescriptionNode" This reverts commit 336bf705ec6f8200987b9a10f1210d732a35c7b0. --- scrapegraphai/nodes/description_node.py | 42 ----------------------- scrapegraphai/nodes/fetch_node_level_K.py | 39 --------------------- 2 files changed, 81 deletions(-) delete mode 100644 scrapegraphai/nodes/description_node.py delete mode 100644 scrapegraphai/nodes/fetch_node_level_K.py diff --git a/scrapegraphai/nodes/description_node.py b/scrapegraphai/nodes/description_node.py deleted file mode 100644 index 200d7032..00000000 --- a/scrapegraphai/nodes/description_node.py +++ /dev/null @@ -1,42 +0,0 @@ -""" -DescriptionNode Module -""" -from typing import List, Optional -from .base_node import BaseNode - -class DescriptionNode(BaseNode): - """ - A node responsible for generating a description of a given document. This description is - generated using a language model and is used for retrieving the right documents. - - It allows scraping of big documents without exceeding the token limit of the language model. - - Attributes: - llm_model: An instance of a language model client, configured for generating answers. - verbose (bool): A flag indicating whether to show print statements during execution. - - Args: - input (str): Boolean expression defining the input keys needed from the state. - output (List[str]): List of output keys to be updated in the state. - node_config (dict): Additional configuration for the node. - node_name (str): The unique identifier name for the node, defaulting to "Parse". - """ - - def __init__( - self, - input: str, - output: List[str], - node_config: Optional[dict] = None, - node_name: str = "Description", - ): - super().__init__(node_name, "node", input, output, 2, node_config) - - self.llm_model = node_config["llm_model"] - self.embedder_model = node_config.get("embedder_model", None) - self.verbose = ( - False if node_config is None else node_config.get("verbose", False) - ) - self.cache_path = node_config.get("cache_path", False) - - def execute(self, state: dict) -> dict: - pass diff --git a/scrapegraphai/nodes/fetch_node_level_K.py b/scrapegraphai/nodes/fetch_node_level_K.py deleted file mode 100644 index 2fd3aa8b..00000000 --- a/scrapegraphai/nodes/fetch_node_level_K.py +++ /dev/null @@ -1,39 +0,0 @@ -""" -FetchNodeLevelK Module -""" -from typing import List, Optional -from .base_node import BaseNode - -class FetchNodeLevelK(BaseNode): - """ - A node responsible for fetching all the pages at a certain level of hyperlink the graph. - - Attributes: - llm_model: An instance of a language model client, configured for generating answers. - verbose (bool): A flag indicating whether to show print statements during execution. - - Args: - input (str): Boolean expression defining the input keys needed from the state. - output (List[str]): List of output keys to be updated in the state. - node_config (dict): Additional configuration for the node. - node_name (str): The unique identifier name for the node, defaulting to "Parse". - """ - - def __init__( - self, - input: str, - output: List[str], - node_config: Optional[dict] = None, - node_name: str = "FetchLevelK", - ): - super().__init__(node_name, "node", input, output, 2, node_config) - - self.llm_model = node_config["llm_model"] - self.embedder_model = node_config.get("embedder_model", None) - self.verbose = ( - False if node_config is None else node_config.get("verbose", False) - ) - self.cache_path = node_config.get("cache_path", False) - - def execute(self, state: dict) -> dict: - pass From 462b27bc1d7ac29d0f668fe478867a4b357cb656 Mon Sep 17 00:00:00 2001 From: Matteo Vedovati Date: Mon, 30 Sep 2024 12:21:33 +0200 Subject: [PATCH 18/36] Revert "Stating anew" This reverts commit 89de5b6cba988421e3f12581707cdbc98a03e289. --- scrapegraphai/nodes/description_node.py | 42 +++++++++++++++++++++++ scrapegraphai/nodes/fetch_node_level_k.py | 42 +++++++++++++++++++++++ 2 files changed, 84 insertions(+) create mode 100644 scrapegraphai/nodes/description_node.py create mode 100644 scrapegraphai/nodes/fetch_node_level_k.py diff --git a/scrapegraphai/nodes/description_node.py b/scrapegraphai/nodes/description_node.py new file mode 100644 index 00000000..49ab941f --- /dev/null +++ b/scrapegraphai/nodes/description_node.py @@ -0,0 +1,42 @@ +""" +DescriptionNode Module +""" +from typing import List, Optional +from .base_node import BaseNode + +class DescriptionNode(BaseNode): + """ + A node responsible for compressing the input tokens and storing the document + in a vector database for retrieval. Relevant chunks are stored in the state. + + It allows scraping of big documents without exceeding the token limit of the language model. + + Attributes: + llm_model: An instance of a language model client, configured for generating answers. + verbose (bool): A flag indicating whether to show print statements during execution. + + Args: + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "Parse". + """ + + def __init__( + self, + input: str, + output: List[str], + node_config: Optional[dict] = None, + node_name: str = "RAG", + ): + super().__init__(node_name, "node", input, output, 2, node_config) + + self.llm_model = node_config["llm_model"] + self.embedder_model = node_config.get("embedder_model", None) + self.verbose = ( + False if node_config is None else node_config.get("verbose", False) + ) + self.cache_path = node_config.get("cache_path", False) + + def execute(self, state: dict) -> dict: + pass diff --git a/scrapegraphai/nodes/fetch_node_level_k.py b/scrapegraphai/nodes/fetch_node_level_k.py new file mode 100644 index 00000000..18a0d435 --- /dev/null +++ b/scrapegraphai/nodes/fetch_node_level_k.py @@ -0,0 +1,42 @@ +""" +FetchNodelevelK Module +""" +from typing import List, Optional +from .base_node import BaseNode + +class FetchNodelevelK(BaseNode): + """ + A node responsible for compressing the input tokens and storing the document + in a vector database for retrieval. Relevant chunks are stored in the state. + + It allows scraping of big documents without exceeding the token limit of the language model. + + Attributes: + llm_model: An instance of a language model client, configured for generating answers. + verbose (bool): A flag indicating whether to show print statements during execution. + + Args: + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "Parse". + """ + + def __init__( + self, + input: str, + output: List[str], + node_config: Optional[dict] = None, + node_name: str = "RAG", + ): + super().__init__(node_name, "node", input, output, 2, node_config) + + self.llm_model = node_config["llm_model"] + self.embedder_model = node_config.get("embedder_model", None) + self.verbose = ( + False if node_config is None else node_config.get("verbose", False) + ) + self.cache_path = node_config.get("cache_path", False) + + def execute(self, state: dict) -> dict: + pass From 6915f3edfd3e18d9c3fdedb677decb14f30afb49 Mon Sep 17 00:00:00 2001 From: Matteo Vedovati Date: Mon, 30 Sep 2024 12:22:09 +0200 Subject: [PATCH 19/36] start form scratch --- scrapegraphai/nodes/fetch_node_level_k.py | 42 ----------------------- 1 file changed, 42 deletions(-) delete mode 100644 scrapegraphai/nodes/fetch_node_level_k.py diff --git a/scrapegraphai/nodes/fetch_node_level_k.py b/scrapegraphai/nodes/fetch_node_level_k.py deleted file mode 100644 index 18a0d435..00000000 --- a/scrapegraphai/nodes/fetch_node_level_k.py +++ /dev/null @@ -1,42 +0,0 @@ -""" -FetchNodelevelK Module -""" -from typing import List, Optional -from .base_node import BaseNode - -class FetchNodelevelK(BaseNode): - """ - A node responsible for compressing the input tokens and storing the document - in a vector database for retrieval. Relevant chunks are stored in the state. - - It allows scraping of big documents without exceeding the token limit of the language model. - - Attributes: - llm_model: An instance of a language model client, configured for generating answers. - verbose (bool): A flag indicating whether to show print statements during execution. - - Args: - input (str): Boolean expression defining the input keys needed from the state. - output (List[str]): List of output keys to be updated in the state. - node_config (dict): Additional configuration for the node. - node_name (str): The unique identifier name for the node, defaulting to "Parse". - """ - - def __init__( - self, - input: str, - output: List[str], - node_config: Optional[dict] = None, - node_name: str = "RAG", - ): - super().__init__(node_name, "node", input, output, 2, node_config) - - self.llm_model = node_config["llm_model"] - self.embedder_model = node_config.get("embedder_model", None) - self.verbose = ( - False if node_config is None else node_config.get("verbose", False) - ) - self.cache_path = node_config.get("cache_path", False) - - def execute(self, state: dict) -> dict: - pass From 57bf572ab4a243a6d79155218bcc0d9d00dc3753 Mon Sep 17 00:00:00 2001 From: Matteo Vedovati Date: Mon, 30 Sep 2024 12:23:11 +0200 Subject: [PATCH 20/36] initial code for fetch nodel level K --- scrapegraphai/nodes/fetch_node_level_k.py | 39 +++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 scrapegraphai/nodes/fetch_node_level_k.py diff --git a/scrapegraphai/nodes/fetch_node_level_k.py b/scrapegraphai/nodes/fetch_node_level_k.py new file mode 100644 index 00000000..2fd3aa8b --- /dev/null +++ b/scrapegraphai/nodes/fetch_node_level_k.py @@ -0,0 +1,39 @@ +""" +FetchNodeLevelK Module +""" +from typing import List, Optional +from .base_node import BaseNode + +class FetchNodeLevelK(BaseNode): + """ + A node responsible for fetching all the pages at a certain level of hyperlink the graph. + + Attributes: + llm_model: An instance of a language model client, configured for generating answers. + verbose (bool): A flag indicating whether to show print statements during execution. + + Args: + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "Parse". + """ + + def __init__( + self, + input: str, + output: List[str], + node_config: Optional[dict] = None, + node_name: str = "FetchLevelK", + ): + super().__init__(node_name, "node", input, output, 2, node_config) + + self.llm_model = node_config["llm_model"] + self.embedder_model = node_config.get("embedder_model", None) + self.verbose = ( + False if node_config is None else node_config.get("verbose", False) + ) + self.cache_path = node_config.get("cache_path", False) + + def execute(self, state: dict) -> dict: + pass From d80b792e1529af8d87bb4534b777693e09b62feb Mon Sep 17 00:00:00 2001 From: Matteo Vedovati Date: Mon, 30 Sep 2024 12:42:26 +0200 Subject: [PATCH 21/36] fetching first level --- scrapegraphai/nodes/fetch_node_level_k.py | 80 +++++++++++++++++++- scrapegraphai/utils/1_manual.py | 92 +++++++++++++++++++++++ 2 files changed, 170 insertions(+), 2 deletions(-) create mode 100644 scrapegraphai/utils/1_manual.py diff --git a/scrapegraphai/nodes/fetch_node_level_k.py b/scrapegraphai/nodes/fetch_node_level_k.py index 2fd3aa8b..bbaafded 100644 --- a/scrapegraphai/nodes/fetch_node_level_k.py +++ b/scrapegraphai/nodes/fetch_node_level_k.py @@ -3,10 +3,17 @@ """ from typing import List, Optional from .base_node import BaseNode +from ..docloaders import ChromiumLoader +from ..utils.cleanup_html import cleanup_html +from ..utils.convert_to_md import convert_to_md +from langchain_core.documents import Document class FetchNodeLevelK(BaseNode): """ - A node responsible for fetching all the pages at a certain level of hyperlink the graph. + A node responsible for fetching the HTML content of a specified URL and all its sub-links + recursively up to a certain level of hyperlink the graph. This content is then used to update + the graph's state. It uses ChromiumLoader to fetch the content from a web page asynchronously + (with proxy protection). Attributes: llm_model: An instance of a language model client, configured for generating answers. @@ -29,11 +36,80 @@ def __init__( super().__init__(node_name, "node", input, output, 2, node_config) self.llm_model = node_config["llm_model"] + self.embedder_model = node_config.get("embedder_model", None) + self.verbose = ( False if node_config is None else node_config.get("verbose", False) ) + self.cache_path = node_config.get("cache_path", False) + + self.headless = ( + True if node_config is None else node_config.get("headless", True) + ) + + self.loader_kwargs = ( + {} if node_config is None else node_config.get("loader_kwargs", {}) + ) + + self.browser_base = ( + None if node_config is None else node_config.get("browser_base", None) + ) def execute(self, state: dict) -> dict: - pass + """ + Executes the node's logic to fetch the HTML content of a specified URL and all its sub-links + and update the graph's state with the content. + + Args: + state (dict): The current state of the graph. The input keys will be used + to fetch the correct data types from the state. + + Returns: + dict: The updated state with a new output key containing the fetched HTML content. + + Raises: + KeyError: If the input key is not found in the state, indicating that the + necessary information to perform the operation is missing. + """ + + self.logger.info(f"--- Executing {self.node_name} Node ---") + + # Interpret input keys based on the provided input expression + input_keys = self.get_input_keys(state) + # Fetching data from the state based on the input keys + input_data = [state[key] for key in input_keys] + + source = input_data[0] + + self.logger.info(f"--- (Fetching HTML from: {source}) ---") + + loader_kwargs = {} + + if self.node_config is not None: + loader_kwargs = self.node_config.get("loader_kwargs", {}) + + if self.browser_base is not None: + try: + from ..docloaders.browser_base import browser_base_fetch + except ImportError: + raise ImportError("""The browserbase module is not installed. + Please install it using `pip install browserbase`.""") + + data = browser_base_fetch(self.browser_base.get("api_key"), + self.browser_base.get("project_id"), [source]) + + document = [Document(page_content=content, + metadata={"source": source}) for content in data] + + else: + loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs) + + document = loader.load() + + if not document or not document[0].page_content.strip(): + raise ValueError("""No HTML body content found in + the document fetched by ChromiumLoader.""") + + parsed_content = document[0].page_content \ No newline at end of file diff --git a/scrapegraphai/utils/1_manual.py b/scrapegraphai/utils/1_manual.py new file mode 100644 index 00000000..21703b7b --- /dev/null +++ b/scrapegraphai/utils/1_manual.py @@ -0,0 +1,92 @@ +import requests +import logging +import time +from urllib.parse import quote, urljoin +from typing import Optional +from bs4 import BeautifulSoup +from dotenv import load_dotenv +import os +import json +import markdownify + +load_dotenv() + +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + +def fetch_content(token: str, target_url: str, max_retries: int = 5, retry_delay: int = 3) -> Optional[str]: + encoded_url = quote(target_url) + url = f"http://api.scrape.do?url={encoded_url}&token={token}&render=true&waitUntil=networkidle0" + + for attempt in range(max_retries): + try: + response = requests.get(url) + if response.status_code == 200: + logging.info(f"Successfully fetched content from {target_url}") + return response.text + logging.warning(f"Failed with status {response.status_code}. Retrying in {retry_delay}s...") + except requests.RequestException as e: + logging.error(f"Error fetching {target_url}: {e}. Retrying in {retry_delay}s...") + time.sleep(retry_delay) + + logging.error(f"Failed to fetch {target_url} after {max_retries} attempts.") + return None + +def extract_links(html_content: str) -> list: + soup = BeautifulSoup(html_content, 'html.parser') + links = [link['href'] for link in soup.find_all('a', href=True)] + logging.info(f"Extracted {len(links)} links.") + return links + +def process_links(token: str, base_url: str, links: list, depth: int, current_depth: int = 1) -> dict: + content_dict = {} + for idx, link in enumerate(links, start=1): + full_link = link if link.startswith("http") else urljoin(base_url, link) + logging.info(f"Processing link {idx}: {full_link}") + link_content = fetch_content(token, full_link) + if link_content: + markdown_content = markdownify.markdownify(link_content, heading_style="ATX") + content_dict[full_link] = markdown_content + save_content_to_json(content_dict, idx) + + if current_depth < depth: + new_links = extract_links(link_content) + content_dict.update(process_links(token, full_link, new_links, depth, current_depth + 1)) + else: + logging.warning(f"Failed to fetch content for {full_link}") + return content_dict + +def save_content_to_json(content_dict: dict, idx: int): + if not os.path.exists("downloaded_pages"): + os.makedirs("downloaded_pages") + + file_name = f"scraped_content_{idx}.json" + file_path = os.path.join("downloaded_pages", file_name) + + with open(file_path, "w", encoding="utf-8") as json_file: + json.dump(content_dict, json_file, ensure_ascii=False, indent=4) + + logging.info(f"Content saved to {file_path}") + +if __name__ == "__main__": + token = os.getenv("TOKEN") + target_url = "https://www.wired.com" + depth = 2 + + if not token or not target_url: + logging.error("Please set the TOKEN and TARGET_URL environment variables.") + exit(1) + + html_content = fetch_content(token, target_url) + + if html_content: + links = extract_links(html_content) + logging.info("Links found:") + for link in links: + logging.info(link) + + content_dict = process_links(token, target_url, links, depth) + for link, content in content_dict.items(): + logging.info(f"Link: {link}") + logging.info(f"Content: {content[:500]}...") + else: + logging.error("Failed to fetch the content.") From 55199e8307721325a2a7e542b0e4938c5885929a Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Mon, 30 Sep 2024 14:23:46 +0200 Subject: [PATCH 22/36] add first iterations of the nodes --- scrapegraphai/nodes/description_node.py | 34 ++++++- scrapegraphai/nodes/generate_answer_node.py | 27 ++++- .../nodes/generate_answer_node_k_level.py | 98 +++++++++++++++++-- scrapegraphai/nodes/rag_node.py | 11 +-- .../prompts/description_node_prompts.py | 10 ++ 5 files changed, 164 insertions(+), 16 deletions(-) create mode 100644 scrapegraphai/prompts/description_node_prompts.py diff --git a/scrapegraphai/nodes/description_node.py b/scrapegraphai/nodes/description_node.py index 49ab941f..683aabe1 100644 --- a/scrapegraphai/nodes/description_node.py +++ b/scrapegraphai/nodes/description_node.py @@ -2,7 +2,11 @@ DescriptionNode Module """ from typing import List, Optional +from tqdm import tqdm +from langchain.prompts import PromptTemplate +from langchain_core.runnables import RunnableParallel from .base_node import BaseNode +from ..prompts.description_node_prompts import DESCRIPTION_NODE_PROMPT class DescriptionNode(BaseNode): """ @@ -39,4 +43,32 @@ def __init__( self.cache_path = node_config.get("cache_path", False) def execute(self, state: dict) -> dict: - pass + self.logger.info(f"--- Executing {self.node_name} Node ---") + + input_keys = self.get_input_keys(state) + input_data = [state[key] for key in input_keys] + docs = input_data[1] + + chains_dict = {} + + for i, chunk in enumerate(tqdm(docs, desc="Processing chunks", disable=not self.verbose)): + prompt = PromptTemplate( + template=DESCRIPTION_NODE_PROMPT, + partial_variables={"context": chunk, + "chunk_id": i + 1 + } + ) + chain_name = f"chunk{i+1}" + chains_dict[chain_name] = prompt | self.llm_model + + async_runner = RunnableParallel(**chains_dict) + batch_results = async_runner.invoke() + + temp_res = {} + + for i, (summary, document) in enumerate(zip(batch_results, docs)): + temp_res[summary] = document + + state["descriptions"] = temp_res + + return state diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index 15686ec1..d5034a1e 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -1,3 +1,6 @@ +""" +generate_answer_node module +""" from typing import List, Optional from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser @@ -15,6 +18,26 @@ ) class GenerateAnswerNode(BaseNode): + """ + Initializes the GenerateAnswerNode class. + + Args: + input (str): The input data type for the node. + output (List[str]): The output data type(s) for the node. + node_config (Optional[dict]): Configuration dictionary for the node, + which includes the LLM model, verbosity, schema, and other settings. + Defaults to None. + node_name (str): The name of the node. Defaults to "GenerateAnswer". + + Attributes: + llm_model: The language model specified in the node configuration. + verbose (bool): Whether verbose mode is enabled. + force (bool): Whether to force certain behaviors, overriding defaults. + script_creator (bool): Whether the node is in script creation mode. + is_md_scraper (bool): Whether the node is scraping markdown data. + additional_info (Optional[str]): Any additional information to be + included in the prompt templates. + """ def __init__( self, input: str, @@ -100,7 +123,9 @@ def execute(self, state: dict) -> dict: prompt = PromptTemplate( template=template_chunks_prompt, input_variables=["question"], - partial_variables={"context": chunk, "chunk_id": i + 1, "format_instructions": format_instructions} + partial_variables={"context": chunk, + "chunk_id": i + 1, + "format_instructions": format_instructions} ) chain_name = f"chunk{i+1}" chains_dict[chain_name] = prompt | self.llm_model diff --git a/scrapegraphai/nodes/generate_answer_node_k_level.py b/scrapegraphai/nodes/generate_answer_node_k_level.py index 1d4cdb4d..1733a380 100644 --- a/scrapegraphai/nodes/generate_answer_node_k_level.py +++ b/scrapegraphai/nodes/generate_answer_node_k_level.py @@ -2,7 +2,19 @@ GenerateAnswerNodeKLevel Module """ from typing import List, Optional +from langchain.prompts import PromptTemplate +from tqdm import tqdm +from langchain_core.output_parsers import JsonOutputParser +from langchain_core.runnables import RunnableParallel +from langchain_openai import ChatOpenAI, AzureChatOpenAI +from langchain_mistralai import ChatMistralAI +from langchain_aws import ChatBedrock +from ..utils.output_parser import get_structured_output_parser, get_pydantic_output_parser from .base_node import BaseNode +from ..prompts import ( + TEMPLATE_CHUNKS, TEMPLATE_NO_CHUNKS, TEMPLATE_MERGE, + TEMPLATE_CHUNKS_MD, TEMPLATE_NO_CHUNKS_MD, TEMPLATE_MERGE_MD +) class GenerateAnswerNodeKLevel(BaseNode): """ @@ -33,18 +45,92 @@ def __init__( self.llm_model = node_config["llm_model"] self.embedder_model = node_config.get("embedder_model", None) - self.verbose = ( - False if node_config is None else node_config.get("verbose", False) - ) + self.verbose = node_config.get("verbose", False) + self.force = node_config.get("force", False) + self.script_creator = node_config.get("script_creator", False) + self.is_md_scraper = node_config.get("is_md_scraper", False) + self.additional_info = node_config.get("additional_info") def execute(self, state: dict) -> dict: + input_keys = self.get_input_keys(state) + input_data = [state[key] for key in input_keys] + user_prompt = input_data[0] + + if self.node_config.get("schema", None) is not None: + if isinstance(self.llm_model, (ChatOpenAI, ChatMistralAI)): + self.llm_model = self.llm_model.with_structured_output( + schema=self.node_config["schema"] + ) + output_parser = get_structured_output_parser(self.node_config["schema"]) + format_instructions = "NA" + else: + if not isinstance(self.llm_model, ChatBedrock): + output_parser = get_pydantic_output_parser(self.node_config["schema"]) + format_instructions = output_parser.get_format_instructions() + else: + output_parser = None + format_instructions = "" + else: + if not isinstance(self.llm_model, ChatBedrock): + output_parser = JsonOutputParser() + format_instructions = output_parser.get_format_instructions() + else: + output_parser = None + format_instructions = "" + + if isinstance(self.llm_model, (ChatOpenAI, AzureChatOpenAI)) \ + and not self.script_creator \ + or self.force \ + and not self.script_creator or self.is_md_scraper: + template_no_chunks_prompt = TEMPLATE_NO_CHUNKS_MD + template_chunks_prompt = TEMPLATE_CHUNKS_MD + template_merge_prompt = TEMPLATE_MERGE_MD + else: + template_no_chunks_prompt = TEMPLATE_NO_CHUNKS + template_chunks_prompt = TEMPLATE_CHUNKS + template_merge_prompt = TEMPLATE_MERGE + + if self.additional_info is not None: + template_no_chunks_prompt = self.additional_info + template_no_chunks_prompt + template_chunks_prompt = self.additional_info + template_chunks_prompt + template_merge_prompt = self.additional_info + template_merge_prompt + client = state["vectorial_db"] - answer = client.query( - collection_name="demo_collection", - query_text="This is a query document" + answer_db = client.query( + collection_name="vectorial_collection", + query_text= state["question"] ) + results_db = [elem for elem in state[answer_db]] + + chains_dict = {} + for i, chunk in enumerate(tqdm(results_db, + desc="Processing chunks", disable=not self.verbose)): + prompt = PromptTemplate( + template=template_chunks_prompt, + input_variables=["question"], + partial_variables={"context": chunk, + "chunk_id": i + 1, + } + ) + chain_name = f"chunk{i+1}" + chains_dict[chain_name] = prompt | self.llm_model + + async_runner = RunnableParallel(**chains_dict) + batch_results = async_runner.invoke({"question": user_prompt}) + + merge_prompt = PromptTemplate( + template=template_merge_prompt, + input_variables=["context", "question"], + partial_variables={"format_instructions": format_instructions} + ) + + merge_chain = merge_prompt | self.llm_model + if output_parser: + merge_chain = merge_chain | output_parser + answer = merge_chain.invoke({"context": batch_results, "question": user_prompt}) + state["answer"] = answer return state diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py index c92e40f0..c137b987 100644 --- a/scrapegraphai/nodes/rag_node.py +++ b/scrapegraphai/nodes/rag_node.py @@ -49,18 +49,13 @@ def execute(self, state: dict) -> dict: else: raise ValueError("client_type provided not correct") - docs = ["Qdrant has Langchain integrations", "Qdrant also has Llama Index integrations"] - metadata = [ - {"source": "Langchain-docs"}, - {"source": "Linkedin-docs"}, - ] - ids = [42, 2] + docs = [elem for elem in state.get("descriptions").keys()] + metadata = [] client.add( - collection_name="demo_collection", + collection_name="vectorial_collection", documents=docs, metadata=metadata, - ids=ids ) state["vectorial_db"] = client diff --git a/scrapegraphai/prompts/description_node_prompts.py b/scrapegraphai/prompts/description_node_prompts.py new file mode 100644 index 00000000..5cd78d7f --- /dev/null +++ b/scrapegraphai/prompts/description_node_prompts.py @@ -0,0 +1,10 @@ +""" +description node prompts +""" + +DESCRIPTION_NODE_PROMPT = """ +You are a scraper and you have just scraped the +following content from a website. \n +Please provide a description summary of maximum of 10 words +Content of the website: {content} +""" \ No newline at end of file From e88fee9a2bb5c3ad8d791560c45d0c1a8f4b73bb Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Mon, 30 Sep 2024 15:10:55 +0200 Subject: [PATCH 23/36] Update generate_answer_node_k_level.py --- scrapegraphai/nodes/generate_answer_node_k_level.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapegraphai/nodes/generate_answer_node_k_level.py b/scrapegraphai/nodes/generate_answer_node_k_level.py index 1733a380..24235e71 100644 --- a/scrapegraphai/nodes/generate_answer_node_k_level.py +++ b/scrapegraphai/nodes/generate_answer_node_k_level.py @@ -99,7 +99,7 @@ def execute(self, state: dict) -> dict: answer_db = client.query( collection_name="vectorial_collection", - query_text= state["question"] + query_text=state["question"] ) results_db = [elem for elem in state[answer_db]] From 45f02cd4e2606a768fb6c147b28eaf1fda5a7ee8 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Tue, 1 Oct 2024 11:13:06 +0200 Subject: [PATCH 24/36] refactoring of the format --- scrapegraphai/nodes/description_node.py | 6 +++++- scrapegraphai/nodes/generate_answer_node_k_level.py | 1 + scrapegraphai/nodes/rag_node.py | 6 +++--- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/scrapegraphai/nodes/description_node.py b/scrapegraphai/nodes/description_node.py index 683aabe1..6175133a 100644 --- a/scrapegraphai/nodes/description_node.py +++ b/scrapegraphai/nodes/description_node.py @@ -67,7 +67,11 @@ def execute(self, state: dict) -> dict: temp_res = {} for i, (summary, document) in enumerate(zip(batch_results, docs)): - temp_res[summary] = document + temp_res[summary] = { + "id": i, + "summary": summary, + "document": document + } state["descriptions"] = temp_res diff --git a/scrapegraphai/nodes/generate_answer_node_k_level.py b/scrapegraphai/nodes/generate_answer_node_k_level.py index 24235e71..10977617 100644 --- a/scrapegraphai/nodes/generate_answer_node_k_level.py +++ b/scrapegraphai/nodes/generate_answer_node_k_level.py @@ -102,6 +102,7 @@ def execute(self, state: dict) -> dict: query_text=state["question"] ) + ## TODO: from the id get the data results_db = [elem for elem in state[answer_db]] chains_dict = {} diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py index c137b987..cac41a99 100644 --- a/scrapegraphai/nodes/rag_node.py +++ b/scrapegraphai/nodes/rag_node.py @@ -49,13 +49,13 @@ def execute(self, state: dict) -> dict: else: raise ValueError("client_type provided not correct") - docs = [elem for elem in state.get("descriptions").keys()] - metadata = [] + docs = [elem.get("summary") for elem in state.get("descriptions", {})] + ids = [elem.get("id") for elem in state.get("descriptions", {})] client.add( collection_name="vectorial_collection", documents=docs, - metadata=metadata, + ids=ids ) state["vectorial_db"] = client From 4cb621feab7e014cba13798c0dd7d4f42b9938db Mon Sep 17 00:00:00 2001 From: Matteo Vedovati Date: Wed, 2 Oct 2024 10:22:21 +0200 Subject: [PATCH 25/36] fetch node level k implementation --- examples/openai/fetch_multiple_links.py | 21 +++++ scrapegraphai/graphs/__init__.py | 1 + scrapegraphai/graphs/depth_search_graph.py | 96 ++++++++++++++++++++++ scrapegraphai/nodes/__init__.py | 2 +- scrapegraphai/nodes/fetch_node_level_k.py | 80 ++++++++++++++++-- 5 files changed, 193 insertions(+), 7 deletions(-) create mode 100644 examples/openai/fetch_multiple_links.py create mode 100644 scrapegraphai/graphs/depth_search_graph.py diff --git a/examples/openai/fetch_multiple_links.py b/examples/openai/fetch_multiple_links.py new file mode 100644 index 00000000..53e246de --- /dev/null +++ b/examples/openai/fetch_multiple_links.py @@ -0,0 +1,21 @@ + +from scrapegraphai.graphs import DepthSearchGraph + +graph_config = { + "llm": { + "api_key":"YOUR_API_KEY", + "model": "openai/gpt-4o-mini", + }, + "verbose": True, + "headless": False, + "depth": 2, +} + +search_graph = DepthSearchGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io/projects/", + config=graph_config +) + +result = search_graph.run() +print(result) \ No newline at end of file diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py index efd6bd7e..b5ffcc47 100644 --- a/scrapegraphai/graphs/__init__.py +++ b/scrapegraphai/graphs/__init__.py @@ -26,3 +26,4 @@ from .screenshot_scraper_graph import ScreenshotScraperGraph from .smart_scraper_multi_concat_graph import SmartScraperMultiConcatGraph from .code_generator_graph import CodeGeneratorGraph +from .depth_search_graph import DepthSearchGraph diff --git a/scrapegraphai/graphs/depth_search_graph.py b/scrapegraphai/graphs/depth_search_graph.py new file mode 100644 index 00000000..a96d96a7 --- /dev/null +++ b/scrapegraphai/graphs/depth_search_graph.py @@ -0,0 +1,96 @@ +""" +... Module +""" +from typing import Optional +import logging +from pydantic import BaseModel +from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph +from ..utils.save_code_to_file import save_code_to_file +from ..nodes import ( + FetchNodeLevelK +) + +class DepthSearchGraph(AbstractGraph): + """ + CodeGeneratorGraph is a script generator pipeline that generates the function extract_data(html: str) -> dict() for + extracting the wanted information from a HTML page. The code generated is in Python and uses the library BeautifulSoup. + It requires a user prompt, a source URL, and an output schema. + + Attributes: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + schema (BaseModel): The schema for the graph output. + llm_model: An instance of a language model client, configured for generating answers. + embedder_model: An instance of an embedding model client, + configured for generating embeddings. + verbose (bool): A flag indicating whether to show print statements during execution. + headless (bool): A flag indicating whether to run the graph in headless mode. + library (str): The library used for web scraping (beautiful soup). + + Args: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + schema (BaseModel): The schema for the graph output. + + Example: + >>> code_gen = CodeGeneratorGraph( + ... "List me all the attractions in Chioggia.", + ... "https://en.wikipedia.org/wiki/Chioggia", + ... {"llm": {"model": "openai/gpt-3.5-turbo"}} + ... ) + >>> result = code_gen.run() + ) + """ + + def __init__(self, prompt: str, source: str, config: dict, schema: Optional[BaseModel] = None): + + super().__init__(prompt, config, source, schema) + + self.input_key = "url" if source.startswith("http") else "local_dir" + + def _create_graph(self) -> BaseGraph: + """ + Creates the graph of nodes representing the workflow for web scraping. + + Returns: + BaseGraph: A graph instance representing the web scraping workflow. + """ + + fetch_node = FetchNodeLevelK( + input="url| local_dir", + output=["docs"], + node_config={ + "loader_kwargs": self.config.get("loader_kwargs", {}), + "force": self.config.get("force", False), + "cut": self.config.get("cut", True), + "browser_base": self.config.get("browser_base"), + "depth": self.config.get("depth", 1) + } + ) + + return BaseGraph( + nodes=[ + fetch_node + ], + edges=[], + entry_point=fetch_node, + graph_name=self.__class__.__name__ + ) + + def run(self) -> str: + """ + Executes the scraping process and returns the generated code. + + Returns: + str: The generated code. + """ + + inputs = {"user_prompt": self.prompt, self.input_key: self.source} + self.final_state, self.execution_info = self.graph.execute(inputs) + + docs = self.final_state.get("docs", "No docs") + + return docs \ No newline at end of file diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py index e5fafb87..7b994746 100644 --- a/scrapegraphai/nodes/__init__.py +++ b/scrapegraphai/nodes/__init__.py @@ -28,6 +28,6 @@ from .generate_code_node import GenerateCodeNode from .search_node_with_context import SearchLinksWithContext from .reasoning_node import ReasoningNode -from .fetch_node_level_k import FetchNodelevelK +from .fetch_node_level_k import FetchNodeLevelK from .generate_answer_node_k_level import GenerateAnswerNodeKLevel from .description_node import DescriptionNode diff --git a/scrapegraphai/nodes/fetch_node_level_k.py b/scrapegraphai/nodes/fetch_node_level_k.py index bbaafded..f9fd57a8 100644 --- a/scrapegraphai/nodes/fetch_node_level_k.py +++ b/scrapegraphai/nodes/fetch_node_level_k.py @@ -7,6 +7,8 @@ from ..utils.cleanup_html import cleanup_html from ..utils.convert_to_md import convert_to_md from langchain_core.documents import Document +from bs4 import BeautifulSoup +from urllib.parse import quote, urljoin class FetchNodeLevelK(BaseNode): """ @@ -34,8 +36,6 @@ def __init__( node_name: str = "FetchLevelK", ): super().__init__(node_name, "node", input, output, 2, node_config) - - self.llm_model = node_config["llm_model"] self.embedder_model = node_config.get("embedder_model", None) @@ -56,6 +56,16 @@ def __init__( self.browser_base = ( None if node_config is None else node_config.get("browser_base", None) ) + + self.depth = ( + 1 if node_config is None else node_config.get("depth", 1) + ) + + self.only_inside_links = ( + False if node_config is None else node_config.get("only_inside_links", False) + ) + + self.min_input_len = 1 def execute(self, state: dict) -> dict: """ @@ -83,6 +93,8 @@ def execute(self, state: dict) -> dict: source = input_data[0] + documents = [{"source": source}] + self.logger.info(f"--- (Fetching HTML from: {source}) ---") loader_kwargs = {} @@ -90,6 +102,12 @@ def execute(self, state: dict) -> dict: if self.node_config is not None: loader_kwargs = self.node_config.get("loader_kwargs", {}) + for _ in range(self.depth): + documents = self.obtain_content(documents, loader_kwargs) + + return {self.output_keys[0]: documents} + + def fetch_content(self, source: str, loader_kwargs) -> Optional[str]: if self.browser_base is not None: try: from ..docloaders.browser_base import browser_base_fetch @@ -108,8 +126,58 @@ def execute(self, state: dict) -> dict: document = loader.load() - if not document or not document[0].page_content.strip(): - raise ValueError("""No HTML body content found in - the document fetched by ChromiumLoader.""") + return document + + def extract_links(self, html_content: str) -> list: + soup = BeautifulSoup(html_content, 'html.parser') + links = [link['href'] for link in soup.find_all('a', href=True)] + self.logger.info(f"Extracted {len(links)} links.") + return links + + def get_full_links(self, base_url: str, links: list) -> list: + full_links = [] + for link in links: + if self.only_inside_links and link.startswith("http"): + continue + full_link = link if link.startswith("http") else urljoin(base_url, link) + full_links.append(full_link) + return full_links + + def obtain_content(self, documents: List, loader_kwargs) -> List: + for doc in documents: + source = doc['source'] + if 'document' not in doc: + document = self.fetch_content(source, loader_kwargs) + + if not document or not document[0].page_content.strip(): + self.logger.warning(f"Failed to fetch content for {source}") + documents.remove(doc) + continue + + doc['document'] = document[0].page_content - parsed_content = document[0].page_content \ No newline at end of file + links = self.extract_links(doc['document']) + full_links = self.get_full_links(source, links) + + # Check if the links are already present in other documents + for link in full_links: + # Check if any document is from the same link + if not any(d.get('source', '') == link for d in documents): + # Add the document + documents.append({"source": link}) + + return documents + + def process_links(self, base_url: str, links: list, loader_kwargs, depth: int, current_depth: int = 1) -> dict: + content_dict = {} + for idx, link in enumerate(links, start=1): + full_link = link if link.startswith("http") else urljoin(base_url, link) + self.logger.info(f"Processing link {idx}: {full_link}") + link_content = self.fetch_content(full_link, loader_kwargs) + + if current_depth < depth: + new_links = self.extract_links(link_content) + content_dict.update(self.process_links(full_link, new_links, depth, current_depth + 1)) + else: + self.logger.warning(f"Failed to fetch content for {full_link}") + return content_dict \ No newline at end of file From ea3ae1fd6d2406a0b1b4c3337eab24cea44c9656 Mon Sep 17 00:00:00 2001 From: Matteo Vedovati Date: Wed, 2 Oct 2024 11:01:23 +0200 Subject: [PATCH 26/36] fetch multiple links fix --- examples/openai/fetch_multiple_links.py | 1 + scrapegraphai/graphs/depth_search_graph.py | 3 ++- scrapegraphai/nodes/fetch_node_level_k.py | 12 +++++++++--- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/examples/openai/fetch_multiple_links.py b/examples/openai/fetch_multiple_links.py index 53e246de..c9c07877 100644 --- a/examples/openai/fetch_multiple_links.py +++ b/examples/openai/fetch_multiple_links.py @@ -9,6 +9,7 @@ "verbose": True, "headless": False, "depth": 2, + "only_inside_links": True, } search_graph = DepthSearchGraph( diff --git a/scrapegraphai/graphs/depth_search_graph.py b/scrapegraphai/graphs/depth_search_graph.py index a96d96a7..fa6294a0 100644 --- a/scrapegraphai/graphs/depth_search_graph.py +++ b/scrapegraphai/graphs/depth_search_graph.py @@ -67,7 +67,8 @@ def _create_graph(self) -> BaseGraph: "force": self.config.get("force", False), "cut": self.config.get("cut", True), "browser_base": self.config.get("browser_base"), - "depth": self.config.get("depth", 1) + "depth": self.config.get("depth", 1), + "only_inside_links": self.config.get("only_inside_links", False) } ) diff --git a/scrapegraphai/nodes/fetch_node_level_k.py b/scrapegraphai/nodes/fetch_node_level_k.py index f9fd57a8..ff329a39 100644 --- a/scrapegraphai/nodes/fetch_node_level_k.py +++ b/scrapegraphai/nodes/fetch_node_level_k.py @@ -105,7 +105,11 @@ def execute(self, state: dict) -> dict: for _ in range(self.depth): documents = self.obtain_content(documents, loader_kwargs) - return {self.output_keys[0]: documents} + filtered_documents = [doc for doc in documents if 'document' in doc] + + state.update({self.output[0]: filtered_documents}) + + return state def fetch_content(self, source: str, loader_kwargs) -> Optional[str]: if self.browser_base is not None: @@ -144,6 +148,7 @@ def get_full_links(self, base_url: str, links: list) -> list: return full_links def obtain_content(self, documents: List, loader_kwargs) -> List: + new_documents = [] for doc in documents: source = doc['source'] if 'document' not in doc: @@ -162,10 +167,11 @@ def obtain_content(self, documents: List, loader_kwargs) -> List: # Check if the links are already present in other documents for link in full_links: # Check if any document is from the same link - if not any(d.get('source', '') == link for d in documents): + if not any(d.get('source', '') == link for d in documents) and not any(d.get('source', '') == link for d in new_documents): # Add the document - documents.append({"source": link}) + new_documents.append({"source": link}) + documents.extend(new_documents) return documents def process_links(self, base_url: str, links: list, loader_kwargs, depth: int, current_depth: int = 1) -> dict: From 2bdb01b07a7011564c23f1117fe524f9238fae1b Mon Sep 17 00:00:00 2001 From: Matteo Vedovati Date: Wed, 2 Oct 2024 11:04:17 +0200 Subject: [PATCH 27/36] Create parse_node_depth_k.py --- scrapegraphai/nodes/parse_node_depth_k.py | 156 ++++++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 scrapegraphai/nodes/parse_node_depth_k.py diff --git a/scrapegraphai/nodes/parse_node_depth_k.py b/scrapegraphai/nodes/parse_node_depth_k.py new file mode 100644 index 00000000..fd2f3810 --- /dev/null +++ b/scrapegraphai/nodes/parse_node_depth_k.py @@ -0,0 +1,156 @@ +""" +ParseNode Module +""" +import re +from typing import List, Optional, Tuple +from urllib.parse import urljoin +from langchain_community.document_transformers import Html2TextTransformer +from langchain_core.documents import Document +from .base_node import BaseNode +from ..utils.split_text_into_chunks import split_text_into_chunks +from ..helpers import default_filters + +class ParseNode(BaseNode): + """ + A node responsible for parsing HTML content from a document. + The parsed content is split into chunks for further processing. + + This node enhances the scraping workflow by allowing for targeted extraction of + content, thereby optimizing the processing of large HTML documents. + + Attributes: + verbose (bool): A flag indicating whether to show print statements during execution. + + Args: + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "Parse". + """ + + def __init__( + self, + input: str, + output: List[str], + node_config: Optional[dict] = None, + node_name: str = "ParseNode", + ): + super().__init__(node_name, "node", input, output, 1, node_config) + + self.verbose = ( + False if node_config is None else node_config.get("verbose", False) + ) + self.parse_html = ( + True if node_config is None else node_config.get("parse_html", True) + ) + self.parse_urls = ( + False if node_config is None else node_config.get("parse_urls", False) + ) + + self.llm_model = node_config.get("llm_model") + self.chunk_size = node_config.get("chunk_size") + + def execute(self, state: dict) -> dict: + """ + Executes the node's logic to parse the HTML document content and split it into chunks. + + Args: + state (dict): The current state of the graph. The input keys will be used to fetch the + correct data from the state. + + Returns: + dict: The updated state with the output key containing the parsed content chunks. + + Raises: + KeyError: If the input keys are not found in the state, indicating that the + necessary information for parsing the content is missing. + """ + + self.logger.info(f"--- Executing {self.node_name} Node ---") + + input_keys = self.get_input_keys(state) + + input_data = [state[key] for key in input_keys] + docs_transformed = input_data[0] + source = input_data[1] if self.parse_urls else None + + if self.parse_html: + docs_transformed = Html2TextTransformer(ignore_links=False).transform_documents(input_data[0]) + docs_transformed = docs_transformed[0] + + link_urls, img_urls = self._extract_urls(docs_transformed.page_content, source) + + chunks = split_text_into_chunks(text=docs_transformed.page_content, + chunk_size=self.chunk_size-250, model=self.llm_model) + else: + docs_transformed = docs_transformed[0] + + link_urls, img_urls = self._extract_urls(docs_transformed.page_content, source) + + chunk_size = self.chunk_size + chunk_size = min(chunk_size - 500, int(chunk_size * 0.75)) + + if isinstance(docs_transformed, Document): + chunks = split_text_into_chunks(text=docs_transformed.page_content, + chunk_size=chunk_size, + model=self.llm_model) + else: + chunks = split_text_into_chunks(text=docs_transformed, + chunk_size=chunk_size, + model=self.llm_model) + + state.update({self.output[0]: chunks}) + if self.parse_urls: + state.update({self.output[1]: link_urls}) + state.update({self.output[2]: img_urls}) + + return state + + def _extract_urls(self, text: str, source: str) -> Tuple[List[str], List[str]]: + """ + Extracts URLs from the given text. + + Args: + text (str): The text to extract URLs from. + + Returns: + Tuple[List[str], List[str]]: A tuple containing the extracted link URLs and image URLs. + """ + if not self.parse_urls: + return [], [] + + image_extensions = default_filters.filter_dict["img_exts"] + image_extension_seq = '|'.join(image_extensions).replace('.','') + url_pattern = re.compile(r'(https?://[^\s]+|\S+\.(?:' + image_extension_seq + '))') + + all_urls = url_pattern.findall(text) + all_urls = self._clean_urls(all_urls) + + if not source.startswith("http"): + all_urls = [url for url in all_urls if url.startswith("http")] + else: + all_urls = [urljoin(source, url) for url in all_urls] + + images = [url for url in all_urls if any(url.endswith(ext) for ext in image_extensions)] + links = [url for url in all_urls if url not in images] + + return links, images + + def _clean_urls(self, urls: List[str]) -> List[str]: + """ + Cleans the URLs extracted from the text. + + Args: + urls (List[str]): The list of URLs to clean. + + Returns: + List[str]: The cleaned URLs. + """ + cleaned_urls = [] + for url in urls: + url = re.sub(r'.*?\]\(', '', url) + url = url.rstrip(').') + + cleaned_urls.append(url) + + return cleaned_urls From f755d56bb1e2406668c5114e649953adbbff6748 Mon Sep 17 00:00:00 2001 From: Matteo Vedovati Date: Wed, 2 Oct 2024 12:28:48 +0200 Subject: [PATCH 28/36] updated parse node --- scrapegraphai/graphs/depth_search_graph.py | 18 +++- scrapegraphai/nodes/__init__.py | 1 + scrapegraphai/nodes/parse_node_depth_k.py | 120 +++------------------ 3 files changed, 33 insertions(+), 106 deletions(-) diff --git a/scrapegraphai/graphs/depth_search_graph.py b/scrapegraphai/graphs/depth_search_graph.py index fa6294a0..6ad3b245 100644 --- a/scrapegraphai/graphs/depth_search_graph.py +++ b/scrapegraphai/graphs/depth_search_graph.py @@ -8,7 +8,8 @@ from .abstract_graph import AbstractGraph from ..utils.save_code_to_file import save_code_to_file from ..nodes import ( - FetchNodeLevelK + FetchNodeLevelK, + ParseNodeDepthK ) class DepthSearchGraph(AbstractGraph): @@ -71,12 +72,23 @@ def _create_graph(self) -> BaseGraph: "only_inside_links": self.config.get("only_inside_links", False) } ) + + parse_node = ParseNodeDepthK( + input="docs", + output=["docs"], + node_config={ + "verbose": self.config.get("verbose", False) + } + ) return BaseGraph( nodes=[ - fetch_node + fetch_node, + parse_node + ], + edges=[ + (fetch_node, parse_node), ], - edges=[], entry_point=fetch_node, graph_name=self.__class__.__name__ ) diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py index 7b994746..edb195a5 100644 --- a/scrapegraphai/nodes/__init__.py +++ b/scrapegraphai/nodes/__init__.py @@ -31,3 +31,4 @@ from .fetch_node_level_k import FetchNodeLevelK from .generate_answer_node_k_level import GenerateAnswerNodeKLevel from .description_node import DescriptionNode +from .parse_node_depth_k import ParseNodeDepthK diff --git a/scrapegraphai/nodes/parse_node_depth_k.py b/scrapegraphai/nodes/parse_node_depth_k.py index fd2f3810..30afa23c 100644 --- a/scrapegraphai/nodes/parse_node_depth_k.py +++ b/scrapegraphai/nodes/parse_node_depth_k.py @@ -1,19 +1,14 @@ """ -ParseNode Module +ParseNodeDepthK Module """ import re from typing import List, Optional, Tuple -from urllib.parse import urljoin -from langchain_community.document_transformers import Html2TextTransformer -from langchain_core.documents import Document from .base_node import BaseNode -from ..utils.split_text_into_chunks import split_text_into_chunks -from ..helpers import default_filters +from ..utils.convert_to_md import convert_to_md -class ParseNode(BaseNode): +class ParseNodeDepthK(BaseNode): """ - A node responsible for parsing HTML content from a document. - The parsed content is split into chunks for further processing. + A node responsible for parsing HTML content from a series of documents. This node enhances the scraping workflow by allowing for targeted extraction of content, thereby optimizing the processing of large HTML documents. @@ -33,26 +28,17 @@ def __init__( input: str, output: List[str], node_config: Optional[dict] = None, - node_name: str = "ParseNode", + node_name: str = "ParseNodeDepthK", ): super().__init__(node_name, "node", input, output, 1, node_config) self.verbose = ( False if node_config is None else node_config.get("verbose", False) ) - self.parse_html = ( - True if node_config is None else node_config.get("parse_html", True) - ) - self.parse_urls = ( - False if node_config is None else node_config.get("parse_urls", False) - ) - - self.llm_model = node_config.get("llm_model") - self.chunk_size = node_config.get("chunk_size") def execute(self, state: dict) -> dict: """ - Executes the node's logic to parse the HTML document content and split it into chunks. + Executes the node's logic to parse the HTML documents content. Args: state (dict): The current state of the graph. The input keys will be used to fetch the @@ -67,90 +53,18 @@ def execute(self, state: dict) -> dict: """ self.logger.info(f"--- Executing {self.node_name} Node ---") - + + # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) - + # Fetching data from the state based on the input keys input_data = [state[key] for key in input_keys] - docs_transformed = input_data[0] - source = input_data[1] if self.parse_urls else None - - if self.parse_html: - docs_transformed = Html2TextTransformer(ignore_links=False).transform_documents(input_data[0]) - docs_transformed = docs_transformed[0] - - link_urls, img_urls = self._extract_urls(docs_transformed.page_content, source) - - chunks = split_text_into_chunks(text=docs_transformed.page_content, - chunk_size=self.chunk_size-250, model=self.llm_model) - else: - docs_transformed = docs_transformed[0] - - link_urls, img_urls = self._extract_urls(docs_transformed.page_content, source) - - chunk_size = self.chunk_size - chunk_size = min(chunk_size - 500, int(chunk_size * 0.75)) - - if isinstance(docs_transformed, Document): - chunks = split_text_into_chunks(text=docs_transformed.page_content, - chunk_size=chunk_size, - model=self.llm_model) - else: - chunks = split_text_into_chunks(text=docs_transformed, - chunk_size=chunk_size, - model=self.llm_model) - - state.update({self.output[0]: chunks}) - if self.parse_urls: - state.update({self.output[1]: link_urls}) - state.update({self.output[2]: img_urls}) + documents = input_data[0] + + for doc in documents: + document_md = convert_to_md(doc["document"]) + doc["document_md"] = document_md + + state.update({self.output[0]: documents}) + return state - - def _extract_urls(self, text: str, source: str) -> Tuple[List[str], List[str]]: - """ - Extracts URLs from the given text. - - Args: - text (str): The text to extract URLs from. - - Returns: - Tuple[List[str], List[str]]: A tuple containing the extracted link URLs and image URLs. - """ - if not self.parse_urls: - return [], [] - - image_extensions = default_filters.filter_dict["img_exts"] - image_extension_seq = '|'.join(image_extensions).replace('.','') - url_pattern = re.compile(r'(https?://[^\s]+|\S+\.(?:' + image_extension_seq + '))') - - all_urls = url_pattern.findall(text) - all_urls = self._clean_urls(all_urls) - - if not source.startswith("http"): - all_urls = [url for url in all_urls if url.startswith("http")] - else: - all_urls = [urljoin(source, url) for url in all_urls] - - images = [url for url in all_urls if any(url.endswith(ext) for ext in image_extensions)] - links = [url for url in all_urls if url not in images] - - return links, images - - def _clean_urls(self, urls: List[str]) -> List[str]: - """ - Cleans the URLs extracted from the text. - - Args: - urls (List[str]): The list of URLs to clean. - - Returns: - List[str]: The cleaned URLs. - """ - cleaned_urls = [] - for url in urls: - url = re.sub(r'.*?\]\(', '', url) - url = url.rstrip(').') - - cleaned_urls.append(url) - - return cleaned_urls From 015c6fd90504b03981d6e259e2f1aa5b16fa2472 Mon Sep 17 00:00:00 2001 From: Matteo Vedovati Date: Wed, 2 Oct 2024 13:06:00 +0200 Subject: [PATCH 29/36] remove link from markdown --- scrapegraphai/nodes/fetch_node_level_k.py | 9 +++++---- scrapegraphai/nodes/parse_node_depth_k.py | 6 ++++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/scrapegraphai/nodes/fetch_node_level_k.py b/scrapegraphai/nodes/fetch_node_level_k.py index ff329a39..5cdd6571 100644 --- a/scrapegraphai/nodes/fetch_node_level_k.py +++ b/scrapegraphai/nodes/fetch_node_level_k.py @@ -95,8 +95,6 @@ def execute(self, state: dict) -> dict: documents = [{"source": source}] - self.logger.info(f"--- (Fetching HTML from: {source}) ---") - loader_kwargs = {} if self.node_config is not None: @@ -112,6 +110,8 @@ def execute(self, state: dict) -> dict: return state def fetch_content(self, source: str, loader_kwargs) -> Optional[str]: + self.logger.info(f"--- (Fetching HTML from: {source}) ---") + if self.browser_base is not None: try: from ..docloaders.browser_base import browser_base_fetch @@ -159,9 +159,10 @@ def obtain_content(self, documents: List, loader_kwargs) -> List: documents.remove(doc) continue - doc['document'] = document[0].page_content + #doc['document'] = document[0].page_content + doc['document'] = document - links = self.extract_links(doc['document']) + links = self.extract_links(doc['document'][0].page_content) full_links = self.get_full_links(source, links) # Check if the links are already present in other documents diff --git a/scrapegraphai/nodes/parse_node_depth_k.py b/scrapegraphai/nodes/parse_node_depth_k.py index 30afa23c..7b7ab194 100644 --- a/scrapegraphai/nodes/parse_node_depth_k.py +++ b/scrapegraphai/nodes/parse_node_depth_k.py @@ -5,6 +5,7 @@ from typing import List, Optional, Tuple from .base_node import BaseNode from ..utils.convert_to_md import convert_to_md +from langchain_community.document_transformers import Html2TextTransformer class ParseNodeDepthK(BaseNode): """ @@ -62,8 +63,9 @@ def execute(self, state: dict) -> dict: documents = input_data[0] for doc in documents: - document_md = convert_to_md(doc["document"]) - doc["document_md"] = document_md + document_md = Html2TextTransformer(ignore_links=True).transform_documents(doc["document"]) + #document_md = convert_to_md(doc["document"]) + doc["document"] = document_md[0].page_content state.update({self.output[0]: documents}) From 6124fbdfca9d5b20129e3737023d5e689f9dea7c Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Wed, 2 Oct 2024 13:41:15 +0200 Subject: [PATCH 30/36] add embeddings with openai --- .../nodes/generate_answer_node_k_level.py | 20 +++++++++-- scrapegraphai/nodes/rag_node.py | 36 +++++++++++++++++++ 2 files changed, 53 insertions(+), 3 deletions(-) diff --git a/scrapegraphai/nodes/generate_answer_node_k_level.py b/scrapegraphai/nodes/generate_answer_node_k_level.py index 10977617..8dea5c98 100644 --- a/scrapegraphai/nodes/generate_answer_node_k_level.py +++ b/scrapegraphai/nodes/generate_answer_node_k_level.py @@ -97,10 +97,24 @@ def execute(self, state: dict) -> dict: client = state["vectorial_db"] - answer_db = client.query( - collection_name="vectorial_collection", - query_text=state["question"] + if state.get("embeddings"): + import openai + openai_client = openai.Client() + + answer_db = client.search( + collection_name="collection", + query_vector=openai_client.embeddings.create( + input=["What is the best to use for vector search scaling?"], + model=state.get("embeddings").get("model"), + ) + .data[0] + .embedding, ) + else: + answer_db = client.query( + collection_name="vectorial_collection", + query_text=state["question"] + ) ## TODO: from the id get the data results_db = [elem for elem in state[answer_db]] diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py index cac41a99..3f861478 100644 --- a/scrapegraphai/nodes/rag_node.py +++ b/scrapegraphai/nodes/rag_node.py @@ -4,6 +4,7 @@ from typing import List, Optional from .base_node import BaseNode from qdrant_client import QdrantClient +from qdrant_client.models import PointStruct, VectorParams, Distance class RAGNode(BaseNode): """ @@ -52,6 +53,41 @@ def execute(self, state: dict) -> dict: docs = [elem.get("summary") for elem in state.get("descriptions", {})] ids = [elem.get("id") for elem in state.get("descriptions", {})] + if state.get("embeddings"): + import openai + openai_client = openai.Client() + + files = state.get("documents") + + array_of_embeddings = [] + i=0 + + for file in files: + embeddings = openai_client.embeddings.create(input=file, + model=state.get("embeddings").get("model")) + i+=1 + points = PointStruct( + id=i, + vector=embeddings, + payload={"text": file}, + ) + + array_of_embeddings.append(points) + + collection_name = "collection" + + client.create_collection( + collection_name, + vectors_config=VectorParams( + size=1536, + distance=Distance.COSINE, + ), + ) + client.upsert(collection_name, points) + + state["vectorial_db"] = client + return state + client.add( collection_name="vectorial_collection", documents=docs, From 4b371f4d94dae47986aad751508813d89ce87b93 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 3 Oct 2024 11:38:14 +0200 Subject: [PATCH 31/36] feat: add deep scraper implementation Co-Authored-By: Matteo Vedovati <68272450+vedovati-matteo@users.noreply.github.com> --- ..._links.py => depth_search_graph_openai.py} | 6 +- scrapegraphai/graphs/depth_search_graph.py | 62 ++++++++++++++++--- scrapegraphai/nodes/description_node.py | 3 +- 3 files changed, 57 insertions(+), 14 deletions(-) rename examples/openai/{fetch_multiple_links.py => depth_search_graph_openai.py} (89%) diff --git a/examples/openai/fetch_multiple_links.py b/examples/openai/depth_search_graph_openai.py similarity index 89% rename from examples/openai/fetch_multiple_links.py rename to examples/openai/depth_search_graph_openai.py index c9c07877..7cde7865 100644 --- a/examples/openai/fetch_multiple_links.py +++ b/examples/openai/depth_search_graph_openai.py @@ -1,4 +1,6 @@ - +""" +depth_search_graph_opeani example +""" from scrapegraphai.graphs import DepthSearchGraph graph_config = { @@ -19,4 +21,4 @@ ) result = search_graph.run() -print(result) \ No newline at end of file +print(result) diff --git a/scrapegraphai/graphs/depth_search_graph.py b/scrapegraphai/graphs/depth_search_graph.py index 6ad3b245..a93d8fcf 100644 --- a/scrapegraphai/graphs/depth_search_graph.py +++ b/scrapegraphai/graphs/depth_search_graph.py @@ -9,13 +9,18 @@ from ..utils.save_code_to_file import save_code_to_file from ..nodes import ( FetchNodeLevelK, - ParseNodeDepthK + ParseNodeDepthK, + DescriptionNode, + RAGNode, + GenerateAnswerNodeKLevel ) class DepthSearchGraph(AbstractGraph): """ - CodeGeneratorGraph is a script generator pipeline that generates the function extract_data(html: str) -> dict() for - extracting the wanted information from a HTML page. The code generated is in Python and uses the library BeautifulSoup. + CodeGeneratorGraph is a script generator pipeline that generates + the function extract_data(html: str) -> dict() for + extracting the wanted information from a HTML page. The + code generated is in Python and uses the library BeautifulSoup. It requires a user prompt, a source URL, and an output schema. Attributes: @@ -60,7 +65,7 @@ def _create_graph(self) -> BaseGraph: BaseGraph: A graph instance representing the web scraping workflow. """ - fetch_node = FetchNodeLevelK( + fetch_node_k = FetchNodeLevelK( input="url| local_dir", output=["docs"], node_config={ @@ -72,8 +77,8 @@ def _create_graph(self) -> BaseGraph: "only_inside_links": self.config.get("only_inside_links", False) } ) - - parse_node = ParseNodeDepthK( + + parse_node_k = ParseNodeDepthK( input="docs", output=["docs"], node_config={ @@ -81,15 +86,52 @@ def _create_graph(self) -> BaseGraph: } ) + description_node = DescriptionNode( + input="docs", + output=["docs"], + node_config={ + "llm_model": self.llm_model, + "verbose": self.config.get("verbose", False), + "cache_path": self.config.get("cache_path", False) + } + ) + + rag_node = RAGNode ( + input="docs", + output=["vectorial_db"], + node_config={ + "llm_model": self.llm_model, + "embedder_model": self.config.get("embedder_model", False), + "verbose": self.config.get("verbose", False), + } + ) + + generate_answer_k = GenerateAnswerNodeKLevel( + input="vectorial_db", + output=["answer"], + node_config={ + "llm_model": self.llm_model, + "embedder_model": self.config.get("embedder_model", False), + "verbose": self.config.get("verbose", False), + } + + ) + return BaseGraph( nodes=[ - fetch_node, - parse_node + fetch_node_k, + parse_node_k, + description_node, + rag_node, + generate_answer_k ], edges=[ - (fetch_node, parse_node), + (fetch_node_k, parse_node_k), + (parse_node_k, description_node), + (description_node, rag_node), + (rag_node, generate_answer_k) ], - entry_point=fetch_node, + entry_point=fetch_node_k, graph_name=self.__class__.__name__ ) diff --git a/scrapegraphai/nodes/description_node.py b/scrapegraphai/nodes/description_node.py index 6175133a..97ef2e8f 100644 --- a/scrapegraphai/nodes/description_node.py +++ b/scrapegraphai/nodes/description_node.py @@ -31,12 +31,11 @@ def __init__( input: str, output: List[str], node_config: Optional[dict] = None, - node_name: str = "RAG", + node_name: str = "DESCRIPTION", ): super().__init__(node_name, "node", input, output, 2, node_config) self.llm_model = node_config["llm_model"] - self.embedder_model = node_config.get("embedder_model", None) self.verbose = ( False if node_config is None else node_config.get("verbose", False) ) From 85cb9572971719f9f7c66171f5e2246376b6aed2 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 3 Oct 2024 13:13:04 +0200 Subject: [PATCH 32/36] feat: finished basic version of deep scraper Co-Authored-By: Matteo Vedovati <68272450+vedovati-matteo@users.noreply.github.com> --- examples/openai/depth_search_graph_openai.py | 12 +++- pyproject.toml | 6 +- requirements-dev.lock | 54 ++++++++++++++++++ requirements.lock | 56 +++++++++++++++++++ scrapegraphai/graphs/depth_search_graph.py | 4 +- scrapegraphai/nodes/description_node.py | 21 ++----- .../nodes/generate_answer_node_k_level.py | 21 ++++--- scrapegraphai/nodes/rag_node.py | 9 +-- .../prompts/description_node_prompts.py | 2 +- .../prompts/generate_answer_node_prompts.py | 2 + 10 files changed, 149 insertions(+), 38 deletions(-) diff --git a/examples/openai/depth_search_graph_openai.py b/examples/openai/depth_search_graph_openai.py index 7cde7865..dff07ad4 100644 --- a/examples/openai/depth_search_graph_openai.py +++ b/examples/openai/depth_search_graph_openai.py @@ -1,22 +1,28 @@ """ depth_search_graph_opeani example """ +import os +from dotenv import load_dotenv from scrapegraphai.graphs import DepthSearchGraph +load_dotenv() + +openai_key = os.getenv("OPENAI_APIKEY") + graph_config = { "llm": { - "api_key":"YOUR_API_KEY", + "api_key": openai_key, "model": "openai/gpt-4o-mini", }, "verbose": True, "headless": False, "depth": 2, - "only_inside_links": True, + "only_inside_links": False, } search_graph = DepthSearchGraph( prompt="List me all the projects with their description", - source="https://perinim.github.io/projects/", + source="https://perinim.github.io", config=graph_config ) diff --git a/pyproject.toml b/pyproject.toml index dde97395..deacd437 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,9 @@ dependencies = [ "google>=3.0.0", "langchain-ollama>=0.1.3", "semchunk==2.2.0", - "transformers==4.44.2" + "transformers==4.44.2", + "qdrant-client>=1.11.3", + "fastembed>=0.3.6" ] license = "MIT" @@ -99,7 +101,7 @@ screenshot_scraper = [ "pillow>=10.4.0", ] -# Group 5: Faiss CPU +# Group 5: qdrant qdrant = [ "qdrant-client>=1.11.3", "fastembed>=0.3.6" diff --git a/requirements-dev.lock b/requirements-dev.lock index 1d9d469a..3423cef0 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -64,6 +64,8 @@ click==8.1.7 # via burr # via streamlit # via uvicorn +coloredlogs==15.0.1 + # via onnxruntime contourpy==1.2.1 # via matplotlib cycler==0.12.1 @@ -84,9 +86,13 @@ fastapi==0.112.0 # via burr fastapi-pagination==0.12.26 # via burr +fastembed==0.3.6 + # via scrapegraphai filelock==3.15.4 # via huggingface-hub # via transformers +flatbuffers==24.3.25 + # via onnxruntime fonttools==4.53.1 # via matplotlib free-proxy==1.1.1 @@ -132,11 +138,19 @@ greenlet==3.0.3 grpcio==1.65.4 # via google-api-core # via grpcio-status + # via grpcio-tools + # via qdrant-client grpcio-status==1.62.3 # via google-api-core +grpcio-tools==1.62.3 + # via qdrant-client h11==0.14.0 # via httpcore # via uvicorn +h2==4.1.0 + # via httpx +hpack==4.0.0 + # via h2 html2text==2024.2.26 # via scrapegraphai httpcore==1.0.5 @@ -149,11 +163,17 @@ httpx==0.27.0 # via langsmith # via ollama # via openai + # via qdrant-client httpx-sse==0.4.0 # via langchain-mistralai huggingface-hub==0.24.5 + # via fastembed # via tokenizers # via transformers +humanfriendly==10.0 + # via coloredlogs +hyperframe==6.0.1 + # via h2 idna==3.7 # via anyio # via httpx @@ -218,6 +238,7 @@ langsmith==0.1.121 # via langchain-core loguru==0.7.2 # via burr + # via fastembed lxml==5.3.0 # via free-proxy markdown-it-py==3.0.0 @@ -236,8 +257,12 @@ minify-html==0.15.0 # via scrapegraphai mistral-common==1.4.1 # via scrapegraphai +mmh3==4.1.0 + # via fastembed mpire==2.10.2 # via semchunk +mpmath==1.3.0 + # via sympy multidict==6.0.5 # via aiohttp # via yarl @@ -249,19 +274,27 @@ narwhals==1.3.0 # via altair numpy==1.26.4 # via contourpy + # via fastembed # via langchain # via langchain-aws # via langchain-community # via matplotlib + # via onnx + # via onnxruntime # via opencv-python-headless # via pandas # via pyarrow # via pydeck + # via qdrant-client # via sf-hamilton # via streamlit # via transformers ollama==0.3.2 # via langchain-ollama +onnx==1.17.0 + # via fastembed +onnxruntime==1.19.2 + # via fastembed openai==1.40.3 # via burr # via langchain-openai @@ -275,6 +308,7 @@ packaging==24.1 # via langchain-core # via marshmallow # via matplotlib + # via onnxruntime # via pytest # via sphinx # via streamlit @@ -284,6 +318,7 @@ pandas==2.2.2 # via sf-hamilton # via streamlit pillow==10.4.0 + # via fastembed # via matplotlib # via mistral-common # via streamlit @@ -294,6 +329,8 @@ playwright==1.45.1 # via undetected-playwright pluggy==1.5.0 # via pytest +portalocker==2.10.1 + # via qdrant-client proto-plus==1.24.0 # via google-ai-generativelanguage # via google-api-core @@ -303,6 +340,9 @@ protobuf==4.25.4 # via google-generativeai # via googleapis-common-protos # via grpcio-status + # via grpcio-tools + # via onnx + # via onnxruntime # via proto-plus # via streamlit pyarrow==17.0.0 @@ -326,6 +366,7 @@ pydantic==2.8.2 # via mistral-common # via openai # via pydantic-settings + # via qdrant-client pydantic-core==2.20.1 # via pydantic pydantic-settings==2.5.2 @@ -343,6 +384,8 @@ pylint==3.2.6 pyparsing==3.1.2 # via httplib2 # via matplotlib +pystemmer==2.2.0.1 + # via fastembed pytest==8.0.0 # via pytest-mock pytest-mock==3.14.0 @@ -361,6 +404,8 @@ pyyaml==6.0.2 # via langchain-community # via langchain-core # via transformers +qdrant-client==1.11.3 + # via scrapegraphai referencing==0.35.1 # via jsonschema # via jsonschema-specifications @@ -369,6 +414,7 @@ regex==2024.7.24 # via transformers requests==2.32.3 # via burr + # via fastembed # via free-proxy # via google-api-core # via huggingface-hub @@ -395,6 +441,8 @@ semchunk==2.2.0 # via scrapegraphai sentencepiece==0.2.0 # via mistral-common +setuptools==75.1.0 + # via grpcio-tools sf-hamilton==1.73.1 # via burr six==1.16.0 @@ -406,6 +454,7 @@ sniffio==1.3.1 # via httpx # via openai snowballstemmer==2.2.0 + # via fastembed # via sphinx soupsieve==2.5 # via beautifulsoup4 @@ -434,6 +483,8 @@ starlette==0.37.2 # via fastapi streamlit==1.37.1 # via burr +sympy==1.13.3 + # via onnxruntime tenacity==8.5.0 # via langchain # via langchain-community @@ -444,6 +495,7 @@ tiktoken==0.7.0 # via mistral-common # via scrapegraphai tokenizers==0.19.1 + # via fastembed # via langchain-mistralai # via transformers toml==0.10.2 @@ -456,6 +508,7 @@ tomlkit==0.13.0 tornado==6.4.1 # via streamlit tqdm==4.66.5 + # via fastembed # via google-generativeai # via huggingface-hub # via mpire @@ -495,6 +548,7 @@ uritemplate==4.1.1 # via google-api-python-client urllib3==1.26.19 # via botocore + # via qdrant-client # via requests uvicorn==0.30.5 # via burr diff --git a/requirements.lock b/requirements.lock index 84e25a0f..8949648a 100644 --- a/requirements.lock +++ b/requirements.lock @@ -41,6 +41,8 @@ certifi==2024.7.4 # via requests charset-normalizer==3.3.2 # via requests +coloredlogs==15.0.1 + # via onnxruntime dataclasses-json==0.6.7 # via langchain-community dill==0.3.8 @@ -49,9 +51,13 @@ distro==1.9.0 # via openai exceptiongroup==1.2.2 # via anyio +fastembed==0.3.6 + # via scrapegraphai filelock==3.15.4 # via huggingface-hub # via transformers +flatbuffers==24.3.25 + # via onnxruntime free-proxy==1.1.1 # via scrapegraphai frozenlist==1.4.1 @@ -87,10 +93,18 @@ greenlet==3.0.3 grpcio==1.65.1 # via google-api-core # via grpcio-status + # via grpcio-tools + # via qdrant-client grpcio-status==1.62.2 # via google-api-core +grpcio-tools==1.62.3 + # via qdrant-client h11==0.14.0 # via httpcore +h2==4.1.0 + # via httpx +hpack==4.0.0 + # via h2 html2text==2024.2.26 # via scrapegraphai httpcore==1.0.5 @@ -103,11 +117,17 @@ httpx==0.27.0 # via langsmith # via ollama # via openai + # via qdrant-client httpx-sse==0.4.0 # via langchain-mistralai huggingface-hub==0.24.1 + # via fastembed # via tokenizers # via transformers +humanfriendly==10.0 + # via coloredlogs +hyperframe==6.0.1 + # via h2 idna==3.7 # via anyio # via httpx @@ -156,6 +176,8 @@ langsmith==0.1.121 # via langchain # via langchain-community # via langchain-core +loguru==0.7.2 + # via fastembed lxml==5.2.2 # via free-proxy marshmallow==3.21.3 @@ -164,8 +186,12 @@ minify-html==0.15.0 # via scrapegraphai mistral-common==1.4.1 # via scrapegraphai +mmh3==4.1.0 + # via fastembed mpire==2.10.2 # via semchunk +mpmath==1.3.0 + # via sympy multidict==6.0.5 # via aiohttp # via yarl @@ -174,14 +200,22 @@ multiprocess==0.70.16 mypy-extensions==1.0.0 # via typing-inspect numpy==1.26.4 + # via fastembed # via langchain # via langchain-aws # via langchain-community + # via onnx + # via onnxruntime # via opencv-python-headless # via pandas + # via qdrant-client # via transformers ollama==0.3.2 # via langchain-ollama +onnx==1.17.0 + # via fastembed +onnxruntime==1.19.2 + # via fastembed openai==1.41.0 # via langchain-openai opencv-python-headless==4.10.0.84 @@ -192,14 +226,18 @@ packaging==24.1 # via huggingface-hub # via langchain-core # via marshmallow + # via onnxruntime # via transformers pandas==2.2.2 # via scrapegraphai pillow==10.4.0 + # via fastembed # via mistral-common playwright==1.45.1 # via scrapegraphai # via undetected-playwright +portalocker==2.10.1 + # via qdrant-client proto-plus==1.24.0 # via google-ai-generativelanguage # via google-api-core @@ -209,6 +247,9 @@ protobuf==4.25.3 # via google-generativeai # via googleapis-common-protos # via grpcio-status + # via grpcio-tools + # via onnx + # via onnxruntime # via proto-plus pyasn1==0.6.0 # via pyasn1-modules @@ -226,6 +267,7 @@ pydantic==2.8.2 # via mistral-common # via openai # via pydantic-settings + # via qdrant-client pydantic-core==2.20.1 # via pydantic pydantic-settings==2.5.2 @@ -236,6 +278,8 @@ pygments==2.18.0 # via mpire pyparsing==3.1.2 # via httplib2 +pystemmer==2.2.0.1 + # via fastembed python-dateutil==2.9.0.post0 # via botocore # via pandas @@ -250,6 +294,8 @@ pyyaml==6.0.1 # via langchain-community # via langchain-core # via transformers +qdrant-client==1.11.3 + # via scrapegraphai referencing==0.35.1 # via jsonschema # via jsonschema-specifications @@ -257,6 +303,7 @@ regex==2024.5.15 # via tiktoken # via transformers requests==2.32.3 + # via fastembed # via free-proxy # via google-api-core # via huggingface-hub @@ -279,17 +326,23 @@ semchunk==2.2.0 # via scrapegraphai sentencepiece==0.2.0 # via mistral-common +setuptools==75.1.0 + # via grpcio-tools six==1.16.0 # via python-dateutil sniffio==1.3.1 # via anyio # via httpx # via openai +snowballstemmer==2.2.0 + # via fastembed soupsieve==2.5 # via beautifulsoup4 sqlalchemy==2.0.31 # via langchain # via langchain-community +sympy==1.13.3 + # via onnxruntime tenacity==8.5.0 # via langchain # via langchain-community @@ -299,9 +352,11 @@ tiktoken==0.7.0 # via mistral-common # via scrapegraphai tokenizers==0.19.1 + # via fastembed # via langchain-mistralai # via transformers tqdm==4.66.4 + # via fastembed # via google-generativeai # via huggingface-hub # via mpire @@ -333,6 +388,7 @@ uritemplate==4.1.1 # via google-api-python-client urllib3==1.26.19 # via botocore + # via qdrant-client # via requests yarl==1.9.4 # via aiohttp diff --git a/scrapegraphai/graphs/depth_search_graph.py b/scrapegraphai/graphs/depth_search_graph.py index a93d8fcf..13b39129 100644 --- a/scrapegraphai/graphs/depth_search_graph.py +++ b/scrapegraphai/graphs/depth_search_graph.py @@ -146,6 +146,6 @@ def run(self) -> str: inputs = {"user_prompt": self.prompt, self.input_key: self.source} self.final_state, self.execution_info = self.graph.execute(inputs) - docs = self.final_state.get("docs", "No docs") + docs = self.final_state.get("answer", "No answer") - return docs \ No newline at end of file + return docs diff --git a/scrapegraphai/nodes/description_node.py b/scrapegraphai/nodes/description_node.py index 97ef2e8f..60c56cec 100644 --- a/scrapegraphai/nodes/description_node.py +++ b/scrapegraphai/nodes/description_node.py @@ -44,34 +44,25 @@ def __init__( def execute(self, state: dict) -> dict: self.logger.info(f"--- Executing {self.node_name} Node ---") - input_keys = self.get_input_keys(state) - input_data = [state[key] for key in input_keys] - docs = input_data[1] + docs = [elem for elem in state.get("docs")] chains_dict = {} for i, chunk in enumerate(tqdm(docs, desc="Processing chunks", disable=not self.verbose)): prompt = PromptTemplate( template=DESCRIPTION_NODE_PROMPT, - partial_variables={"context": chunk, - "chunk_id": i + 1 - } + partial_variables={"content": chunk.get("document")} ) chain_name = f"chunk{i+1}" chains_dict[chain_name] = prompt | self.llm_model async_runner = RunnableParallel(**chains_dict) - batch_results = async_runner.invoke() + batch_results = async_runner.invoke({}) - temp_res = {} - for i, (summary, document) in enumerate(zip(batch_results, docs)): - temp_res[summary] = { - "id": i, - "summary": summary, - "document": document - } + for i in range(1, len(docs)+1): + docs[i-1]["summary"] = batch_results.get(f"chunk{i}").content - state["descriptions"] = temp_res + state.update({self.output[0]: docs}) return state diff --git a/scrapegraphai/nodes/generate_answer_node_k_level.py b/scrapegraphai/nodes/generate_answer_node_k_level.py index 8dea5c98..291109f2 100644 --- a/scrapegraphai/nodes/generate_answer_node_k_level.py +++ b/scrapegraphai/nodes/generate_answer_node_k_level.py @@ -52,9 +52,9 @@ def __init__( self.additional_info = node_config.get("additional_info") def execute(self, state: dict) -> dict: - input_keys = self.get_input_keys(state) - input_data = [state[key] for key in input_keys] - user_prompt = input_data[0] + self.logger.info(f"--- Executing {self.node_name} Node ---") + + user_prompt = state.get("user_prompt") if self.node_config.get("schema", None) is not None: if isinstance(self.llm_model, (ChatOpenAI, ChatMistralAI)): @@ -113,19 +113,18 @@ def execute(self, state: dict) -> dict: else: answer_db = client.query( collection_name="vectorial_collection", - query_text=state["question"] + query_text=user_prompt ) - ## TODO: from the id get the data - results_db = [elem for elem in state[answer_db]] - chains_dict = {} - for i, chunk in enumerate(tqdm(results_db, + elems =[state.get("docs")[elem.id-1] for elem in answer_db if elem.score>0.5] + + for i, chunk in enumerate(tqdm(elems, desc="Processing chunks", disable=not self.verbose)): prompt = PromptTemplate( template=template_chunks_prompt, - input_variables=["question"], - partial_variables={"context": chunk, + input_variables=["format_instructions"], + partial_variables={"context": chunk.get("document"), "chunk_id": i + 1, } ) @@ -133,7 +132,7 @@ def execute(self, state: dict) -> dict: chains_dict[chain_name] = prompt | self.llm_model async_runner = RunnableParallel(**chains_dict) - batch_results = async_runner.invoke({"question": user_prompt}) + batch_results = async_runner.invoke({"format_instructions": user_prompt}) merge_prompt = PromptTemplate( template=template_merge_prompt, diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py index 3f861478..b67c50e9 100644 --- a/scrapegraphai/nodes/rag_node.py +++ b/scrapegraphai/nodes/rag_node.py @@ -40,8 +40,9 @@ def __init__( ) def execute(self, state: dict) -> dict: - - if self.node_config.get("client_type") == "memory": + self.logger.info(f"--- Executing {self.node_name} Node ---") + + if self.node_config.get("client_type") in ["memory", None]: client = QdrantClient(":memory:") elif self.node_config.get("client_type") == "local_db": client = QdrantClient(path="path/to/db") @@ -50,8 +51,8 @@ def execute(self, state: dict) -> dict: else: raise ValueError("client_type provided not correct") - docs = [elem.get("summary") for elem in state.get("descriptions", {})] - ids = [elem.get("id") for elem in state.get("descriptions", {})] + docs = [elem.get("summary") for elem in state.get("docs")] + ids = [i for i in range(1, len(state.get("docs"))+1)] if state.get("embeddings"): import openai diff --git a/scrapegraphai/prompts/description_node_prompts.py b/scrapegraphai/prompts/description_node_prompts.py index 5cd78d7f..20df481a 100644 --- a/scrapegraphai/prompts/description_node_prompts.py +++ b/scrapegraphai/prompts/description_node_prompts.py @@ -5,6 +5,6 @@ DESCRIPTION_NODE_PROMPT = """ You are a scraper and you have just scraped the following content from a website. \n -Please provide a description summary of maximum of 10 words +Please provide a description summary of maximum of 20 words Content of the website: {content} """ \ No newline at end of file diff --git a/scrapegraphai/prompts/generate_answer_node_prompts.py b/scrapegraphai/prompts/generate_answer_node_prompts.py index 7c098fe2..1b336fb4 100644 --- a/scrapegraphai/prompts/generate_answer_node_prompts.py +++ b/scrapegraphai/prompts/generate_answer_node_prompts.py @@ -2,6 +2,7 @@ Generate answer node prompts """ + TEMPLATE_CHUNKS_MD = """ You are a website scraper and you have just scraped the following content from a website converted in markdown format. @@ -32,6 +33,7 @@ You are now asked to answer a user question about the content you have scraped.\n You have scraped many chunks since the website is big and now you are asked to merge them into a single answer without repetitions (if there are any).\n Make sure that if a maximum number of items is specified in the instructions that you get that maximum number and do not exceed it. \n +The structure should be coherent. \n Make sure the output format is a valid JSON and does not contain errors. \n OUTPUT INSTRUCTIONS: {format_instructions}\n USER QUESTION: {question}\n From cb46efbe4622597ca6ecbdaa8f750eb7ccc74d14 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 3 Oct 2024 14:33:52 +0200 Subject: [PATCH 33/36] changed depedencies --- README.md | 6 ------ pyproject.toml | 6 ------ 2 files changed, 12 deletions(-) diff --git a/README.md b/README.md index 51bc3fa9..5d79bf55 100644 --- a/README.md +++ b/README.md @@ -54,12 +54,6 @@ Additional dependecies can be added while installing the library: pip install scrapegraphai[more-browser-options] ``` -- qdrants Options: this group includes qdrant integration for RAGnode and DeepScraperGraph. - - ```bash - pip install scrapegraphai[qdrant] - ``` - diff --git a/pyproject.toml b/pyproject.toml index deacd437..4c5e5117 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -101,12 +101,6 @@ screenshot_scraper = [ "pillow>=10.4.0", ] -# Group 5: qdrant -qdrant = [ - "qdrant-client>=1.11.3", - "fastembed>=0.3.6" -] - [build-system] requires = ["hatchling"] build-backend = "hatchling.build" From c91975e0c81fd2f77007039503c0f1a02685c969 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 3 Oct 2024 15:32:29 +0200 Subject: [PATCH 34/36] update examples --- ...aper_haiku.py => csv_scraper_anthropic.py} | 0 ...y => csv_scraper_graph_multi_anthropic.py} | 0 ...aph_haiku.py => custom_graph_anthropic.py} | 0 .../anthropic/depth_search_graph_anthropic.py | 28 ++++++++++++++ ...per_haiku.py => json_scraper_anthropic.py} | 0 ...iku.py => json_scraper_multi_anthropic.py} | 0 ...aiku.py => pdf_scraper_graph_anthropic.py} | 0 ...aiku.py => pdf_scraper_multi_anthropic.py} | 0 ...limit_haiku.py => rate_limit_anthropic.py} | 0 ...aiku.py => scrape_plain_text_anthropic.py} | 0 ...haiku.py => script_generator_anthropic.py} | 0 ...py => script_multi_generator_anthropic.py} | 0 ...aph_haiku.py => search_graph_anthropic.py} | 0 ...ku.py => search_graph_schema_anthropic.py} | 0 ...aiku.py => search_link_graph_anthropic.py} | 0 ...er_haiku.py => smart_scraper_anthropic.py} | 0 ...ku.py => smart_scraper_multi_anthropic.py} | 0 ...> smart_scraper_multi_concat_anthropic.py} | 0 ...u.py => smart_scraper_schema_anthropic.py} | 0 ...aper_haiku.py => xml_scraper_anthropic.py} | 0 ...y => xml_scraper_graph_multi_anthropic.py} | 0 examples/azure/code_generator_graph_azure.py | 2 +- examples/azure/csv_scraper_azure.py | 2 +- .../azure/csv_scraper_graph_multi_azure.py | 2 +- examples/azure/depth_search_graph_azure.py | 30 +++++++++++++++ examples/azure/json_scraper_azure.py | 2 +- examples/azure/json_scraper_multi_azure.py | 2 +- examples/azure/pdf_scraper_azure.py | 2 +- examples/azure/rate_limit_azure.py | 2 +- examples/azure/scrape_plain_text_azure.py | 2 +- examples/azure/script_generator_azure.py | 2 +- .../azure/script_multi_generator_azure.py | 2 +- examples/azure/search_graph_azure.py | 2 +- examples/azure/search_graph_schema_azure.py | 2 +- examples/azure/search_link_graph_azure.py | 2 +- examples/azure/smart_scraper_azure.py | 2 +- examples/azure/smart_scraper_multi_azure.py | 2 +- .../azure/smart_scraper_multi_concat_azure.py | 2 +- examples/azure/smart_scraper_schema_azure.py | 2 +- examples/azure/xml_scraper_azure.py | 2 +- .../azure/xml_scraper_graph_multi_azure.py | 2 +- .../bedrock/depth_search_graph_bedrock.py | 31 +++++++++++++++ .../deepseek/depth_search_graph_deepseek.py | 30 +++++++++++++++ examples/ernie/custom_graph_ernie.py | 2 +- examples/ernie/depth_search_graph_ernie.py | 26 +++++++++++++ .../fireworks/depth_search_graph_fireworks.py | 30 +++++++++++++++ .../google_genai/depth_search_graph_gemini.py | 30 +++++++++++++++ .../depth_search_graph_gemini.py | 30 +++++++++++++++ examples/groq/depth_search_graph_groq.py | 31 +++++++++++++++ .../custom_graph_huggingfacehub.py | 1 - .../depth_search_graph_huggingfacehub.py | 38 +++++++++++++++++++ .../local_models/depth_search_graph_ollama.py | 32 ++++++++++++++++ .../local_models/json_scraper_multi_ollama.py | 1 + .../smart_scraper_schema_ollama.py | 1 - .../mistral/depth_search_graph_mistral.py | 30 +++++++++++++++ .../nemotron/depth_search_graph_nemotron.py | 30 +++++++++++++++ examples/oneapi/depth_search_graph_onenapi.py | 31 +++++++++++++++ .../together/depth_search_graph_together.py | 31 +++++++++++++++ 58 files changed, 479 insertions(+), 22 deletions(-) rename examples/anthropic/{csv_scraper_haiku.py => csv_scraper_anthropic.py} (100%) rename examples/anthropic/{csv_scraper_graph_multi_haiku.py => csv_scraper_graph_multi_anthropic.py} (100%) rename examples/anthropic/{custom_graph_haiku.py => custom_graph_anthropic.py} (100%) create mode 100644 examples/anthropic/depth_search_graph_anthropic.py rename examples/anthropic/{json_scraper_haiku.py => json_scraper_anthropic.py} (100%) rename examples/anthropic/{json_scraper_multi_haiku.py => json_scraper_multi_anthropic.py} (100%) rename examples/anthropic/{pdf_scraper_graph_haiku.py => pdf_scraper_graph_anthropic.py} (100%) rename examples/anthropic/{pdf_scraper_multi_haiku.py => pdf_scraper_multi_anthropic.py} (100%) rename examples/anthropic/{rate_limit_haiku.py => rate_limit_anthropic.py} (100%) rename examples/anthropic/{scrape_plain_text_haiku.py => scrape_plain_text_anthropic.py} (100%) rename examples/anthropic/{script_generator_haiku.py => script_generator_anthropic.py} (100%) rename examples/anthropic/{script_multi_generator_haiku.py => script_multi_generator_anthropic.py} (100%) rename examples/anthropic/{search_graph_haiku.py => search_graph_anthropic.py} (100%) rename examples/anthropic/{search_graph_schema_haiku.py => search_graph_schema_anthropic.py} (100%) rename examples/anthropic/{search_link_graph_haiku.py => search_link_graph_anthropic.py} (100%) rename examples/anthropic/{smart_scraper_haiku.py => smart_scraper_anthropic.py} (100%) rename examples/anthropic/{smart_scraper_multi_haiku.py => smart_scraper_multi_anthropic.py} (100%) rename examples/anthropic/{smart_scraper_multi_concat_haiku.py => smart_scraper_multi_concat_anthropic.py} (100%) rename examples/anthropic/{smart_scraper_schema_haiku.py => smart_scraper_schema_anthropic.py} (100%) rename examples/anthropic/{xml_scraper_haiku.py => xml_scraper_anthropic.py} (100%) rename examples/anthropic/{xml_scraper_graph_multi_haiku.py => xml_scraper_graph_multi_anthropic.py} (100%) create mode 100644 examples/azure/depth_search_graph_azure.py create mode 100644 examples/bedrock/depth_search_graph_bedrock.py create mode 100644 examples/deepseek/depth_search_graph_deepseek.py create mode 100644 examples/ernie/depth_search_graph_ernie.py create mode 100644 examples/fireworks/depth_search_graph_fireworks.py create mode 100644 examples/google_genai/depth_search_graph_gemini.py create mode 100644 examples/google_vertexai/depth_search_graph_gemini.py create mode 100644 examples/groq/depth_search_graph_groq.py create mode 100644 examples/huggingfacehub/depth_search_graph_huggingfacehub.py create mode 100644 examples/local_models/depth_search_graph_ollama.py create mode 100644 examples/mistral/depth_search_graph_mistral.py create mode 100644 examples/nemotron/depth_search_graph_nemotron.py create mode 100644 examples/oneapi/depth_search_graph_onenapi.py create mode 100644 examples/together/depth_search_graph_together.py diff --git a/examples/anthropic/csv_scraper_haiku.py b/examples/anthropic/csv_scraper_anthropic.py similarity index 100% rename from examples/anthropic/csv_scraper_haiku.py rename to examples/anthropic/csv_scraper_anthropic.py diff --git a/examples/anthropic/csv_scraper_graph_multi_haiku.py b/examples/anthropic/csv_scraper_graph_multi_anthropic.py similarity index 100% rename from examples/anthropic/csv_scraper_graph_multi_haiku.py rename to examples/anthropic/csv_scraper_graph_multi_anthropic.py diff --git a/examples/anthropic/custom_graph_haiku.py b/examples/anthropic/custom_graph_anthropic.py similarity index 100% rename from examples/anthropic/custom_graph_haiku.py rename to examples/anthropic/custom_graph_anthropic.py diff --git a/examples/anthropic/depth_search_graph_anthropic.py b/examples/anthropic/depth_search_graph_anthropic.py new file mode 100644 index 00000000..8cac7bea --- /dev/null +++ b/examples/anthropic/depth_search_graph_anthropic.py @@ -0,0 +1,28 @@ +""" +depth_search_graph_opeani example +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import DepthSearchGraph + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("ANTHROPIC_API_KEY"), + "model": "openai/gpt-4o-mini", + }, + "verbose": True, + "headless": False, + "depth": 2, + "only_inside_links": False, +} + +search_graph = DepthSearchGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io", + config=graph_config +) + +result = search_graph.run() +print(result) diff --git a/examples/anthropic/json_scraper_haiku.py b/examples/anthropic/json_scraper_anthropic.py similarity index 100% rename from examples/anthropic/json_scraper_haiku.py rename to examples/anthropic/json_scraper_anthropic.py diff --git a/examples/anthropic/json_scraper_multi_haiku.py b/examples/anthropic/json_scraper_multi_anthropic.py similarity index 100% rename from examples/anthropic/json_scraper_multi_haiku.py rename to examples/anthropic/json_scraper_multi_anthropic.py diff --git a/examples/anthropic/pdf_scraper_graph_haiku.py b/examples/anthropic/pdf_scraper_graph_anthropic.py similarity index 100% rename from examples/anthropic/pdf_scraper_graph_haiku.py rename to examples/anthropic/pdf_scraper_graph_anthropic.py diff --git a/examples/anthropic/pdf_scraper_multi_haiku.py b/examples/anthropic/pdf_scraper_multi_anthropic.py similarity index 100% rename from examples/anthropic/pdf_scraper_multi_haiku.py rename to examples/anthropic/pdf_scraper_multi_anthropic.py diff --git a/examples/anthropic/rate_limit_haiku.py b/examples/anthropic/rate_limit_anthropic.py similarity index 100% rename from examples/anthropic/rate_limit_haiku.py rename to examples/anthropic/rate_limit_anthropic.py diff --git a/examples/anthropic/scrape_plain_text_haiku.py b/examples/anthropic/scrape_plain_text_anthropic.py similarity index 100% rename from examples/anthropic/scrape_plain_text_haiku.py rename to examples/anthropic/scrape_plain_text_anthropic.py diff --git a/examples/anthropic/script_generator_haiku.py b/examples/anthropic/script_generator_anthropic.py similarity index 100% rename from examples/anthropic/script_generator_haiku.py rename to examples/anthropic/script_generator_anthropic.py diff --git a/examples/anthropic/script_multi_generator_haiku.py b/examples/anthropic/script_multi_generator_anthropic.py similarity index 100% rename from examples/anthropic/script_multi_generator_haiku.py rename to examples/anthropic/script_multi_generator_anthropic.py diff --git a/examples/anthropic/search_graph_haiku.py b/examples/anthropic/search_graph_anthropic.py similarity index 100% rename from examples/anthropic/search_graph_haiku.py rename to examples/anthropic/search_graph_anthropic.py diff --git a/examples/anthropic/search_graph_schema_haiku.py b/examples/anthropic/search_graph_schema_anthropic.py similarity index 100% rename from examples/anthropic/search_graph_schema_haiku.py rename to examples/anthropic/search_graph_schema_anthropic.py diff --git a/examples/anthropic/search_link_graph_haiku.py b/examples/anthropic/search_link_graph_anthropic.py similarity index 100% rename from examples/anthropic/search_link_graph_haiku.py rename to examples/anthropic/search_link_graph_anthropic.py diff --git a/examples/anthropic/smart_scraper_haiku.py b/examples/anthropic/smart_scraper_anthropic.py similarity index 100% rename from examples/anthropic/smart_scraper_haiku.py rename to examples/anthropic/smart_scraper_anthropic.py diff --git a/examples/anthropic/smart_scraper_multi_haiku.py b/examples/anthropic/smart_scraper_multi_anthropic.py similarity index 100% rename from examples/anthropic/smart_scraper_multi_haiku.py rename to examples/anthropic/smart_scraper_multi_anthropic.py diff --git a/examples/anthropic/smart_scraper_multi_concat_haiku.py b/examples/anthropic/smart_scraper_multi_concat_anthropic.py similarity index 100% rename from examples/anthropic/smart_scraper_multi_concat_haiku.py rename to examples/anthropic/smart_scraper_multi_concat_anthropic.py diff --git a/examples/anthropic/smart_scraper_schema_haiku.py b/examples/anthropic/smart_scraper_schema_anthropic.py similarity index 100% rename from examples/anthropic/smart_scraper_schema_haiku.py rename to examples/anthropic/smart_scraper_schema_anthropic.py diff --git a/examples/anthropic/xml_scraper_haiku.py b/examples/anthropic/xml_scraper_anthropic.py similarity index 100% rename from examples/anthropic/xml_scraper_haiku.py rename to examples/anthropic/xml_scraper_anthropic.py diff --git a/examples/anthropic/xml_scraper_graph_multi_haiku.py b/examples/anthropic/xml_scraper_graph_multi_anthropic.py similarity index 100% rename from examples/anthropic/xml_scraper_graph_multi_haiku.py rename to examples/anthropic/xml_scraper_graph_multi_anthropic.py diff --git a/examples/azure/code_generator_graph_azure.py b/examples/azure/code_generator_graph_azure.py index ad48933f..4bad1b0d 100644 --- a/examples/azure/code_generator_graph_azure.py +++ b/examples/azure/code_generator_graph_azure.py @@ -28,7 +28,7 @@ class Projects(BaseModel): graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o" }, "verbose": True, "headless": False, diff --git a/examples/azure/csv_scraper_azure.py b/examples/azure/csv_scraper_azure.py index efc99758..272527b3 100644 --- a/examples/azure/csv_scraper_azure.py +++ b/examples/azure/csv_scraper_azure.py @@ -25,7 +25,7 @@ graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o" }, "verbose": True, "headless": False diff --git a/examples/azure/csv_scraper_graph_multi_azure.py b/examples/azure/csv_scraper_graph_multi_azure.py index d9160c40..cccbf88e 100644 --- a/examples/azure/csv_scraper_graph_multi_azure.py +++ b/examples/azure/csv_scraper_graph_multi_azure.py @@ -25,7 +25,7 @@ graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o" }, "verbose": True, "headless": False diff --git a/examples/azure/depth_search_graph_azure.py b/examples/azure/depth_search_graph_azure.py new file mode 100644 index 00000000..88b2cd1b --- /dev/null +++ b/examples/azure/depth_search_graph_azure.py @@ -0,0 +1,30 @@ +""" +depth_search_graph_opeani example +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import DepthSearchGraph + +load_dotenv() + +openai_key = os.getenv("OPENAI_APIKEY") + +graph_config = { + "llm": { + "api_key": os.environ["AZURE_OPENAI_KEY"], + "model": "azure_openai/gpt-4o", + }, + "verbose": True, + "headless": False, + "depth": 2, + "only_inside_links": False, +} + +search_graph = DepthSearchGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io", + config=graph_config +) + +result = search_graph.run() +print(result) diff --git a/examples/azure/json_scraper_azure.py b/examples/azure/json_scraper_azure.py index 483544fe..5ba54f7b 100644 --- a/examples/azure/json_scraper_azure.py +++ b/examples/azure/json_scraper_azure.py @@ -23,7 +23,7 @@ graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o" }, "verbose": True, "headless": False diff --git a/examples/azure/json_scraper_multi_azure.py b/examples/azure/json_scraper_multi_azure.py index ecf97280..befc4e84 100644 --- a/examples/azure/json_scraper_multi_azure.py +++ b/examples/azure/json_scraper_multi_azure.py @@ -12,7 +12,7 @@ graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o" }, "verbose": True, "headless": False diff --git a/examples/azure/pdf_scraper_azure.py b/examples/azure/pdf_scraper_azure.py index f8926489..02b3b7e6 100644 --- a/examples/azure/pdf_scraper_azure.py +++ b/examples/azure/pdf_scraper_azure.py @@ -10,7 +10,7 @@ graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o" }, "verbose": True, "headless": False diff --git a/examples/azure/rate_limit_azure.py b/examples/azure/rate_limit_azure.py index cfd05f1f..892996c7 100644 --- a/examples/azure/rate_limit_azure.py +++ b/examples/azure/rate_limit_azure.py @@ -26,7 +26,7 @@ graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o", "rate_limit": { "requests_per_second": 1 }, diff --git a/examples/azure/scrape_plain_text_azure.py b/examples/azure/scrape_plain_text_azure.py index ef0d7d1c..9ea18d07 100644 --- a/examples/azure/scrape_plain_text_azure.py +++ b/examples/azure/scrape_plain_text_azure.py @@ -28,7 +28,7 @@ graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o" }, "verbose": True, "headless": False diff --git a/examples/azure/script_generator_azure.py b/examples/azure/script_generator_azure.py index 12f5d6be..b2bbb220 100644 --- a/examples/azure/script_generator_azure.py +++ b/examples/azure/script_generator_azure.py @@ -15,7 +15,7 @@ graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o" }, "verbose": True, "headless": False diff --git a/examples/azure/script_multi_generator_azure.py b/examples/azure/script_multi_generator_azure.py index a1bb8dbd..8c52cb95 100644 --- a/examples/azure/script_multi_generator_azure.py +++ b/examples/azure/script_multi_generator_azure.py @@ -16,7 +16,7 @@ graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o" }, "verbose": True, "headless": False diff --git a/examples/azure/search_graph_azure.py b/examples/azure/search_graph_azure.py index 13547e06..949f134c 100644 --- a/examples/azure/search_graph_azure.py +++ b/examples/azure/search_graph_azure.py @@ -22,7 +22,7 @@ graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o" }, "verbose": True, "headless": False diff --git a/examples/azure/search_graph_schema_azure.py b/examples/azure/search_graph_schema_azure.py index 629c92ab..e8c10093 100644 --- a/examples/azure/search_graph_schema_azure.py +++ b/examples/azure/search_graph_schema_azure.py @@ -30,7 +30,7 @@ class Dishes(BaseModel): graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o" }, "verbose": True, "headless": False diff --git a/examples/azure/search_link_graph_azure.py b/examples/azure/search_link_graph_azure.py index aec2297b..42ed07ad 100644 --- a/examples/azure/search_link_graph_azure.py +++ b/examples/azure/search_link_graph_azure.py @@ -15,7 +15,7 @@ graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o" }, "verbose": True, "headless": False diff --git a/examples/azure/smart_scraper_azure.py b/examples/azure/smart_scraper_azure.py index bf3bc8d7..933dc5b0 100644 --- a/examples/azure/smart_scraper_azure.py +++ b/examples/azure/smart_scraper_azure.py @@ -26,7 +26,7 @@ graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o" }, "verbose": True, "headless": False diff --git a/examples/azure/smart_scraper_multi_azure.py b/examples/azure/smart_scraper_multi_azure.py index f1f3451e..e066eaf1 100644 --- a/examples/azure/smart_scraper_multi_azure.py +++ b/examples/azure/smart_scraper_multi_azure.py @@ -14,7 +14,7 @@ graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o" }, "verbose": True, "headless": False diff --git a/examples/azure/smart_scraper_multi_concat_azure.py b/examples/azure/smart_scraper_multi_concat_azure.py index e3870a4c..06d08b9a 100644 --- a/examples/azure/smart_scraper_multi_concat_azure.py +++ b/examples/azure/smart_scraper_multi_concat_azure.py @@ -15,7 +15,7 @@ graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o" }, "verbose": True, "headless": False diff --git a/examples/azure/smart_scraper_schema_azure.py b/examples/azure/smart_scraper_schema_azure.py index d0816bf5..d2766ecb 100644 --- a/examples/azure/smart_scraper_schema_azure.py +++ b/examples/azure/smart_scraper_schema_azure.py @@ -29,7 +29,7 @@ class Projects(BaseModel): graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o" }, "verbose": True, "headless": False diff --git a/examples/azure/xml_scraper_azure.py b/examples/azure/xml_scraper_azure.py index ecfb8743..1c40f3e7 100644 --- a/examples/azure/xml_scraper_azure.py +++ b/examples/azure/xml_scraper_azure.py @@ -24,7 +24,7 @@ graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o" }, "verbose": True, "headless": False diff --git a/examples/azure/xml_scraper_graph_multi_azure.py b/examples/azure/xml_scraper_graph_multi_azure.py index db4db108..972eb823 100644 --- a/examples/azure/xml_scraper_graph_multi_azure.py +++ b/examples/azure/xml_scraper_graph_multi_azure.py @@ -25,7 +25,7 @@ graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o", }, "verbose": True, "headless": False diff --git a/examples/bedrock/depth_search_graph_bedrock.py b/examples/bedrock/depth_search_graph_bedrock.py new file mode 100644 index 00000000..2ab88291 --- /dev/null +++ b/examples/bedrock/depth_search_graph_bedrock.py @@ -0,0 +1,31 @@ +""" +depth_search_graph_opeani example +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import DepthSearchGraph + +load_dotenv() + +openai_key = os.getenv("OPENAI_APIKEY") + +graph_config = { + "llm": { + "client": "client_name", + "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + "temperature": 0.0 + }, + "verbose": True, + "headless": False, + "depth": 2, + "only_inside_links": False, +} + +search_graph = DepthSearchGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io", + config=graph_config +) + +result = search_graph.run() +print(result) diff --git a/examples/deepseek/depth_search_graph_deepseek.py b/examples/deepseek/depth_search_graph_deepseek.py new file mode 100644 index 00000000..064690a5 --- /dev/null +++ b/examples/deepseek/depth_search_graph_deepseek.py @@ -0,0 +1,30 @@ +""" +depth_search_graph_opeani example +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import DepthSearchGraph + +load_dotenv() + +deepseek_key = os.getenv("DEEPSEEK_APIKEY") + +graph_config = { + "llm": { + "model": "deepseek/deepseek-chat", + "api_key": deepseek_key, + }, + "verbose": True, + "headless": False, + "depth": 2, + "only_inside_links": False, +} + +search_graph = DepthSearchGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io", + config=graph_config +) + +result = search_graph.run() +print(result) diff --git a/examples/ernie/custom_graph_ernie.py b/examples/ernie/custom_graph_ernie.py index 57d422e5..a3082cf7 100644 --- a/examples/ernie/custom_graph_ernie.py +++ b/examples/ernie/custom_graph_ernie.py @@ -14,7 +14,7 @@ # Define the configuration for the graph # ************************************************ -graph_config = { +graph_config = { "llm": { "model": "ernie/ernie-bot-turbo", "ernie_client_id": "", diff --git a/examples/ernie/depth_search_graph_ernie.py b/examples/ernie/depth_search_graph_ernie.py new file mode 100644 index 00000000..99470d8d --- /dev/null +++ b/examples/ernie/depth_search_graph_ernie.py @@ -0,0 +1,26 @@ +""" +depth_search_graph_opeani example +""" +from scrapegraphai.graphs import DepthSearchGraph + +graph_config = { + "llm": { + "model": "ernie/ernie-bot-turbo", + "ernie_client_id": "", + "ernie_client_secret": "", + "temperature": 0.1 + }, + "verbose": True, + "headless": False, + "depth": 2, + "only_inside_links": False, +} + +search_graph = DepthSearchGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io", + config=graph_config +) + +result = search_graph.run() +print(result) diff --git a/examples/fireworks/depth_search_graph_fireworks.py b/examples/fireworks/depth_search_graph_fireworks.py new file mode 100644 index 00000000..f467be9f --- /dev/null +++ b/examples/fireworks/depth_search_graph_fireworks.py @@ -0,0 +1,30 @@ +""" +depth_search_graph_opeani example +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import DepthSearchGraph + +load_dotenv() + +fireworks_api_key = os.getenv("FIREWORKS_APIKEY") + +graph_config = { + "llm": { + "api_key": fireworks_api_key, + "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" + }, + "verbose": True, + "headless": False, + "depth": 2, + "only_inside_links": False, +} + +search_graph = DepthSearchGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io", + config=graph_config +) + +result = search_graph.run() +print(result) diff --git a/examples/google_genai/depth_search_graph_gemini.py b/examples/google_genai/depth_search_graph_gemini.py new file mode 100644 index 00000000..956341f4 --- /dev/null +++ b/examples/google_genai/depth_search_graph_gemini.py @@ -0,0 +1,30 @@ +""" +depth_search_graph_opeani example +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import DepthSearchGraph + +load_dotenv() + +gemini_key = os.getenv("GOOGLE_APIKEY") + +graph_config = { + "llm": { + "api_key": gemini_key, + "model": "google_genai/gemini-pro", + }, + "verbose": True, + "headless": False, + "depth": 2, + "only_inside_links": False, +} + +search_graph = DepthSearchGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io", + config=graph_config +) + +result = search_graph.run() +print(result) diff --git a/examples/google_vertexai/depth_search_graph_gemini.py b/examples/google_vertexai/depth_search_graph_gemini.py new file mode 100644 index 00000000..13bba630 --- /dev/null +++ b/examples/google_vertexai/depth_search_graph_gemini.py @@ -0,0 +1,30 @@ +""" +depth_search_graph_opeani example +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import DepthSearchGraph + +load_dotenv() + +gemini_key = os.getenv("GOOGLE_APIKEY") + +graph_config = { + "llm": { + "api_key": gemini_key, + "model": "google_vertexai/gemini-1.5-pro", + }, + "verbose": True, + "headless": False, + "depth": 2, + "only_inside_links": False, +} + +search_graph = DepthSearchGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io", + config=graph_config +) + +result = search_graph.run() +print(result) diff --git a/examples/groq/depth_search_graph_groq.py b/examples/groq/depth_search_graph_groq.py new file mode 100644 index 00000000..2d1ed8b1 --- /dev/null +++ b/examples/groq/depth_search_graph_groq.py @@ -0,0 +1,31 @@ +""" +depth_search_graph_opeani example +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import DepthSearchGraph + +load_dotenv() + +groq_key = os.getenv("GROQ_APIKEY") + +graph_config = { + "llm": { + "model": "groq/gemma-7b-it", + "api_key": groq_key, + "temperature": 0 + }, + "verbose": True, + "headless": False, + "depth": 2, + "only_inside_links": False, +} + +search_graph = DepthSearchGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io", + config=graph_config +) + +result = search_graph.run() +print(result) diff --git a/examples/huggingfacehub/custom_graph_huggingfacehub.py b/examples/huggingfacehub/custom_graph_huggingfacehub.py index cec007b7..06b2f089 100644 --- a/examples/huggingfacehub/custom_graph_huggingfacehub.py +++ b/examples/huggingfacehub/custom_graph_huggingfacehub.py @@ -4,7 +4,6 @@ import os from dotenv import load_dotenv - from langchain_openai import OpenAIEmbeddings from langchain_openai import ChatOpenAI from scrapegraphai.graphs import BaseGraph diff --git a/examples/huggingfacehub/depth_search_graph_huggingfacehub.py b/examples/huggingfacehub/depth_search_graph_huggingfacehub.py new file mode 100644 index 00000000..48df3e37 --- /dev/null +++ b/examples/huggingfacehub/depth_search_graph_huggingfacehub.py @@ -0,0 +1,38 @@ +""" +depth_search_graph_opeani example +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import DepthSearchGraph +from langchain_community.llms import HuggingFaceEndpoint +from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings + +load_dotenv() +HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') + +repo_id = "mistralai/Mistral-7B-Instruct-v0.2" + +llm_model_instance = HuggingFaceEndpoint( + repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN +) + +embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( + api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2" +) + +graph_config = { + "llm": {"model_instance": llm_model_instance}, + "verbose": True, + "headless": False, + "depth": 2, + "only_inside_links": False, +} + +search_graph = DepthSearchGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io", + config=graph_config +) + +result = search_graph.run() +print(result) diff --git a/examples/local_models/depth_search_graph_ollama.py b/examples/local_models/depth_search_graph_ollama.py new file mode 100644 index 00000000..d0f960b5 --- /dev/null +++ b/examples/local_models/depth_search_graph_ollama.py @@ -0,0 +1,32 @@ +""" +depth_search_graph_opeani example +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import DepthSearchGraph + +load_dotenv() + +openai_key = os.getenv("OPENAI_APIKEY") + +graph_config = { + "llm": { + "model": "ollama/llama3.1", + "temperature": 0, + "format": "json", # Ollama needs the format to be specified explicitly + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "verbose": True, + "headless": False, + "depth": 2, + "only_inside_links": False, +} + +search_graph = DepthSearchGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io", + config=graph_config +) + +result = search_graph.run() +print(result) diff --git a/examples/local_models/json_scraper_multi_ollama.py b/examples/local_models/json_scraper_multi_ollama.py index 6e9c3da3..e80bf5ec 100644 --- a/examples/local_models/json_scraper_multi_ollama.py +++ b/examples/local_models/json_scraper_multi_ollama.py @@ -15,6 +15,7 @@ "verbose": True, "headless": False, } + FILE_NAME = "inputs/example.json" curr_dir = os.path.dirname(os.path.realpath(__file__)) file_path = os.path.join(curr_dir, FILE_NAME) diff --git a/examples/local_models/smart_scraper_schema_ollama.py b/examples/local_models/smart_scraper_schema_ollama.py index 35503bd7..5a5b3cea 100644 --- a/examples/local_models/smart_scraper_schema_ollama.py +++ b/examples/local_models/smart_scraper_schema_ollama.py @@ -24,7 +24,6 @@ class Projects(BaseModel): "format": "json", # Ollama needs the format to be specified explicitly # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, - "verbose": True, "headless": False } diff --git a/examples/mistral/depth_search_graph_mistral.py b/examples/mistral/depth_search_graph_mistral.py new file mode 100644 index 00000000..ae18ffba --- /dev/null +++ b/examples/mistral/depth_search_graph_mistral.py @@ -0,0 +1,30 @@ +""" +depth_search_graph_opeani example +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import DepthSearchGraph + +load_dotenv() + +mistral_key = os.getenv("MISTRAL_API_KEY") + +graph_config = { + "llm": { + "api_key": mistral_key, + "model": "mistralai/open-mistral-nemo", + }, + "verbose": True, + "headless": False, + "depth": 2, + "only_inside_links": False, +} + +search_graph = DepthSearchGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io", + config=graph_config +) + +result = search_graph.run() +print(result) diff --git a/examples/nemotron/depth_search_graph_nemotron.py b/examples/nemotron/depth_search_graph_nemotron.py new file mode 100644 index 00000000..edd80463 --- /dev/null +++ b/examples/nemotron/depth_search_graph_nemotron.py @@ -0,0 +1,30 @@ +""" +depth_search_graph_opeani example +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import DepthSearchGraph + +load_dotenv() + +openai_key = os.getenv("OPENAI_APIKEY") + +graph_config = { + "llm": { + "api_key": os.getenv("NEMOTRON_KEY"), + "model": "claude-3-haiku-20240307", + }, + "verbose": True, + "headless": False, + "depth": 2, + "only_inside_links": False, +} + +search_graph = DepthSearchGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io", + config=graph_config +) + +result = search_graph.run() +print(result) diff --git a/examples/oneapi/depth_search_graph_onenapi.py b/examples/oneapi/depth_search_graph_onenapi.py new file mode 100644 index 00000000..7a2e7f3e --- /dev/null +++ b/examples/oneapi/depth_search_graph_onenapi.py @@ -0,0 +1,31 @@ +""" +depth_search_graph_opeani example +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import DepthSearchGraph + +load_dotenv() + +openai_key = os.getenv("OPENAI_APIKEY") + +graph_config = { + "llm": { + "api_key": "***************************", + "model": "oneapi/qwen-turbo", + "base_url": "http://127.0.0.1:3000/v1", # 设置 OneAPI URL + }, + "verbose": True, + "headless": False, + "depth": 2, + "only_inside_links": False, +} + +search_graph = DepthSearchGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io", + config=graph_config +) + +result = search_graph.run() +print(result) diff --git a/examples/together/depth_search_graph_together.py b/examples/together/depth_search_graph_together.py new file mode 100644 index 00000000..7a2e7f3e --- /dev/null +++ b/examples/together/depth_search_graph_together.py @@ -0,0 +1,31 @@ +""" +depth_search_graph_opeani example +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import DepthSearchGraph + +load_dotenv() + +openai_key = os.getenv("OPENAI_APIKEY") + +graph_config = { + "llm": { + "api_key": "***************************", + "model": "oneapi/qwen-turbo", + "base_url": "http://127.0.0.1:3000/v1", # 设置 OneAPI URL + }, + "verbose": True, + "headless": False, + "depth": 2, + "only_inside_links": False, +} + +search_graph = DepthSearchGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io", + config=graph_config +) + +result = search_graph.run() +print(result) From db54d694334209f047c950e2f6ac2c02e2da1d39 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Fri, 4 Oct 2024 09:54:54 +0200 Subject: [PATCH 35/36] refactoring of code for pylint integration --- scrapegraphai/nodes/description_node.py | 1 - scrapegraphai/nodes/fetch_node_level_k.py | 178 ++++++++++++---------- scrapegraphai/nodes/parse_node_depth_k.py | 17 +-- 3 files changed, 107 insertions(+), 89 deletions(-) diff --git a/scrapegraphai/nodes/description_node.py b/scrapegraphai/nodes/description_node.py index 60c56cec..4201a61d 100644 --- a/scrapegraphai/nodes/description_node.py +++ b/scrapegraphai/nodes/description_node.py @@ -34,7 +34,6 @@ def __init__( node_name: str = "DESCRIPTION", ): super().__init__(node_name, "node", input, output, 2, node_config) - self.llm_model = node_config["llm_model"] self.verbose = ( False if node_config is None else node_config.get("verbose", False) diff --git a/scrapegraphai/nodes/fetch_node_level_k.py b/scrapegraphai/nodes/fetch_node_level_k.py index 5cdd6571..d321b33c 100644 --- a/scrapegraphai/nodes/fetch_node_level_k.py +++ b/scrapegraphai/nodes/fetch_node_level_k.py @@ -1,6 +1,3 @@ -""" -FetchNodeLevelK Module -""" from typing import List, Optional from .base_node import BaseNode from ..docloaders import ChromiumLoader @@ -18,14 +15,21 @@ class FetchNodeLevelK(BaseNode): (with proxy protection). Attributes: - llm_model: An instance of a language model client, configured for generating answers. + embedder_model: An optional model for embedding the fetched content. verbose (bool): A flag indicating whether to show print statements during execution. + cache_path (str): Path to cache fetched content. + headless (bool): Whether to run the Chromium browser in headless mode. + loader_kwargs (dict): Additional arguments for the content loader. + browser_base (dict): Optional configuration for the browser base API. + depth (int): Maximum depth of hyperlink graph traversal. + only_inside_links (bool): Whether to fetch only internal links. + min_input_len (int): Minimum required length of input data. Args: input (str): Boolean expression defining the input keys needed from the state. output (List[str]): List of output keys to be updated in the state. node_config (dict): Additional configuration for the node. - node_name (str): The unique identifier name for the node, defaulting to "Parse". + node_name (str): The unique identifier name for the node, defaulting to "FetchLevelK". """ def __init__( @@ -35,81 +39,68 @@ def __init__( node_config: Optional[dict] = None, node_name: str = "FetchLevelK", ): + """ + Initializes the FetchNodeLevelK instance. + + Args: + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (Optional[dict]): Additional configuration for the node. + node_name (str): The name of the node (default is "FetchLevelK"). + """ super().__init__(node_name, "node", input, output, 2, node_config) - + self.embedder_model = node_config.get("embedder_model", None) - - self.verbose = ( - False if node_config is None else node_config.get("verbose", False) - ) - + self.verbose = node_config.get("verbose", False) if node_config else False self.cache_path = node_config.get("cache_path", False) - - self.headless = ( - True if node_config is None else node_config.get("headless", True) - ) - - self.loader_kwargs = ( - {} if node_config is None else node_config.get("loader_kwargs", {}) - ) - - self.browser_base = ( - None if node_config is None else node_config.get("browser_base", None) - ) - - self.depth = ( - 1 if node_config is None else node_config.get("depth", 1) - ) - - self.only_inside_links = ( - False if node_config is None else node_config.get("only_inside_links", False) - ) - + self.headless = node_config.get("headless", True) if node_config else True + self.loader_kwargs = node_config.get("loader_kwargs", {}) if node_config else {} + self.browser_base = node_config.get("browser_base", None) + self.depth = node_config.get("depth", 1) if node_config else 1 + self.only_inside_links = node_config.get("only_inside_links", False) if node_config else False self.min_input_len = 1 def execute(self, state: dict) -> dict: """ - Executes the node's logic to fetch the HTML content of a specified URL and all its sub-links - and update the graph's state with the content. + Executes the node's logic to fetch the HTML content of a specified URL and its sub-links + recursively, then updates the graph's state with the fetched content. Args: - state (dict): The current state of the graph. The input keys will be used - to fetch the correct data types from the state. + state (dict): The current state of the graph. Returns: dict: The updated state with a new output key containing the fetched HTML content. Raises: - KeyError: If the input key is not found in the state, indicating that the - necessary information to perform the operation is missing. + KeyError: If the input key is not found in the state. """ - self.logger.info(f"--- Executing {self.node_name} Node ---") - - # Interpret input keys based on the provided input expression + input_keys = self.get_input_keys(state) - # Fetching data from the state based on the input keys input_data = [state[key] for key in input_keys] - source = input_data[0] - + documents = [{"source": source}] - - loader_kwargs = {} + loader_kwargs = self.node_config.get("loader_kwargs", {}) if self.node_config else {} - if self.node_config is not None: - loader_kwargs = self.node_config.get("loader_kwargs", {}) - for _ in range(self.depth): documents = self.obtain_content(documents, loader_kwargs) - + filtered_documents = [doc for doc in documents if 'document' in doc] - state.update({self.output[0]: filtered_documents}) - return state - + def fetch_content(self, source: str, loader_kwargs) -> Optional[str]: + """ + Fetches the HTML content of a given source URL. + + Args: + source (str): The URL to fetch content from. + loader_kwargs (dict): Additional arguments for the content loader. + + Returns: + Optional[str]: The fetched HTML content or None if fetching failed. + """ self.logger.info(f"--- (Fetching HTML from: {source}) ---") if self.browser_base is not None: @@ -119,26 +110,40 @@ def fetch_content(self, source: str, loader_kwargs) -> Optional[str]: raise ImportError("""The browserbase module is not installed. Please install it using `pip install browserbase`.""") - data = browser_base_fetch(self.browser_base.get("api_key"), - self.browser_base.get("project_id"), [source]) - - document = [Document(page_content=content, - metadata={"source": source}) for content in data] - + data = browser_base_fetch(self.browser_base.get("api_key"), + self.browser_base.get("project_id"), [source]) + document = [Document(page_content=content, metadata={"source": source}) for content in data] else: loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs) - document = loader.load() - return document - + def extract_links(self, html_content: str) -> list: + """ + Extracts all hyperlinks from the HTML content. + + Args: + html_content (str): The HTML content to extract links from. + + Returns: + list: A list of extracted hyperlinks. + """ soup = BeautifulSoup(html_content, 'html.parser') links = [link['href'] for link in soup.find_all('a', href=True)] self.logger.info(f"Extracted {len(links)} links.") return links - + def get_full_links(self, base_url: str, links: list) -> list: + """ + Converts relative URLs to full URLs based on the base URL. + + Args: + base_url (str): The base URL for resolving relative links. + links (list): A list of links to convert. + + Returns: + list: A list of full URLs. + """ full_links = [] for link in links: if self.only_inside_links and link.startswith("http"): @@ -146,36 +151,55 @@ def get_full_links(self, base_url: str, links: list) -> list: full_link = link if link.startswith("http") else urljoin(base_url, link) full_links.append(full_link) return full_links - + def obtain_content(self, documents: List, loader_kwargs) -> List: + """ + Iterates through documents, fetching and updating content recursively. + + Args: + documents (List): A list of documents containing the source URLs. + loader_kwargs (dict): Additional arguments for the content loader. + + Returns: + List: The updated list of documents with fetched content. + """ new_documents = [] for doc in documents: source = doc['source'] if 'document' not in doc: document = self.fetch_content(source, loader_kwargs) - + if not document or not document[0].page_content.strip(): self.logger.warning(f"Failed to fetch content for {source}") documents.remove(doc) continue - - #doc['document'] = document[0].page_content + doc['document'] = document - links = self.extract_links(doc['document'][0].page_content) full_links = self.get_full_links(source, links) - - # Check if the links are already present in other documents + for link in full_links: - # Check if any document is from the same link if not any(d.get('source', '') == link for d in documents) and not any(d.get('source', '') == link for d in new_documents): - # Add the document new_documents.append({"source": link}) - + documents.extend(new_documents) return documents - - def process_links(self, base_url: str, links: list, loader_kwargs, depth: int, current_depth: int = 1) -> dict: + + def process_links(self, base_url: str, links: list, + loader_kwargs, depth: int, current_depth: int = 1) -> dict: + """ + Processes a list of links recursively up to a given depth. + + Args: + base_url (str): The base URL for resolving relative links. + links (list): A list of links to process. + loader_kwargs (dict): Additional arguments for the content loader. + depth (int): The maximum depth for recursion. + current_depth (int): The current depth of recursion (default is 1). + + Returns: + dict: A dictionary containing processed link content. + """ content_dict = {} for idx, link in enumerate(links, start=1): full_link = link if link.startswith("http") else urljoin(base_url, link) @@ -184,7 +208,7 @@ def process_links(self, base_url: str, links: list, loader_kwargs, depth: int, c if current_depth < depth: new_links = self.extract_links(link_content) - content_dict.update(self.process_links(full_link, new_links, depth, current_depth + 1)) + content_dict.update(self.process_links(full_link, new_links, loader_kwargs, depth, current_depth + 1)) else: self.logger.warning(f"Failed to fetch content for {full_link}") - return content_dict \ No newline at end of file + return content_dict diff --git a/scrapegraphai/nodes/parse_node_depth_k.py b/scrapegraphai/nodes/parse_node_depth_k.py index 7b7ab194..6427b051 100644 --- a/scrapegraphai/nodes/parse_node_depth_k.py +++ b/scrapegraphai/nodes/parse_node_depth_k.py @@ -1,11 +1,9 @@ """ ParseNodeDepthK Module """ -import re -from typing import List, Optional, Tuple -from .base_node import BaseNode -from ..utils.convert_to_md import convert_to_md +from typing import List, Optional from langchain_community.document_transformers import Html2TextTransformer +from .base_node import BaseNode class ParseNodeDepthK(BaseNode): """ @@ -54,19 +52,16 @@ def execute(self, state: dict) -> dict: """ self.logger.info(f"--- Executing {self.node_name} Node ---") - - # Interpret input keys based on the provided input expression + input_keys = self.get_input_keys(state) - # Fetching data from the state based on the input keys input_data = [state[key] for key in input_keys] documents = input_data[0] - + for doc in documents: document_md = Html2TextTransformer(ignore_links=True).transform_documents(doc["document"]) - #document_md = convert_to_md(doc["document"]) doc["document"] = document_md[0].page_content - + state.update({self.output[0]: documents}) - + return state From d056c439cd4582b4c6b4bf6efc5ebd057cd5a3a1 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Fri, 4 Oct 2024 14:16:13 +0200 Subject: [PATCH 36/36] Create code_generator_graph_togehter.py --- .../together/code_generator_graph_togehter.py | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 examples/together/code_generator_graph_togehter.py diff --git a/examples/together/code_generator_graph_togehter.py b/examples/together/code_generator_graph_togehter.py new file mode 100644 index 00000000..aefbeba4 --- /dev/null +++ b/examples/together/code_generator_graph_togehter.py @@ -0,0 +1,60 @@ +""" +Basic example of scraping pipeline using Code Generator with schema +""" + +import os, json +from typing import List +from dotenv import load_dotenv +from pydantic import BaseModel, Field +from scrapegraphai.graphs import CodeGeneratorGraph + +load_dotenv() + +# ************************************************ +# Define the output schema for the graph +# ************************************************ + +class Project(BaseModel): + title: str = Field(description="The title of the project") + description: str = Field(description="The description of the project") + +class Projects(BaseModel): + projects: List[Project] + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +together_key = os.getenv("TOGETHER_KEY") + +graph_config = { + "llm": { + "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", + "api_key": together_key, + }, + "verbose": True, + "headless": False, + "reduction": 2, + "max_iterations": { + "overall": 10, + "syntax": 3, + "execution": 3, + "validation": 3, + "semantic": 3 + }, + "output_file_name": "extracted_data.py" +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +code_generator_graph = CodeGeneratorGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io/projects/", + schema=Projects, + config=graph_config +) + +result = code_generator_graph.run() +print(result)