From e2fe39c093009254d5849aea8a49fad0aea450fa Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Thu, 26 Sep 2024 17:59:50 +0200
Subject: [PATCH 01/36] Reasoning node created

---
 scrapegraphai/nodes/__init__.py       |   3 +-
 scrapegraphai/nodes/reasoning_node.py | 482 ++++++++++++++++++++++++++
 2 files changed, 484 insertions(+), 1 deletion(-)
 create mode 100644 scrapegraphai/nodes/reasoning_node.py

diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py
index e5427044..2a0f261a 100644
--- a/scrapegraphai/nodes/__init__.py
+++ b/scrapegraphai/nodes/__init__.py
@@ -25,4 +25,5 @@
 from .concat_answers_node import ConcatAnswersNode
 from .prompt_refiner_node import PromptRefinerNode
 from .html_analyzer_node import HtmlAnalyzerNode
-from .generate_code_node import GenerateCodeNode
\ No newline at end of file
+from .generate_code_node import GenerateCodeNode
+from .reasoning_node import ReasoningNode
\ No newline at end of file
diff --git a/scrapegraphai/nodes/reasoning_node.py b/scrapegraphai/nodes/reasoning_node.py
new file mode 100644
index 00000000..4d9b29da
--- /dev/null
+++ b/scrapegraphai/nodes/reasoning_node.py
@@ -0,0 +1,482 @@
+"""
+PromptRefinerNode Module
+"""
+from typing import List, Optional
+from langchain.prompts import PromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables import RunnableParallel
+from langchain_core.utils.pydantic import is_basemodel_subclass
+from langchain_openai import ChatOpenAI, AzureChatOpenAI
+from langchain_mistralai import ChatMistralAI
+from langchain_community.chat_models import ChatOllama
+from tqdm import tqdm
+from .base_node import BaseNode
+from ..utils import transform_schema
+
+class ReasoningNode(BaseNode):
+    """
+    A node that refine the user prompt with the use of the schema and additional context and
+    create a precise prompt in subsequent steps that explicitly link elements in the user's 
+    original input to their corresponding representations in the JSON schema.
+
+    Attributes:
+        llm_model: An instance of a language model client, configured for generating answers.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+
+    Args:
+        input (str): Boolean expression defining the input keys needed from the state.
+        output (List[str]): List of output keys to be updated in the state.
+        node_config (dict): Additional configuration for the node.
+        node_name (str): The unique identifier name for the node, defaulting to "GenerateAnswer".
+    """
+
+    def __init__(
+        self,
+        input: str,
+        output: List[str],
+        node_config: Optional[dict] = None,
+        node_name: str = "PromptRefiner",
+    ):
+        super().__init__(node_name, "node", input, output, 2, node_config)
+
+        self.llm_model = node_config["llm_model"]
+
+        if isinstance(node_config["llm_model"], ChatOllama):
+            self.llm_model.format="json"
+
+        self.verbose = (
+            True if node_config is None else node_config.get("verbose", False)
+        )
+        self.force = (
+            False if node_config is None else node_config.get("force", False)
+        )
+        self.script_creator = (
+            False if node_config is None else node_config.get("script_creator", False)
+        )
+        self.is_md_scraper = (
+            False if node_config is None else node_config.get("is_md_scraper", False)
+        )
+
+        self.additional_info = node_config.get("additional_info")
+        
+        self.output_schema = node_config.get("schema")
+
+    def execute(self, state: dict) -> dict:
+        """
+        Generate a refined prompt using the user's prompt, the schema, and additional context.
+
+        Args:
+            state (dict): The current state of the graph. The input keys will be used
+                            to fetch the correct data from the state.
+
+        Returns:
+            dict: The updated state with the output key containing the generated answer.
+
+        Raises:
+            KeyError: If the input keys are not found in the state, indicating
+                      that the necessary information for generating an answer is missing.
+        """
+
+        self.logger.info(f"--- Executing {self.node_name} Node ---")
+
+        user_prompt = state['user_prompt']
+
+        self.simplefied_schema = transform_schema(self.output_schema.schema())
+        
+        if self.additional_info is not None:
+            prompt = PromptTemplate(
+                template=TEMPLATE_REFINER_WITH_CONTEXT,
+                partial_variables={"user_input": user_prompt,
+                                    "json_schema": str(self.simplefied_schema),
+                                    "additional_context": self.additional_info})
+        else:
+            prompt = PromptTemplate(
+                template=TEMPLATE_REFINER,
+                partial_variables={"user_input": user_prompt,
+                                    "json_schema": str(self.simplefied_schema)})
+
+        output_parser = StrOutputParser()
+
+        chain =  prompt | self.llm_model | output_parser
+        refined_prompt = chain.invoke({})
+
+        state.update({self.output[0]: refined_prompt})
+        return state
+
+
+TEMPLATE_REASONING = """
+**Task**: Analyze the user's request and the provided JSON schema to clearly map the desired data extraction.\n
+Break down the user's request into key components, and then explicitly connect these components to the 
+corresponding elements within the JSON schema.
+
+**User's Request**:
+{user_input}
+
+**Desired JSON Output Schema**:
+```json
+{json_schema}
+```
+
+**Analysis Instructions**:
+1. **Break Down User Request:** 
+* Clearly identify the core entities or data types the user is asking for.\n
+* Highlight any specific attributes or relationships mentioned in the request.\n
+
+2. **Map to JSON Schema**:
+* For each identified element in the user request, pinpoint its exact counterpart in the JSON schema.\n
+* Explain how the schema structure accommodates the user's needs.
+* If applicable, mention any schema elements that are not directly addressed in the user's request.\n
+
+This analysis will be used to guide the HTML structure examination and ultimately inform the code generation process.\n
+Please generate only the analysis and no other text.
+
+**Response**:
+"""
+        
+TEMPLATE_REASONING_WITH_CONTEXT = """
+**Task**: Analyze the user's request, the provided JSON schema, and the additional context the user provided to clearly map the desired data extraction.\n
+Break down the user's request into key components, and then explicitly connect these components to the corresponding elements within the JSON schema.\n
+
+**User's Request**:
+{user_input}
+
+**Desired JSON Output Schema**:
+```json
+{json_schema}
+```
+
+**Additional Context**:
+{additional_context}
+
+**Analysis Instructions**:
+1. **Break Down User Request:** 
+* Clearly identify the core entities or data types the user is asking for.\n
+* Highlight any specific attributes or relationships mentioned in the request.\n
+
+2. **Map to JSON Schema**:
+* For each identified element in the user request, pinpoint its exact counterpart in the JSON schema.\n
+* Explain how the schema structure accommodates the user's needs.\n
+* If applicable, mention any schema elements that are not directly addressed in the user's request.\n
+
+This analysis will be used to guide the HTML structure examination and ultimately inform the code generation process.\n
+Please generate only the analysis and no other text.
+
+**Response**:
+"""
+
+# TEMPLATE_REASONING_v1 (Emphasis on Clarity)
+TEMPLATE_REASONING_v1 = """
+**Task:** Meticulously analyze the user's request and the provided JSON schema to create a crystal-clear mapping for data extraction.
+
+**User's Request:**
+{user_input}
+
+**Desired JSON Output Schema:**
+```json
+{json_schema}
+```
+
+**Analysis Steps:**
+
+1. **Deconstruct User Request:** 
+   * Pinpoint the core data the user needs (e.g., specific entities, attributes, relationships).
+   * Highlight any filtering or sorting criteria mentioned in the request.
+
+2. **Connect to JSON Schema:**
+   * For each element the user wants, locate its precise match in the schema.
+   * Explain how the schema's structure fulfills the user's needs (e.g., nested objects, arrays).
+   * If any schema parts aren't relevant to the request, point them out.
+
+**Remember:** 
+* This analysis is crucial for building the HTML structure and generating code.
+* Be thorough and explicit in your explanations.
+* Focus solely on the analysis; avoid extraneous text.
+
+**Response:**
+"""
+
+# TEMPLATE_REASONING_v2 (Focus on Data Transformation)
+TEMPLATE_REASONING_v2 = """
+**Task:** Analyze the user's request and the JSON schema to determine the necessary data transformations for extraction.
+
+**User's Request:**
+{user_input}
+
+**Desired JSON Output Schema:**
+```json
+{json_schema}
+```
+
+**Analysis Steps:**
+
+1. **Understand User's Needs:** 
+   * Identify the specific data the user wants and how they want it presented.
+   * Note any calculations, formatting, or restructuring required.
+
+2. **Schema Mapping and Transformations:**
+   * Match user's needs to schema elements, noting any data type conversions needed.
+   * Outline the steps to transform the schema data into the user's desired format.
+   * If the schema lacks necessary data, clearly state this.
+
+**Key Points:** 
+* This analysis guides how we'll manipulate the schema data to match the user's request.
+* Be explicit about the transformations needed (e.g., filtering, renaming, calculations).
+* Focus on the analysis; no additional text is required.
+
+**Response:**
+"""
+
+# TEMPLATE_REASONING_v3 (Highlighting Potential Challenges)
+TEMPLATE_REASONING_v3 = """
+**Task:** Analyze the user's request and JSON schema, identifying potential challenges in data extraction.
+
+**User's Request:**
+{user_input}
+
+**Desired JSON Output Schema:**
+```json
+{json_schema}
+```
+
+**Analysis Steps:**
+
+1. **Thorough Request Understanding:** 
+   * Clearly identify all data elements the user wants.
+   * Note any ambiguities or complexities in the request.
+
+2. **Schema Mapping and Challenges:**
+   * Match user needs to schema elements, flagging any mismatches or missing data.
+   * Highlight any complex schema structures that might complicate extraction.
+   * If the request is vague, suggest clarifications needed from the user.
+
+**Important Notes:** 
+* This analysis helps us anticipate and address potential roadblocks in code generation.
+* Be proactive in identifying challenges, not just mapping data.
+* If the request is unclear, ask specific questions for clarification.
+* Focus on the analysis; avoid any unnecessary text.
+
+**Response:**
+"""
+
+# TEMPLATE_REASONING_v4 (Concise and Actionable)
+TEMPLATE_REASONING_v4 = """
+**Task:** Map user request to JSON schema, providing actionable insights for data extraction.
+
+**User's Request:**
+{user_input}
+
+**Desired JSON Output Schema:**
+```json
+{json_schema}
+```
+
+**Analysis:**
+
+* **Key Data:** [List the specific data elements the user wants]
+* **Schema Mapping:** [Concisely map each desired element to its schema counterpart]
+* **Transformations:** [Briefly list any data manipulations needed]
+* **Challenges:** [Highlight any potential issues or ambiguities]
+
+**Response:**
+"""
+
+# TEMPLATE_REASONING_v5 (Schema-Centric Approach)
+TEMPLATE_REASONING_v5 = """
+**Task:** Analyze the JSON schema to determine how it can fulfill the user's data request.
+
+**User's Request:**
+{user_input}
+
+**Desired JSON Output Schema:**
+```json
+{json_schema}
+```
+
+**Analysis:**
+
+1. **Schema Structure Breakdown:**
+   * Describe the key entities, relationships, and nesting in the schema.
+   * Highlight any relevant data types or formatting within the schema.
+
+2. **Fulfilling User's Needs:**
+   * Explain how the schema's structure can provide the data the user wants.
+   * Point out any schema elements that directly address the user's request.
+   * Identify any potential gaps or challenges in fulfilling the request.
+
+**Remember:** 
+* This analysis prioritizes understanding the schema's capabilities.
+* Focus on how the schema's structure can be leveraged for data extraction.
+* If the schema is insufficient, clearly state this and suggest potential solutions.
+* Provide only the analysis; avoid any additional text.
+
+**Response:**
+"""
+
+# TEMPLATE_REASONING_WITH_CONTEXT_v1 (Clarity with Context Integration)
+TEMPLATE_REASONING_WITH_CONTEXT_v1 = """
+**Task:** Carefully analyze the user's request, the provided JSON schema, and the additional context to create a precise mapping for data extraction.
+
+**User's Request:**
+{user_input}
+
+**Desired JSON Output Schema:**
+```json
+{json_schema}
+```
+
+**Additional Context:**
+{additional_context}
+
+**Analysis Steps:**
+
+1. **Integrate Context into Request Understanding:**
+   * Combine the user's explicit request with the additional context to gain a deeper understanding of their needs.
+   * Identify any implicit requirements or preferences hinted at in the context
+
+2. **Deconstruct Enhanced Request:**
+   * Pinpoint the core data the user needs (e.g., specific entities, attributes, relationships).
+   * Highlight any filtering or sorting criteria mentioned in the request or implied by the context
+
+3. **Connect to JSON Schema:**
+   * For each element the user wants, locate its precise match in the schema
+   * Explain how the schema's structure fulfills the user's needs (e.g., nested objects, arrays)
+   * If any schema parts aren't relevant to the request, point them out.
+
+**Remember:** 
+* The additional context is crucial for refining the analysis and ensuring accurate data extraction
+* Be thorough and explicit in your explanations.
+* Focus solely on the analysis; avoid extraneous text.
+
+**Response:**
+"""
+
+# TEMPLATE_REASONING_WITH_CONTEXT_v2 (Context-Driven Data Transformation)
+TEMPLATE_REASONING_WITH_CONTEXT_v2 = """
+**Task:** Analyze the user's request, JSON schema, and context to determine the data transformations needed for extraction.
+
+**User's Request:**
+{user_input}
+
+**Desired JSON Output Schema:**
+```json
+{json_schema}
+```
+
+**Additional Context:**
+{additional_context}
+
+**Analysis Steps:**
+
+1. **Contextual Understanding of User's Needs:**
+   * Combine the request and context to fully grasp the desired data and its presentation
+   * Note any calculations, formatting, or restructuring implied by the context.
+
+2. **Schema Mapping and Contextual Transformations:**
+   * Match user's needs to schema elements, considering context for data type conversions
+   * Outline the steps to transform schema data into the user's desired format, as informed by the context
+   * If the schema lacks necessary data, clearly state this
+
+**Key Points:** 
+* The context is vital for tailoring data transformations to the user's specific situation.
+* Be explicit about the transformations needed, referencing the context where relevant
+* Focus on the analysis; no additional text is required
+
+**Response:**
+"""
+
+# TEMPLATE_REASONING_WITH_CONTEXT_v3 (Contextual Challenge Identification)
+TEMPLATE_REASONING_WITH_CONTEXT_v3 = """
+**Task:** Analyze the user's request, JSON schema, and context, identifying potential challenges in data extraction
+
+**User's Request:**
+{user_input}
+
+**Desired JSON Output Schema:**
+```json
+{json_schema}
+```
+
+**Additional Context:**
+{additional_context}
+
+**Analysis Steps:**
+
+1. **Context-Enhanced Request Understanding:**
+   * Use the context to clarify any ambiguities or complexities in the request
+   * Identify any implicit requirements or potential conflicts highlighted by the context
+
+2. **Schema Mapping and Contextual Challenges:**
+   * Match user needs to schema elements, flagging any mismatches or missing data, considering the context
+   * Highlight any complex schema structures or contextual factors that might complicate extraction
+   * If the request remains unclear even with context, suggest specific clarifications needed from the user
+
+**Important Notes:** 
+* The context is key for anticipating and addressing potential roadblocks in code generation
+* Be proactive in identifying challenges, especially those arising from the context
+* If further clarification is needed, ask
+specific questions tailored to the context
+
+* Focus on the analysis; avoid any unnecessary text
+
+**Response:**
+"""
+
+# TEMPLATE_REASONING_WITH_CONTEXT_v4 (Concise and Actionable, with Context)
+TEMPLATE_REASONING_WITH_CONTEXT_v4 = """
+**Task:** Map user request to JSON schema, incorporating context for actionable insights.
+
+**User's Request:**
+{user_input}
+
+**Desired JSON Output Schema:**
+```json
+{json_schema}
+```
+
+**Additional Context:**
+{additional_context}
+
+**Analysis:**
+
+* **Key Data (Contextualized):** [List the specific data elements the user wants, considering the context]
+* **Schema Mapping (Context-Aware):** [Concisely map each desired element to its schema counterpart, noting any context-driven adjustments]
+* **Transformations (Context-Informed):** [Briefly list any data manipulations needed, taking the context into account]
+* **Challenges (Contextual):** [Highlight any potential issues or ambiguities arising from the request or context]
+
+**Response:**
+"""
+
+# TEMPLATE_REASONING_WITH_CONTEXT_v5 (Schema-Centric with Contextual Lens)
+TEMPLATE_REASONING_WITH_CONTEXT_v5 = """
+**Task:** Analyze the JSON schema through the lens of the user's request and context, determining how it can fulfill their needs
+
+**User's Request:**
+{user_input}
+
+**Desired JSON Output Schema:**
+```json
+{json_schema}
+```
+
+**Additional Context:**
+{additional_context}
+
+**Analysis:**
+
+1. **Schema Structure Breakdown (Contextualized):**
+   * Describe the key entities, relationships, and nesting in the schema, highlighting those most relevant to the context
+   * Point out any relevant data types or formatting within the schema that align with the context
+
+2. **Fulfilling User's Needs (Context-Driven):**
+   * Explain how the schema's structure, combined with the context, can provide the data the user wants
+   * Identify any schema elements that directly or indirectly address the user's request, considering the context
+   * Address any potential gaps or challenges in fulfilling the request, taking the context into account
+
+**Remember:** 
+* This analysis prioritizes understanding the schema's capabilities in relation to the specific context
+* Focus on how the schema's structure, combined with the context, can be leveraged for data extraction
+* If the schema is insufficient even with context, clearly state this and suggest potential solutions
+* Provide only the analysis; avoid any additional text
+
+**Response:**
+"""

From 3228f7dafbcde757d4dd8a27a7727c7a6f50561d Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Thu, 26 Sep 2024 18:10:37 +0200
Subject: [PATCH 02/36] Update reasoning_node.py

---
 scrapegraphai/nodes/reasoning_node.py | 456 ++++----------------------
 1 file changed, 65 insertions(+), 391 deletions(-)

diff --git a/scrapegraphai/nodes/reasoning_node.py b/scrapegraphai/nodes/reasoning_node.py
index 4d9b29da..295b2d28 100644
--- a/scrapegraphai/nodes/reasoning_node.py
+++ b/scrapegraphai/nodes/reasoning_node.py
@@ -15,9 +15,7 @@
 
 class ReasoningNode(BaseNode):
     """
-    A node that refine the user prompt with the use of the schema and additional context and
-    create a precise prompt in subsequent steps that explicitly link elements in the user's 
-    original input to their corresponding representations in the JSON schema.
+    ...
 
     Attributes:
         llm_model: An instance of a language model client, configured for generating answers.
@@ -50,20 +48,14 @@ def __init__(
         self.force = (
             False if node_config is None else node_config.get("force", False)
         )
-        self.script_creator = (
-            False if node_config is None else node_config.get("script_creator", False)
-        )
-        self.is_md_scraper = (
-            False if node_config is None else node_config.get("is_md_scraper", False)
-        )
 
-        self.additional_info = node_config.get("additional_info")
+        self.additional_info = node_config.get("additional_info", None)
         
         self.output_schema = node_config.get("schema")
 
     def execute(self, state: dict) -> dict:
         """
-        Generate a refined prompt using the user's prompt, the schema, and additional context.
+        ...
 
         Args:
             state (dict): The current state of the graph. The input keys will be used
@@ -79,19 +71,79 @@ def execute(self, state: dict) -> dict:
 
         self.logger.info(f"--- Executing {self.node_name} Node ---")
 
+        TEMPLATE_REASONING = """
+        **Task**: Analyze the user's request and the provided JSON schema to clearly map the desired data extraction.\n
+        Break down the user's request into key components, and then explicitly connect these components to the 
+        corresponding elements within the JSON schema.
+
+        **User's Request**:
+        {user_input}
+
+        **Desired JSON Output Schema**:
+        ```json
+        {json_schema}
+        ```
+
+        **Analysis Instructions**:
+        1. **Break Down User Request:** 
+        * Clearly identify the core entities or data types the user is asking for.\n
+        * Highlight any specific attributes or relationships mentioned in the request.\n
+
+        2. **Map to JSON Schema**:
+        * For each identified element in the user request, pinpoint its exact counterpart in the JSON schema.\n
+        * Explain how the schema structure accommodates the user's needs.
+        * If applicable, mention any schema elements that are not directly addressed in the user's request.\n
+
+        This analysis will be used to guide the HTML structure examination and ultimately inform the code generation process.\n
+        Please generate only the analysis and no other text.
+
+        **Response**:
+        """
+                
+        TEMPLATE_REASONING_WITH_CONTEXT = """
+        **Task**: Analyze the user's request, the provided JSON schema, and the additional context the user provided to clearly map the desired data extraction.\n
+        Break down the user's request into key components, and then explicitly connect these components to the corresponding elements within the JSON schema.\n
+
+        **User's Request**:
+        {user_input}
+
+        **Desired JSON Output Schema**:
+        ```json
+        {json_schema}
+        ```
+
+        **Additional Context**:
+        {additional_context}
+
+        **Analysis Instructions**:
+        1. **Break Down User Request:** 
+        * Clearly identify the core entities or data types the user is asking for.\n
+        * Highlight any specific attributes or relationships mentioned in the request.\n
+
+        2. **Map to JSON Schema**:
+        * For each identified element in the user request, pinpoint its exact counterpart in the JSON schema.\n
+        * Explain how the schema structure accommodates the user's needs.\n
+        * If applicable, mention any schema elements that are not directly addressed in the user's request.\n
+
+        This analysis will be used to guide the HTML structure examination and ultimately inform the code generation process.\n
+        Please generate only the analysis and no other text.
+
+        **Response**:
+        """
+        
         user_prompt = state['user_prompt']
 
         self.simplefied_schema = transform_schema(self.output_schema.schema())
         
         if self.additional_info is not None:
             prompt = PromptTemplate(
-                template=TEMPLATE_REFINER_WITH_CONTEXT,
+                template=TEMPLATE_REASONING_WITH_CONTEXT,
                 partial_variables={"user_input": user_prompt,
                                     "json_schema": str(self.simplefied_schema),
                                     "additional_context": self.additional_info})
         else:
             prompt = PromptTemplate(
-                template=TEMPLATE_REFINER,
+                template=TEMPLATE_REASONING,
                 partial_variables={"user_input": user_prompt,
                                     "json_schema": str(self.simplefied_schema)})
 
@@ -102,381 +154,3 @@ def execute(self, state: dict) -> dict:
 
         state.update({self.output[0]: refined_prompt})
         return state
-
-
-TEMPLATE_REASONING = """
-**Task**: Analyze the user's request and the provided JSON schema to clearly map the desired data extraction.\n
-Break down the user's request into key components, and then explicitly connect these components to the 
-corresponding elements within the JSON schema.
-
-**User's Request**:
-{user_input}
-
-**Desired JSON Output Schema**:
-```json
-{json_schema}
-```
-
-**Analysis Instructions**:
-1. **Break Down User Request:** 
-* Clearly identify the core entities or data types the user is asking for.\n
-* Highlight any specific attributes or relationships mentioned in the request.\n
-
-2. **Map to JSON Schema**:
-* For each identified element in the user request, pinpoint its exact counterpart in the JSON schema.\n
-* Explain how the schema structure accommodates the user's needs.
-* If applicable, mention any schema elements that are not directly addressed in the user's request.\n
-
-This analysis will be used to guide the HTML structure examination and ultimately inform the code generation process.\n
-Please generate only the analysis and no other text.
-
-**Response**:
-"""
-        
-TEMPLATE_REASONING_WITH_CONTEXT = """
-**Task**: Analyze the user's request, the provided JSON schema, and the additional context the user provided to clearly map the desired data extraction.\n
-Break down the user's request into key components, and then explicitly connect these components to the corresponding elements within the JSON schema.\n
-
-**User's Request**:
-{user_input}
-
-**Desired JSON Output Schema**:
-```json
-{json_schema}
-```
-
-**Additional Context**:
-{additional_context}
-
-**Analysis Instructions**:
-1. **Break Down User Request:** 
-* Clearly identify the core entities or data types the user is asking for.\n
-* Highlight any specific attributes or relationships mentioned in the request.\n
-
-2. **Map to JSON Schema**:
-* For each identified element in the user request, pinpoint its exact counterpart in the JSON schema.\n
-* Explain how the schema structure accommodates the user's needs.\n
-* If applicable, mention any schema elements that are not directly addressed in the user's request.\n
-
-This analysis will be used to guide the HTML structure examination and ultimately inform the code generation process.\n
-Please generate only the analysis and no other text.
-
-**Response**:
-"""
-
-# TEMPLATE_REASONING_v1 (Emphasis on Clarity)
-TEMPLATE_REASONING_v1 = """
-**Task:** Meticulously analyze the user's request and the provided JSON schema to create a crystal-clear mapping for data extraction.
-
-**User's Request:**
-{user_input}
-
-**Desired JSON Output Schema:**
-```json
-{json_schema}
-```
-
-**Analysis Steps:**
-
-1. **Deconstruct User Request:** 
-   * Pinpoint the core data the user needs (e.g., specific entities, attributes, relationships).
-   * Highlight any filtering or sorting criteria mentioned in the request.
-
-2. **Connect to JSON Schema:**
-   * For each element the user wants, locate its precise match in the schema.
-   * Explain how the schema's structure fulfills the user's needs (e.g., nested objects, arrays).
-   * If any schema parts aren't relevant to the request, point them out.
-
-**Remember:** 
-* This analysis is crucial for building the HTML structure and generating code.
-* Be thorough and explicit in your explanations.
-* Focus solely on the analysis; avoid extraneous text.
-
-**Response:**
-"""
-
-# TEMPLATE_REASONING_v2 (Focus on Data Transformation)
-TEMPLATE_REASONING_v2 = """
-**Task:** Analyze the user's request and the JSON schema to determine the necessary data transformations for extraction.
-
-**User's Request:**
-{user_input}
-
-**Desired JSON Output Schema:**
-```json
-{json_schema}
-```
-
-**Analysis Steps:**
-
-1. **Understand User's Needs:** 
-   * Identify the specific data the user wants and how they want it presented.
-   * Note any calculations, formatting, or restructuring required.
-
-2. **Schema Mapping and Transformations:**
-   * Match user's needs to schema elements, noting any data type conversions needed.
-   * Outline the steps to transform the schema data into the user's desired format.
-   * If the schema lacks necessary data, clearly state this.
-
-**Key Points:** 
-* This analysis guides how we'll manipulate the schema data to match the user's request.
-* Be explicit about the transformations needed (e.g., filtering, renaming, calculations).
-* Focus on the analysis; no additional text is required.
-
-**Response:**
-"""
-
-# TEMPLATE_REASONING_v3 (Highlighting Potential Challenges)
-TEMPLATE_REASONING_v3 = """
-**Task:** Analyze the user's request and JSON schema, identifying potential challenges in data extraction.
-
-**User's Request:**
-{user_input}
-
-**Desired JSON Output Schema:**
-```json
-{json_schema}
-```
-
-**Analysis Steps:**
-
-1. **Thorough Request Understanding:** 
-   * Clearly identify all data elements the user wants.
-   * Note any ambiguities or complexities in the request.
-
-2. **Schema Mapping and Challenges:**
-   * Match user needs to schema elements, flagging any mismatches or missing data.
-   * Highlight any complex schema structures that might complicate extraction.
-   * If the request is vague, suggest clarifications needed from the user.
-
-**Important Notes:** 
-* This analysis helps us anticipate and address potential roadblocks in code generation.
-* Be proactive in identifying challenges, not just mapping data.
-* If the request is unclear, ask specific questions for clarification.
-* Focus on the analysis; avoid any unnecessary text.
-
-**Response:**
-"""
-
-# TEMPLATE_REASONING_v4 (Concise and Actionable)
-TEMPLATE_REASONING_v4 = """
-**Task:** Map user request to JSON schema, providing actionable insights for data extraction.
-
-**User's Request:**
-{user_input}
-
-**Desired JSON Output Schema:**
-```json
-{json_schema}
-```
-
-**Analysis:**
-
-* **Key Data:** [List the specific data elements the user wants]
-* **Schema Mapping:** [Concisely map each desired element to its schema counterpart]
-* **Transformations:** [Briefly list any data manipulations needed]
-* **Challenges:** [Highlight any potential issues or ambiguities]
-
-**Response:**
-"""
-
-# TEMPLATE_REASONING_v5 (Schema-Centric Approach)
-TEMPLATE_REASONING_v5 = """
-**Task:** Analyze the JSON schema to determine how it can fulfill the user's data request.
-
-**User's Request:**
-{user_input}
-
-**Desired JSON Output Schema:**
-```json
-{json_schema}
-```
-
-**Analysis:**
-
-1. **Schema Structure Breakdown:**
-   * Describe the key entities, relationships, and nesting in the schema.
-   * Highlight any relevant data types or formatting within the schema.
-
-2. **Fulfilling User's Needs:**
-   * Explain how the schema's structure can provide the data the user wants.
-   * Point out any schema elements that directly address the user's request.
-   * Identify any potential gaps or challenges in fulfilling the request.
-
-**Remember:** 
-* This analysis prioritizes understanding the schema's capabilities.
-* Focus on how the schema's structure can be leveraged for data extraction.
-* If the schema is insufficient, clearly state this and suggest potential solutions.
-* Provide only the analysis; avoid any additional text.
-
-**Response:**
-"""
-
-# TEMPLATE_REASONING_WITH_CONTEXT_v1 (Clarity with Context Integration)
-TEMPLATE_REASONING_WITH_CONTEXT_v1 = """
-**Task:** Carefully analyze the user's request, the provided JSON schema, and the additional context to create a precise mapping for data extraction.
-
-**User's Request:**
-{user_input}
-
-**Desired JSON Output Schema:**
-```json
-{json_schema}
-```
-
-**Additional Context:**
-{additional_context}
-
-**Analysis Steps:**
-
-1. **Integrate Context into Request Understanding:**
-   * Combine the user's explicit request with the additional context to gain a deeper understanding of their needs.
-   * Identify any implicit requirements or preferences hinted at in the context
-
-2. **Deconstruct Enhanced Request:**
-   * Pinpoint the core data the user needs (e.g., specific entities, attributes, relationships).
-   * Highlight any filtering or sorting criteria mentioned in the request or implied by the context
-
-3. **Connect to JSON Schema:**
-   * For each element the user wants, locate its precise match in the schema
-   * Explain how the schema's structure fulfills the user's needs (e.g., nested objects, arrays)
-   * If any schema parts aren't relevant to the request, point them out.
-
-**Remember:** 
-* The additional context is crucial for refining the analysis and ensuring accurate data extraction
-* Be thorough and explicit in your explanations.
-* Focus solely on the analysis; avoid extraneous text.
-
-**Response:**
-"""
-
-# TEMPLATE_REASONING_WITH_CONTEXT_v2 (Context-Driven Data Transformation)
-TEMPLATE_REASONING_WITH_CONTEXT_v2 = """
-**Task:** Analyze the user's request, JSON schema, and context to determine the data transformations needed for extraction.
-
-**User's Request:**
-{user_input}
-
-**Desired JSON Output Schema:**
-```json
-{json_schema}
-```
-
-**Additional Context:**
-{additional_context}
-
-**Analysis Steps:**
-
-1. **Contextual Understanding of User's Needs:**
-   * Combine the request and context to fully grasp the desired data and its presentation
-   * Note any calculations, formatting, or restructuring implied by the context.
-
-2. **Schema Mapping and Contextual Transformations:**
-   * Match user's needs to schema elements, considering context for data type conversions
-   * Outline the steps to transform schema data into the user's desired format, as informed by the context
-   * If the schema lacks necessary data, clearly state this
-
-**Key Points:** 
-* The context is vital for tailoring data transformations to the user's specific situation.
-* Be explicit about the transformations needed, referencing the context where relevant
-* Focus on the analysis; no additional text is required
-
-**Response:**
-"""
-
-# TEMPLATE_REASONING_WITH_CONTEXT_v3 (Contextual Challenge Identification)
-TEMPLATE_REASONING_WITH_CONTEXT_v3 = """
-**Task:** Analyze the user's request, JSON schema, and context, identifying potential challenges in data extraction
-
-**User's Request:**
-{user_input}
-
-**Desired JSON Output Schema:**
-```json
-{json_schema}
-```
-
-**Additional Context:**
-{additional_context}
-
-**Analysis Steps:**
-
-1. **Context-Enhanced Request Understanding:**
-   * Use the context to clarify any ambiguities or complexities in the request
-   * Identify any implicit requirements or potential conflicts highlighted by the context
-
-2. **Schema Mapping and Contextual Challenges:**
-   * Match user needs to schema elements, flagging any mismatches or missing data, considering the context
-   * Highlight any complex schema structures or contextual factors that might complicate extraction
-   * If the request remains unclear even with context, suggest specific clarifications needed from the user
-
-**Important Notes:** 
-* The context is key for anticipating and addressing potential roadblocks in code generation
-* Be proactive in identifying challenges, especially those arising from the context
-* If further clarification is needed, ask
-specific questions tailored to the context
-
-* Focus on the analysis; avoid any unnecessary text
-
-**Response:**
-"""
-
-# TEMPLATE_REASONING_WITH_CONTEXT_v4 (Concise and Actionable, with Context)
-TEMPLATE_REASONING_WITH_CONTEXT_v4 = """
-**Task:** Map user request to JSON schema, incorporating context for actionable insights.
-
-**User's Request:**
-{user_input}
-
-**Desired JSON Output Schema:**
-```json
-{json_schema}
-```
-
-**Additional Context:**
-{additional_context}
-
-**Analysis:**
-
-* **Key Data (Contextualized):** [List the specific data elements the user wants, considering the context]
-* **Schema Mapping (Context-Aware):** [Concisely map each desired element to its schema counterpart, noting any context-driven adjustments]
-* **Transformations (Context-Informed):** [Briefly list any data manipulations needed, taking the context into account]
-* **Challenges (Contextual):** [Highlight any potential issues or ambiguities arising from the request or context]
-
-**Response:**
-"""
-
-# TEMPLATE_REASONING_WITH_CONTEXT_v5 (Schema-Centric with Contextual Lens)
-TEMPLATE_REASONING_WITH_CONTEXT_v5 = """
-**Task:** Analyze the JSON schema through the lens of the user's request and context, determining how it can fulfill their needs
-
-**User's Request:**
-{user_input}
-
-**Desired JSON Output Schema:**
-```json
-{json_schema}
-```
-
-**Additional Context:**
-{additional_context}
-
-**Analysis:**
-
-1. **Schema Structure Breakdown (Contextualized):**
-   * Describe the key entities, relationships, and nesting in the schema, highlighting those most relevant to the context
-   * Point out any relevant data types or formatting within the schema that align with the context
-
-2. **Fulfilling User's Needs (Context-Driven):**
-   * Explain how the schema's structure, combined with the context, can provide the data the user wants
-   * Identify any schema elements that directly or indirectly address the user's request, considering the context
-   * Address any potential gaps or challenges in fulfilling the request, taking the context into account
-
-**Remember:** 
-* This analysis prioritizes understanding the schema's capabilities in relation to the specific context
-* Focus on how the schema's structure, combined with the context, can be leveraged for data extraction
-* If the schema is insufficient even with context, clearly state this and suggest potential solutions
-* Provide only the analysis; avoid any additional text
-
-**Response:**
-"""

From 0b125896641b8da06fa987ffb11159a46916f90c Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Thu, 26 Sep 2024 18:28:26 +0200
Subject: [PATCH 03/36] reasoning node prompt refinement

---
 scrapegraphai/nodes/reasoning_node.py | 59 +++++++++++++++------------
 1 file changed, 34 insertions(+), 25 deletions(-)

diff --git a/scrapegraphai/nodes/reasoning_node.py b/scrapegraphai/nodes/reasoning_node.py
index 295b2d28..e054eeca 100644
--- a/scrapegraphai/nodes/reasoning_node.py
+++ b/scrapegraphai/nodes/reasoning_node.py
@@ -72,42 +72,42 @@ def execute(self, state: dict) -> dict:
         self.logger.info(f"--- Executing {self.node_name} Node ---")
 
         TEMPLATE_REASONING = """
-        **Task**: Analyze the user's request and the provided JSON schema to clearly map the desired data extraction.\n
-        Break down the user's request into key components, and then explicitly connect these components to the 
-        corresponding elements within the JSON schema.
+        **Task**: Analyze the user's request and the provided JSON schema to guide an LLM in extracting information directly from HTML.
 
         **User's Request**:
         {user_input}
 
-        **Desired JSON Output Schema**:
+        **Target JSON Schema**:
         ```json
         {json_schema}
         ```
 
         **Analysis Instructions**:
-        1. **Break Down User Request:** 
-        * Clearly identify the core entities or data types the user is asking for.\n
-        * Highlight any specific attributes or relationships mentioned in the request.\n
+        1. **Interpret User Request:** 
+        * Identify the key information types or entities the user is seeking.
+        * Note any specific attributes, relationships, or constraints mentioned.
 
         2. **Map to JSON Schema**:
-        * For each identified element in the user request, pinpoint its exact counterpart in the JSON schema.\n
-        * Explain how the schema structure accommodates the user's needs.
-        * If applicable, mention any schema elements that are not directly addressed in the user's request.\n
+        * For each identified element in the user request, locate its corresponding field in the JSON schema.
+        * Explain how the schema structure represents the requested information.
+        * Highlight any relevant schema elements not explicitly mentioned in the user's request.
 
-        This analysis will be used to guide the HTML structure examination and ultimately inform the code generation process.\n
-        Please generate only the analysis and no other text.
+        3. **Data Transformation Guidance**:
+        * Provide guidance on any necessary transformations to align extracted data with the JSON schema requirements.
 
-        **Response**:
+        This analysis will be used to instruct an LLM that has the HTML content in its context. The LLM will use this guidance to extract the information and return it directly in the specified JSON format.
+
+        **Reasoning Output**:
+        [Your detailed analysis based on the above instructions]
         """
                 
         TEMPLATE_REASONING_WITH_CONTEXT = """
-        **Task**: Analyze the user's request, the provided JSON schema, and the additional context the user provided to clearly map the desired data extraction.\n
-        Break down the user's request into key components, and then explicitly connect these components to the corresponding elements within the JSON schema.\n
+        **Task**: Analyze the user's request, provided JSON schema, and additional context to guide an LLM in extracting information directly from HTML.
 
         **User's Request**:
         {user_input}
 
-        **Desired JSON Output Schema**:
+        **Target JSON Schema**:
         ```json
         {json_schema}
         ```
@@ -116,19 +116,28 @@ def execute(self, state: dict) -> dict:
         {additional_context}
 
         **Analysis Instructions**:
-        1. **Break Down User Request:** 
-        * Clearly identify the core entities or data types the user is asking for.\n
-        * Highlight any specific attributes or relationships mentioned in the request.\n
+        1. **Interpret User Request and Context:** 
+        * Identify the key information types or entities the user is seeking.
+        * Note any specific attributes, relationships, or constraints mentioned.
+        * Incorporate insights from the additional context to refine understanding of the task.
 
         2. **Map to JSON Schema**:
-        * For each identified element in the user request, pinpoint its exact counterpart in the JSON schema.\n
-        * Explain how the schema structure accommodates the user's needs.\n
-        * If applicable, mention any schema elements that are not directly addressed in the user's request.\n
+        * For each identified element in the user request, locate its corresponding field in the JSON schema.
+        * Explain how the schema structure represents the requested information.
+        * Highlight any relevant schema elements not explicitly mentioned in the user's request.
+
+        3. **Extraction Strategy**:
+        * Based on the additional context, suggest specific strategies for locating and extracting the required information from the HTML.
+        * Highlight any potential challenges or special considerations mentioned in the context.
+
+        4. **Data Transformation Guidance**:
+        * Provide guidance on any necessary transformations to align extracted data with the JSON schema requirements.
+        * Note any special formatting, validation, or business logic considerations from the additional context.
 
-        This analysis will be used to guide the HTML structure examination and ultimately inform the code generation process.\n
-        Please generate only the analysis and no other text.
+        This analysis will be used to instruct an LLM that has the HTML content in its context. The LLM will use this guidance to extract the information and return it directly in the specified JSON format.
 
-        **Response**:
+        **Reasoning Output**:
+        [Your detailed analysis based on the above instructions, incorporating insights from the additional context]
         """
         
         user_prompt = state['user_prompt']

From b7b3e9660f02b346f1159b7cf7a52be6ce37b4f7 Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Thu, 26 Sep 2024 18:33:22 +0200
Subject: [PATCH 04/36] reasoning node refactoring

---
 scrapegraphai/nodes/reasoning_node.py         | 78 ++-----------------
 scrapegraphai/prompts/__init__.py             |  3 +-
 .../prompts/reasoning_node_prompts.py         | 72 +++++++++++++++++
 3 files changed, 81 insertions(+), 72 deletions(-)
 create mode 100644 scrapegraphai/prompts/reasoning_node_prompts.py

diff --git a/scrapegraphai/nodes/reasoning_node.py b/scrapegraphai/nodes/reasoning_node.py
index e054eeca..3c65bbc4 100644
--- a/scrapegraphai/nodes/reasoning_node.py
+++ b/scrapegraphai/nodes/reasoning_node.py
@@ -12,10 +12,15 @@
 from tqdm import tqdm
 from .base_node import BaseNode
 from ..utils import transform_schema
+from ..prompts import (
+    TEMPLATE_REASONING, TEMPLATE_REASONING_WITH_CONTEXT
+)
 
 class ReasoningNode(BaseNode):
     """
-    ...
+    A node that refine the user prompt with the use of the schema and additional context and
+    create a precise prompt in subsequent steps that explicitly link elements in the user's
+    original input to their corresponding representations in the JSON schema.
 
     Attributes:
         llm_model: An instance of a language model client, configured for generating answers.
@@ -55,7 +60,7 @@ def __init__(
 
     def execute(self, state: dict) -> dict:
         """
-        ...
+        Generate a refined prompt for the reasoning task based on the user's input and the JSON schema.
 
         Args:
             state (dict): The current state of the graph. The input keys will be used
@@ -70,75 +75,6 @@ def execute(self, state: dict) -> dict:
         """
 
         self.logger.info(f"--- Executing {self.node_name} Node ---")
-
-        TEMPLATE_REASONING = """
-        **Task**: Analyze the user's request and the provided JSON schema to guide an LLM in extracting information directly from HTML.
-
-        **User's Request**:
-        {user_input}
-
-        **Target JSON Schema**:
-        ```json
-        {json_schema}
-        ```
-
-        **Analysis Instructions**:
-        1. **Interpret User Request:** 
-        * Identify the key information types or entities the user is seeking.
-        * Note any specific attributes, relationships, or constraints mentioned.
-
-        2. **Map to JSON Schema**:
-        * For each identified element in the user request, locate its corresponding field in the JSON schema.
-        * Explain how the schema structure represents the requested information.
-        * Highlight any relevant schema elements not explicitly mentioned in the user's request.
-
-        3. **Data Transformation Guidance**:
-        * Provide guidance on any necessary transformations to align extracted data with the JSON schema requirements.
-
-        This analysis will be used to instruct an LLM that has the HTML content in its context. The LLM will use this guidance to extract the information and return it directly in the specified JSON format.
-
-        **Reasoning Output**:
-        [Your detailed analysis based on the above instructions]
-        """
-                
-        TEMPLATE_REASONING_WITH_CONTEXT = """
-        **Task**: Analyze the user's request, provided JSON schema, and additional context to guide an LLM in extracting information directly from HTML.
-
-        **User's Request**:
-        {user_input}
-
-        **Target JSON Schema**:
-        ```json
-        {json_schema}
-        ```
-
-        **Additional Context**:
-        {additional_context}
-
-        **Analysis Instructions**:
-        1. **Interpret User Request and Context:** 
-        * Identify the key information types or entities the user is seeking.
-        * Note any specific attributes, relationships, or constraints mentioned.
-        * Incorporate insights from the additional context to refine understanding of the task.
-
-        2. **Map to JSON Schema**:
-        * For each identified element in the user request, locate its corresponding field in the JSON schema.
-        * Explain how the schema structure represents the requested information.
-        * Highlight any relevant schema elements not explicitly mentioned in the user's request.
-
-        3. **Extraction Strategy**:
-        * Based on the additional context, suggest specific strategies for locating and extracting the required information from the HTML.
-        * Highlight any potential challenges or special considerations mentioned in the context.
-
-        4. **Data Transformation Guidance**:
-        * Provide guidance on any necessary transformations to align extracted data with the JSON schema requirements.
-        * Note any special formatting, validation, or business logic considerations from the additional context.
-
-        This analysis will be used to instruct an LLM that has the HTML content in its context. The LLM will use this guidance to extract the information and return it directly in the specified JSON format.
-
-        **Reasoning Output**:
-        [Your detailed analysis based on the above instructions, incorporating insights from the additional context]
-        """
         
         user_prompt = state['user_prompt']
 
diff --git a/scrapegraphai/prompts/__init__.py b/scrapegraphai/prompts/__init__.py
index f7be89c1..ab34580b 100644
--- a/scrapegraphai/prompts/__init__.py
+++ b/scrapegraphai/prompts/__init__.py
@@ -18,4 +18,5 @@
                                          TEMPLATE_EXECUTION_ANALYSIS, TEMPLATE_EXECUTION_CODE_GENERATION,
                                          TEMPLATE_VALIDATION_ANALYSIS, TEMPLATE_VALIDATION_CODE_GENERATION,
                                          TEMPLATE_SEMANTIC_COMPARISON, TEMPLATE_SEMANTIC_ANALYSIS,
-                                         TEMPLATE_SEMANTIC_CODE_GENERATION)
\ No newline at end of file
+                                         TEMPLATE_SEMANTIC_CODE_GENERATION)
+from .reasoning_node_prompts import TEMPLATE_REASONING, TEMPLATE_REASONING_WITH_CONTEXT
\ No newline at end of file
diff --git a/scrapegraphai/prompts/reasoning_node_prompts.py b/scrapegraphai/prompts/reasoning_node_prompts.py
new file mode 100644
index 00000000..2ecd96e3
--- /dev/null
+++ b/scrapegraphai/prompts/reasoning_node_prompts.py
@@ -0,0 +1,72 @@
+"""
+Reasoning prompts helper
+"""
+
+TEMPLATE_REASONING = """
+**Task**: Analyze the user's request and the provided JSON schema to guide an LLM in extracting information directly from HTML.
+
+**User's Request**:
+{user_input}
+
+**Target JSON Schema**:
+```json
+{json_schema}
+```
+
+**Analysis Instructions**:
+1. **Interpret User Request:** 
+* Identify the key information types or entities the user is seeking.
+* Note any specific attributes, relationships, or constraints mentioned.
+
+2. **Map to JSON Schema**:
+* For each identified element in the user request, locate its corresponding field in the JSON schema.
+* Explain how the schema structure represents the requested information.
+* Highlight any relevant schema elements not explicitly mentioned in the user's request.
+
+3. **Data Transformation Guidance**:
+* Provide guidance on any necessary transformations to align extracted data with the JSON schema requirements.
+
+This analysis will be used to instruct an LLM that has the HTML content in its context. The LLM will use this guidance to extract the information and return it directly in the specified JSON format.
+
+**Reasoning Output**:
+[Your detailed analysis based on the above instructions]
+"""
+        
+TEMPLATE_REASONING_WITH_CONTEXT = """
+**Task**: Analyze the user's request, provided JSON schema, and additional context to guide an LLM in extracting information directly from HTML.
+
+**User's Request**:
+{user_input}
+
+**Target JSON Schema**:
+```json
+{json_schema}
+```
+
+**Additional Context**:
+{additional_context}
+
+**Analysis Instructions**:
+1. **Interpret User Request and Context:** 
+* Identify the key information types or entities the user is seeking.
+* Note any specific attributes, relationships, or constraints mentioned.
+* Incorporate insights from the additional context to refine understanding of the task.
+
+2. **Map to JSON Schema**:
+* For each identified element in the user request, locate its corresponding field in the JSON schema.
+* Explain how the schema structure represents the requested information.
+* Highlight any relevant schema elements not explicitly mentioned in the user's request.
+
+3. **Extraction Strategy**:
+* Based on the additional context, suggest specific strategies for locating and extracting the required information from the HTML.
+* Highlight any potential challenges or special considerations mentioned in the context.
+
+4. **Data Transformation Guidance**:
+* Provide guidance on any necessary transformations to align extracted data with the JSON schema requirements.
+* Note any special formatting, validation, or business logic considerations from the additional context.
+
+This analysis will be used to instruct an LLM that has the HTML content in its context. The LLM will use this guidance to extract the information and return it directly in the specified JSON format.
+
+**Reasoning Output**:
+[Your detailed analysis based on the above instructions, incorporating insights from the additional context]
+"""
\ No newline at end of file

From afa9aa3fe78ffdf82c9faad82ae5a25375f1674d Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Thu, 26 Sep 2024 18:37:40 +0200
Subject: [PATCH 05/36] import refactoring

---
 scrapegraphai/nodes/generate_code_node.py  | 9 +++------
 scrapegraphai/nodes/html_analyzer_node.py  | 3 ---
 scrapegraphai/nodes/prompt_refiner_node.py | 5 -----
 scrapegraphai/nodes/reasoning_node.py      | 5 -----
 4 files changed, 3 insertions(+), 19 deletions(-)

diff --git a/scrapegraphai/nodes/generate_code_node.py b/scrapegraphai/nodes/generate_code_node.py
index 1174a4aa..bcb7ea74 100644
--- a/scrapegraphai/nodes/generate_code_node.py
+++ b/scrapegraphai/nodes/generate_code_node.py
@@ -5,17 +5,16 @@
 from langchain.prompts import PromptTemplate
 from langchain.output_parsers import ResponseSchema, StructuredOutputParser
 from langchain_core.output_parsers import StrOutputParser
-from langchain_core.runnables import RunnableParallel
-from langchain_core.utils.pydantic import is_basemodel_subclass
 from langchain_community.chat_models import ChatOllama
 import ast
 import sys
 from io import StringIO
 from bs4 import BeautifulSoup
 import re
-from tqdm import tqdm
-from .base_node import BaseNode
+import json
+from jsonschema import validate, ValidationError
 from pydantic import ValidationError
+from .base_node import BaseNode
 from ..utils import (transform_schema,
                     extract_code,
                     syntax_focused_analysis, syntax_focused_code_generation,
@@ -23,8 +22,6 @@
                     validation_focused_analysis, validation_focused_code_generation,
                     semantic_focused_analysis, semantic_focused_code_generation,
                     are_content_equal)
-from jsonschema import validate, ValidationError
-import json
 from ..prompts import (
     TEMPLATE_INIT_CODE_GENERATION, TEMPLATE_SEMANTIC_COMPARISON
 )
diff --git a/scrapegraphai/nodes/html_analyzer_node.py b/scrapegraphai/nodes/html_analyzer_node.py
index b07c4040..26304dcd 100644
--- a/scrapegraphai/nodes/html_analyzer_node.py
+++ b/scrapegraphai/nodes/html_analyzer_node.py
@@ -4,10 +4,7 @@
 from typing import List, Optional
 from langchain.prompts import PromptTemplate
 from langchain_core.output_parsers import StrOutputParser
-from langchain_core.runnables import RunnableParallel
-from langchain_core.utils.pydantic import is_basemodel_subclass
 from langchain_community.chat_models import ChatOllama
-from tqdm import tqdm
 from .base_node import BaseNode
 from ..utils import reduce_html
 from ..prompts import (
diff --git a/scrapegraphai/nodes/prompt_refiner_node.py b/scrapegraphai/nodes/prompt_refiner_node.py
index dfb62eb6..7cc53020 100644
--- a/scrapegraphai/nodes/prompt_refiner_node.py
+++ b/scrapegraphai/nodes/prompt_refiner_node.py
@@ -4,12 +4,7 @@
 from typing import List, Optional
 from langchain.prompts import PromptTemplate
 from langchain_core.output_parsers import StrOutputParser
-from langchain_core.runnables import RunnableParallel
-from langchain_core.utils.pydantic import is_basemodel_subclass
-from langchain_openai import ChatOpenAI, AzureChatOpenAI
-from langchain_mistralai import ChatMistralAI
 from langchain_community.chat_models import ChatOllama
-from tqdm import tqdm
 from .base_node import BaseNode
 from ..utils import transform_schema
 from ..prompts import (
diff --git a/scrapegraphai/nodes/reasoning_node.py b/scrapegraphai/nodes/reasoning_node.py
index 3c65bbc4..431d8ab1 100644
--- a/scrapegraphai/nodes/reasoning_node.py
+++ b/scrapegraphai/nodes/reasoning_node.py
@@ -4,12 +4,7 @@
 from typing import List, Optional
 from langchain.prompts import PromptTemplate
 from langchain_core.output_parsers import StrOutputParser
-from langchain_core.runnables import RunnableParallel
-from langchain_core.utils.pydantic import is_basemodel_subclass
-from langchain_openai import ChatOpenAI, AzureChatOpenAI
-from langchain_mistralai import ChatMistralAI
 from langchain_community.chat_models import ChatOllama
-from tqdm import tqdm
 from .base_node import BaseNode
 from ..utils import transform_schema
 from ..prompts import (

From 9fa109453f9f6a6cc60e88a5f6d787e075d51f7f Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <88108002+VinciGit00@users.noreply.github.com>
Date: Fri, 27 Sep 2024 14:09:35 +0200
Subject: [PATCH 06/36] Update reasoning_node_prompts.py

---
 scrapegraphai/prompts/reasoning_node_prompts.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scrapegraphai/prompts/reasoning_node_prompts.py b/scrapegraphai/prompts/reasoning_node_prompts.py
index 2ecd96e3..47ceaa41 100644
--- a/scrapegraphai/prompts/reasoning_node_prompts.py
+++ b/scrapegraphai/prompts/reasoning_node_prompts.py
@@ -3,7 +3,7 @@
 """
 
 TEMPLATE_REASONING = """
-**Task**: Analyze the user's request and the provided JSON schema to guide an LLM in extracting information directly from HTML.
+**Task**: Analyze the user's request and the provided JSON schema to guide an LLM in extracting information directly from a markdown file previously parsed froma a HTML file.
 
 **User's Request**:
 {user_input}
@@ -33,7 +33,7 @@
 """
         
 TEMPLATE_REASONING_WITH_CONTEXT = """
-**Task**: Analyze the user's request, provided JSON schema, and additional context to guide an LLM in extracting information directly from HTML.
+**Task**: Analyze the user's request and the provided JSON schema to guide an LLM in extracting information directly from a markdown file previously parsed froma a HTML file.
 
 **User's Request**:
 {user_input}
@@ -69,4 +69,4 @@
 
 **Reasoning Output**:
 [Your detailed analysis based on the above instructions, incorporating insights from the additional context]
-"""
\ No newline at end of file
+"""

From bdcffd6360237b27797546a198ceece55ce4bc81 Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <mvincig11@gmail.com>
Date: Fri, 27 Sep 2024 16:41:45 +0200
Subject: [PATCH 07/36] feat: add html_mode to smart_scraper

---
 examples/extras/html_mode.py                | 48 +++++++++++++++++
 scrapegraphai/graphs/smart_scraper_graph.py | 57 +++++++++++++--------
 2 files changed, 85 insertions(+), 20 deletions(-)
 create mode 100644 examples/extras/html_mode.py

diff --git a/examples/extras/html_mode.py b/examples/extras/html_mode.py
new file mode 100644
index 00000000..c13ba694
--- /dev/null
+++ b/examples/extras/html_mode.py
@@ -0,0 +1,48 @@
+""" 
+Basic example of scraping pipeline using SmartScraper
+By default smart scraper converts in md format the 
+code.
+"""
+
+import os
+import json
+from dotenv import load_dotenv
+from scrapegraphai.graphs import SmartScraperGraph
+from scrapegraphai.utils import prettify_exec_info
+
+load_dotenv()
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+
+graph_config = {
+    "llm": {
+        "api_key": os.getenv("OPENAI_API_KEY"),
+        "model": "openai/gpt-4o",
+    },
+    "html_mode": True,
+    "verbose": True,
+    "headless": False,
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+smart_scraper_graph = SmartScraperGraph(
+    prompt="List me what does the company do, the name and a contact email.",
+    source="https://scrapegraphai.com/",
+    config=graph_config
+)
+
+result = smart_scraper_graph.run()
+print(json.dumps(result, indent=4))
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = smart_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py
index 0c025c3a..7792ed58 100644
--- a/scrapegraphai/graphs/smart_scraper_graph.py
+++ b/scrapegraphai/graphs/smart_scraper_graph.py
@@ -69,14 +69,7 @@ def _create_graph(self) -> BaseGraph:
                 "scrape_do": self.config.get("scrape_do")
             }
         )
-        parse_node = ParseNode(
-            input="doc",
-            output=["parsed_doc"],
-            node_config={
-                "llm_model": self.llm_model,
-                "chunk_size": self.model_token
-            }
-        )
+       
 
         generate_answer_node = GenerateAnswerNode(
             input="user_prompt & (relevant_chunks | parsed_doc | doc)",
@@ -88,19 +81,43 @@ def _create_graph(self) -> BaseGraph:
             }
         )
 
+        if self.config.get("html_mode") is not True:
+
+            parse_node = ParseNode(
+                input="doc",
+                output=["parsed_doc"],
+                node_config={
+                    "llm_model": self.llm_model,
+                    "chunk_size": self.model_token
+                }
+            )
+
+            return BaseGraph(
+                nodes=[
+                    fetch_node,
+                    parse_node,
+                    generate_answer_node,
+                ],
+                edges=[
+                    (fetch_node, parse_node),
+                    (parse_node, generate_answer_node)
+                ],
+                entry_point=fetch_node,
+                graph_name=self.__class__.__name__
+            )
+
         return BaseGraph(
-            nodes=[
-                fetch_node,
-                parse_node,
-                generate_answer_node,
-            ],
-            edges=[
-                (fetch_node, parse_node),
-                (parse_node, generate_answer_node)
-            ],
-            entry_point=fetch_node,
-            graph_name=self.__class__.__name__
-        )
+                nodes=[
+                    fetch_node,
+                    generate_answer_node,
+                ],
+                edges=[
+                    (fetch_node,  generate_answer_node)
+                ],
+                entry_point=fetch_node,
+                graph_name=self.__class__.__name__
+            )
+
 
     def run(self) -> str:
         """

From 1e4ee3abdf8dce321977bbc74f1976fba33877bc Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <mvincig11@gmail.com>
Date: Fri, 27 Sep 2024 16:42:51 +0200
Subject: [PATCH 08/36] Update html_mode.py

---
 examples/extras/html_mode.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/extras/html_mode.py b/examples/extras/html_mode.py
index c13ba694..6e2670a0 100644
--- a/examples/extras/html_mode.py
+++ b/examples/extras/html_mode.py
@@ -1,7 +1,8 @@
 """ 
 Basic example of scraping pipeline using SmartScraper
 By default smart scraper converts in md format the 
-code.
+code. If you want to just use the original code, you have
+to specify in the confi
 """
 
 import os

From 4330179cb65674d65423c1763f90182e85c15a74 Mon Sep 17 00:00:00 2001
From: semantic-release-bot <semantic-release-bot@martynus.net>
Date: Fri, 27 Sep 2024 14:47:04 +0000
Subject: [PATCH 09/36] ci(release): 1.22.0-beta.4 [skip ci]

## [1.22.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.3...v1.22.0-beta.4) (2024-09-27)

### Features

* add html_mode to smart_scraper ([bdcffd6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bdcffd6360237b27797546a198ceece55ce4bc81))
---
 CHANGELOG.md   | 7 +++++++
 pyproject.toml | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 70bcbbde..bcc66ecd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,10 @@
+## [1.22.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.3...v1.22.0-beta.4) (2024-09-27)
+
+
+### Features
+
+* add html_mode to smart_scraper ([bdcffd6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bdcffd6360237b27797546a198ceece55ce4bc81))
+
 ## [1.22.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.2...v1.22.0-beta.3) (2024-09-25)
 
 
diff --git a/pyproject.toml b/pyproject.toml
index b7e0b1cc..fc61a859 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "scrapegraphai"
 
-version = "1.22.0b3"
+version = "1.22.0b4"
 
 description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
 authors = [

From b2822f620a610e61d295cbf4b670aa08fde9de24 Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <mvincig11@gmail.com>
Date: Fri, 27 Sep 2024 17:45:44 +0200
Subject: [PATCH 10/36] feat: add reasoning integration

---
 examples/extras/reasoning.py                  | 46 +++++++++++++++++++
 scrapegraphai/graphs/smart_scraper_graph.py   | 28 +++++++++++
 scrapegraphai/nodes/__init__.py               |  2 +-
 scrapegraphai/nodes/reasoning_node.py         |  9 ++--
 .../prompts/reasoning_node_prompts.py         |  2 +-
 5 files changed, 81 insertions(+), 6 deletions(-)
 create mode 100644 examples/extras/reasoning.py

diff --git a/examples/extras/reasoning.py b/examples/extras/reasoning.py
new file mode 100644
index 00000000..80e57faa
--- /dev/null
+++ b/examples/extras/reasoning.py
@@ -0,0 +1,46 @@
+""" 
+Basic example of scraping pipeline using SmartScraper
+"""
+
+import os
+import json
+from dotenv import load_dotenv
+from scrapegraphai.graphs import SmartScraperGraph
+from scrapegraphai.utils import prettify_exec_info
+
+load_dotenv()
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+
+graph_config = {
+    "llm": {
+        "api_key": os.getenv("OPENAI_API_KEY"),
+        "model": "openai/gpt-4o",
+    },
+    "reasoning": True,
+    "verbose": True,
+    "headless": False,
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+smart_scraper_graph = SmartScraperGraph(
+    prompt="List me what does the company do, the name and a contact email.",
+    source="https://scrapegraphai.com/",
+    config=graph_config
+)
+
+result = smart_scraper_graph.run()
+print(json.dumps(result, indent=4))
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = smart_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py
index 0c025c3a..95c2b460 100644
--- a/scrapegraphai/graphs/smart_scraper_graph.py
+++ b/scrapegraphai/graphs/smart_scraper_graph.py
@@ -9,6 +9,7 @@
 from ..nodes import (
     FetchNode,
     ParseNode,
+    ReasoningNode,
     GenerateAnswerNode
 )
 
@@ -88,6 +89,33 @@ def _create_graph(self) -> BaseGraph:
             }
         )
 
+        if self.config.get("reasoning"):
+            reasoning_node =  ReasoningNode(
+                input="user_prompt & (relevant_chunks | parsed_doc | doc)",
+                output=["answer"],
+                node_config={
+                    "llm_model": self.llm_model,
+                    "additional_info": self.config.get("additional_info"),
+                    "schema": self.schema,
+                }
+            )
+
+            return BaseGraph(
+                nodes=[
+                    fetch_node,
+                    parse_node,
+                    reasoning_node,
+                    generate_answer_node,
+                ],
+                edges=[
+                    (fetch_node, parse_node),
+                    (parse_node, reasoning_node),
+                    (reasoning_node, generate_answer_node)
+                ],
+                entry_point=fetch_node,
+                graph_name=self.__class__.__name__
+            )
+
         return BaseGraph(
             nodes=[
                 fetch_node,
diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py
index 2a0f261a..7ed99808 100644
--- a/scrapegraphai/nodes/__init__.py
+++ b/scrapegraphai/nodes/__init__.py
@@ -26,4 +26,4 @@
 from .prompt_refiner_node import PromptRefinerNode
 from .html_analyzer_node import HtmlAnalyzerNode
 from .generate_code_node import GenerateCodeNode
-from .reasoning_node import ReasoningNode
\ No newline at end of file
+from .reasoning_node import ReasoningNode
diff --git a/scrapegraphai/nodes/reasoning_node.py b/scrapegraphai/nodes/reasoning_node.py
index 431d8ab1..6b91155c 100644
--- a/scrapegraphai/nodes/reasoning_node.py
+++ b/scrapegraphai/nodes/reasoning_node.py
@@ -50,12 +50,13 @@ def __init__(
         )
 
         self.additional_info = node_config.get("additional_info", None)
-        
+
         self.output_schema = node_config.get("schema")
 
     def execute(self, state: dict) -> dict:
         """
-        Generate a refined prompt for the reasoning task based on the user's input and the JSON schema.
+        Generate a refined prompt for the reasoning task based 
+        on the user's input and the JSON schema.
 
         Args:
             state (dict): The current state of the graph. The input keys will be used
@@ -70,11 +71,11 @@ def execute(self, state: dict) -> dict:
         """
 
         self.logger.info(f"--- Executing {self.node_name} Node ---")
-        
+
         user_prompt = state['user_prompt']
 
         self.simplefied_schema = transform_schema(self.output_schema.schema())
-        
+
         if self.additional_info is not None:
             prompt = PromptTemplate(
                 template=TEMPLATE_REASONING_WITH_CONTEXT,
diff --git a/scrapegraphai/prompts/reasoning_node_prompts.py b/scrapegraphai/prompts/reasoning_node_prompts.py
index 47ceaa41..d9caf937 100644
--- a/scrapegraphai/prompts/reasoning_node_prompts.py
+++ b/scrapegraphai/prompts/reasoning_node_prompts.py
@@ -31,7 +31,7 @@
 **Reasoning Output**:
 [Your detailed analysis based on the above instructions]
 """
-        
+
 TEMPLATE_REASONING_WITH_CONTEXT = """
 **Task**: Analyze the user's request and the provided JSON schema to guide an LLM in extracting information directly from a markdown file previously parsed froma a HTML file.
 

From 6d8f5435d1ecd2d90b06aade50abc064f75c9d78 Mon Sep 17 00:00:00 2001
From: semantic-release-bot <semantic-release-bot@martynus.net>
Date: Fri, 27 Sep 2024 15:51:48 +0000
Subject: [PATCH 11/36] ci(release): 1.22.0-beta.5 [skip ci]

## [1.22.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.4...v1.22.0-beta.5) (2024-09-27)

### Features

* add reasoning integration ([b2822f6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b2822f620a610e61d295cbf4b670aa08fde9de24))
---
 CHANGELOG.md   | 7 +++++++
 pyproject.toml | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index bcc66ecd..689eeec3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,10 @@
+## [1.22.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.4...v1.22.0-beta.5) (2024-09-27)
+
+
+### Features
+
+* add reasoning integration ([b2822f6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b2822f620a610e61d295cbf4b670aa08fde9de24))
+
 ## [1.22.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.3...v1.22.0-beta.4) (2024-09-27)
 
 
diff --git a/pyproject.toml b/pyproject.toml
index fc61a859..ef0b104a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "scrapegraphai"
 
-version = "1.22.0b4"
+version = "1.22.0b5"
 
 description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
 authors = [

From f87ffa1d8db32b38c47d9f5aa2ae88f1d7978a04 Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <mvincig11@gmail.com>
Date: Fri, 27 Sep 2024 18:31:42 +0200
Subject: [PATCH 12/36] fix: integration with html_mode

---
 scrapegraphai/graphs/smart_scraper_graph.py | 44 +++++++++++++++++----
 1 file changed, 37 insertions(+), 7 deletions(-)

diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py
index 4ffc6bed..65f03a24 100644
--- a/scrapegraphai/graphs/smart_scraper_graph.py
+++ b/scrapegraphai/graphs/smart_scraper_graph.py
@@ -70,7 +70,6 @@ def _create_graph(self) -> BaseGraph:
                 "scrape_do": self.config.get("scrape_do")
             }
         )
-       
 
         generate_answer_node = GenerateAnswerNode(
             input="user_prompt & (relevant_chunks | parsed_doc | doc)",
@@ -82,14 +81,15 @@ def _create_graph(self) -> BaseGraph:
             }
         )
 
-        if self.config.get("html_mode") is not True:
-
+        if self.config.get("html_mode") is False:
             parse_node = ParseNode(
                 input="doc",
                 output=["parsed_doc"],
                 node_config={
                     "llm_model": self.llm_model,
                     "chunk_size": self.model_token
+                }
+            )
 
         if self.config.get("reasoning"):
             reasoning_node =  ReasoningNode(
@@ -102,17 +102,17 @@ def _create_graph(self) -> BaseGraph:
                 }
             )
 
+        if self.config.get("html_mode") is False and self.config.get("reasoning") is True:
+
             return BaseGraph(
                 nodes=[
                     fetch_node,
                     parse_node,
-
                     reasoning_node,
                     generate_answer_node,
                 ],
                 edges=[
                     (fetch_node, parse_node),
-                    (parse_node, generate_answer_node)
                     (parse_node, reasoning_node),
                     (reasoning_node, generate_answer_node)
                 ],
@@ -120,18 +120,48 @@ def _create_graph(self) -> BaseGraph:
                 graph_name=self.__class__.__name__
             )
 
-        return BaseGraph(
+        elif self.config.get("html_mode") is True and self.config.get("reasoning") is True:
+
+            return BaseGraph(
                 nodes=[
                     fetch_node,
+                    reasoning_node,
                     generate_answer_node,
                 ],
                 edges=[
-                    (fetch_node,  generate_answer_node)
+                    (fetch_node, reasoning_node),
+                    (reasoning_node, generate_answer_node)
                 ],
                 entry_point=fetch_node,
                 graph_name=self.__class__.__name__
             )
 
+        elif self.config.get("html_mode") is True and self.config.get("reasoning") is False:
+            return BaseGraph(
+                nodes=[
+                    fetch_node,
+                    generate_answer_node,
+                ],
+                edges=[
+                    (fetch_node, generate_answer_node)
+                ],
+                entry_point=fetch_node,
+                graph_name=self.__class__.__name__
+            )
+
+        return BaseGraph(
+                nodes=[
+                    fetch_node,
+                    parse_node,
+                    generate_answer_node,
+                ],
+                edges=[
+                    (fetch_node, parse_node),
+                    (parse_node, generate_answer_node)
+                ],
+                entry_point=fetch_node,
+                graph_name=self.__class__.__name__
+            )
 
     def run(self) -> str:
         """

From 39f78154a6f1123fa8aca5e169c803111c175473 Mon Sep 17 00:00:00 2001
From: semantic-release-bot <semantic-release-bot@martynus.net>
Date: Sat, 28 Sep 2024 10:42:13 +0000
Subject: [PATCH 13/36] ci(release): 1.22.0-beta.6 [skip ci]

## [1.22.0-beta.6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.5...v1.22.0-beta.6) (2024-09-28)

### Bug Fixes

* integration with html_mode ([f87ffa1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f87ffa1d8db32b38c47d9f5aa2ae88f1d7978a04))
---
 CHANGELOG.md   | 7 +++++++
 pyproject.toml | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 689eeec3..bf3a3bc9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,10 @@
+## [1.22.0-beta.6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.5...v1.22.0-beta.6) (2024-09-28)
+
+
+### Bug Fixes
+
+* integration with html_mode ([f87ffa1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f87ffa1d8db32b38c47d9f5aa2ae88f1d7978a04))
+
 ## [1.22.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.4...v1.22.0-beta.5) (2024-09-27)
 
 
diff --git a/pyproject.toml b/pyproject.toml
index ef0b104a..da9fdc9c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "scrapegraphai"
 
-version = "1.22.0b5"
+version = "1.22.0b6"
 
 description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
 authors = [

From ea27b2499ef5dccc46aab8bc7cdc987cfc6e6c20 Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <mvincig11@gmail.com>
Date: Mon, 30 Sep 2024 11:52:14 +0200
Subject: [PATCH 14/36] add empyt nodes

---
 README.md                                     |  19 +-
 pyproject.toml                                |   5 +-
 scrapegraphai/nodes/__init__.py               |   3 +
 scrapegraphai/nodes/description_node.py       |  42 +++++
 scrapegraphai/nodes/fetch_node_level_k.py     |  42 +++++
 .../nodes/generate_answer_node_k_level.py     |  50 ++++++
 scrapegraphai/nodes/generate_code_node.py     |  19 +-
 scrapegraphai/nodes/rag_node.py               | 163 +++---------------
 8 files changed, 178 insertions(+), 165 deletions(-)
 create mode 100644 scrapegraphai/nodes/description_node.py
 create mode 100644 scrapegraphai/nodes/fetch_node_level_k.py
 create mode 100644 scrapegraphai/nodes/generate_answer_node_k_level.py

diff --git a/README.md b/README.md
index cf437203..51bc3fa9 100644
--- a/README.md
+++ b/README.md
@@ -38,10 +38,9 @@ Additional dependecies can be added while installing the library:
 
 - <b>More Language Models</b>: additional language models are installed, such as Fireworks, Groq, Anthropic, Hugging Face, and Nvidia AI Endpoints.
 
-
-This group allows you to use additional language models like Fireworks, Groq, Anthropic, Together AI, Hugging Face, and Nvidia AI Endpoints.
-```bash
-pip install scrapegraphai[other-language-models]
+  This group allows you to use additional language models like Fireworks, Groq, Anthropic, Together AI, Hugging Face, and Nvidia AI Endpoints.
+  ```bash
+  pip install scrapegraphai[other-language-models]
   ```
 - <b>Semantic Options</b>: this group includes tools for advanced semantic processing, such as Graphviz.
 
@@ -55,23 +54,15 @@ pip install scrapegraphai[other-language-models]
   pip install scrapegraphai[more-browser-options]
   ```
 
-- <b>faiss Options</b>: this group includes faiss integration
+- <b>qdrants Options</b>: this group includes qdrant integration for RAGnode and DeepScraperGraph.
 
   ```bash
-  pip install scrapegraphai[faiss-cpu]
+  pip install scrapegraphai[qdrant]
   ```
 
 </details>
 
 
-
-### Installing "More Browser Options"
-
-This group includes an ocr scraper for websites
-```bash
-pip install scrapegraphai[screenshot_scraper]
-```
-
 ## 💻 Usage
 There are multiple standard scraping pipelines that can be used to extract information from a website (or local file).
 
diff --git a/pyproject.toml b/pyproject.toml
index 26b1fdb7..dde97395 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -100,8 +100,9 @@ screenshot_scraper = [
 ]
 
 # Group 5: Faiss CPU
-faiss-cpu = [
-    "faiss-cpu>=1.8.0",
+qdrant = [
+    "qdrant-client>=1.11.3",
+    "fastembed>=0.3.6"
 ]
 
 [build-system]
diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py
index ec16c48e..e5fafb87 100644
--- a/scrapegraphai/nodes/__init__.py
+++ b/scrapegraphai/nodes/__init__.py
@@ -28,3 +28,6 @@
 from .generate_code_node import GenerateCodeNode
 from .search_node_with_context import SearchLinksWithContext
 from .reasoning_node import ReasoningNode
+from .fetch_node_level_k import FetchNodelevelK
+from .generate_answer_node_k_level import GenerateAnswerNodeKLevel
+from .description_node import DescriptionNode
diff --git a/scrapegraphai/nodes/description_node.py b/scrapegraphai/nodes/description_node.py
new file mode 100644
index 00000000..49ab941f
--- /dev/null
+++ b/scrapegraphai/nodes/description_node.py
@@ -0,0 +1,42 @@
+"""
+DescriptionNode Module
+"""
+from typing import List, Optional
+from .base_node import BaseNode
+
+class DescriptionNode(BaseNode):
+    """
+    A node responsible for compressing the input tokens and storing the document
+    in a vector database for retrieval. Relevant chunks are stored in the state.
+
+    It allows scraping of big documents without exceeding the token limit of the language model.
+
+    Attributes:
+        llm_model: An instance of a language model client, configured for generating answers.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+
+    Args:
+        input (str): Boolean expression defining the input keys needed from the state.
+        output (List[str]): List of output keys to be updated in the state.
+        node_config (dict): Additional configuration for the node.
+        node_name (str): The unique identifier name for the node, defaulting to "Parse".
+    """
+
+    def __init__(
+        self,
+        input: str,
+        output: List[str],
+        node_config: Optional[dict] = None,
+        node_name: str = "RAG",
+    ):
+        super().__init__(node_name, "node", input, output, 2, node_config)
+
+        self.llm_model = node_config["llm_model"]
+        self.embedder_model = node_config.get("embedder_model", None)
+        self.verbose = (
+            False if node_config is None else node_config.get("verbose", False)
+        )
+        self.cache_path = node_config.get("cache_path", False)
+
+    def execute(self, state: dict) -> dict:
+        pass
diff --git a/scrapegraphai/nodes/fetch_node_level_k.py b/scrapegraphai/nodes/fetch_node_level_k.py
new file mode 100644
index 00000000..18a0d435
--- /dev/null
+++ b/scrapegraphai/nodes/fetch_node_level_k.py
@@ -0,0 +1,42 @@
+"""
+FetchNodelevelK Module
+"""
+from typing import List, Optional
+from .base_node import BaseNode
+
+class FetchNodelevelK(BaseNode):
+    """
+    A node responsible for compressing the input tokens and storing the document
+    in a vector database for retrieval. Relevant chunks are stored in the state.
+
+    It allows scraping of big documents without exceeding the token limit of the language model.
+
+    Attributes:
+        llm_model: An instance of a language model client, configured for generating answers.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+
+    Args:
+        input (str): Boolean expression defining the input keys needed from the state.
+        output (List[str]): List of output keys to be updated in the state.
+        node_config (dict): Additional configuration for the node.
+        node_name (str): The unique identifier name for the node, defaulting to "Parse".
+    """
+
+    def __init__(
+        self,
+        input: str,
+        output: List[str],
+        node_config: Optional[dict] = None,
+        node_name: str = "RAG",
+    ):
+        super().__init__(node_name, "node", input, output, 2, node_config)
+
+        self.llm_model = node_config["llm_model"]
+        self.embedder_model = node_config.get("embedder_model", None)
+        self.verbose = (
+            False if node_config is None else node_config.get("verbose", False)
+        )
+        self.cache_path = node_config.get("cache_path", False)
+
+    def execute(self, state: dict) -> dict:
+        pass
diff --git a/scrapegraphai/nodes/generate_answer_node_k_level.py b/scrapegraphai/nodes/generate_answer_node_k_level.py
new file mode 100644
index 00000000..1d4cdb4d
--- /dev/null
+++ b/scrapegraphai/nodes/generate_answer_node_k_level.py
@@ -0,0 +1,50 @@
+"""
+GenerateAnswerNodeKLevel Module
+"""
+from typing import List, Optional
+from .base_node import BaseNode
+
+class GenerateAnswerNodeKLevel(BaseNode):
+    """
+    A node responsible for compressing the input tokens and storing the document
+    in a vector database for retrieval. Relevant chunks are stored in the state.
+
+    It allows scraping of big documents without exceeding the token limit of the language model.
+
+    Attributes:
+        llm_model: An instance of a language model client, configured for generating answers.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+
+    Args:
+        input (str): Boolean expression defining the input keys needed from the state.
+        output (List[str]): List of output keys to be updated in the state.
+        node_config (dict): Additional configuration for the node.
+        node_name (str): The unique identifier name for the node, defaulting to "Parse".
+    """
+
+    def __init__(
+        self,
+        input: str,
+        output: List[str],
+        node_config: Optional[dict] = None,
+        node_name: str = "GANLK",
+    ):
+        super().__init__(node_name, "node", input, output, 2, node_config)
+
+        self.llm_model = node_config["llm_model"]
+        self.embedder_model = node_config.get("embedder_model", None)
+        self.verbose = (
+            False if node_config is None else node_config.get("verbose", False)
+        )
+
+    def execute(self, state: dict) -> dict:
+        client = state["vectorial_db"]
+
+        answer = client.query(
+            collection_name="demo_collection",
+            query_text="This is a query document"
+        )
+
+        state["answer"] = answer
+
+        return state
diff --git a/scrapegraphai/nodes/generate_code_node.py b/scrapegraphai/nodes/generate_code_node.py
index cc72aaf4..746b10a5 100644
--- a/scrapegraphai/nodes/generate_code_node.py
+++ b/scrapegraphai/nodes/generate_code_node.py
@@ -26,7 +26,6 @@
 from .base_node import BaseNode
 from jsonschema import validate, ValidationError
 
-
 class GenerateCodeNode(BaseNode):
     """
     A node that generates Python code for a function that extracts data
@@ -96,7 +95,7 @@ def execute(self, state: dict) -> dict:
         Raises:
             KeyError: If the input keys are not found in the state, indicating
                       that the necessary information for generating an answer is missing.
-            RuntimeError: If the maximum number of iterations is 
+            RuntimeError: If the maximum number of iterations is
             reached without obtaining the desired code.
         """
 
@@ -170,7 +169,7 @@ def overall_reasoning_loop(self, state: dict) -> dict:
             self.logger.info(f"--- (Checking if the informations exctrcated are the ones Requested) ---")
             state = self.semantic_comparison_loop(state)
             if state["errors"]["semantic"]:
-                continue      
+                continue
             break
 
         if state["iteration"] == self.max_iterations["overall"] and \
@@ -195,9 +194,9 @@ def syntax_reasoning_loop(self, state: dict) -> dict:
             state["errors"]["syntax"] = [syntax_message]
             self.logger.info(f"--- (Synax Error Found: {syntax_message}) ---")
             analysis = syntax_focused_analysis(state, self.llm_model)
-            self.logger.info(f"""--- (Regenerating Code 
+            self.logger.info(f"""--- (Regenerating Code
                              to fix the Error) ---""")
-            state["generated_code"] = syntax_focused_code_generation(state, 
+            state["generated_code"] = syntax_focused_code_generation(state,
                                                                      analysis, self.llm_model)
             state["generated_code"] = extract_code(state["generated_code"])
         return state
@@ -217,14 +216,14 @@ def execution_reasoning_loop(self, state: dict) -> dict:
             self.logger.info(f"--- (Code Execution Error: {execution_result}) ---")
             analysis = execution_focused_analysis(state, self.llm_model)
             self.logger.info(f"--- (Regenerating Code to fix the Error) ---")
-            state["generated_code"] = execution_focused_code_generation(state, 
+            state["generated_code"] = execution_focused_code_generation(state,
                                                                         analysis, self.llm_model)
             state["generated_code"] = extract_code(state["generated_code"])
         return state
 
     def validation_reasoning_loop(self, state: dict) -> dict:
         for _ in range(self.max_iterations["validation"]):
-            validation, errors = self.validate_dict(state["execution_result"], 
+            validation, errors = self.validate_dict(state["execution_result"],
                                                     self.output_schema.schema())
             if validation:
                 state["errors"]["validation"] = []
@@ -240,7 +239,7 @@ def validation_reasoning_loop(self, state: dict) -> dict:
 
     def semantic_comparison_loop(self, state: dict) -> dict:
         for _ in range(self.max_iterations["semantic"]):
-            comparison_result = self.semantic_comparison(state["execution_result"], 
+            comparison_result = self.semantic_comparison(state["execution_result"],
                                                          state["reference_answer"])
             if comparison_result["are_semantically_equivalent"]:
                 state["errors"]["semantic"] = []
@@ -342,7 +341,7 @@ def create_sandbox_and_execute(self, function_code):
             if not extract_data:
                 raise NameError("Function 'extract_data' not found in the generated code.")
 
-            result = extract_data(self.raw_html)            
+            result = extract_data(self.raw_html)
             return True, result
         except Exception as e:
             return False, f"Error during execution: {str(e)}"
@@ -357,5 +356,5 @@ def validate_dict(self, data: dict, schema):
             validate(instance=data, schema=schema)
             return True, None
         except ValidationError as e:
-            errors = e.errors()
+            errors = [e.message]
             return False, errors
diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py
index 1174beee..c92e40f0 100644
--- a/scrapegraphai/nodes/rag_node.py
+++ b/scrapegraphai/nodes/rag_node.py
@@ -1,29 +1,9 @@
 """
 RAGNode Module
 """
-import os
-import sys
 from typing import List, Optional
-from langchain.docstore.document import Document
-from langchain.retrievers import ContextualCompressionRetriever
-from langchain.retrievers.document_compressors import (
-    DocumentCompressorPipeline,
-    EmbeddingsFilter,
-)
-from langchain_community.document_transformers import EmbeddingsRedundantFilter
-from langchain_community.vectorstores import FAISS
-from langchain_community.chat_models import ChatOllama
-from langchain_community.embeddings import OllamaEmbeddings
-from langchain_aws import BedrockEmbeddings, ChatBedrock
-from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
-from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings, ChatOpenAI, AzureChatOpenAI
-from ..utils.logging import get_logger
 from .base_node import BaseNode
-from ..helpers import models_tokens
-from ..models import DeepSeek
-
-optional_modules = {"langchain_anthropic", "langchain_fireworks",
-                    "langchain_groq", "langchain_google_vertexai"}
+from qdrant_client import QdrantClient
 
 class RAGNode(BaseNode):
     """
@@ -34,7 +14,6 @@ class RAGNode(BaseNode):
 
     Attributes:
         llm_model: An instance of a language model client, configured for generating answers.
-        embedder_model: An instance of an embedding model client, configured for generating embeddings.
         verbose (bool): A flag indicating whether to show print statements during execution.
 
     Args:
@@ -58,125 +37,31 @@ def __init__(
         self.verbose = (
             False if node_config is None else node_config.get("verbose", False)
         )
-        self.cache_path = node_config.get("cache_path", False)
 
     def execute(self, state: dict) -> dict:
-        # Execution logic
-        pass
-
-    def _create_default_embedder(self, llm_config=None) -> object:
-        """
-        Create an embedding model instance based on the chosen llm model.
-
-        Returns:
-            object: An instance of the embedding model client.
 
-        Raises:
-            ValueError: If the model is not supported.
-        """
-
-        if isinstance(self.llm_model, ChatGoogleGenerativeAI):
-            return GoogleGenerativeAIEmbeddings(
-                google_api_key=llm_config["api_key"], model="models/embedding-001"
-            )
-        if isinstance(self.llm_model, ChatOpenAI):
-            return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key,
-                                    base_url=self.llm_model.openai_api_base)
-        elif isinstance(self.llm_model, DeepSeek):
-            return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key)
-        elif isinstance(self.llm_model, AzureOpenAIEmbeddings):
-            return self.llm_model
-        elif isinstance(self.llm_model, AzureChatOpenAI):
-            return AzureOpenAIEmbeddings()
-        elif isinstance(self.llm_model, ChatOllama):
-            params = self.llm_model._lc_kwargs
-            params.pop("streaming", None)
-            params.pop("temperature", None)
-            return OllamaEmbeddings(**params)
-        elif isinstance(self.llm_model, ChatBedrock):
-            return BedrockEmbeddings(client=None, model_id=self.llm_model.model_id)
-        elif all(key in sys.modules for key in optional_modules):
-            if isinstance(self.llm_model, ChatFireworks):
-                from langchain_fireworks import FireworksEmbeddings
-                return FireworksEmbeddings(model=self.llm_model.model_name)
-            if isinstance(self.llm_model, ChatNVIDIA):
-                from langchain_nvidia import NVIDIAEmbeddings
-                return NVIDIAEmbeddings(model=self.llm_model.model_name)
-            if isinstance(self.llm_model, ChatHuggingFace):
-                from langchain_huggingface import HuggingFaceEmbeddings
-                return HuggingFaceEmbeddings(model=self.llm_model.model)
-            if isinstance(self.llm_model, ChatVertexAI):
-                from langchain_vertexai import VertexAIEmbeddings
-                return VertexAIEmbeddings()
+        if self.node_config.get("client_type") == "memory":
+            client = QdrantClient(":memory:")
+        elif self.node_config.get("client_type") == "local_db":
+            client = QdrantClient(path="path/to/db")
+        elif self.node_config.get("client_type") == "image":
+            client = QdrantClient(url="http://localhost:6333")
         else:
-            raise ValueError("Embedding Model missing or not supported")
-
-    def _create_embedder(self, embedder_config: dict) -> object:
-        """
-        Create an embedding model instance based on the configuration provided.
-
-        Args:
-            embedder_config (dict): Configuration parameters for the embedding model.
-
-        Returns:
-            object: An instance of the embedding model client.
-
-        Raises:
-            KeyError: If the model is not supported.
-        """
-        embedder_params = {**embedder_config}
-        if "model_instance" in embedder_config:
-            return embedder_params["model_instance"]
-        if "openai" in embedder_params["model"]:
-            return OpenAIEmbeddings(api_key=embedder_params["api_key"])
-        if "azure" in embedder_params["model"]:
-            return AzureOpenAIEmbeddings()
-        if "ollama" in embedder_params["model"]:
-            embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:])
-            try:
-                models_tokens["ollama"][embedder_params["model"]]
-            except KeyError as exc:
-                raise KeyError("Model not supported") from exc
-            return OllamaEmbeddings(**embedder_params)
-        if "gemini" in embedder_params["model"]:
-            try:
-                models_tokens["gemini"][embedder_params["model"]]
-            except KeyError as exc:
-                raise KeyError("Model not supported") from exc
-            return GoogleGenerativeAIEmbeddings(model=embedder_params["model"])
-        if "bedrock" in embedder_params["model"]:
-            embedder_params["model"] = embedder_params["model"].split("/")[-1]
-            client = embedder_params.get("client", None)
-            try:
-                models_tokens["bedrock"][embedder_params["model"]]
-            except KeyError as exc:
-                raise KeyError("Model not supported") from exc
-            return BedrockEmbeddings(client=client, model_id=embedder_params["model"])
-        if all(key in sys.modules for key in optional_modules):
-            if "hugging_face" in embedder_params["model"]:
-                from langchain_huggingface import HuggingFaceEmbeddings
-                embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:])
-                try:
-                    models_tokens["hugging_face"][embedder_params["model"]]
-                except KeyError as exc:
-                    raise KeyError("Model not supported") from exc
-                return HuggingFaceEmbeddings(model=embedder_params["model"])
-            elif "fireworks" in embedder_params["model"]:
-                from langchain_fireworks import FireworksEmbeddings
-                embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:])
-                try:
-                    models_tokens["fireworks"][embedder_params["model"]]
-                except KeyError as exc:
-                    raise KeyError("Model not supported") from exc
-                return FireworksEmbeddings(model=embedder_params["model"])
-            elif "nvidia" in embedder_params["model"]:
-                from langchain_nvidia import NVIDIAEmbeddings
-                embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:])
-                try:
-                    models_tokens["nvidia"][embedder_params["model"]]
-                except KeyError as exc:
-                    raise KeyError("Model not supported") from exc
-                return NVIDIAEmbeddings(model=embedder_params["model"],
-                                        nvidia_api_key=embedder_params["api_key"])
+            raise ValueError("client_type provided not correct")
+
+        docs = ["Qdrant has Langchain integrations", "Qdrant also has Llama Index integrations"]
+        metadata = [
+            {"source": "Langchain-docs"},
+            {"source": "Linkedin-docs"},
+        ]
+        ids = [42, 2]
+
+        client.add(
+            collection_name="demo_collection",
+            documents=docs,
+            metadata=metadata,
+            ids=ids
+        )
 
-        raise ValueError("Model provided by the configuration not supported")
+        state["vectorial_db"] = client
+        return state

From 89de5b6cba988421e3f12581707cdbc98a03e289 Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Mon, 30 Sep 2024 12:10:40 +0200
Subject: [PATCH 15/36] Stating anew

---
 scrapegraphai/nodes/description_node.py   | 42 -----------------------
 scrapegraphai/nodes/fetch_node_level_k.py | 42 -----------------------
 2 files changed, 84 deletions(-)
 delete mode 100644 scrapegraphai/nodes/description_node.py
 delete mode 100644 scrapegraphai/nodes/fetch_node_level_k.py

diff --git a/scrapegraphai/nodes/description_node.py b/scrapegraphai/nodes/description_node.py
deleted file mode 100644
index 49ab941f..00000000
--- a/scrapegraphai/nodes/description_node.py
+++ /dev/null
@@ -1,42 +0,0 @@
-"""
-DescriptionNode Module
-"""
-from typing import List, Optional
-from .base_node import BaseNode
-
-class DescriptionNode(BaseNode):
-    """
-    A node responsible for compressing the input tokens and storing the document
-    in a vector database for retrieval. Relevant chunks are stored in the state.
-
-    It allows scraping of big documents without exceeding the token limit of the language model.
-
-    Attributes:
-        llm_model: An instance of a language model client, configured for generating answers.
-        verbose (bool): A flag indicating whether to show print statements during execution.
-
-    Args:
-        input (str): Boolean expression defining the input keys needed from the state.
-        output (List[str]): List of output keys to be updated in the state.
-        node_config (dict): Additional configuration for the node.
-        node_name (str): The unique identifier name for the node, defaulting to "Parse".
-    """
-
-    def __init__(
-        self,
-        input: str,
-        output: List[str],
-        node_config: Optional[dict] = None,
-        node_name: str = "RAG",
-    ):
-        super().__init__(node_name, "node", input, output, 2, node_config)
-
-        self.llm_model = node_config["llm_model"]
-        self.embedder_model = node_config.get("embedder_model", None)
-        self.verbose = (
-            False if node_config is None else node_config.get("verbose", False)
-        )
-        self.cache_path = node_config.get("cache_path", False)
-
-    def execute(self, state: dict) -> dict:
-        pass
diff --git a/scrapegraphai/nodes/fetch_node_level_k.py b/scrapegraphai/nodes/fetch_node_level_k.py
deleted file mode 100644
index 18a0d435..00000000
--- a/scrapegraphai/nodes/fetch_node_level_k.py
+++ /dev/null
@@ -1,42 +0,0 @@
-"""
-FetchNodelevelK Module
-"""
-from typing import List, Optional
-from .base_node import BaseNode
-
-class FetchNodelevelK(BaseNode):
-    """
-    A node responsible for compressing the input tokens and storing the document
-    in a vector database for retrieval. Relevant chunks are stored in the state.
-
-    It allows scraping of big documents without exceeding the token limit of the language model.
-
-    Attributes:
-        llm_model: An instance of a language model client, configured for generating answers.
-        verbose (bool): A flag indicating whether to show print statements during execution.
-
-    Args:
-        input (str): Boolean expression defining the input keys needed from the state.
-        output (List[str]): List of output keys to be updated in the state.
-        node_config (dict): Additional configuration for the node.
-        node_name (str): The unique identifier name for the node, defaulting to "Parse".
-    """
-
-    def __init__(
-        self,
-        input: str,
-        output: List[str],
-        node_config: Optional[dict] = None,
-        node_name: str = "RAG",
-    ):
-        super().__init__(node_name, "node", input, output, 2, node_config)
-
-        self.llm_model = node_config["llm_model"]
-        self.embedder_model = node_config.get("embedder_model", None)
-        self.verbose = (
-            False if node_config is None else node_config.get("verbose", False)
-        )
-        self.cache_path = node_config.get("cache_path", False)
-
-    def execute(self, state: dict) -> dict:
-        pass

From 336bf705ec6f8200987b9a10f1210d732a35c7b0 Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Mon, 30 Sep 2024 12:18:30 +0200
Subject: [PATCH 16/36] initial creation of FetchNodeLevelK and DescriptionNode

---
 scrapegraphai/nodes/description_node.py   | 42 +++++++++++++++++++++++
 scrapegraphai/nodes/fetch_node_level_K.py | 39 +++++++++++++++++++++
 2 files changed, 81 insertions(+)
 create mode 100644 scrapegraphai/nodes/description_node.py
 create mode 100644 scrapegraphai/nodes/fetch_node_level_K.py

diff --git a/scrapegraphai/nodes/description_node.py b/scrapegraphai/nodes/description_node.py
new file mode 100644
index 00000000..200d7032
--- /dev/null
+++ b/scrapegraphai/nodes/description_node.py
@@ -0,0 +1,42 @@
+"""
+DescriptionNode Module
+"""
+from typing import List, Optional
+from .base_node import BaseNode
+
+class DescriptionNode(BaseNode):
+    """
+    A node responsible for generating a description of a given document. This description is 
+    generated using a language model and is used for retrieving the right documents.
+
+    It allows scraping of big documents without exceeding the token limit of the language model.
+
+    Attributes:
+        llm_model: An instance of a language model client, configured for generating answers.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+
+    Args:
+        input (str): Boolean expression defining the input keys needed from the state.
+        output (List[str]): List of output keys to be updated in the state.
+        node_config (dict): Additional configuration for the node.
+        node_name (str): The unique identifier name for the node, defaulting to "Parse".
+    """
+
+    def __init__(
+        self,
+        input: str,
+        output: List[str],
+        node_config: Optional[dict] = None,
+        node_name: str = "Description",
+    ):
+        super().__init__(node_name, "node", input, output, 2, node_config)
+
+        self.llm_model = node_config["llm_model"]
+        self.embedder_model = node_config.get("embedder_model", None)
+        self.verbose = (
+            False if node_config is None else node_config.get("verbose", False)
+        )
+        self.cache_path = node_config.get("cache_path", False)
+
+    def execute(self, state: dict) -> dict:
+        pass
diff --git a/scrapegraphai/nodes/fetch_node_level_K.py b/scrapegraphai/nodes/fetch_node_level_K.py
new file mode 100644
index 00000000..2fd3aa8b
--- /dev/null
+++ b/scrapegraphai/nodes/fetch_node_level_K.py
@@ -0,0 +1,39 @@
+"""
+FetchNodeLevelK Module
+"""
+from typing import List, Optional
+from .base_node import BaseNode
+
+class FetchNodeLevelK(BaseNode):
+    """
+    A node responsible for fetching all the pages at a certain level of hyperlink the graph.
+
+    Attributes:
+        llm_model: An instance of a language model client, configured for generating answers.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+
+    Args:
+        input (str): Boolean expression defining the input keys needed from the state.
+        output (List[str]): List of output keys to be updated in the state.
+        node_config (dict): Additional configuration for the node.
+        node_name (str): The unique identifier name for the node, defaulting to "Parse".
+    """
+
+    def __init__(
+        self,
+        input: str,
+        output: List[str],
+        node_config: Optional[dict] = None,
+        node_name: str = "FetchLevelK",
+    ):
+        super().__init__(node_name, "node", input, output, 2, node_config)
+
+        self.llm_model = node_config["llm_model"]
+        self.embedder_model = node_config.get("embedder_model", None)
+        self.verbose = (
+            False if node_config is None else node_config.get("verbose", False)
+        )
+        self.cache_path = node_config.get("cache_path", False)
+
+    def execute(self, state: dict) -> dict:
+        pass

From 7411ff061c9ea74ddcd043574da1d968f6abaf99 Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Mon, 30 Sep 2024 12:21:26 +0200
Subject: [PATCH 17/36] Revert "initial creation of FetchNodeLevelK and
 DescriptionNode"

This reverts commit 336bf705ec6f8200987b9a10f1210d732a35c7b0.
---
 scrapegraphai/nodes/description_node.py   | 42 -----------------------
 scrapegraphai/nodes/fetch_node_level_K.py | 39 ---------------------
 2 files changed, 81 deletions(-)
 delete mode 100644 scrapegraphai/nodes/description_node.py
 delete mode 100644 scrapegraphai/nodes/fetch_node_level_K.py

diff --git a/scrapegraphai/nodes/description_node.py b/scrapegraphai/nodes/description_node.py
deleted file mode 100644
index 200d7032..00000000
--- a/scrapegraphai/nodes/description_node.py
+++ /dev/null
@@ -1,42 +0,0 @@
-"""
-DescriptionNode Module
-"""
-from typing import List, Optional
-from .base_node import BaseNode
-
-class DescriptionNode(BaseNode):
-    """
-    A node responsible for generating a description of a given document. This description is 
-    generated using a language model and is used for retrieving the right documents.
-
-    It allows scraping of big documents without exceeding the token limit of the language model.
-
-    Attributes:
-        llm_model: An instance of a language model client, configured for generating answers.
-        verbose (bool): A flag indicating whether to show print statements during execution.
-
-    Args:
-        input (str): Boolean expression defining the input keys needed from the state.
-        output (List[str]): List of output keys to be updated in the state.
-        node_config (dict): Additional configuration for the node.
-        node_name (str): The unique identifier name for the node, defaulting to "Parse".
-    """
-
-    def __init__(
-        self,
-        input: str,
-        output: List[str],
-        node_config: Optional[dict] = None,
-        node_name: str = "Description",
-    ):
-        super().__init__(node_name, "node", input, output, 2, node_config)
-
-        self.llm_model = node_config["llm_model"]
-        self.embedder_model = node_config.get("embedder_model", None)
-        self.verbose = (
-            False if node_config is None else node_config.get("verbose", False)
-        )
-        self.cache_path = node_config.get("cache_path", False)
-
-    def execute(self, state: dict) -> dict:
-        pass
diff --git a/scrapegraphai/nodes/fetch_node_level_K.py b/scrapegraphai/nodes/fetch_node_level_K.py
deleted file mode 100644
index 2fd3aa8b..00000000
--- a/scrapegraphai/nodes/fetch_node_level_K.py
+++ /dev/null
@@ -1,39 +0,0 @@
-"""
-FetchNodeLevelK Module
-"""
-from typing import List, Optional
-from .base_node import BaseNode
-
-class FetchNodeLevelK(BaseNode):
-    """
-    A node responsible for fetching all the pages at a certain level of hyperlink the graph.
-
-    Attributes:
-        llm_model: An instance of a language model client, configured for generating answers.
-        verbose (bool): A flag indicating whether to show print statements during execution.
-
-    Args:
-        input (str): Boolean expression defining the input keys needed from the state.
-        output (List[str]): List of output keys to be updated in the state.
-        node_config (dict): Additional configuration for the node.
-        node_name (str): The unique identifier name for the node, defaulting to "Parse".
-    """
-
-    def __init__(
-        self,
-        input: str,
-        output: List[str],
-        node_config: Optional[dict] = None,
-        node_name: str = "FetchLevelK",
-    ):
-        super().__init__(node_name, "node", input, output, 2, node_config)
-
-        self.llm_model = node_config["llm_model"]
-        self.embedder_model = node_config.get("embedder_model", None)
-        self.verbose = (
-            False if node_config is None else node_config.get("verbose", False)
-        )
-        self.cache_path = node_config.get("cache_path", False)
-
-    def execute(self, state: dict) -> dict:
-        pass

From 462b27bc1d7ac29d0f668fe478867a4b357cb656 Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Mon, 30 Sep 2024 12:21:33 +0200
Subject: [PATCH 18/36] Revert "Stating anew"

This reverts commit 89de5b6cba988421e3f12581707cdbc98a03e289.
---
 scrapegraphai/nodes/description_node.py   | 42 +++++++++++++++++++++++
 scrapegraphai/nodes/fetch_node_level_k.py | 42 +++++++++++++++++++++++
 2 files changed, 84 insertions(+)
 create mode 100644 scrapegraphai/nodes/description_node.py
 create mode 100644 scrapegraphai/nodes/fetch_node_level_k.py

diff --git a/scrapegraphai/nodes/description_node.py b/scrapegraphai/nodes/description_node.py
new file mode 100644
index 00000000..49ab941f
--- /dev/null
+++ b/scrapegraphai/nodes/description_node.py
@@ -0,0 +1,42 @@
+"""
+DescriptionNode Module
+"""
+from typing import List, Optional
+from .base_node import BaseNode
+
+class DescriptionNode(BaseNode):
+    """
+    A node responsible for compressing the input tokens and storing the document
+    in a vector database for retrieval. Relevant chunks are stored in the state.
+
+    It allows scraping of big documents without exceeding the token limit of the language model.
+
+    Attributes:
+        llm_model: An instance of a language model client, configured for generating answers.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+
+    Args:
+        input (str): Boolean expression defining the input keys needed from the state.
+        output (List[str]): List of output keys to be updated in the state.
+        node_config (dict): Additional configuration for the node.
+        node_name (str): The unique identifier name for the node, defaulting to "Parse".
+    """
+
+    def __init__(
+        self,
+        input: str,
+        output: List[str],
+        node_config: Optional[dict] = None,
+        node_name: str = "RAG",
+    ):
+        super().__init__(node_name, "node", input, output, 2, node_config)
+
+        self.llm_model = node_config["llm_model"]
+        self.embedder_model = node_config.get("embedder_model", None)
+        self.verbose = (
+            False if node_config is None else node_config.get("verbose", False)
+        )
+        self.cache_path = node_config.get("cache_path", False)
+
+    def execute(self, state: dict) -> dict:
+        pass
diff --git a/scrapegraphai/nodes/fetch_node_level_k.py b/scrapegraphai/nodes/fetch_node_level_k.py
new file mode 100644
index 00000000..18a0d435
--- /dev/null
+++ b/scrapegraphai/nodes/fetch_node_level_k.py
@@ -0,0 +1,42 @@
+"""
+FetchNodelevelK Module
+"""
+from typing import List, Optional
+from .base_node import BaseNode
+
+class FetchNodelevelK(BaseNode):
+    """
+    A node responsible for compressing the input tokens and storing the document
+    in a vector database for retrieval. Relevant chunks are stored in the state.
+
+    It allows scraping of big documents without exceeding the token limit of the language model.
+
+    Attributes:
+        llm_model: An instance of a language model client, configured for generating answers.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+
+    Args:
+        input (str): Boolean expression defining the input keys needed from the state.
+        output (List[str]): List of output keys to be updated in the state.
+        node_config (dict): Additional configuration for the node.
+        node_name (str): The unique identifier name for the node, defaulting to "Parse".
+    """
+
+    def __init__(
+        self,
+        input: str,
+        output: List[str],
+        node_config: Optional[dict] = None,
+        node_name: str = "RAG",
+    ):
+        super().__init__(node_name, "node", input, output, 2, node_config)
+
+        self.llm_model = node_config["llm_model"]
+        self.embedder_model = node_config.get("embedder_model", None)
+        self.verbose = (
+            False if node_config is None else node_config.get("verbose", False)
+        )
+        self.cache_path = node_config.get("cache_path", False)
+
+    def execute(self, state: dict) -> dict:
+        pass

From 6915f3edfd3e18d9c3fdedb677decb14f30afb49 Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Mon, 30 Sep 2024 12:22:09 +0200
Subject: [PATCH 19/36] start form scratch

---
 scrapegraphai/nodes/fetch_node_level_k.py | 42 -----------------------
 1 file changed, 42 deletions(-)
 delete mode 100644 scrapegraphai/nodes/fetch_node_level_k.py

diff --git a/scrapegraphai/nodes/fetch_node_level_k.py b/scrapegraphai/nodes/fetch_node_level_k.py
deleted file mode 100644
index 18a0d435..00000000
--- a/scrapegraphai/nodes/fetch_node_level_k.py
+++ /dev/null
@@ -1,42 +0,0 @@
-"""
-FetchNodelevelK Module
-"""
-from typing import List, Optional
-from .base_node import BaseNode
-
-class FetchNodelevelK(BaseNode):
-    """
-    A node responsible for compressing the input tokens and storing the document
-    in a vector database for retrieval. Relevant chunks are stored in the state.
-
-    It allows scraping of big documents without exceeding the token limit of the language model.
-
-    Attributes:
-        llm_model: An instance of a language model client, configured for generating answers.
-        verbose (bool): A flag indicating whether to show print statements during execution.
-
-    Args:
-        input (str): Boolean expression defining the input keys needed from the state.
-        output (List[str]): List of output keys to be updated in the state.
-        node_config (dict): Additional configuration for the node.
-        node_name (str): The unique identifier name for the node, defaulting to "Parse".
-    """
-
-    def __init__(
-        self,
-        input: str,
-        output: List[str],
-        node_config: Optional[dict] = None,
-        node_name: str = "RAG",
-    ):
-        super().__init__(node_name, "node", input, output, 2, node_config)
-
-        self.llm_model = node_config["llm_model"]
-        self.embedder_model = node_config.get("embedder_model", None)
-        self.verbose = (
-            False if node_config is None else node_config.get("verbose", False)
-        )
-        self.cache_path = node_config.get("cache_path", False)
-
-    def execute(self, state: dict) -> dict:
-        pass

From 57bf572ab4a243a6d79155218bcc0d9d00dc3753 Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Mon, 30 Sep 2024 12:23:11 +0200
Subject: [PATCH 20/36] initial code for fetch nodel level K

---
 scrapegraphai/nodes/fetch_node_level_k.py | 39 +++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 scrapegraphai/nodes/fetch_node_level_k.py

diff --git a/scrapegraphai/nodes/fetch_node_level_k.py b/scrapegraphai/nodes/fetch_node_level_k.py
new file mode 100644
index 00000000..2fd3aa8b
--- /dev/null
+++ b/scrapegraphai/nodes/fetch_node_level_k.py
@@ -0,0 +1,39 @@
+"""
+FetchNodeLevelK Module
+"""
+from typing import List, Optional
+from .base_node import BaseNode
+
+class FetchNodeLevelK(BaseNode):
+    """
+    A node responsible for fetching all the pages at a certain level of hyperlink the graph.
+
+    Attributes:
+        llm_model: An instance of a language model client, configured for generating answers.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+
+    Args:
+        input (str): Boolean expression defining the input keys needed from the state.
+        output (List[str]): List of output keys to be updated in the state.
+        node_config (dict): Additional configuration for the node.
+        node_name (str): The unique identifier name for the node, defaulting to "Parse".
+    """
+
+    def __init__(
+        self,
+        input: str,
+        output: List[str],
+        node_config: Optional[dict] = None,
+        node_name: str = "FetchLevelK",
+    ):
+        super().__init__(node_name, "node", input, output, 2, node_config)
+
+        self.llm_model = node_config["llm_model"]
+        self.embedder_model = node_config.get("embedder_model", None)
+        self.verbose = (
+            False if node_config is None else node_config.get("verbose", False)
+        )
+        self.cache_path = node_config.get("cache_path", False)
+
+    def execute(self, state: dict) -> dict:
+        pass

From d80b792e1529af8d87bb4534b777693e09b62feb Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Mon, 30 Sep 2024 12:42:26 +0200
Subject: [PATCH 21/36] fetching first level

---
 scrapegraphai/nodes/fetch_node_level_k.py | 80 +++++++++++++++++++-
 scrapegraphai/utils/1_manual.py           | 92 +++++++++++++++++++++++
 2 files changed, 170 insertions(+), 2 deletions(-)
 create mode 100644 scrapegraphai/utils/1_manual.py

diff --git a/scrapegraphai/nodes/fetch_node_level_k.py b/scrapegraphai/nodes/fetch_node_level_k.py
index 2fd3aa8b..bbaafded 100644
--- a/scrapegraphai/nodes/fetch_node_level_k.py
+++ b/scrapegraphai/nodes/fetch_node_level_k.py
@@ -3,10 +3,17 @@
 """
 from typing import List, Optional
 from .base_node import BaseNode
+from ..docloaders import ChromiumLoader
+from ..utils.cleanup_html import cleanup_html
+from ..utils.convert_to_md import convert_to_md
+from langchain_core.documents import Document
 
 class FetchNodeLevelK(BaseNode):
     """
-    A node responsible for fetching all the pages at a certain level of hyperlink the graph.
+    A node responsible for fetching the HTML content of a specified URL and all its sub-links 
+    recursively up to a certain level of hyperlink the graph. This content is then used to update
+    the graph's state. It uses ChromiumLoader to fetch the content from a web page asynchronously
+    (with proxy protection).
 
     Attributes:
         llm_model: An instance of a language model client, configured for generating answers.
@@ -29,11 +36,80 @@ def __init__(
         super().__init__(node_name, "node", input, output, 2, node_config)
 
         self.llm_model = node_config["llm_model"]
+        
         self.embedder_model = node_config.get("embedder_model", None)
+        
         self.verbose = (
             False if node_config is None else node_config.get("verbose", False)
         )
+        
         self.cache_path = node_config.get("cache_path", False)
+        
+        self.headless = (
+            True if node_config is None else node_config.get("headless", True)
+        )
+        
+        self.loader_kwargs = (
+            {} if node_config is None else node_config.get("loader_kwargs", {})
+        )
+        
+        self.browser_base = (
+            None if node_config is None else node_config.get("browser_base", None)
+        )
 
     def execute(self, state: dict) -> dict:
-        pass
+        """
+        Executes the node's logic to fetch the HTML content of a specified URL and all its sub-links
+        and update the graph's state with the content.
+
+        Args:
+            state (dict): The current state of the graph. The input keys will be used
+                            to fetch the correct data types from the state.
+
+        Returns:
+            dict: The updated state with a new output key containing the fetched HTML content.
+
+        Raises:
+            KeyError: If the input key is not found in the state, indicating that the
+                    necessary information to perform the operation is missing.
+        """
+
+        self.logger.info(f"--- Executing {self.node_name} Node ---")
+        
+        # Interpret input keys based on the provided input expression
+        input_keys = self.get_input_keys(state)
+        # Fetching data from the state based on the input keys
+        input_data = [state[key] for key in input_keys]
+
+        source = input_data[0]
+        
+        self.logger.info(f"--- (Fetching HTML from: {source}) ---")
+        
+        loader_kwargs = {}
+
+        if self.node_config is not None:
+            loader_kwargs = self.node_config.get("loader_kwargs", {})
+        
+        if self.browser_base is not None:
+            try:
+                from ..docloaders.browser_base import browser_base_fetch
+            except ImportError:
+                raise ImportError("""The browserbase module is not installed. 
+                                    Please install it using `pip install browserbase`.""")
+
+            data =  browser_base_fetch(self.browser_base.get("api_key"),
+                                        self.browser_base.get("project_id"), [source])
+
+            document = [Document(page_content=content,
+                                metadata={"source": source}) for content in data]
+        
+        else:
+            loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs)
+            
+            document = loader.load()
+        
+        if not document or not document[0].page_content.strip():
+                raise ValueError("""No HTML body content found in
+                                 the document fetched by ChromiumLoader.""")
+                
+        parsed_content = document[0].page_content
\ No newline at end of file
diff --git a/scrapegraphai/utils/1_manual.py b/scrapegraphai/utils/1_manual.py
new file mode 100644
index 00000000..21703b7b
--- /dev/null
+++ b/scrapegraphai/utils/1_manual.py
@@ -0,0 +1,92 @@
+import requests
+import logging
+import time
+from urllib.parse import quote, urljoin
+from typing import Optional
+from bs4 import BeautifulSoup
+from dotenv import load_dotenv
+import os
+import json
+import markdownify
+
+load_dotenv()
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+def fetch_content(token: str, target_url: str, max_retries: int = 5, retry_delay: int = 3) -> Optional[str]:
+    encoded_url = quote(target_url)
+    url = f"http://api.scrape.do?url={encoded_url}&token={token}&render=true&waitUntil=networkidle0"
+
+    for attempt in range(max_retries):
+        try:
+            response = requests.get(url)
+            if response.status_code == 200:
+                logging.info(f"Successfully fetched content from {target_url}")
+                return response.text
+            logging.warning(f"Failed with status {response.status_code}. Retrying in {retry_delay}s...")
+        except requests.RequestException as e:
+            logging.error(f"Error fetching {target_url}: {e}. Retrying in {retry_delay}s...")
+        time.sleep(retry_delay)
+
+    logging.error(f"Failed to fetch {target_url} after {max_retries} attempts.")
+    return None
+
+def extract_links(html_content: str) -> list:
+    soup = BeautifulSoup(html_content, 'html.parser')
+    links = [link['href'] for link in soup.find_all('a', href=True)]
+    logging.info(f"Extracted {len(links)} links.")
+    return links
+
+def process_links(token: str, base_url: str, links: list, depth: int, current_depth: int = 1) -> dict:
+    content_dict = {}
+    for idx, link in enumerate(links, start=1):
+        full_link = link if link.startswith("http") else urljoin(base_url, link)
+        logging.info(f"Processing link {idx}: {full_link}")
+        link_content = fetch_content(token, full_link)
+        if link_content:
+            markdown_content = markdownify.markdownify(link_content, heading_style="ATX")
+            content_dict[full_link] = markdown_content
+            save_content_to_json(content_dict, idx)
+
+            if current_depth < depth:
+                new_links = extract_links(link_content)
+                content_dict.update(process_links(token, full_link, new_links, depth, current_depth + 1))
+        else:
+            logging.warning(f"Failed to fetch content for {full_link}")
+    return content_dict
+
+def save_content_to_json(content_dict: dict, idx: int):
+    if not os.path.exists("downloaded_pages"):
+        os.makedirs("downloaded_pages")
+
+    file_name = f"scraped_content_{idx}.json"
+    file_path = os.path.join("downloaded_pages", file_name)
+
+    with open(file_path, "w", encoding="utf-8") as json_file:
+        json.dump(content_dict, json_file, ensure_ascii=False, indent=4)
+
+    logging.info(f"Content saved to {file_path}")
+
+if __name__ == "__main__":
+    token = os.getenv("TOKEN")
+    target_url = "https://www.wired.com"
+    depth = 2 
+
+    if not token or not target_url:
+        logging.error("Please set the TOKEN and TARGET_URL environment variables.")
+        exit(1)
+
+    html_content = fetch_content(token, target_url)
+
+    if html_content:
+        links = extract_links(html_content)
+        logging.info("Links found:")
+        for link in links:
+            logging.info(link)
+
+        content_dict = process_links(token, target_url, links, depth)
+        for link, content in content_dict.items():
+            logging.info(f"Link: {link}")
+            logging.info(f"Content: {content[:500]}...") 
+    else:
+        logging.error("Failed to fetch the content.")

From 55199e8307721325a2a7e542b0e4938c5885929a Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <mvincig11@gmail.com>
Date: Mon, 30 Sep 2024 14:23:46 +0200
Subject: [PATCH 22/36] add first iterations of the nodes

---
 scrapegraphai/nodes/description_node.py       | 34 ++++++-
 scrapegraphai/nodes/generate_answer_node.py   | 27 ++++-
 .../nodes/generate_answer_node_k_level.py     | 98 +++++++++++++++++--
 scrapegraphai/nodes/rag_node.py               | 11 +--
 .../prompts/description_node_prompts.py       | 10 ++
 5 files changed, 164 insertions(+), 16 deletions(-)
 create mode 100644 scrapegraphai/prompts/description_node_prompts.py

diff --git a/scrapegraphai/nodes/description_node.py b/scrapegraphai/nodes/description_node.py
index 49ab941f..683aabe1 100644
--- a/scrapegraphai/nodes/description_node.py
+++ b/scrapegraphai/nodes/description_node.py
@@ -2,7 +2,11 @@
 DescriptionNode Module
 """
 from typing import List, Optional
+from tqdm import tqdm
+from langchain.prompts import PromptTemplate
+from langchain_core.runnables import RunnableParallel
 from .base_node import BaseNode
+from ..prompts.description_node_prompts import DESCRIPTION_NODE_PROMPT
 
 class DescriptionNode(BaseNode):
     """
@@ -39,4 +43,32 @@ def __init__(
         self.cache_path = node_config.get("cache_path", False)
 
     def execute(self, state: dict) -> dict:
-        pass
+        self.logger.info(f"--- Executing {self.node_name} Node ---")
+
+        input_keys = self.get_input_keys(state)
+        input_data = [state[key] for key in input_keys]
+        docs = input_data[1]
+
+        chains_dict = {}
+
+        for i, chunk in enumerate(tqdm(docs, desc="Processing chunks", disable=not self.verbose)):
+            prompt = PromptTemplate(
+                template=DESCRIPTION_NODE_PROMPT,
+                partial_variables={"context": chunk,
+                                   "chunk_id": i + 1
+                                 }
+            )
+            chain_name = f"chunk{i+1}"
+            chains_dict[chain_name] = prompt | self.llm_model
+
+        async_runner = RunnableParallel(**chains_dict)
+        batch_results = async_runner.invoke()
+
+        temp_res = {}
+
+        for i, (summary, document) in enumerate(zip(batch_results, docs)):
+            temp_res[summary] = document
+
+        state["descriptions"] = temp_res
+
+        return state
diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py
index 15686ec1..d5034a1e 100644
--- a/scrapegraphai/nodes/generate_answer_node.py
+++ b/scrapegraphai/nodes/generate_answer_node.py
@@ -1,3 +1,6 @@
+"""
+generate_answer_node module
+"""
 from typing import List, Optional
 from langchain.prompts import PromptTemplate
 from langchain_core.output_parsers import JsonOutputParser
@@ -15,6 +18,26 @@
 )
 
 class GenerateAnswerNode(BaseNode):
+    """
+        Initializes the GenerateAnswerNode class.
+
+        Args:
+            input (str): The input data type for the node.
+            output (List[str]): The output data type(s) for the node.
+            node_config (Optional[dict]): Configuration dictionary for the node, 
+            which includes the LLM model, verbosity, schema, and other settings. 
+            Defaults to None.
+            node_name (str): The name of the node. Defaults to "GenerateAnswer".
+
+        Attributes:
+            llm_model: The language model specified in the node configuration.
+            verbose (bool): Whether verbose mode is enabled.
+            force (bool): Whether to force certain behaviors, overriding defaults.
+            script_creator (bool): Whether the node is in script creation mode.
+            is_md_scraper (bool): Whether the node is scraping markdown data.
+            additional_info (Optional[str]): Any additional information to be 
+            included in the prompt templates.
+    """
     def __init__(
         self,
         input: str,
@@ -100,7 +123,9 @@ def execute(self, state: dict) -> dict:
             prompt = PromptTemplate(
                 template=template_chunks_prompt,
                 input_variables=["question"],
-                partial_variables={"context": chunk, "chunk_id": i + 1, "format_instructions": format_instructions}
+                partial_variables={"context": chunk,
+                                   "chunk_id": i + 1,
+                                   "format_instructions": format_instructions}
             )
             chain_name = f"chunk{i+1}"
             chains_dict[chain_name] = prompt | self.llm_model
diff --git a/scrapegraphai/nodes/generate_answer_node_k_level.py b/scrapegraphai/nodes/generate_answer_node_k_level.py
index 1d4cdb4d..1733a380 100644
--- a/scrapegraphai/nodes/generate_answer_node_k_level.py
+++ b/scrapegraphai/nodes/generate_answer_node_k_level.py
@@ -2,7 +2,19 @@
 GenerateAnswerNodeKLevel Module
 """
 from typing import List, Optional
+from langchain.prompts import PromptTemplate
+from tqdm import tqdm
+from langchain_core.output_parsers import JsonOutputParser
+from langchain_core.runnables import RunnableParallel
+from langchain_openai import ChatOpenAI, AzureChatOpenAI
+from langchain_mistralai import ChatMistralAI
+from langchain_aws import ChatBedrock
+from ..utils.output_parser import get_structured_output_parser, get_pydantic_output_parser
 from .base_node import BaseNode
+from ..prompts import (
+    TEMPLATE_CHUNKS, TEMPLATE_NO_CHUNKS, TEMPLATE_MERGE,
+    TEMPLATE_CHUNKS_MD, TEMPLATE_NO_CHUNKS_MD, TEMPLATE_MERGE_MD
+)
 
 class GenerateAnswerNodeKLevel(BaseNode):
     """
@@ -33,18 +45,92 @@ def __init__(
 
         self.llm_model = node_config["llm_model"]
         self.embedder_model = node_config.get("embedder_model", None)
-        self.verbose = (
-            False if node_config is None else node_config.get("verbose", False)
-        )
+        self.verbose = node_config.get("verbose", False)
+        self.force = node_config.get("force", False)
+        self.script_creator = node_config.get("script_creator", False)
+        self.is_md_scraper = node_config.get("is_md_scraper", False)
+        self.additional_info = node_config.get("additional_info")
 
     def execute(self, state: dict) -> dict:
+        input_keys = self.get_input_keys(state)
+        input_data = [state[key] for key in input_keys]
+        user_prompt = input_data[0]
+
+        if self.node_config.get("schema", None) is not None:
+            if isinstance(self.llm_model, (ChatOpenAI, ChatMistralAI)):
+                self.llm_model = self.llm_model.with_structured_output(
+                    schema=self.node_config["schema"]
+                )
+                output_parser = get_structured_output_parser(self.node_config["schema"])
+                format_instructions = "NA"
+            else:
+                if not isinstance(self.llm_model, ChatBedrock):
+                    output_parser = get_pydantic_output_parser(self.node_config["schema"])
+                    format_instructions = output_parser.get_format_instructions()
+                else:
+                    output_parser = None
+                    format_instructions = ""
+        else:
+            if not isinstance(self.llm_model, ChatBedrock):
+                output_parser = JsonOutputParser()
+                format_instructions = output_parser.get_format_instructions()
+            else:
+                output_parser = None
+                format_instructions = ""
+
+        if isinstance(self.llm_model, (ChatOpenAI, AzureChatOpenAI)) \
+            and not self.script_creator \
+            or self.force \
+            and not self.script_creator or self.is_md_scraper:
+            template_no_chunks_prompt = TEMPLATE_NO_CHUNKS_MD
+            template_chunks_prompt = TEMPLATE_CHUNKS_MD
+            template_merge_prompt = TEMPLATE_MERGE_MD
+        else:
+            template_no_chunks_prompt = TEMPLATE_NO_CHUNKS
+            template_chunks_prompt = TEMPLATE_CHUNKS
+            template_merge_prompt = TEMPLATE_MERGE
+
+        if self.additional_info is not None:
+            template_no_chunks_prompt = self.additional_info + template_no_chunks_prompt
+            template_chunks_prompt = self.additional_info + template_chunks_prompt
+            template_merge_prompt = self.additional_info + template_merge_prompt
+
         client = state["vectorial_db"]
 
-        answer = client.query(
-            collection_name="demo_collection",
-            query_text="This is a query document"
+        answer_db = client.query(
+            collection_name="vectorial_collection",
+            query_text= state["question"]
         )
 
+        results_db = [elem for elem in state[answer_db]]
+
+        chains_dict = {}
+        for i, chunk in enumerate(tqdm(results_db,
+                                       desc="Processing chunks", disable=not self.verbose)):
+            prompt = PromptTemplate(
+                        template=template_chunks_prompt,
+                        input_variables=["question"],
+                        partial_variables={"context": chunk,
+                                        "chunk_id": i + 1,
+                                     }
+                )
+            chain_name = f"chunk{i+1}"
+            chains_dict[chain_name] = prompt | self.llm_model
+
+        async_runner = RunnableParallel(**chains_dict)
+        batch_results = async_runner.invoke({"question": user_prompt})
+
+        merge_prompt = PromptTemplate(
+            template=template_merge_prompt,
+            input_variables=["context", "question"],
+            partial_variables={"format_instructions": format_instructions}
+        )
+
+        merge_chain = merge_prompt | self.llm_model
+        if output_parser:
+            merge_chain = merge_chain | output_parser
+        answer = merge_chain.invoke({"context": batch_results, "question": user_prompt})
+
         state["answer"] = answer
 
         return state
diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py
index c92e40f0..c137b987 100644
--- a/scrapegraphai/nodes/rag_node.py
+++ b/scrapegraphai/nodes/rag_node.py
@@ -49,18 +49,13 @@ def execute(self, state: dict) -> dict:
         else:
             raise ValueError("client_type provided not correct")
 
-        docs = ["Qdrant has Langchain integrations", "Qdrant also has Llama Index integrations"]
-        metadata = [
-            {"source": "Langchain-docs"},
-            {"source": "Linkedin-docs"},
-        ]
-        ids = [42, 2]
+        docs = [elem for elem in state.get("descriptions").keys()]
+        metadata = []
 
         client.add(
-            collection_name="demo_collection",
+            collection_name="vectorial_collection",
             documents=docs,
             metadata=metadata,
-            ids=ids
         )
 
         state["vectorial_db"] = client
diff --git a/scrapegraphai/prompts/description_node_prompts.py b/scrapegraphai/prompts/description_node_prompts.py
new file mode 100644
index 00000000..5cd78d7f
--- /dev/null
+++ b/scrapegraphai/prompts/description_node_prompts.py
@@ -0,0 +1,10 @@
+"""
+description node prompts
+"""
+
+DESCRIPTION_NODE_PROMPT = """
+You are a  scraper and you have just scraped the
+following content from a website. \n
+Please provide a description summary of maximum of 10 words 
+Content of the website: {content}
+"""
\ No newline at end of file

From e88fee9a2bb5c3ad8d791560c45d0c1a8f4b73bb Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <mvincig11@gmail.com>
Date: Mon, 30 Sep 2024 15:10:55 +0200
Subject: [PATCH 23/36] Update generate_answer_node_k_level.py

---
 scrapegraphai/nodes/generate_answer_node_k_level.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scrapegraphai/nodes/generate_answer_node_k_level.py b/scrapegraphai/nodes/generate_answer_node_k_level.py
index 1733a380..24235e71 100644
--- a/scrapegraphai/nodes/generate_answer_node_k_level.py
+++ b/scrapegraphai/nodes/generate_answer_node_k_level.py
@@ -99,7 +99,7 @@ def execute(self, state: dict) -> dict:
 
         answer_db = client.query(
             collection_name="vectorial_collection",
-            query_text= state["question"]
+            query_text=state["question"]
         )
 
         results_db = [elem for elem in state[answer_db]]

From 45f02cd4e2606a768fb6c147b28eaf1fda5a7ee8 Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <mvincig11@gmail.com>
Date: Tue, 1 Oct 2024 11:13:06 +0200
Subject: [PATCH 24/36] refactoring of the format

---
 scrapegraphai/nodes/description_node.py             | 6 +++++-
 scrapegraphai/nodes/generate_answer_node_k_level.py | 1 +
 scrapegraphai/nodes/rag_node.py                     | 6 +++---
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/scrapegraphai/nodes/description_node.py b/scrapegraphai/nodes/description_node.py
index 683aabe1..6175133a 100644
--- a/scrapegraphai/nodes/description_node.py
+++ b/scrapegraphai/nodes/description_node.py
@@ -67,7 +67,11 @@ def execute(self, state: dict) -> dict:
         temp_res = {}
 
         for i, (summary, document) in enumerate(zip(batch_results, docs)):
-            temp_res[summary] = document
+            temp_res[summary] = {
+                "id": i,
+                "summary": summary,
+                "document": document
+            }
 
         state["descriptions"] = temp_res
 
diff --git a/scrapegraphai/nodes/generate_answer_node_k_level.py b/scrapegraphai/nodes/generate_answer_node_k_level.py
index 24235e71..10977617 100644
--- a/scrapegraphai/nodes/generate_answer_node_k_level.py
+++ b/scrapegraphai/nodes/generate_answer_node_k_level.py
@@ -102,6 +102,7 @@ def execute(self, state: dict) -> dict:
             query_text=state["question"]
         )
 
+        ## TODO: from the id get the data
         results_db = [elem for elem in state[answer_db]]
 
         chains_dict = {}
diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py
index c137b987..cac41a99 100644
--- a/scrapegraphai/nodes/rag_node.py
+++ b/scrapegraphai/nodes/rag_node.py
@@ -49,13 +49,13 @@ def execute(self, state: dict) -> dict:
         else:
             raise ValueError("client_type provided not correct")
 
-        docs = [elem for elem in state.get("descriptions").keys()]
-        metadata = []
+        docs = [elem.get("summary") for elem in state.get("descriptions", {})]
+        ids = [elem.get("id") for elem in state.get("descriptions", {})]
 
         client.add(
             collection_name="vectorial_collection",
             documents=docs,
-            metadata=metadata,
+            ids=ids
         )
 
         state["vectorial_db"] = client

From 4cb621feab7e014cba13798c0dd7d4f42b9938db Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Wed, 2 Oct 2024 10:22:21 +0200
Subject: [PATCH 25/36] fetch node level k implementation

---
 examples/openai/fetch_multiple_links.py    | 21 +++++
 scrapegraphai/graphs/__init__.py           |  1 +
 scrapegraphai/graphs/depth_search_graph.py | 96 ++++++++++++++++++++++
 scrapegraphai/nodes/__init__.py            |  2 +-
 scrapegraphai/nodes/fetch_node_level_k.py  | 80 ++++++++++++++++--
 5 files changed, 193 insertions(+), 7 deletions(-)
 create mode 100644 examples/openai/fetch_multiple_links.py
 create mode 100644 scrapegraphai/graphs/depth_search_graph.py

diff --git a/examples/openai/fetch_multiple_links.py b/examples/openai/fetch_multiple_links.py
new file mode 100644
index 00000000..53e246de
--- /dev/null
+++ b/examples/openai/fetch_multiple_links.py
@@ -0,0 +1,21 @@
+
+from scrapegraphai.graphs import DepthSearchGraph
+
+graph_config = {
+    "llm": {
+        "api_key":"YOUR_API_KEY",
+        "model": "openai/gpt-4o-mini",
+    },
+    "verbose": True,
+    "headless": False,
+    "depth": 2,
+}
+
+search_graph = DepthSearchGraph(
+    prompt="List me all the projects with their description",
+    source="https://perinim.github.io/projects/",
+    config=graph_config
+)
+
+result = search_graph.run()
+print(result)
\ No newline at end of file
diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py
index efd6bd7e..b5ffcc47 100644
--- a/scrapegraphai/graphs/__init__.py
+++ b/scrapegraphai/graphs/__init__.py
@@ -26,3 +26,4 @@
 from .screenshot_scraper_graph import ScreenshotScraperGraph
 from .smart_scraper_multi_concat_graph import SmartScraperMultiConcatGraph
 from .code_generator_graph import CodeGeneratorGraph
+from .depth_search_graph import DepthSearchGraph
diff --git a/scrapegraphai/graphs/depth_search_graph.py b/scrapegraphai/graphs/depth_search_graph.py
new file mode 100644
index 00000000..a96d96a7
--- /dev/null
+++ b/scrapegraphai/graphs/depth_search_graph.py
@@ -0,0 +1,96 @@
+"""
+... Module
+"""
+from typing import Optional
+import logging
+from pydantic import BaseModel
+from .base_graph import BaseGraph
+from .abstract_graph import AbstractGraph
+from ..utils.save_code_to_file import save_code_to_file
+from ..nodes import (
+    FetchNodeLevelK
+)
+
+class DepthSearchGraph(AbstractGraph):
+    """
+    CodeGeneratorGraph is a script generator pipeline that generates the function extract_data(html: str) -> dict() for
+    extracting the wanted information from a HTML page. The code generated is in Python and uses the library BeautifulSoup.
+    It requires a user prompt, a source URL, and an output schema.
+
+    Attributes:
+        prompt (str): The prompt for the graph.
+        source (str): The source of the graph.
+        config (dict): Configuration parameters for the graph.
+        schema (BaseModel): The schema for the graph output.
+        llm_model: An instance of a language model client, configured for generating answers.
+        embedder_model: An instance of an embedding model client,
+        configured for generating embeddings.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+        headless (bool): A flag indicating whether to run the graph in headless mode.
+        library (str): The library used for web scraping (beautiful soup).
+
+    Args:
+        prompt (str): The prompt for the graph.
+        source (str): The source of the graph.
+        config (dict): Configuration parameters for the graph.
+        schema (BaseModel): The schema for the graph output.
+
+    Example:
+        >>> code_gen = CodeGeneratorGraph(
+        ...     "List me all the attractions in Chioggia.",
+        ...     "https://en.wikipedia.org/wiki/Chioggia",
+        ...     {"llm": {"model": "openai/gpt-3.5-turbo"}}
+        ... )
+        >>> result = code_gen.run()
+        )
+    """
+
+    def __init__(self, prompt: str, source: str, config: dict, schema: Optional[BaseModel] = None):
+
+        super().__init__(prompt, config, source, schema)
+
+        self.input_key = "url" if source.startswith("http") else "local_dir"
+
+    def _create_graph(self) -> BaseGraph:
+        """
+        Creates the graph of nodes representing the workflow for web scraping.
+
+        Returns:
+            BaseGraph: A graph instance representing the web scraping workflow.
+        """
+
+        fetch_node = FetchNodeLevelK(
+            input="url| local_dir",
+            output=["docs"],
+            node_config={
+                "loader_kwargs": self.config.get("loader_kwargs", {}),
+                "force": self.config.get("force", False),
+                "cut": self.config.get("cut", True),
+                "browser_base": self.config.get("browser_base"),
+                "depth": self.config.get("depth", 1)
+            }
+        )
+
+        return BaseGraph(
+            nodes=[
+                fetch_node
+            ],
+            edges=[],
+            entry_point=fetch_node,
+            graph_name=self.__class__.__name__
+        )
+
+    def run(self) -> str:
+        """
+        Executes the scraping process and returns the generated code.
+
+        Returns:
+            str: The generated code.
+        """
+
+        inputs = {"user_prompt": self.prompt, self.input_key: self.source}
+        self.final_state, self.execution_info = self.graph.execute(inputs)
+
+        docs = self.final_state.get("docs", "No docs")
+
+        return docs
\ No newline at end of file
diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py
index e5fafb87..7b994746 100644
--- a/scrapegraphai/nodes/__init__.py
+++ b/scrapegraphai/nodes/__init__.py
@@ -28,6 +28,6 @@
 from .generate_code_node import GenerateCodeNode
 from .search_node_with_context import SearchLinksWithContext
 from .reasoning_node import ReasoningNode
-from .fetch_node_level_k import FetchNodelevelK
+from .fetch_node_level_k import FetchNodeLevelK
 from .generate_answer_node_k_level import GenerateAnswerNodeKLevel
 from .description_node import DescriptionNode
diff --git a/scrapegraphai/nodes/fetch_node_level_k.py b/scrapegraphai/nodes/fetch_node_level_k.py
index bbaafded..f9fd57a8 100644
--- a/scrapegraphai/nodes/fetch_node_level_k.py
+++ b/scrapegraphai/nodes/fetch_node_level_k.py
@@ -7,6 +7,8 @@
 from ..utils.cleanup_html import cleanup_html
 from ..utils.convert_to_md import convert_to_md
 from langchain_core.documents import Document
+from bs4 import BeautifulSoup
+from urllib.parse import quote, urljoin
 
 class FetchNodeLevelK(BaseNode):
     """
@@ -34,8 +36,6 @@ def __init__(
         node_name: str = "FetchLevelK",
     ):
         super().__init__(node_name, "node", input, output, 2, node_config)
-
-        self.llm_model = node_config["llm_model"]
         
         self.embedder_model = node_config.get("embedder_model", None)
         
@@ -56,6 +56,16 @@ def __init__(
         self.browser_base = (
             None if node_config is None else node_config.get("browser_base", None)
         )
+        
+        self.depth = (
+            1 if node_config is None else node_config.get("depth", 1)
+        )
+        
+        self.only_inside_links = (
+            False if node_config is None else node_config.get("only_inside_links", False)
+        )
+        
+        self.min_input_len = 1
 
     def execute(self, state: dict) -> dict:
         """
@@ -83,6 +93,8 @@ def execute(self, state: dict) -> dict:
 
         source = input_data[0]
         
+        documents = [{"source": source}]
+        
         self.logger.info(f"--- (Fetching HTML from: {source}) ---")
         
         loader_kwargs = {}
@@ -90,6 +102,12 @@ def execute(self, state: dict) -> dict:
         if self.node_config is not None:
             loader_kwargs = self.node_config.get("loader_kwargs", {})
         
+        for _ in range(self.depth):
+            documents = self.obtain_content(documents, loader_kwargs)
+        
+        return {self.output_keys[0]: documents}
+    
+    def fetch_content(self, source: str, loader_kwargs) -> Optional[str]:
         if self.browser_base is not None:
             try:
                 from ..docloaders.browser_base import browser_base_fetch
@@ -108,8 +126,58 @@ def execute(self, state: dict) -> dict:
             
             document = loader.load()
         
-        if not document or not document[0].page_content.strip():
-                raise ValueError("""No HTML body content found in
-                                 the document fetched by ChromiumLoader.""")
+        return document
+    
+    def extract_links(self, html_content: str) -> list:
+        soup = BeautifulSoup(html_content, 'html.parser')
+        links = [link['href'] for link in soup.find_all('a', href=True)]
+        self.logger.info(f"Extracted {len(links)} links.")
+        return links
+    
+    def get_full_links(self, base_url: str, links: list) -> list:
+        full_links = []
+        for link in links:
+            if self.only_inside_links and link.startswith("http"):
+                continue
+            full_link = link if link.startswith("http") else urljoin(base_url, link)
+            full_links.append(full_link)
+        return full_links
+    
+    def obtain_content(self, documents: List, loader_kwargs) -> List:
+        for doc in documents:
+            source = doc['source']
+            if 'document' not in doc:
+                document = self.fetch_content(source, loader_kwargs)
+                
+                if not document or not document[0].page_content.strip():
+                    self.logger.warning(f"Failed to fetch content for {source}")
+                    documents.remove(doc)
+                    continue
+                
+                doc['document'] = document[0].page_content
                 
-        parsed_content = document[0].page_content
\ No newline at end of file
+                links = self.extract_links(doc['document'])
+                full_links = self.get_full_links(source, links)
+                
+                # Check if the links are already present in other documents
+                for link in full_links:
+                    # Check if any document is from the same link
+                    if not any(d.get('source', '') == link for d in documents):
+                        # Add the document
+                        documents.append({"source": link})
+        
+        return documents
+    
+    def process_links(self, base_url: str, links: list, loader_kwargs, depth: int, current_depth: int = 1) -> dict:
+        content_dict = {}
+        for idx, link in enumerate(links, start=1):
+            full_link = link if link.startswith("http") else urljoin(base_url, link)
+            self.logger.info(f"Processing link {idx}: {full_link}")
+            link_content = self.fetch_content(full_link, loader_kwargs)
+
+            if current_depth < depth:
+                new_links = self.extract_links(link_content)
+                content_dict.update(self.process_links(full_link, new_links, depth, current_depth + 1))
+            else:
+                self.logger.warning(f"Failed to fetch content for {full_link}")
+        return content_dict
\ No newline at end of file

From ea3ae1fd6d2406a0b1b4c3337eab24cea44c9656 Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Wed, 2 Oct 2024 11:01:23 +0200
Subject: [PATCH 26/36] fetch multiple links fix

---
 examples/openai/fetch_multiple_links.py    |  1 +
 scrapegraphai/graphs/depth_search_graph.py |  3 ++-
 scrapegraphai/nodes/fetch_node_level_k.py  | 12 +++++++++---
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/examples/openai/fetch_multiple_links.py b/examples/openai/fetch_multiple_links.py
index 53e246de..c9c07877 100644
--- a/examples/openai/fetch_multiple_links.py
+++ b/examples/openai/fetch_multiple_links.py
@@ -9,6 +9,7 @@
     "verbose": True,
     "headless": False,
     "depth": 2,
+    "only_inside_links": True,
 }
 
 search_graph = DepthSearchGraph(
diff --git a/scrapegraphai/graphs/depth_search_graph.py b/scrapegraphai/graphs/depth_search_graph.py
index a96d96a7..fa6294a0 100644
--- a/scrapegraphai/graphs/depth_search_graph.py
+++ b/scrapegraphai/graphs/depth_search_graph.py
@@ -67,7 +67,8 @@ def _create_graph(self) -> BaseGraph:
                 "force": self.config.get("force", False),
                 "cut": self.config.get("cut", True),
                 "browser_base": self.config.get("browser_base"),
-                "depth": self.config.get("depth", 1)
+                "depth": self.config.get("depth", 1),
+                "only_inside_links": self.config.get("only_inside_links", False)
             }
         )
 
diff --git a/scrapegraphai/nodes/fetch_node_level_k.py b/scrapegraphai/nodes/fetch_node_level_k.py
index f9fd57a8..ff329a39 100644
--- a/scrapegraphai/nodes/fetch_node_level_k.py
+++ b/scrapegraphai/nodes/fetch_node_level_k.py
@@ -105,7 +105,11 @@ def execute(self, state: dict) -> dict:
         for _ in range(self.depth):
             documents = self.obtain_content(documents, loader_kwargs)
         
-        return {self.output_keys[0]: documents}
+        filtered_documents = [doc for doc in documents if 'document' in doc]
+        
+        state.update({self.output[0]: filtered_documents})
+        
+        return state
     
     def fetch_content(self, source: str, loader_kwargs) -> Optional[str]:
         if self.browser_base is not None:
@@ -144,6 +148,7 @@ def get_full_links(self, base_url: str, links: list) -> list:
         return full_links
     
     def obtain_content(self, documents: List, loader_kwargs) -> List:
+        new_documents = []
         for doc in documents:
             source = doc['source']
             if 'document' not in doc:
@@ -162,10 +167,11 @@ def obtain_content(self, documents: List, loader_kwargs) -> List:
                 # Check if the links are already present in other documents
                 for link in full_links:
                     # Check if any document is from the same link
-                    if not any(d.get('source', '') == link for d in documents):
+                    if not any(d.get('source', '') == link for d in documents) and not any(d.get('source', '') == link for d in new_documents):
                         # Add the document
-                        documents.append({"source": link})
+                        new_documents.append({"source": link})
         
+        documents.extend(new_documents)
         return documents
     
     def process_links(self, base_url: str, links: list, loader_kwargs, depth: int, current_depth: int = 1) -> dict:

From 2bdb01b07a7011564c23f1117fe524f9238fae1b Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Wed, 2 Oct 2024 11:04:17 +0200
Subject: [PATCH 27/36] Create parse_node_depth_k.py

---
 scrapegraphai/nodes/parse_node_depth_k.py | 156 ++++++++++++++++++++++
 1 file changed, 156 insertions(+)
 create mode 100644 scrapegraphai/nodes/parse_node_depth_k.py

diff --git a/scrapegraphai/nodes/parse_node_depth_k.py b/scrapegraphai/nodes/parse_node_depth_k.py
new file mode 100644
index 00000000..fd2f3810
--- /dev/null
+++ b/scrapegraphai/nodes/parse_node_depth_k.py
@@ -0,0 +1,156 @@
+"""
+ParseNode Module
+"""
+import re
+from typing import List, Optional, Tuple
+from urllib.parse import urljoin
+from langchain_community.document_transformers import Html2TextTransformer
+from langchain_core.documents import Document
+from .base_node import BaseNode
+from ..utils.split_text_into_chunks import split_text_into_chunks
+from ..helpers import default_filters
+
+class ParseNode(BaseNode):
+    """
+    A node responsible for parsing HTML content from a document.
+    The parsed content is split into chunks for further processing.
+
+    This node enhances the scraping workflow by allowing for targeted extraction of
+    content, thereby optimizing the processing of large HTML documents.
+
+    Attributes:
+        verbose (bool): A flag indicating whether to show print statements during execution.
+
+    Args:
+        input (str): Boolean expression defining the input keys needed from the state.
+        output (List[str]): List of output keys to be updated in the state.
+        node_config (dict): Additional configuration for the node.
+        node_name (str): The unique identifier name for the node, defaulting to "Parse".
+    """
+
+    def __init__(
+        self,
+        input: str,
+        output: List[str],
+        node_config: Optional[dict] = None,
+        node_name: str = "ParseNode",
+    ):
+        super().__init__(node_name, "node", input, output, 1, node_config)
+
+        self.verbose = (
+            False if node_config is None else node_config.get("verbose", False)
+        )
+        self.parse_html = (
+            True if node_config is None else node_config.get("parse_html", True)
+        )
+        self.parse_urls = (
+            False if node_config is None else node_config.get("parse_urls", False)
+        )
+
+        self.llm_model = node_config.get("llm_model")
+        self.chunk_size = node_config.get("chunk_size")
+
+    def execute(self, state: dict) -> dict:
+        """
+        Executes the node's logic to parse the HTML document content and split it into chunks.
+
+        Args:
+            state (dict): The current state of the graph. The input keys will be used to fetch the
+                            correct data from the state.
+
+        Returns:
+            dict: The updated state with the output key containing the parsed content chunks.
+
+        Raises:
+            KeyError: If the input keys are not found in the state, indicating that the
+                        necessary information for parsing the content is missing.
+        """
+
+        self.logger.info(f"--- Executing {self.node_name} Node ---")
+
+        input_keys = self.get_input_keys(state)
+
+        input_data = [state[key] for key in input_keys]
+        docs_transformed = input_data[0]
+        source = input_data[1] if self.parse_urls else None
+
+        if self.parse_html:
+            docs_transformed = Html2TextTransformer(ignore_links=False).transform_documents(input_data[0])
+            docs_transformed = docs_transformed[0]
+
+            link_urls, img_urls = self._extract_urls(docs_transformed.page_content, source)
+
+            chunks = split_text_into_chunks(text=docs_transformed.page_content,
+                                            chunk_size=self.chunk_size-250, model=self.llm_model)
+        else:
+            docs_transformed = docs_transformed[0]
+
+            link_urls, img_urls = self._extract_urls(docs_transformed.page_content, source)
+
+            chunk_size = self.chunk_size
+            chunk_size = min(chunk_size - 500, int(chunk_size * 0.75))
+
+            if isinstance(docs_transformed, Document):
+                chunks = split_text_into_chunks(text=docs_transformed.page_content,
+                                                chunk_size=chunk_size,
+                                                model=self.llm_model)
+            else:
+                chunks = split_text_into_chunks(text=docs_transformed,
+                                                chunk_size=chunk_size,
+                                                model=self.llm_model)
+
+        state.update({self.output[0]: chunks})
+        if self.parse_urls:
+            state.update({self.output[1]: link_urls})
+            state.update({self.output[2]: img_urls})
+
+        return state
+
+    def _extract_urls(self, text: str, source: str) -> Tuple[List[str], List[str]]:
+        """
+        Extracts URLs from the given text.
+
+        Args:
+            text (str): The text to extract URLs from.
+
+        Returns:
+            Tuple[List[str], List[str]]: A tuple containing the extracted link URLs and image URLs.
+        """
+        if not self.parse_urls:
+            return [], []
+
+        image_extensions = default_filters.filter_dict["img_exts"]
+        image_extension_seq = '|'.join(image_extensions).replace('.','')
+        url_pattern = re.compile(r'(https?://[^\s]+|\S+\.(?:' + image_extension_seq + '))')
+
+        all_urls = url_pattern.findall(text)
+        all_urls = self._clean_urls(all_urls)
+
+        if not source.startswith("http"):
+            all_urls = [url for url in all_urls if url.startswith("http")]
+        else:
+            all_urls = [urljoin(source, url) for url in all_urls]
+
+        images = [url for url in all_urls if any(url.endswith(ext) for ext in image_extensions)]
+        links = [url for url in all_urls if url not in images]
+
+        return links, images
+
+    def _clean_urls(self, urls: List[str]) -> List[str]:
+        """
+        Cleans the URLs extracted from the text.
+
+        Args:
+            urls (List[str]): The list of URLs to clean.
+
+        Returns:
+            List[str]: The cleaned URLs.
+        """
+        cleaned_urls = []
+        for url in urls:
+            url = re.sub(r'.*?\]\(', '', url)
+            url = url.rstrip(').')
+
+            cleaned_urls.append(url)
+
+        return cleaned_urls

From f755d56bb1e2406668c5114e649953adbbff6748 Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Wed, 2 Oct 2024 12:28:48 +0200
Subject: [PATCH 28/36] updated parse node

---
 scrapegraphai/graphs/depth_search_graph.py |  18 +++-
 scrapegraphai/nodes/__init__.py            |   1 +
 scrapegraphai/nodes/parse_node_depth_k.py  | 120 +++------------------
 3 files changed, 33 insertions(+), 106 deletions(-)

diff --git a/scrapegraphai/graphs/depth_search_graph.py b/scrapegraphai/graphs/depth_search_graph.py
index fa6294a0..6ad3b245 100644
--- a/scrapegraphai/graphs/depth_search_graph.py
+++ b/scrapegraphai/graphs/depth_search_graph.py
@@ -8,7 +8,8 @@
 from .abstract_graph import AbstractGraph
 from ..utils.save_code_to_file import save_code_to_file
 from ..nodes import (
-    FetchNodeLevelK
+    FetchNodeLevelK,
+    ParseNodeDepthK
 )
 
 class DepthSearchGraph(AbstractGraph):
@@ -71,12 +72,23 @@ def _create_graph(self) -> BaseGraph:
                 "only_inside_links": self.config.get("only_inside_links", False)
             }
         )
+        
+        parse_node = ParseNodeDepthK(
+            input="docs",
+            output=["docs"],
+            node_config={
+                "verbose": self.config.get("verbose", False)
+            }
+        )
 
         return BaseGraph(
             nodes=[
-                fetch_node
+                fetch_node,
+                parse_node
+            ],
+            edges=[
+                (fetch_node, parse_node),
             ],
-            edges=[],
             entry_point=fetch_node,
             graph_name=self.__class__.__name__
         )
diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py
index 7b994746..edb195a5 100644
--- a/scrapegraphai/nodes/__init__.py
+++ b/scrapegraphai/nodes/__init__.py
@@ -31,3 +31,4 @@
 from .fetch_node_level_k import FetchNodeLevelK
 from .generate_answer_node_k_level import GenerateAnswerNodeKLevel
 from .description_node import DescriptionNode
+from .parse_node_depth_k import ParseNodeDepthK
diff --git a/scrapegraphai/nodes/parse_node_depth_k.py b/scrapegraphai/nodes/parse_node_depth_k.py
index fd2f3810..30afa23c 100644
--- a/scrapegraphai/nodes/parse_node_depth_k.py
+++ b/scrapegraphai/nodes/parse_node_depth_k.py
@@ -1,19 +1,14 @@
 """
-ParseNode Module
+ParseNodeDepthK Module
 """
 import re
 from typing import List, Optional, Tuple
-from urllib.parse import urljoin
-from langchain_community.document_transformers import Html2TextTransformer
-from langchain_core.documents import Document
 from .base_node import BaseNode
-from ..utils.split_text_into_chunks import split_text_into_chunks
-from ..helpers import default_filters
+from ..utils.convert_to_md import convert_to_md
 
-class ParseNode(BaseNode):
+class ParseNodeDepthK(BaseNode):
     """
-    A node responsible for parsing HTML content from a document.
-    The parsed content is split into chunks for further processing.
+    A node responsible for parsing HTML content from a series of documents.
 
     This node enhances the scraping workflow by allowing for targeted extraction of
     content, thereby optimizing the processing of large HTML documents.
@@ -33,26 +28,17 @@ def __init__(
         input: str,
         output: List[str],
         node_config: Optional[dict] = None,
-        node_name: str = "ParseNode",
+        node_name: str = "ParseNodeDepthK",
     ):
         super().__init__(node_name, "node", input, output, 1, node_config)
 
         self.verbose = (
             False if node_config is None else node_config.get("verbose", False)
         )
-        self.parse_html = (
-            True if node_config is None else node_config.get("parse_html", True)
-        )
-        self.parse_urls = (
-            False if node_config is None else node_config.get("parse_urls", False)
-        )
-
-        self.llm_model = node_config.get("llm_model")
-        self.chunk_size = node_config.get("chunk_size")
 
     def execute(self, state: dict) -> dict:
         """
-        Executes the node's logic to parse the HTML document content and split it into chunks.
+        Executes the node's logic to parse the HTML documents content.
 
         Args:
             state (dict): The current state of the graph. The input keys will be used to fetch the
@@ -67,90 +53,18 @@ def execute(self, state: dict) -> dict:
         """
 
         self.logger.info(f"--- Executing {self.node_name} Node ---")
-
+        
+        # Interpret input keys based on the provided input expression
         input_keys = self.get_input_keys(state)
-
+        # Fetching data from the state based on the input keys
         input_data = [state[key] for key in input_keys]
-        docs_transformed = input_data[0]
-        source = input_data[1] if self.parse_urls else None
-
-        if self.parse_html:
-            docs_transformed = Html2TextTransformer(ignore_links=False).transform_documents(input_data[0])
-            docs_transformed = docs_transformed[0]
-
-            link_urls, img_urls = self._extract_urls(docs_transformed.page_content, source)
-
-            chunks = split_text_into_chunks(text=docs_transformed.page_content,
-                                            chunk_size=self.chunk_size-250, model=self.llm_model)
-        else:
-            docs_transformed = docs_transformed[0]
-
-            link_urls, img_urls = self._extract_urls(docs_transformed.page_content, source)
-
-            chunk_size = self.chunk_size
-            chunk_size = min(chunk_size - 500, int(chunk_size * 0.75))
-
-            if isinstance(docs_transformed, Document):
-                chunks = split_text_into_chunks(text=docs_transformed.page_content,
-                                                chunk_size=chunk_size,
-                                                model=self.llm_model)
-            else:
-                chunks = split_text_into_chunks(text=docs_transformed,
-                                                chunk_size=chunk_size,
-                                                model=self.llm_model)
-
-        state.update({self.output[0]: chunks})
-        if self.parse_urls:
-            state.update({self.output[1]: link_urls})
-            state.update({self.output[2]: img_urls})
 
+        documents = input_data[0]
+        
+        for doc in documents:
+            document_md = convert_to_md(doc["document"])
+            doc["document_md"] = document_md
+        
+        state.update({self.output[0]: documents})
+        
         return state
-
-    def _extract_urls(self, text: str, source: str) -> Tuple[List[str], List[str]]:
-        """
-        Extracts URLs from the given text.
-
-        Args:
-            text (str): The text to extract URLs from.
-
-        Returns:
-            Tuple[List[str], List[str]]: A tuple containing the extracted link URLs and image URLs.
-        """
-        if not self.parse_urls:
-            return [], []
-
-        image_extensions = default_filters.filter_dict["img_exts"]
-        image_extension_seq = '|'.join(image_extensions).replace('.','')
-        url_pattern = re.compile(r'(https?://[^\s]+|\S+\.(?:' + image_extension_seq + '))')
-
-        all_urls = url_pattern.findall(text)
-        all_urls = self._clean_urls(all_urls)
-
-        if not source.startswith("http"):
-            all_urls = [url for url in all_urls if url.startswith("http")]
-        else:
-            all_urls = [urljoin(source, url) for url in all_urls]
-
-        images = [url for url in all_urls if any(url.endswith(ext) for ext in image_extensions)]
-        links = [url for url in all_urls if url not in images]
-
-        return links, images
-
-    def _clean_urls(self, urls: List[str]) -> List[str]:
-        """
-        Cleans the URLs extracted from the text.
-
-        Args:
-            urls (List[str]): The list of URLs to clean.
-
-        Returns:
-            List[str]: The cleaned URLs.
-        """
-        cleaned_urls = []
-        for url in urls:
-            url = re.sub(r'.*?\]\(', '', url)
-            url = url.rstrip(').')
-
-            cleaned_urls.append(url)
-
-        return cleaned_urls

From 015c6fd90504b03981d6e259e2f1aa5b16fa2472 Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Wed, 2 Oct 2024 13:06:00 +0200
Subject: [PATCH 29/36] remove link from markdown

---
 scrapegraphai/nodes/fetch_node_level_k.py | 9 +++++----
 scrapegraphai/nodes/parse_node_depth_k.py | 6 ++++--
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/scrapegraphai/nodes/fetch_node_level_k.py b/scrapegraphai/nodes/fetch_node_level_k.py
index ff329a39..5cdd6571 100644
--- a/scrapegraphai/nodes/fetch_node_level_k.py
+++ b/scrapegraphai/nodes/fetch_node_level_k.py
@@ -95,8 +95,6 @@ def execute(self, state: dict) -> dict:
         
         documents = [{"source": source}]
         
-        self.logger.info(f"--- (Fetching HTML from: {source}) ---")
-        
         loader_kwargs = {}
 
         if self.node_config is not None:
@@ -112,6 +110,8 @@ def execute(self, state: dict) -> dict:
         return state
     
     def fetch_content(self, source: str, loader_kwargs) -> Optional[str]:
+        self.logger.info(f"--- (Fetching HTML from: {source}) ---")
+        
         if self.browser_base is not None:
             try:
                 from ..docloaders.browser_base import browser_base_fetch
@@ -159,9 +159,10 @@ def obtain_content(self, documents: List, loader_kwargs) -> List:
                     documents.remove(doc)
                     continue
                 
-                doc['document'] = document[0].page_content
+                #doc['document'] = document[0].page_content
+                doc['document'] = document
                 
-                links = self.extract_links(doc['document'])
+                links = self.extract_links(doc['document'][0].page_content)
                 full_links = self.get_full_links(source, links)
                 
                 # Check if the links are already present in other documents
diff --git a/scrapegraphai/nodes/parse_node_depth_k.py b/scrapegraphai/nodes/parse_node_depth_k.py
index 30afa23c..7b7ab194 100644
--- a/scrapegraphai/nodes/parse_node_depth_k.py
+++ b/scrapegraphai/nodes/parse_node_depth_k.py
@@ -5,6 +5,7 @@
 from typing import List, Optional, Tuple
 from .base_node import BaseNode
 from ..utils.convert_to_md import convert_to_md
+from langchain_community.document_transformers import Html2TextTransformer
 
 class ParseNodeDepthK(BaseNode):
     """
@@ -62,8 +63,9 @@ def execute(self, state: dict) -> dict:
         documents = input_data[0]
         
         for doc in documents:
-            document_md = convert_to_md(doc["document"])
-            doc["document_md"] = document_md
+            document_md = Html2TextTransformer(ignore_links=True).transform_documents(doc["document"])
+            #document_md = convert_to_md(doc["document"])
+            doc["document"] = document_md[0].page_content
         
         state.update({self.output[0]: documents})
         

From 6124fbdfca9d5b20129e3737023d5e689f9dea7c Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <mvincig11@gmail.com>
Date: Wed, 2 Oct 2024 13:41:15 +0200
Subject: [PATCH 30/36] add embeddings with openai

---
 .../nodes/generate_answer_node_k_level.py     | 20 +++++++++--
 scrapegraphai/nodes/rag_node.py               | 36 +++++++++++++++++++
 2 files changed, 53 insertions(+), 3 deletions(-)

diff --git a/scrapegraphai/nodes/generate_answer_node_k_level.py b/scrapegraphai/nodes/generate_answer_node_k_level.py
index 10977617..8dea5c98 100644
--- a/scrapegraphai/nodes/generate_answer_node_k_level.py
+++ b/scrapegraphai/nodes/generate_answer_node_k_level.py
@@ -97,10 +97,24 @@ def execute(self, state: dict) -> dict:
 
         client = state["vectorial_db"]
 
-        answer_db = client.query(
-            collection_name="vectorial_collection",
-            query_text=state["question"]
+        if state.get("embeddings"):
+            import openai
+            openai_client = openai.Client()
+
+            answer_db = client.search(
+            collection_name="collection",
+            query_vector=openai_client.embeddings.create(
+                input=["What is the best to use for vector search scaling?"],
+                model=state.get("embeddings").get("model"),
+            )
+            .data[0]
+            .embedding,
         )
+        else:
+            answer_db = client.query(
+                collection_name="vectorial_collection",
+                query_text=state["question"]
+            )
 
         ## TODO: from the id get the data
         results_db = [elem for elem in state[answer_db]]
diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py
index cac41a99..3f861478 100644
--- a/scrapegraphai/nodes/rag_node.py
+++ b/scrapegraphai/nodes/rag_node.py
@@ -4,6 +4,7 @@
 from typing import List, Optional
 from .base_node import BaseNode
 from qdrant_client import QdrantClient
+from qdrant_client.models import PointStruct, VectorParams, Distance
 
 class RAGNode(BaseNode):
     """
@@ -52,6 +53,41 @@ def execute(self, state: dict) -> dict:
         docs = [elem.get("summary") for elem in state.get("descriptions", {})]
         ids = [elem.get("id") for elem in state.get("descriptions", {})]
 
+        if state.get("embeddings"):
+            import openai
+            openai_client = openai.Client()
+
+            files = state.get("documents")
+
+            array_of_embeddings = []
+            i=0
+
+            for file in files:
+                embeddings = openai_client.embeddings.create(input=file,
+                                                             model=state.get("embeddings").get("model"))
+                i+=1
+                points = PointStruct(
+                        id=i,
+                        vector=embeddings,
+                        payload={"text": file},
+                    )
+
+                array_of_embeddings.append(points)
+
+            collection_name = "collection"
+
+            client.create_collection(
+                collection_name,
+                vectors_config=VectorParams(
+                    size=1536,
+                    distance=Distance.COSINE,
+                ),
+            )
+            client.upsert(collection_name, points)
+
+            state["vectorial_db"] = client
+            return state
+
         client.add(
             collection_name="vectorial_collection",
             documents=docs,

From 4b371f4d94dae47986aad751508813d89ce87b93 Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <mvincig11@gmail.com>
Date: Thu, 3 Oct 2024 11:38:14 +0200
Subject: [PATCH 31/36] feat: add deep scraper implementation

Co-Authored-By: Matteo Vedovati <68272450+vedovati-matteo@users.noreply.github.com>
---
 ..._links.py => depth_search_graph_openai.py} |  6 +-
 scrapegraphai/graphs/depth_search_graph.py    | 62 ++++++++++++++++---
 scrapegraphai/nodes/description_node.py       |  3 +-
 3 files changed, 57 insertions(+), 14 deletions(-)
 rename examples/openai/{fetch_multiple_links.py => depth_search_graph_openai.py} (89%)

diff --git a/examples/openai/fetch_multiple_links.py b/examples/openai/depth_search_graph_openai.py
similarity index 89%
rename from examples/openai/fetch_multiple_links.py
rename to examples/openai/depth_search_graph_openai.py
index c9c07877..7cde7865 100644
--- a/examples/openai/fetch_multiple_links.py
+++ b/examples/openai/depth_search_graph_openai.py
@@ -1,4 +1,6 @@
-
+"""
+depth_search_graph_opeani example
+"""
 from scrapegraphai.graphs import DepthSearchGraph
 
 graph_config = {
@@ -19,4 +21,4 @@
 )
 
 result = search_graph.run()
-print(result)
\ No newline at end of file
+print(result)
diff --git a/scrapegraphai/graphs/depth_search_graph.py b/scrapegraphai/graphs/depth_search_graph.py
index 6ad3b245..a93d8fcf 100644
--- a/scrapegraphai/graphs/depth_search_graph.py
+++ b/scrapegraphai/graphs/depth_search_graph.py
@@ -9,13 +9,18 @@
 from ..utils.save_code_to_file import save_code_to_file
 from ..nodes import (
     FetchNodeLevelK,
-    ParseNodeDepthK
+    ParseNodeDepthK,
+    DescriptionNode,
+    RAGNode,
+    GenerateAnswerNodeKLevel
 )
 
 class DepthSearchGraph(AbstractGraph):
     """
-    CodeGeneratorGraph is a script generator pipeline that generates the function extract_data(html: str) -> dict() for
-    extracting the wanted information from a HTML page. The code generated is in Python and uses the library BeautifulSoup.
+    CodeGeneratorGraph is a script generator pipeline that generates 
+    the function extract_data(html: str) -> dict() for
+    extracting the wanted information from a HTML page. The 
+    code generated is in Python and uses the library BeautifulSoup.
     It requires a user prompt, a source URL, and an output schema.
 
     Attributes:
@@ -60,7 +65,7 @@ def _create_graph(self) -> BaseGraph:
             BaseGraph: A graph instance representing the web scraping workflow.
         """
 
-        fetch_node = FetchNodeLevelK(
+        fetch_node_k = FetchNodeLevelK(
             input="url| local_dir",
             output=["docs"],
             node_config={
@@ -72,8 +77,8 @@ def _create_graph(self) -> BaseGraph:
                 "only_inside_links": self.config.get("only_inside_links", False)
             }
         )
-        
-        parse_node = ParseNodeDepthK(
+
+        parse_node_k = ParseNodeDepthK(
             input="docs",
             output=["docs"],
             node_config={
@@ -81,15 +86,52 @@ def _create_graph(self) -> BaseGraph:
             }
         )
 
+        description_node = DescriptionNode(
+            input="docs",
+            output=["docs"],
+            node_config={
+                "llm_model": self.llm_model,
+                "verbose": self.config.get("verbose", False),
+                "cache_path": self.config.get("cache_path", False)
+            }
+        )
+
+        rag_node = RAGNode (
+            input="docs",
+            output=["vectorial_db"],
+            node_config={
+                "llm_model": self.llm_model,
+                "embedder_model": self.config.get("embedder_model", False),
+                "verbose": self.config.get("verbose", False),
+            }
+        )
+
+        generate_answer_k = GenerateAnswerNodeKLevel(
+            input="vectorial_db",
+            output=["answer"],
+            node_config={
+                "llm_model": self.llm_model,
+                "embedder_model": self.config.get("embedder_model", False),
+                "verbose": self.config.get("verbose", False),
+            }
+
+        )
+
         return BaseGraph(
             nodes=[
-                fetch_node,
-                parse_node
+                fetch_node_k,
+                parse_node_k,
+                description_node,
+                rag_node,
+                generate_answer_k
             ],
             edges=[
-                (fetch_node, parse_node),
+                (fetch_node_k, parse_node_k),
+                (parse_node_k, description_node),
+                (description_node, rag_node),
+                (rag_node, generate_answer_k)
             ],
-            entry_point=fetch_node,
+            entry_point=fetch_node_k,
             graph_name=self.__class__.__name__
         )
 
diff --git a/scrapegraphai/nodes/description_node.py b/scrapegraphai/nodes/description_node.py
index 6175133a..97ef2e8f 100644
--- a/scrapegraphai/nodes/description_node.py
+++ b/scrapegraphai/nodes/description_node.py
@@ -31,12 +31,11 @@ def __init__(
         input: str,
         output: List[str],
         node_config: Optional[dict] = None,
-        node_name: str = "RAG",
+        node_name: str = "DESCRIPTION",
     ):
         super().__init__(node_name, "node", input, output, 2, node_config)
 
         self.llm_model = node_config["llm_model"]
-        self.embedder_model = node_config.get("embedder_model", None)
         self.verbose = (
             False if node_config is None else node_config.get("verbose", False)
         )

From 85cb9572971719f9f7c66171f5e2246376b6aed2 Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <mvincig11@gmail.com>
Date: Thu, 3 Oct 2024 13:13:04 +0200
Subject: [PATCH 32/36] feat: finished basic version of deep scraper

Co-Authored-By: Matteo Vedovati <68272450+vedovati-matteo@users.noreply.github.com>
---
 examples/openai/depth_search_graph_openai.py  | 12 +++-
 pyproject.toml                                |  6 +-
 requirements-dev.lock                         | 54 ++++++++++++++++++
 requirements.lock                             | 56 +++++++++++++++++++
 scrapegraphai/graphs/depth_search_graph.py    |  4 +-
 scrapegraphai/nodes/description_node.py       | 21 ++-----
 .../nodes/generate_answer_node_k_level.py     | 21 ++++---
 scrapegraphai/nodes/rag_node.py               |  9 +--
 .../prompts/description_node_prompts.py       |  2 +-
 .../prompts/generate_answer_node_prompts.py   |  2 +
 10 files changed, 149 insertions(+), 38 deletions(-)

diff --git a/examples/openai/depth_search_graph_openai.py b/examples/openai/depth_search_graph_openai.py
index 7cde7865..dff07ad4 100644
--- a/examples/openai/depth_search_graph_openai.py
+++ b/examples/openai/depth_search_graph_openai.py
@@ -1,22 +1,28 @@
 """
 depth_search_graph_opeani example
 """
+import os
+from dotenv import load_dotenv
 from scrapegraphai.graphs import DepthSearchGraph
 
+load_dotenv()
+
+openai_key = os.getenv("OPENAI_APIKEY")
+
 graph_config = {
     "llm": {
-        "api_key":"YOUR_API_KEY",
+        "api_key": openai_key,
         "model": "openai/gpt-4o-mini",
     },
     "verbose": True,
     "headless": False,
     "depth": 2,
-    "only_inside_links": True,
+    "only_inside_links": False,
 }
 
 search_graph = DepthSearchGraph(
     prompt="List me all the projects with their description",
-    source="https://perinim.github.io/projects/",
+    source="https://perinim.github.io",
     config=graph_config
 )
 
diff --git a/pyproject.toml b/pyproject.toml
index dde97395..deacd437 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,7 +31,9 @@ dependencies = [
     "google>=3.0.0",
     "langchain-ollama>=0.1.3",
     "semchunk==2.2.0",
-    "transformers==4.44.2"
+    "transformers==4.44.2",
+    "qdrant-client>=1.11.3",
+    "fastembed>=0.3.6"
 ]
 
 license = "MIT"
@@ -99,7 +101,7 @@ screenshot_scraper = [
     "pillow>=10.4.0",
 ]
 
-# Group 5: Faiss CPU
+# Group 5: qdrant
 qdrant = [
     "qdrant-client>=1.11.3",
     "fastembed>=0.3.6"
diff --git a/requirements-dev.lock b/requirements-dev.lock
index 1d9d469a..3423cef0 100644
--- a/requirements-dev.lock
+++ b/requirements-dev.lock
@@ -64,6 +64,8 @@ click==8.1.7
     # via burr
     # via streamlit
     # via uvicorn
+coloredlogs==15.0.1
+    # via onnxruntime
 contourpy==1.2.1
     # via matplotlib
 cycler==0.12.1
@@ -84,9 +86,13 @@ fastapi==0.112.0
     # via burr
 fastapi-pagination==0.12.26
     # via burr
+fastembed==0.3.6
+    # via scrapegraphai
 filelock==3.15.4
     # via huggingface-hub
     # via transformers
+flatbuffers==24.3.25
+    # via onnxruntime
 fonttools==4.53.1
     # via matplotlib
 free-proxy==1.1.1
@@ -132,11 +138,19 @@ greenlet==3.0.3
 grpcio==1.65.4
     # via google-api-core
     # via grpcio-status
+    # via grpcio-tools
+    # via qdrant-client
 grpcio-status==1.62.3
     # via google-api-core
+grpcio-tools==1.62.3
+    # via qdrant-client
 h11==0.14.0
     # via httpcore
     # via uvicorn
+h2==4.1.0
+    # via httpx
+hpack==4.0.0
+    # via h2
 html2text==2024.2.26
     # via scrapegraphai
 httpcore==1.0.5
@@ -149,11 +163,17 @@ httpx==0.27.0
     # via langsmith
     # via ollama
     # via openai
+    # via qdrant-client
 httpx-sse==0.4.0
     # via langchain-mistralai
 huggingface-hub==0.24.5
+    # via fastembed
     # via tokenizers
     # via transformers
+humanfriendly==10.0
+    # via coloredlogs
+hyperframe==6.0.1
+    # via h2
 idna==3.7
     # via anyio
     # via httpx
@@ -218,6 +238,7 @@ langsmith==0.1.121
     # via langchain-core
 loguru==0.7.2
     # via burr
+    # via fastembed
 lxml==5.3.0
     # via free-proxy
 markdown-it-py==3.0.0
@@ -236,8 +257,12 @@ minify-html==0.15.0
     # via scrapegraphai
 mistral-common==1.4.1
     # via scrapegraphai
+mmh3==4.1.0
+    # via fastembed
 mpire==2.10.2
     # via semchunk
+mpmath==1.3.0
+    # via sympy
 multidict==6.0.5
     # via aiohttp
     # via yarl
@@ -249,19 +274,27 @@ narwhals==1.3.0
     # via altair
 numpy==1.26.4
     # via contourpy
+    # via fastembed
     # via langchain
     # via langchain-aws
     # via langchain-community
     # via matplotlib
+    # via onnx
+    # via onnxruntime
     # via opencv-python-headless
     # via pandas
     # via pyarrow
     # via pydeck
+    # via qdrant-client
     # via sf-hamilton
     # via streamlit
     # via transformers
 ollama==0.3.2
     # via langchain-ollama
+onnx==1.17.0
+    # via fastembed
+onnxruntime==1.19.2
+    # via fastembed
 openai==1.40.3
     # via burr
     # via langchain-openai
@@ -275,6 +308,7 @@ packaging==24.1
     # via langchain-core
     # via marshmallow
     # via matplotlib
+    # via onnxruntime
     # via pytest
     # via sphinx
     # via streamlit
@@ -284,6 +318,7 @@ pandas==2.2.2
     # via sf-hamilton
     # via streamlit
 pillow==10.4.0
+    # via fastembed
     # via matplotlib
     # via mistral-common
     # via streamlit
@@ -294,6 +329,8 @@ playwright==1.45.1
     # via undetected-playwright
 pluggy==1.5.0
     # via pytest
+portalocker==2.10.1
+    # via qdrant-client
 proto-plus==1.24.0
     # via google-ai-generativelanguage
     # via google-api-core
@@ -303,6 +340,9 @@ protobuf==4.25.4
     # via google-generativeai
     # via googleapis-common-protos
     # via grpcio-status
+    # via grpcio-tools
+    # via onnx
+    # via onnxruntime
     # via proto-plus
     # via streamlit
 pyarrow==17.0.0
@@ -326,6 +366,7 @@ pydantic==2.8.2
     # via mistral-common
     # via openai
     # via pydantic-settings
+    # via qdrant-client
 pydantic-core==2.20.1
     # via pydantic
 pydantic-settings==2.5.2
@@ -343,6 +384,8 @@ pylint==3.2.6
 pyparsing==3.1.2
     # via httplib2
     # via matplotlib
+pystemmer==2.2.0.1
+    # via fastembed
 pytest==8.0.0
     # via pytest-mock
 pytest-mock==3.14.0
@@ -361,6 +404,8 @@ pyyaml==6.0.2
     # via langchain-community
     # via langchain-core
     # via transformers
+qdrant-client==1.11.3
+    # via scrapegraphai
 referencing==0.35.1
     # via jsonschema
     # via jsonschema-specifications
@@ -369,6 +414,7 @@ regex==2024.7.24
     # via transformers
 requests==2.32.3
     # via burr
+    # via fastembed
     # via free-proxy
     # via google-api-core
     # via huggingface-hub
@@ -395,6 +441,8 @@ semchunk==2.2.0
     # via scrapegraphai
 sentencepiece==0.2.0
     # via mistral-common
+setuptools==75.1.0
+    # via grpcio-tools
 sf-hamilton==1.73.1
     # via burr
 six==1.16.0
@@ -406,6 +454,7 @@ sniffio==1.3.1
     # via httpx
     # via openai
 snowballstemmer==2.2.0
+    # via fastembed
     # via sphinx
 soupsieve==2.5
     # via beautifulsoup4
@@ -434,6 +483,8 @@ starlette==0.37.2
     # via fastapi
 streamlit==1.37.1
     # via burr
+sympy==1.13.3
+    # via onnxruntime
 tenacity==8.5.0
     # via langchain
     # via langchain-community
@@ -444,6 +495,7 @@ tiktoken==0.7.0
     # via mistral-common
     # via scrapegraphai
 tokenizers==0.19.1
+    # via fastembed
     # via langchain-mistralai
     # via transformers
 toml==0.10.2
@@ -456,6 +508,7 @@ tomlkit==0.13.0
 tornado==6.4.1
     # via streamlit
 tqdm==4.66.5
+    # via fastembed
     # via google-generativeai
     # via huggingface-hub
     # via mpire
@@ -495,6 +548,7 @@ uritemplate==4.1.1
     # via google-api-python-client
 urllib3==1.26.19
     # via botocore
+    # via qdrant-client
     # via requests
 uvicorn==0.30.5
     # via burr
diff --git a/requirements.lock b/requirements.lock
index 84e25a0f..8949648a 100644
--- a/requirements.lock
+++ b/requirements.lock
@@ -41,6 +41,8 @@ certifi==2024.7.4
     # via requests
 charset-normalizer==3.3.2
     # via requests
+coloredlogs==15.0.1
+    # via onnxruntime
 dataclasses-json==0.6.7
     # via langchain-community
 dill==0.3.8
@@ -49,9 +51,13 @@ distro==1.9.0
     # via openai
 exceptiongroup==1.2.2
     # via anyio
+fastembed==0.3.6
+    # via scrapegraphai
 filelock==3.15.4
     # via huggingface-hub
     # via transformers
+flatbuffers==24.3.25
+    # via onnxruntime
 free-proxy==1.1.1
     # via scrapegraphai
 frozenlist==1.4.1
@@ -87,10 +93,18 @@ greenlet==3.0.3
 grpcio==1.65.1
     # via google-api-core
     # via grpcio-status
+    # via grpcio-tools
+    # via qdrant-client
 grpcio-status==1.62.2
     # via google-api-core
+grpcio-tools==1.62.3
+    # via qdrant-client
 h11==0.14.0
     # via httpcore
+h2==4.1.0
+    # via httpx
+hpack==4.0.0
+    # via h2
 html2text==2024.2.26
     # via scrapegraphai
 httpcore==1.0.5
@@ -103,11 +117,17 @@ httpx==0.27.0
     # via langsmith
     # via ollama
     # via openai
+    # via qdrant-client
 httpx-sse==0.4.0
     # via langchain-mistralai
 huggingface-hub==0.24.1
+    # via fastembed
     # via tokenizers
     # via transformers
+humanfriendly==10.0
+    # via coloredlogs
+hyperframe==6.0.1
+    # via h2
 idna==3.7
     # via anyio
     # via httpx
@@ -156,6 +176,8 @@ langsmith==0.1.121
     # via langchain
     # via langchain-community
     # via langchain-core
+loguru==0.7.2
+    # via fastembed
 lxml==5.2.2
     # via free-proxy
 marshmallow==3.21.3
@@ -164,8 +186,12 @@ minify-html==0.15.0
     # via scrapegraphai
 mistral-common==1.4.1
     # via scrapegraphai
+mmh3==4.1.0
+    # via fastembed
 mpire==2.10.2
     # via semchunk
+mpmath==1.3.0
+    # via sympy
 multidict==6.0.5
     # via aiohttp
     # via yarl
@@ -174,14 +200,22 @@ multiprocess==0.70.16
 mypy-extensions==1.0.0
     # via typing-inspect
 numpy==1.26.4
+    # via fastembed
     # via langchain
     # via langchain-aws
     # via langchain-community
+    # via onnx
+    # via onnxruntime
     # via opencv-python-headless
     # via pandas
+    # via qdrant-client
     # via transformers
 ollama==0.3.2
     # via langchain-ollama
+onnx==1.17.0
+    # via fastembed
+onnxruntime==1.19.2
+    # via fastembed
 openai==1.41.0
     # via langchain-openai
 opencv-python-headless==4.10.0.84
@@ -192,14 +226,18 @@ packaging==24.1
     # via huggingface-hub
     # via langchain-core
     # via marshmallow
+    # via onnxruntime
     # via transformers
 pandas==2.2.2
     # via scrapegraphai
 pillow==10.4.0
+    # via fastembed
     # via mistral-common
 playwright==1.45.1
     # via scrapegraphai
     # via undetected-playwright
+portalocker==2.10.1
+    # via qdrant-client
 proto-plus==1.24.0
     # via google-ai-generativelanguage
     # via google-api-core
@@ -209,6 +247,9 @@ protobuf==4.25.3
     # via google-generativeai
     # via googleapis-common-protos
     # via grpcio-status
+    # via grpcio-tools
+    # via onnx
+    # via onnxruntime
     # via proto-plus
 pyasn1==0.6.0
     # via pyasn1-modules
@@ -226,6 +267,7 @@ pydantic==2.8.2
     # via mistral-common
     # via openai
     # via pydantic-settings
+    # via qdrant-client
 pydantic-core==2.20.1
     # via pydantic
 pydantic-settings==2.5.2
@@ -236,6 +278,8 @@ pygments==2.18.0
     # via mpire
 pyparsing==3.1.2
     # via httplib2
+pystemmer==2.2.0.1
+    # via fastembed
 python-dateutil==2.9.0.post0
     # via botocore
     # via pandas
@@ -250,6 +294,8 @@ pyyaml==6.0.1
     # via langchain-community
     # via langchain-core
     # via transformers
+qdrant-client==1.11.3
+    # via scrapegraphai
 referencing==0.35.1
     # via jsonschema
     # via jsonschema-specifications
@@ -257,6 +303,7 @@ regex==2024.5.15
     # via tiktoken
     # via transformers
 requests==2.32.3
+    # via fastembed
     # via free-proxy
     # via google-api-core
     # via huggingface-hub
@@ -279,17 +326,23 @@ semchunk==2.2.0
     # via scrapegraphai
 sentencepiece==0.2.0
     # via mistral-common
+setuptools==75.1.0
+    # via grpcio-tools
 six==1.16.0
     # via python-dateutil
 sniffio==1.3.1
     # via anyio
     # via httpx
     # via openai
+snowballstemmer==2.2.0
+    # via fastembed
 soupsieve==2.5
     # via beautifulsoup4
 sqlalchemy==2.0.31
     # via langchain
     # via langchain-community
+sympy==1.13.3
+    # via onnxruntime
 tenacity==8.5.0
     # via langchain
     # via langchain-community
@@ -299,9 +352,11 @@ tiktoken==0.7.0
     # via mistral-common
     # via scrapegraphai
 tokenizers==0.19.1
+    # via fastembed
     # via langchain-mistralai
     # via transformers
 tqdm==4.66.4
+    # via fastembed
     # via google-generativeai
     # via huggingface-hub
     # via mpire
@@ -333,6 +388,7 @@ uritemplate==4.1.1
     # via google-api-python-client
 urllib3==1.26.19
     # via botocore
+    # via qdrant-client
     # via requests
 yarl==1.9.4
     # via aiohttp
diff --git a/scrapegraphai/graphs/depth_search_graph.py b/scrapegraphai/graphs/depth_search_graph.py
index a93d8fcf..13b39129 100644
--- a/scrapegraphai/graphs/depth_search_graph.py
+++ b/scrapegraphai/graphs/depth_search_graph.py
@@ -146,6 +146,6 @@ def run(self) -> str:
         inputs = {"user_prompt": self.prompt, self.input_key: self.source}
         self.final_state, self.execution_info = self.graph.execute(inputs)
 
-        docs = self.final_state.get("docs", "No docs")
+        docs = self.final_state.get("answer", "No answer")
 
-        return docs
\ No newline at end of file
+        return docs
diff --git a/scrapegraphai/nodes/description_node.py b/scrapegraphai/nodes/description_node.py
index 97ef2e8f..60c56cec 100644
--- a/scrapegraphai/nodes/description_node.py
+++ b/scrapegraphai/nodes/description_node.py
@@ -44,34 +44,25 @@ def __init__(
     def execute(self, state: dict) -> dict:
         self.logger.info(f"--- Executing {self.node_name} Node ---")
 
-        input_keys = self.get_input_keys(state)
-        input_data = [state[key] for key in input_keys]
-        docs = input_data[1]
+        docs = [elem for elem in state.get("docs")]
 
         chains_dict = {}
 
         for i, chunk in enumerate(tqdm(docs, desc="Processing chunks", disable=not self.verbose)):
             prompt = PromptTemplate(
                 template=DESCRIPTION_NODE_PROMPT,
-                partial_variables={"context": chunk,
-                                   "chunk_id": i + 1
-                                 }
+                partial_variables={"content": chunk.get("document")}
             )
             chain_name = f"chunk{i+1}"
             chains_dict[chain_name] = prompt | self.llm_model
 
         async_runner = RunnableParallel(**chains_dict)
-        batch_results = async_runner.invoke()
+        batch_results = async_runner.invoke({})
 
-        temp_res = {}
 
-        for i, (summary, document) in enumerate(zip(batch_results, docs)):
-            temp_res[summary] = {
-                "id": i,
-                "summary": summary,
-                "document": document
-            }
+        for i in range(1, len(docs)+1):
+            docs[i-1]["summary"] = batch_results.get(f"chunk{i}").content
 
-        state["descriptions"] = temp_res
+        state.update({self.output[0]: docs})
 
         return state
diff --git a/scrapegraphai/nodes/generate_answer_node_k_level.py b/scrapegraphai/nodes/generate_answer_node_k_level.py
index 8dea5c98..291109f2 100644
--- a/scrapegraphai/nodes/generate_answer_node_k_level.py
+++ b/scrapegraphai/nodes/generate_answer_node_k_level.py
@@ -52,9 +52,9 @@ def __init__(
         self.additional_info = node_config.get("additional_info")
 
     def execute(self, state: dict) -> dict:
-        input_keys = self.get_input_keys(state)
-        input_data = [state[key] for key in input_keys]
-        user_prompt = input_data[0]
+        self.logger.info(f"--- Executing {self.node_name} Node ---")
+
+        user_prompt = state.get("user_prompt")
 
         if self.node_config.get("schema", None) is not None:
             if isinstance(self.llm_model, (ChatOpenAI, ChatMistralAI)):
@@ -113,19 +113,18 @@ def execute(self, state: dict) -> dict:
         else:
             answer_db = client.query(
                 collection_name="vectorial_collection",
-                query_text=state["question"]
+                query_text=user_prompt
             )
 
-        ## TODO: from the id get the data
-        results_db = [elem for elem in state[answer_db]]
-
         chains_dict = {}
-        for i, chunk in enumerate(tqdm(results_db,
+        elems =[state.get("docs")[elem.id-1] for elem in answer_db if elem.score>0.5]
+
+        for i, chunk in enumerate(tqdm(elems,
                                        desc="Processing chunks", disable=not self.verbose)):
             prompt = PromptTemplate(
                         template=template_chunks_prompt,
-                        input_variables=["question"],
-                        partial_variables={"context": chunk,
+                        input_variables=["format_instructions"],
+                        partial_variables={"context": chunk.get("document"),
                                         "chunk_id": i + 1,
                                      }
                 )
@@ -133,7 +132,7 @@ def execute(self, state: dict) -> dict:
             chains_dict[chain_name] = prompt | self.llm_model
 
         async_runner = RunnableParallel(**chains_dict)
-        batch_results = async_runner.invoke({"question": user_prompt})
+        batch_results = async_runner.invoke({"format_instructions": user_prompt})
 
         merge_prompt = PromptTemplate(
             template=template_merge_prompt,
diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py
index 3f861478..b67c50e9 100644
--- a/scrapegraphai/nodes/rag_node.py
+++ b/scrapegraphai/nodes/rag_node.py
@@ -40,8 +40,9 @@ def __init__(
         )
 
     def execute(self, state: dict) -> dict:
-
-        if self.node_config.get("client_type") == "memory":
+        self.logger.info(f"--- Executing {self.node_name} Node ---")
+        
+        if self.node_config.get("client_type") in ["memory", None]:
             client = QdrantClient(":memory:")
         elif self.node_config.get("client_type") == "local_db":
             client = QdrantClient(path="path/to/db")
@@ -50,8 +51,8 @@ def execute(self, state: dict) -> dict:
         else:
             raise ValueError("client_type provided not correct")
 
-        docs = [elem.get("summary") for elem in state.get("descriptions", {})]
-        ids = [elem.get("id") for elem in state.get("descriptions", {})]
+        docs = [elem.get("summary") for elem in state.get("docs")]
+        ids = [i for i in range(1, len(state.get("docs"))+1)]
 
         if state.get("embeddings"):
             import openai
diff --git a/scrapegraphai/prompts/description_node_prompts.py b/scrapegraphai/prompts/description_node_prompts.py
index 5cd78d7f..20df481a 100644
--- a/scrapegraphai/prompts/description_node_prompts.py
+++ b/scrapegraphai/prompts/description_node_prompts.py
@@ -5,6 +5,6 @@
 DESCRIPTION_NODE_PROMPT = """
 You are a  scraper and you have just scraped the
 following content from a website. \n
-Please provide a description summary of maximum of 10 words 
+Please provide a description summary of maximum of 20 words 
 Content of the website: {content}
 """
\ No newline at end of file
diff --git a/scrapegraphai/prompts/generate_answer_node_prompts.py b/scrapegraphai/prompts/generate_answer_node_prompts.py
index 7c098fe2..1b336fb4 100644
--- a/scrapegraphai/prompts/generate_answer_node_prompts.py
+++ b/scrapegraphai/prompts/generate_answer_node_prompts.py
@@ -2,6 +2,7 @@
 Generate answer node prompts
 """
 
+
 TEMPLATE_CHUNKS_MD = """
 You are a website scraper and you have just scraped the
 following content from a website converted in markdown format.
@@ -32,6 +33,7 @@
 You are now asked to answer a user question about the content you have scraped.\n 
 You have scraped many chunks since the website is big and now you are asked to merge them into a single answer without repetitions (if there are any).\n
 Make sure that if a maximum number of items is specified in the instructions that you get that maximum number and do not exceed it. \n
+The structure should be coherent. \n
 Make sure the output format is a valid JSON and does not contain errors. \n
 OUTPUT INSTRUCTIONS: {format_instructions}\n 
 USER QUESTION: {question}\n

From cb46efbe4622597ca6ecbdaa8f750eb7ccc74d14 Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <mvincig11@gmail.com>
Date: Thu, 3 Oct 2024 14:33:52 +0200
Subject: [PATCH 33/36] changed depedencies

---
 README.md      | 6 ------
 pyproject.toml | 6 ------
 2 files changed, 12 deletions(-)

diff --git a/README.md b/README.md
index 51bc3fa9..5d79bf55 100644
--- a/README.md
+++ b/README.md
@@ -54,12 +54,6 @@ Additional dependecies can be added while installing the library:
   pip install scrapegraphai[more-browser-options]
   ```
 
-- <b>qdrants Options</b>: this group includes qdrant integration for RAGnode and DeepScraperGraph.
-
-  ```bash
-  pip install scrapegraphai[qdrant]
-  ```
-
 </details>
 
 
diff --git a/pyproject.toml b/pyproject.toml
index deacd437..4c5e5117 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -101,12 +101,6 @@ screenshot_scraper = [
     "pillow>=10.4.0",
 ]
 
-# Group 5: qdrant
-qdrant = [
-    "qdrant-client>=1.11.3",
-    "fastembed>=0.3.6"
-]
-
 [build-system]
 requires = ["hatchling"]
 build-backend = "hatchling.build"

From c91975e0c81fd2f77007039503c0f1a02685c969 Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <mvincig11@gmail.com>
Date: Thu, 3 Oct 2024 15:32:29 +0200
Subject: [PATCH 34/36] update examples

---
 ...aper_haiku.py => csv_scraper_anthropic.py} |  0
 ...y => csv_scraper_graph_multi_anthropic.py} |  0
 ...aph_haiku.py => custom_graph_anthropic.py} |  0
 .../anthropic/depth_search_graph_anthropic.py | 28 ++++++++++++++
 ...per_haiku.py => json_scraper_anthropic.py} |  0
 ...iku.py => json_scraper_multi_anthropic.py} |  0
 ...aiku.py => pdf_scraper_graph_anthropic.py} |  0
 ...aiku.py => pdf_scraper_multi_anthropic.py} |  0
 ...limit_haiku.py => rate_limit_anthropic.py} |  0
 ...aiku.py => scrape_plain_text_anthropic.py} |  0
 ...haiku.py => script_generator_anthropic.py} |  0
 ...py => script_multi_generator_anthropic.py} |  0
 ...aph_haiku.py => search_graph_anthropic.py} |  0
 ...ku.py => search_graph_schema_anthropic.py} |  0
 ...aiku.py => search_link_graph_anthropic.py} |  0
 ...er_haiku.py => smart_scraper_anthropic.py} |  0
 ...ku.py => smart_scraper_multi_anthropic.py} |  0
 ...> smart_scraper_multi_concat_anthropic.py} |  0
 ...u.py => smart_scraper_schema_anthropic.py} |  0
 ...aper_haiku.py => xml_scraper_anthropic.py} |  0
 ...y => xml_scraper_graph_multi_anthropic.py} |  0
 examples/azure/code_generator_graph_azure.py  |  2 +-
 examples/azure/csv_scraper_azure.py           |  2 +-
 .../azure/csv_scraper_graph_multi_azure.py    |  2 +-
 examples/azure/depth_search_graph_azure.py    | 30 +++++++++++++++
 examples/azure/json_scraper_azure.py          |  2 +-
 examples/azure/json_scraper_multi_azure.py    |  2 +-
 examples/azure/pdf_scraper_azure.py           |  2 +-
 examples/azure/rate_limit_azure.py            |  2 +-
 examples/azure/scrape_plain_text_azure.py     |  2 +-
 examples/azure/script_generator_azure.py      |  2 +-
 .../azure/script_multi_generator_azure.py     |  2 +-
 examples/azure/search_graph_azure.py          |  2 +-
 examples/azure/search_graph_schema_azure.py   |  2 +-
 examples/azure/search_link_graph_azure.py     |  2 +-
 examples/azure/smart_scraper_azure.py         |  2 +-
 examples/azure/smart_scraper_multi_azure.py   |  2 +-
 .../azure/smart_scraper_multi_concat_azure.py |  2 +-
 examples/azure/smart_scraper_schema_azure.py  |  2 +-
 examples/azure/xml_scraper_azure.py           |  2 +-
 .../azure/xml_scraper_graph_multi_azure.py    |  2 +-
 .../bedrock/depth_search_graph_bedrock.py     | 31 +++++++++++++++
 .../deepseek/depth_search_graph_deepseek.py   | 30 +++++++++++++++
 examples/ernie/custom_graph_ernie.py          |  2 +-
 examples/ernie/depth_search_graph_ernie.py    | 26 +++++++++++++
 .../fireworks/depth_search_graph_fireworks.py | 30 +++++++++++++++
 .../google_genai/depth_search_graph_gemini.py | 30 +++++++++++++++
 .../depth_search_graph_gemini.py              | 30 +++++++++++++++
 examples/groq/depth_search_graph_groq.py      | 31 +++++++++++++++
 .../custom_graph_huggingfacehub.py            |  1 -
 .../depth_search_graph_huggingfacehub.py      | 38 +++++++++++++++++++
 .../local_models/depth_search_graph_ollama.py | 32 ++++++++++++++++
 .../local_models/json_scraper_multi_ollama.py |  1 +
 .../smart_scraper_schema_ollama.py            |  1 -
 .../mistral/depth_search_graph_mistral.py     | 30 +++++++++++++++
 .../nemotron/depth_search_graph_nemotron.py   | 30 +++++++++++++++
 examples/oneapi/depth_search_graph_onenapi.py | 31 +++++++++++++++
 .../together/depth_search_graph_together.py   | 31 +++++++++++++++
 58 files changed, 479 insertions(+), 22 deletions(-)
 rename examples/anthropic/{csv_scraper_haiku.py => csv_scraper_anthropic.py} (100%)
 rename examples/anthropic/{csv_scraper_graph_multi_haiku.py => csv_scraper_graph_multi_anthropic.py} (100%)
 rename examples/anthropic/{custom_graph_haiku.py => custom_graph_anthropic.py} (100%)
 create mode 100644 examples/anthropic/depth_search_graph_anthropic.py
 rename examples/anthropic/{json_scraper_haiku.py => json_scraper_anthropic.py} (100%)
 rename examples/anthropic/{json_scraper_multi_haiku.py => json_scraper_multi_anthropic.py} (100%)
 rename examples/anthropic/{pdf_scraper_graph_haiku.py => pdf_scraper_graph_anthropic.py} (100%)
 rename examples/anthropic/{pdf_scraper_multi_haiku.py => pdf_scraper_multi_anthropic.py} (100%)
 rename examples/anthropic/{rate_limit_haiku.py => rate_limit_anthropic.py} (100%)
 rename examples/anthropic/{scrape_plain_text_haiku.py => scrape_plain_text_anthropic.py} (100%)
 rename examples/anthropic/{script_generator_haiku.py => script_generator_anthropic.py} (100%)
 rename examples/anthropic/{script_multi_generator_haiku.py => script_multi_generator_anthropic.py} (100%)
 rename examples/anthropic/{search_graph_haiku.py => search_graph_anthropic.py} (100%)
 rename examples/anthropic/{search_graph_schema_haiku.py => search_graph_schema_anthropic.py} (100%)
 rename examples/anthropic/{search_link_graph_haiku.py => search_link_graph_anthropic.py} (100%)
 rename examples/anthropic/{smart_scraper_haiku.py => smart_scraper_anthropic.py} (100%)
 rename examples/anthropic/{smart_scraper_multi_haiku.py => smart_scraper_multi_anthropic.py} (100%)
 rename examples/anthropic/{smart_scraper_multi_concat_haiku.py => smart_scraper_multi_concat_anthropic.py} (100%)
 rename examples/anthropic/{smart_scraper_schema_haiku.py => smart_scraper_schema_anthropic.py} (100%)
 rename examples/anthropic/{xml_scraper_haiku.py => xml_scraper_anthropic.py} (100%)
 rename examples/anthropic/{xml_scraper_graph_multi_haiku.py => xml_scraper_graph_multi_anthropic.py} (100%)
 create mode 100644 examples/azure/depth_search_graph_azure.py
 create mode 100644 examples/bedrock/depth_search_graph_bedrock.py
 create mode 100644 examples/deepseek/depth_search_graph_deepseek.py
 create mode 100644 examples/ernie/depth_search_graph_ernie.py
 create mode 100644 examples/fireworks/depth_search_graph_fireworks.py
 create mode 100644 examples/google_genai/depth_search_graph_gemini.py
 create mode 100644 examples/google_vertexai/depth_search_graph_gemini.py
 create mode 100644 examples/groq/depth_search_graph_groq.py
 create mode 100644 examples/huggingfacehub/depth_search_graph_huggingfacehub.py
 create mode 100644 examples/local_models/depth_search_graph_ollama.py
 create mode 100644 examples/mistral/depth_search_graph_mistral.py
 create mode 100644 examples/nemotron/depth_search_graph_nemotron.py
 create mode 100644 examples/oneapi/depth_search_graph_onenapi.py
 create mode 100644 examples/together/depth_search_graph_together.py

diff --git a/examples/anthropic/csv_scraper_haiku.py b/examples/anthropic/csv_scraper_anthropic.py
similarity index 100%
rename from examples/anthropic/csv_scraper_haiku.py
rename to examples/anthropic/csv_scraper_anthropic.py
diff --git a/examples/anthropic/csv_scraper_graph_multi_haiku.py b/examples/anthropic/csv_scraper_graph_multi_anthropic.py
similarity index 100%
rename from examples/anthropic/csv_scraper_graph_multi_haiku.py
rename to examples/anthropic/csv_scraper_graph_multi_anthropic.py
diff --git a/examples/anthropic/custom_graph_haiku.py b/examples/anthropic/custom_graph_anthropic.py
similarity index 100%
rename from examples/anthropic/custom_graph_haiku.py
rename to examples/anthropic/custom_graph_anthropic.py
diff --git a/examples/anthropic/depth_search_graph_anthropic.py b/examples/anthropic/depth_search_graph_anthropic.py
new file mode 100644
index 00000000..8cac7bea
--- /dev/null
+++ b/examples/anthropic/depth_search_graph_anthropic.py
@@ -0,0 +1,28 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+graph_config = {
+    "llm": {
+        "api_key": os.getenv("ANTHROPIC_API_KEY"),
+        "model": "openai/gpt-4o-mini",
+    },
+    "verbose": True,
+    "headless": False,
+    "depth": 2,
+    "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+    prompt="List me all the projects with their description",
+    source="https://perinim.github.io",
+    config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/anthropic/json_scraper_haiku.py b/examples/anthropic/json_scraper_anthropic.py
similarity index 100%
rename from examples/anthropic/json_scraper_haiku.py
rename to examples/anthropic/json_scraper_anthropic.py
diff --git a/examples/anthropic/json_scraper_multi_haiku.py b/examples/anthropic/json_scraper_multi_anthropic.py
similarity index 100%
rename from examples/anthropic/json_scraper_multi_haiku.py
rename to examples/anthropic/json_scraper_multi_anthropic.py
diff --git a/examples/anthropic/pdf_scraper_graph_haiku.py b/examples/anthropic/pdf_scraper_graph_anthropic.py
similarity index 100%
rename from examples/anthropic/pdf_scraper_graph_haiku.py
rename to examples/anthropic/pdf_scraper_graph_anthropic.py
diff --git a/examples/anthropic/pdf_scraper_multi_haiku.py b/examples/anthropic/pdf_scraper_multi_anthropic.py
similarity index 100%
rename from examples/anthropic/pdf_scraper_multi_haiku.py
rename to examples/anthropic/pdf_scraper_multi_anthropic.py
diff --git a/examples/anthropic/rate_limit_haiku.py b/examples/anthropic/rate_limit_anthropic.py
similarity index 100%
rename from examples/anthropic/rate_limit_haiku.py
rename to examples/anthropic/rate_limit_anthropic.py
diff --git a/examples/anthropic/scrape_plain_text_haiku.py b/examples/anthropic/scrape_plain_text_anthropic.py
similarity index 100%
rename from examples/anthropic/scrape_plain_text_haiku.py
rename to examples/anthropic/scrape_plain_text_anthropic.py
diff --git a/examples/anthropic/script_generator_haiku.py b/examples/anthropic/script_generator_anthropic.py
similarity index 100%
rename from examples/anthropic/script_generator_haiku.py
rename to examples/anthropic/script_generator_anthropic.py
diff --git a/examples/anthropic/script_multi_generator_haiku.py b/examples/anthropic/script_multi_generator_anthropic.py
similarity index 100%
rename from examples/anthropic/script_multi_generator_haiku.py
rename to examples/anthropic/script_multi_generator_anthropic.py
diff --git a/examples/anthropic/search_graph_haiku.py b/examples/anthropic/search_graph_anthropic.py
similarity index 100%
rename from examples/anthropic/search_graph_haiku.py
rename to examples/anthropic/search_graph_anthropic.py
diff --git a/examples/anthropic/search_graph_schema_haiku.py b/examples/anthropic/search_graph_schema_anthropic.py
similarity index 100%
rename from examples/anthropic/search_graph_schema_haiku.py
rename to examples/anthropic/search_graph_schema_anthropic.py
diff --git a/examples/anthropic/search_link_graph_haiku.py b/examples/anthropic/search_link_graph_anthropic.py
similarity index 100%
rename from examples/anthropic/search_link_graph_haiku.py
rename to examples/anthropic/search_link_graph_anthropic.py
diff --git a/examples/anthropic/smart_scraper_haiku.py b/examples/anthropic/smart_scraper_anthropic.py
similarity index 100%
rename from examples/anthropic/smart_scraper_haiku.py
rename to examples/anthropic/smart_scraper_anthropic.py
diff --git a/examples/anthropic/smart_scraper_multi_haiku.py b/examples/anthropic/smart_scraper_multi_anthropic.py
similarity index 100%
rename from examples/anthropic/smart_scraper_multi_haiku.py
rename to examples/anthropic/smart_scraper_multi_anthropic.py
diff --git a/examples/anthropic/smart_scraper_multi_concat_haiku.py b/examples/anthropic/smart_scraper_multi_concat_anthropic.py
similarity index 100%
rename from examples/anthropic/smart_scraper_multi_concat_haiku.py
rename to examples/anthropic/smart_scraper_multi_concat_anthropic.py
diff --git a/examples/anthropic/smart_scraper_schema_haiku.py b/examples/anthropic/smart_scraper_schema_anthropic.py
similarity index 100%
rename from examples/anthropic/smart_scraper_schema_haiku.py
rename to examples/anthropic/smart_scraper_schema_anthropic.py
diff --git a/examples/anthropic/xml_scraper_haiku.py b/examples/anthropic/xml_scraper_anthropic.py
similarity index 100%
rename from examples/anthropic/xml_scraper_haiku.py
rename to examples/anthropic/xml_scraper_anthropic.py
diff --git a/examples/anthropic/xml_scraper_graph_multi_haiku.py b/examples/anthropic/xml_scraper_graph_multi_anthropic.py
similarity index 100%
rename from examples/anthropic/xml_scraper_graph_multi_haiku.py
rename to examples/anthropic/xml_scraper_graph_multi_anthropic.py
diff --git a/examples/azure/code_generator_graph_azure.py b/examples/azure/code_generator_graph_azure.py
index ad48933f..4bad1b0d 100644
--- a/examples/azure/code_generator_graph_azure.py
+++ b/examples/azure/code_generator_graph_azure.py
@@ -28,7 +28,7 @@ class Projects(BaseModel):
 graph_config = {
     "llm": {
         "api_key": os.environ["AZURE_OPENAI_KEY"],
-        "model": "azure_openai/gpt-3.5-turbo",
+        "model": "azure_openai/gpt-4o"
     },
     "verbose": True,
     "headless": False,
diff --git a/examples/azure/csv_scraper_azure.py b/examples/azure/csv_scraper_azure.py
index efc99758..272527b3 100644
--- a/examples/azure/csv_scraper_azure.py
+++ b/examples/azure/csv_scraper_azure.py
@@ -25,7 +25,7 @@
 graph_config = {
     "llm": {
         "api_key": os.environ["AZURE_OPENAI_KEY"],
-        "model": "azure_openai/gpt-3.5-turbo",
+        "model": "azure_openai/gpt-4o"
     },
     "verbose": True,
     "headless": False
diff --git a/examples/azure/csv_scraper_graph_multi_azure.py b/examples/azure/csv_scraper_graph_multi_azure.py
index d9160c40..cccbf88e 100644
--- a/examples/azure/csv_scraper_graph_multi_azure.py
+++ b/examples/azure/csv_scraper_graph_multi_azure.py
@@ -25,7 +25,7 @@
 graph_config = {
     "llm": {
         "api_key": os.environ["AZURE_OPENAI_KEY"],
-        "model": "azure_openai/gpt-3.5-turbo",
+        "model": "azure_openai/gpt-4o"
     },
     "verbose": True,
     "headless": False
diff --git a/examples/azure/depth_search_graph_azure.py b/examples/azure/depth_search_graph_azure.py
new file mode 100644
index 00000000..88b2cd1b
--- /dev/null
+++ b/examples/azure/depth_search_graph_azure.py
@@ -0,0 +1,30 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+openai_key = os.getenv("OPENAI_APIKEY")
+
+graph_config = {
+   "llm": {
+        "api_key": os.environ["AZURE_OPENAI_KEY"],
+        "model": "azure_openai/gpt-4o",
+    },
+    "verbose": True,
+    "headless": False,
+    "depth": 2,
+    "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+    prompt="List me all the projects with their description",
+    source="https://perinim.github.io",
+    config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/azure/json_scraper_azure.py b/examples/azure/json_scraper_azure.py
index 483544fe..5ba54f7b 100644
--- a/examples/azure/json_scraper_azure.py
+++ b/examples/azure/json_scraper_azure.py
@@ -23,7 +23,7 @@
 graph_config = {
     "llm": {
         "api_key": os.environ["AZURE_OPENAI_KEY"],
-        "model": "azure_openai/gpt-3.5-turbo",
+        "model": "azure_openai/gpt-4o"
     },
     "verbose": True,
     "headless": False
diff --git a/examples/azure/json_scraper_multi_azure.py b/examples/azure/json_scraper_multi_azure.py
index ecf97280..befc4e84 100644
--- a/examples/azure/json_scraper_multi_azure.py
+++ b/examples/azure/json_scraper_multi_azure.py
@@ -12,7 +12,7 @@
 graph_config = {
     "llm": {
         "api_key": os.environ["AZURE_OPENAI_KEY"],
-        "model": "azure_openai/gpt-3.5-turbo",
+        "model": "azure_openai/gpt-4o"
     },
     "verbose": True,
     "headless": False
diff --git a/examples/azure/pdf_scraper_azure.py b/examples/azure/pdf_scraper_azure.py
index f8926489..02b3b7e6 100644
--- a/examples/azure/pdf_scraper_azure.py
+++ b/examples/azure/pdf_scraper_azure.py
@@ -10,7 +10,7 @@
 graph_config = {
     "llm": {
         "api_key": os.environ["AZURE_OPENAI_KEY"],
-        "model": "azure_openai/gpt-3.5-turbo",
+        "model": "azure_openai/gpt-4o"
     },
     "verbose": True,
     "headless": False
diff --git a/examples/azure/rate_limit_azure.py b/examples/azure/rate_limit_azure.py
index cfd05f1f..892996c7 100644
--- a/examples/azure/rate_limit_azure.py
+++ b/examples/azure/rate_limit_azure.py
@@ -26,7 +26,7 @@
 graph_config = {
     "llm": {
         "api_key": os.environ["AZURE_OPENAI_KEY"],
-        "model": "azure_openai/gpt-3.5-turbo",
+        "model": "azure_openai/gpt-4o",
         "rate_limit": {
             "requests_per_second": 1
         },
diff --git a/examples/azure/scrape_plain_text_azure.py b/examples/azure/scrape_plain_text_azure.py
index ef0d7d1c..9ea18d07 100644
--- a/examples/azure/scrape_plain_text_azure.py
+++ b/examples/azure/scrape_plain_text_azure.py
@@ -28,7 +28,7 @@
 graph_config = {
     "llm": {
         "api_key": os.environ["AZURE_OPENAI_KEY"],
-        "model": "azure_openai/gpt-3.5-turbo",
+        "model": "azure_openai/gpt-4o"
     },
     "verbose": True,
     "headless": False
diff --git a/examples/azure/script_generator_azure.py b/examples/azure/script_generator_azure.py
index 12f5d6be..b2bbb220 100644
--- a/examples/azure/script_generator_azure.py
+++ b/examples/azure/script_generator_azure.py
@@ -15,7 +15,7 @@
 graph_config = {
     "llm": {
         "api_key": os.environ["AZURE_OPENAI_KEY"],
-        "model": "azure_openai/gpt-3.5-turbo",
+        "model": "azure_openai/gpt-4o"
     },
     "verbose": True,
     "headless": False
diff --git a/examples/azure/script_multi_generator_azure.py b/examples/azure/script_multi_generator_azure.py
index a1bb8dbd..8c52cb95 100644
--- a/examples/azure/script_multi_generator_azure.py
+++ b/examples/azure/script_multi_generator_azure.py
@@ -16,7 +16,7 @@
 graph_config = {
     "llm": {
         "api_key": os.environ["AZURE_OPENAI_KEY"],
-        "model": "azure_openai/gpt-3.5-turbo",
+        "model": "azure_openai/gpt-4o"
     },
     "verbose": True,
     "headless": False
diff --git a/examples/azure/search_graph_azure.py b/examples/azure/search_graph_azure.py
index 13547e06..949f134c 100644
--- a/examples/azure/search_graph_azure.py
+++ b/examples/azure/search_graph_azure.py
@@ -22,7 +22,7 @@
 graph_config = {
     "llm": {
         "api_key": os.environ["AZURE_OPENAI_KEY"],
-        "model": "azure_openai/gpt-3.5-turbo",
+        "model": "azure_openai/gpt-4o"
     },
     "verbose": True,
     "headless": False
diff --git a/examples/azure/search_graph_schema_azure.py b/examples/azure/search_graph_schema_azure.py
index 629c92ab..e8c10093 100644
--- a/examples/azure/search_graph_schema_azure.py
+++ b/examples/azure/search_graph_schema_azure.py
@@ -30,7 +30,7 @@ class Dishes(BaseModel):
 graph_config = {
     "llm": {
         "api_key": os.environ["AZURE_OPENAI_KEY"],
-        "model": "azure_openai/gpt-3.5-turbo",
+        "model": "azure_openai/gpt-4o"
     },
     "verbose": True,
     "headless": False
diff --git a/examples/azure/search_link_graph_azure.py b/examples/azure/search_link_graph_azure.py
index aec2297b..42ed07ad 100644
--- a/examples/azure/search_link_graph_azure.py
+++ b/examples/azure/search_link_graph_azure.py
@@ -15,7 +15,7 @@
 graph_config = {
     "llm": {
         "api_key": os.environ["AZURE_OPENAI_KEY"],
-        "model": "azure_openai/gpt-3.5-turbo",
+        "model": "azure_openai/gpt-4o"
     },
     "verbose": True,
     "headless": False
diff --git a/examples/azure/smart_scraper_azure.py b/examples/azure/smart_scraper_azure.py
index bf3bc8d7..933dc5b0 100644
--- a/examples/azure/smart_scraper_azure.py
+++ b/examples/azure/smart_scraper_azure.py
@@ -26,7 +26,7 @@
 graph_config = {
     "llm": {
         "api_key": os.environ["AZURE_OPENAI_KEY"],
-        "model": "azure_openai/gpt-3.5-turbo",
+        "model": "azure_openai/gpt-4o"
     },
     "verbose": True,
     "headless": False
diff --git a/examples/azure/smart_scraper_multi_azure.py b/examples/azure/smart_scraper_multi_azure.py
index f1f3451e..e066eaf1 100644
--- a/examples/azure/smart_scraper_multi_azure.py
+++ b/examples/azure/smart_scraper_multi_azure.py
@@ -14,7 +14,7 @@
 graph_config = {
     "llm": {
         "api_key": os.environ["AZURE_OPENAI_KEY"],
-        "model": "azure_openai/gpt-3.5-turbo",
+        "model": "azure_openai/gpt-4o"
     },
     "verbose": True,
     "headless": False
diff --git a/examples/azure/smart_scraper_multi_concat_azure.py b/examples/azure/smart_scraper_multi_concat_azure.py
index e3870a4c..06d08b9a 100644
--- a/examples/azure/smart_scraper_multi_concat_azure.py
+++ b/examples/azure/smart_scraper_multi_concat_azure.py
@@ -15,7 +15,7 @@
 graph_config = {
     "llm": {
         "api_key": os.environ["AZURE_OPENAI_KEY"],
-        "model": "azure_openai/gpt-3.5-turbo",
+        "model": "azure_openai/gpt-4o"
     },
     "verbose": True,
     "headless": False
diff --git a/examples/azure/smart_scraper_schema_azure.py b/examples/azure/smart_scraper_schema_azure.py
index d0816bf5..d2766ecb 100644
--- a/examples/azure/smart_scraper_schema_azure.py
+++ b/examples/azure/smart_scraper_schema_azure.py
@@ -29,7 +29,7 @@ class Projects(BaseModel):
 graph_config = {
     "llm": {
         "api_key": os.environ["AZURE_OPENAI_KEY"],
-        "model": "azure_openai/gpt-3.5-turbo",
+        "model": "azure_openai/gpt-4o"
     },
     "verbose": True,
     "headless": False
diff --git a/examples/azure/xml_scraper_azure.py b/examples/azure/xml_scraper_azure.py
index ecfb8743..1c40f3e7 100644
--- a/examples/azure/xml_scraper_azure.py
+++ b/examples/azure/xml_scraper_azure.py
@@ -24,7 +24,7 @@
 graph_config = {
     "llm": {
         "api_key": os.environ["AZURE_OPENAI_KEY"],
-        "model": "azure_openai/gpt-3.5-turbo",
+        "model": "azure_openai/gpt-4o"
     },
     "verbose": True,
     "headless": False
diff --git a/examples/azure/xml_scraper_graph_multi_azure.py b/examples/azure/xml_scraper_graph_multi_azure.py
index db4db108..972eb823 100644
--- a/examples/azure/xml_scraper_graph_multi_azure.py
+++ b/examples/azure/xml_scraper_graph_multi_azure.py
@@ -25,7 +25,7 @@
 graph_config = {
     "llm": {
         "api_key": os.environ["AZURE_OPENAI_KEY"],
-        "model": "azure_openai/gpt-3.5-turbo",
+        "model": "azure_openai/gpt-4o",
     },
     "verbose": True,
     "headless": False
diff --git a/examples/bedrock/depth_search_graph_bedrock.py b/examples/bedrock/depth_search_graph_bedrock.py
new file mode 100644
index 00000000..2ab88291
--- /dev/null
+++ b/examples/bedrock/depth_search_graph_bedrock.py
@@ -0,0 +1,31 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+openai_key = os.getenv("OPENAI_APIKEY")
+
+graph_config = {
+    "llm": {
+        "client": "client_name",
+        "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
+        "temperature": 0.0
+    },
+    "verbose": True,
+    "headless": False,
+    "depth": 2,
+    "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+    prompt="List me all the projects with their description",
+    source="https://perinim.github.io",
+    config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/deepseek/depth_search_graph_deepseek.py b/examples/deepseek/depth_search_graph_deepseek.py
new file mode 100644
index 00000000..064690a5
--- /dev/null
+++ b/examples/deepseek/depth_search_graph_deepseek.py
@@ -0,0 +1,30 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+deepseek_key = os.getenv("DEEPSEEK_APIKEY")
+
+graph_config = {
+    "llm": {
+        "model": "deepseek/deepseek-chat",
+        "api_key": deepseek_key,
+    },
+    "verbose": True,
+    "headless": False,
+    "depth": 2,
+    "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+    prompt="List me all the projects with their description",
+    source="https://perinim.github.io",
+    config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/ernie/custom_graph_ernie.py b/examples/ernie/custom_graph_ernie.py
index 57d422e5..a3082cf7 100644
--- a/examples/ernie/custom_graph_ernie.py
+++ b/examples/ernie/custom_graph_ernie.py
@@ -14,7 +14,7 @@
 # Define the configuration for the graph
 # ************************************************
 
-graph_config = {  
+graph_config = {
     "llm": {
         "model": "ernie/ernie-bot-turbo",
         "ernie_client_id": "<ernie_client_id>",
diff --git a/examples/ernie/depth_search_graph_ernie.py b/examples/ernie/depth_search_graph_ernie.py
new file mode 100644
index 00000000..99470d8d
--- /dev/null
+++ b/examples/ernie/depth_search_graph_ernie.py
@@ -0,0 +1,26 @@
+"""
+depth_search_graph_opeani example
+"""
+from scrapegraphai.graphs import DepthSearchGraph
+
+graph_config = {
+    "llm": {
+        "model": "ernie/ernie-bot-turbo",
+        "ernie_client_id": "<ernie_client_id>",
+        "ernie_client_secret": "<ernie_client_secret>",
+        "temperature": 0.1
+    },
+    "verbose": True,
+    "headless": False,
+    "depth": 2,
+    "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+    prompt="List me all the projects with their description",
+    source="https://perinim.github.io",
+    config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/fireworks/depth_search_graph_fireworks.py b/examples/fireworks/depth_search_graph_fireworks.py
new file mode 100644
index 00000000..f467be9f
--- /dev/null
+++ b/examples/fireworks/depth_search_graph_fireworks.py
@@ -0,0 +1,30 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+fireworks_api_key = os.getenv("FIREWORKS_APIKEY")
+
+graph_config = {
+    "llm": {
+        "api_key": fireworks_api_key,
+        "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct"
+    },
+    "verbose": True,
+    "headless": False,
+    "depth": 2,
+    "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+    prompt="List me all the projects with their description",
+    source="https://perinim.github.io",
+    config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/google_genai/depth_search_graph_gemini.py b/examples/google_genai/depth_search_graph_gemini.py
new file mode 100644
index 00000000..956341f4
--- /dev/null
+++ b/examples/google_genai/depth_search_graph_gemini.py
@@ -0,0 +1,30 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+gemini_key = os.getenv("GOOGLE_APIKEY")
+
+graph_config = {
+   "llm": {
+        "api_key": gemini_key,
+        "model": "google_genai/gemini-pro",
+    },
+    "verbose": True,
+    "headless": False,
+    "depth": 2,
+    "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+    prompt="List me all the projects with their description",
+    source="https://perinim.github.io",
+    config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/google_vertexai/depth_search_graph_gemini.py b/examples/google_vertexai/depth_search_graph_gemini.py
new file mode 100644
index 00000000..13bba630
--- /dev/null
+++ b/examples/google_vertexai/depth_search_graph_gemini.py
@@ -0,0 +1,30 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+gemini_key = os.getenv("GOOGLE_APIKEY")
+
+graph_config = {
+    "llm": {
+        "api_key": gemini_key,
+        "model": "google_vertexai/gemini-1.5-pro",
+    },
+    "verbose": True,
+    "headless": False,
+    "depth": 2,
+    "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+    prompt="List me all the projects with their description",
+    source="https://perinim.github.io",
+    config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/groq/depth_search_graph_groq.py b/examples/groq/depth_search_graph_groq.py
new file mode 100644
index 00000000..2d1ed8b1
--- /dev/null
+++ b/examples/groq/depth_search_graph_groq.py
@@ -0,0 +1,31 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+groq_key = os.getenv("GROQ_APIKEY")
+
+graph_config = {
+   "llm": {
+        "model": "groq/gemma-7b-it",
+        "api_key": groq_key,
+        "temperature": 0
+    },
+    "verbose": True,
+    "headless": False,
+    "depth": 2,
+    "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+    prompt="List me all the projects with their description",
+    source="https://perinim.github.io",
+    config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/huggingfacehub/custom_graph_huggingfacehub.py b/examples/huggingfacehub/custom_graph_huggingfacehub.py
index cec007b7..06b2f089 100644
--- a/examples/huggingfacehub/custom_graph_huggingfacehub.py
+++ b/examples/huggingfacehub/custom_graph_huggingfacehub.py
@@ -4,7 +4,6 @@
 
 import os
 from dotenv import load_dotenv
-
 from langchain_openai import OpenAIEmbeddings
 from langchain_openai import ChatOpenAI
 from scrapegraphai.graphs import BaseGraph
diff --git a/examples/huggingfacehub/depth_search_graph_huggingfacehub.py b/examples/huggingfacehub/depth_search_graph_huggingfacehub.py
new file mode 100644
index 00000000..48df3e37
--- /dev/null
+++ b/examples/huggingfacehub/depth_search_graph_huggingfacehub.py
@@ -0,0 +1,38 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+from langchain_community.llms import HuggingFaceEndpoint
+from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
+
+load_dotenv()
+HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
+
+repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
+
+llm_model_instance = HuggingFaceEndpoint(
+    repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN
+)
+
+embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
+    api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2"
+)
+
+graph_config = {
+    "llm": {"model_instance": llm_model_instance},
+    "verbose": True,
+    "headless": False,
+    "depth": 2,
+    "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+    prompt="List me all the projects with their description",
+    source="https://perinim.github.io",
+    config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/local_models/depth_search_graph_ollama.py b/examples/local_models/depth_search_graph_ollama.py
new file mode 100644
index 00000000..d0f960b5
--- /dev/null
+++ b/examples/local_models/depth_search_graph_ollama.py
@@ -0,0 +1,32 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+openai_key = os.getenv("OPENAI_APIKEY")
+
+graph_config = {
+    "llm": {
+        "model": "ollama/llama3.1",
+        "temperature": 0,
+        "format": "json",  # Ollama needs the format to be specified explicitly
+        # "base_url": "http://localhost:11434", # set ollama URL arbitrarily
+    },
+    "verbose": True,
+    "headless": False,
+    "depth": 2,
+    "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+    prompt="List me all the projects with their description",
+    source="https://perinim.github.io",
+    config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/local_models/json_scraper_multi_ollama.py b/examples/local_models/json_scraper_multi_ollama.py
index 6e9c3da3..e80bf5ec 100644
--- a/examples/local_models/json_scraper_multi_ollama.py
+++ b/examples/local_models/json_scraper_multi_ollama.py
@@ -15,6 +15,7 @@
     "verbose": True,
     "headless": False,
 }
+
 FILE_NAME = "inputs/example.json"
 curr_dir = os.path.dirname(os.path.realpath(__file__))
 file_path = os.path.join(curr_dir, FILE_NAME)
diff --git a/examples/local_models/smart_scraper_schema_ollama.py b/examples/local_models/smart_scraper_schema_ollama.py
index 35503bd7..5a5b3cea 100644
--- a/examples/local_models/smart_scraper_schema_ollama.py
+++ b/examples/local_models/smart_scraper_schema_ollama.py
@@ -24,7 +24,6 @@ class Projects(BaseModel):
         "format": "json",  # Ollama needs the format to be specified explicitly
         # "base_url": "http://localhost:11434", # set ollama URL arbitrarily
     },
-
     "verbose": True,
     "headless": False
 }
diff --git a/examples/mistral/depth_search_graph_mistral.py b/examples/mistral/depth_search_graph_mistral.py
new file mode 100644
index 00000000..ae18ffba
--- /dev/null
+++ b/examples/mistral/depth_search_graph_mistral.py
@@ -0,0 +1,30 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+mistral_key = os.getenv("MISTRAL_API_KEY")
+
+graph_config = {
+    "llm": {
+        "api_key": mistral_key,
+        "model": "mistralai/open-mistral-nemo",
+    },
+    "verbose": True,
+    "headless": False,
+    "depth": 2,
+    "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+    prompt="List me all the projects with their description",
+    source="https://perinim.github.io",
+    config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/nemotron/depth_search_graph_nemotron.py b/examples/nemotron/depth_search_graph_nemotron.py
new file mode 100644
index 00000000..edd80463
--- /dev/null
+++ b/examples/nemotron/depth_search_graph_nemotron.py
@@ -0,0 +1,30 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+openai_key = os.getenv("OPENAI_APIKEY")
+
+graph_config = {
+    "llm": {
+        "api_key": os.getenv("NEMOTRON_KEY"),
+        "model": "claude-3-haiku-20240307",
+    },
+    "verbose": True,
+    "headless": False,
+    "depth": 2,
+    "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+    prompt="List me all the projects with their description",
+    source="https://perinim.github.io",
+    config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/oneapi/depth_search_graph_onenapi.py b/examples/oneapi/depth_search_graph_onenapi.py
new file mode 100644
index 00000000..7a2e7f3e
--- /dev/null
+++ b/examples/oneapi/depth_search_graph_onenapi.py
@@ -0,0 +1,31 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+openai_key = os.getenv("OPENAI_APIKEY")
+
+graph_config = {
+    "llm": {
+        "api_key": "***************************",
+        "model": "oneapi/qwen-turbo",
+        "base_url": "http://127.0.0.1:3000/v1",  # 设置 OneAPI URL
+    },
+    "verbose": True,
+    "headless": False,
+    "depth": 2,
+    "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+    prompt="List me all the projects with their description",
+    source="https://perinim.github.io",
+    config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/together/depth_search_graph_together.py b/examples/together/depth_search_graph_together.py
new file mode 100644
index 00000000..7a2e7f3e
--- /dev/null
+++ b/examples/together/depth_search_graph_together.py
@@ -0,0 +1,31 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+openai_key = os.getenv("OPENAI_APIKEY")
+
+graph_config = {
+    "llm": {
+        "api_key": "***************************",
+        "model": "oneapi/qwen-turbo",
+        "base_url": "http://127.0.0.1:3000/v1",  # 设置 OneAPI URL
+    },
+    "verbose": True,
+    "headless": False,
+    "depth": 2,
+    "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+    prompt="List me all the projects with their description",
+    source="https://perinim.github.io",
+    config=graph_config
+)
+
+result = search_graph.run()
+print(result)

From db54d694334209f047c950e2f6ac2c02e2da1d39 Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <mvincig11@gmail.com>
Date: Fri, 4 Oct 2024 09:54:54 +0200
Subject: [PATCH 35/36] refactoring of code for pylint integration

---
 scrapegraphai/nodes/description_node.py   |   1 -
 scrapegraphai/nodes/fetch_node_level_k.py | 178 ++++++++++++----------
 scrapegraphai/nodes/parse_node_depth_k.py |  17 +--
 3 files changed, 107 insertions(+), 89 deletions(-)

diff --git a/scrapegraphai/nodes/description_node.py b/scrapegraphai/nodes/description_node.py
index 60c56cec..4201a61d 100644
--- a/scrapegraphai/nodes/description_node.py
+++ b/scrapegraphai/nodes/description_node.py
@@ -34,7 +34,6 @@ def __init__(
         node_name: str = "DESCRIPTION",
     ):
         super().__init__(node_name, "node", input, output, 2, node_config)
-
         self.llm_model = node_config["llm_model"]
         self.verbose = (
             False if node_config is None else node_config.get("verbose", False)
diff --git a/scrapegraphai/nodes/fetch_node_level_k.py b/scrapegraphai/nodes/fetch_node_level_k.py
index 5cdd6571..d321b33c 100644
--- a/scrapegraphai/nodes/fetch_node_level_k.py
+++ b/scrapegraphai/nodes/fetch_node_level_k.py
@@ -1,6 +1,3 @@
-"""
-FetchNodeLevelK Module
-"""
 from typing import List, Optional
 from .base_node import BaseNode
 from ..docloaders import ChromiumLoader
@@ -18,14 +15,21 @@ class FetchNodeLevelK(BaseNode):
     (with proxy protection).
 
     Attributes:
-        llm_model: An instance of a language model client, configured for generating answers.
+        embedder_model: An optional model for embedding the fetched content.
         verbose (bool): A flag indicating whether to show print statements during execution.
+        cache_path (str): Path to cache fetched content.
+        headless (bool): Whether to run the Chromium browser in headless mode.
+        loader_kwargs (dict): Additional arguments for the content loader.
+        browser_base (dict): Optional configuration for the browser base API.
+        depth (int): Maximum depth of hyperlink graph traversal.
+        only_inside_links (bool): Whether to fetch only internal links.
+        min_input_len (int): Minimum required length of input data.
 
     Args:
         input (str): Boolean expression defining the input keys needed from the state.
         output (List[str]): List of output keys to be updated in the state.
         node_config (dict): Additional configuration for the node.
-        node_name (str): The unique identifier name for the node, defaulting to "Parse".
+        node_name (str): The unique identifier name for the node, defaulting to "FetchLevelK".
     """
 
     def __init__(
@@ -35,81 +39,68 @@ def __init__(
         node_config: Optional[dict] = None,
         node_name: str = "FetchLevelK",
     ):
+        """
+        Initializes the FetchNodeLevelK instance.
+
+        Args:
+            input (str): Boolean expression defining the input keys needed from the state.
+            output (List[str]): List of output keys to be updated in the state.
+            node_config (Optional[dict]): Additional configuration for the node.
+            node_name (str): The name of the node (default is "FetchLevelK").
+        """
         super().__init__(node_name, "node", input, output, 2, node_config)
-        
+
         self.embedder_model = node_config.get("embedder_model", None)
-        
-        self.verbose = (
-            False if node_config is None else node_config.get("verbose", False)
-        )
-        
+        self.verbose = node_config.get("verbose", False) if node_config else False
         self.cache_path = node_config.get("cache_path", False)
-        
-        self.headless = (
-            True if node_config is None else node_config.get("headless", True)
-        )
-        
-        self.loader_kwargs = (
-            {} if node_config is None else node_config.get("loader_kwargs", {})
-        )
-        
-        self.browser_base = (
-            None if node_config is None else node_config.get("browser_base", None)
-        )
-        
-        self.depth = (
-            1 if node_config is None else node_config.get("depth", 1)
-        )
-        
-        self.only_inside_links = (
-            False if node_config is None else node_config.get("only_inside_links", False)
-        )
-        
+        self.headless = node_config.get("headless", True) if node_config else True
+        self.loader_kwargs = node_config.get("loader_kwargs", {}) if node_config else {}
+        self.browser_base = node_config.get("browser_base", None)
+        self.depth = node_config.get("depth", 1) if node_config else 1
+        self.only_inside_links = node_config.get("only_inside_links", False) if node_config else False
         self.min_input_len = 1
 
     def execute(self, state: dict) -> dict:
         """
-        Executes the node's logic to fetch the HTML content of a specified URL and all its sub-links
-        and update the graph's state with the content.
+        Executes the node's logic to fetch the HTML content of a specified URL and its sub-links
+        recursively, then updates the graph's state with the fetched content.
 
         Args:
-            state (dict): The current state of the graph. The input keys will be used
-                            to fetch the correct data types from the state.
+            state (dict): The current state of the graph.
 
         Returns:
             dict: The updated state with a new output key containing the fetched HTML content.
 
         Raises:
-            KeyError: If the input key is not found in the state, indicating that the
-                    necessary information to perform the operation is missing.
+            KeyError: If the input key is not found in the state.
         """
-
         self.logger.info(f"--- Executing {self.node_name} Node ---")
-        
-        # Interpret input keys based on the provided input expression
+
         input_keys = self.get_input_keys(state)
-        # Fetching data from the state based on the input keys
         input_data = [state[key] for key in input_keys]
-
         source = input_data[0]
-        
+
         documents = [{"source": source}]
-        
-        loader_kwargs = {}
+        loader_kwargs = self.node_config.get("loader_kwargs", {}) if self.node_config else {}
 
-        if self.node_config is not None:
-            loader_kwargs = self.node_config.get("loader_kwargs", {})
-        
         for _ in range(self.depth):
             documents = self.obtain_content(documents, loader_kwargs)
-        
+
         filtered_documents = [doc for doc in documents if 'document' in doc]
-        
         state.update({self.output[0]: filtered_documents})
-        
         return state
-    
+
     def fetch_content(self, source: str, loader_kwargs) -> Optional[str]:
+        """
+        Fetches the HTML content of a given source URL.
+
+        Args:
+            source (str): The URL to fetch content from.
+            loader_kwargs (dict): Additional arguments for the content loader.
+
+        Returns:
+            Optional[str]: The fetched HTML content or None if fetching failed.
+        """
         self.logger.info(f"--- (Fetching HTML from: {source}) ---")
         
         if self.browser_base is not None:
@@ -119,26 +110,40 @@ def fetch_content(self, source: str, loader_kwargs) -> Optional[str]:
                 raise ImportError("""The browserbase module is not installed. 
                                     Please install it using `pip install browserbase`.""")
 
-            data =  browser_base_fetch(self.browser_base.get("api_key"),
-                                        self.browser_base.get("project_id"), [source])
-
-            document = [Document(page_content=content,
-                                metadata={"source": source}) for content in data]
-        
+            data = browser_base_fetch(self.browser_base.get("api_key"), 
+                                      self.browser_base.get("project_id"), [source])
+            document = [Document(page_content=content, metadata={"source": source}) for content in data]
         else:
             loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs)
-            
             document = loader.load()
-        
         return document
-    
+
     def extract_links(self, html_content: str) -> list:
+        """
+        Extracts all hyperlinks from the HTML content.
+
+        Args:
+            html_content (str): The HTML content to extract links from.
+
+        Returns:
+            list: A list of extracted hyperlinks.
+        """
         soup = BeautifulSoup(html_content, 'html.parser')
         links = [link['href'] for link in soup.find_all('a', href=True)]
         self.logger.info(f"Extracted {len(links)} links.")
         return links
-    
+
     def get_full_links(self, base_url: str, links: list) -> list:
+        """
+        Converts relative URLs to full URLs based on the base URL.
+
+        Args:
+            base_url (str): The base URL for resolving relative links.
+            links (list): A list of links to convert.
+
+        Returns:
+            list: A list of full URLs.
+        """
         full_links = []
         for link in links:
             if self.only_inside_links and link.startswith("http"):
@@ -146,36 +151,55 @@ def get_full_links(self, base_url: str, links: list) -> list:
             full_link = link if link.startswith("http") else urljoin(base_url, link)
             full_links.append(full_link)
         return full_links
-    
+
     def obtain_content(self, documents: List, loader_kwargs) -> List:
+        """
+        Iterates through documents, fetching and updating content recursively.
+
+        Args:
+            documents (List): A list of documents containing the source URLs.
+            loader_kwargs (dict): Additional arguments for the content loader.
+
+        Returns:
+            List: The updated list of documents with fetched content.
+        """
         new_documents = []
         for doc in documents:
             source = doc['source']
             if 'document' not in doc:
                 document = self.fetch_content(source, loader_kwargs)
-                
+
                 if not document or not document[0].page_content.strip():
                     self.logger.warning(f"Failed to fetch content for {source}")
                     documents.remove(doc)
                     continue
-                
-                #doc['document'] = document[0].page_content
+
                 doc['document'] = document
-                
                 links = self.extract_links(doc['document'][0].page_content)
                 full_links = self.get_full_links(source, links)
-                
-                # Check if the links are already present in other documents
+
                 for link in full_links:
-                    # Check if any document is from the same link
                     if not any(d.get('source', '') == link for d in documents) and not any(d.get('source', '') == link for d in new_documents):
-                        # Add the document
                         new_documents.append({"source": link})
-        
+
         documents.extend(new_documents)
         return documents
-    
-    def process_links(self, base_url: str, links: list, loader_kwargs, depth: int, current_depth: int = 1) -> dict:
+
+    def process_links(self, base_url: str, links: list, 
+                      loader_kwargs, depth: int, current_depth: int = 1) -> dict:
+        """
+        Processes a list of links recursively up to a given depth.
+
+        Args:
+            base_url (str): The base URL for resolving relative links.
+            links (list): A list of links to process.
+            loader_kwargs (dict): Additional arguments for the content loader.
+            depth (int): The maximum depth for recursion.
+            current_depth (int): The current depth of recursion (default is 1).
+
+        Returns:
+            dict: A dictionary containing processed link content.
+        """
         content_dict = {}
         for idx, link in enumerate(links, start=1):
             full_link = link if link.startswith("http") else urljoin(base_url, link)
@@ -184,7 +208,7 @@ def process_links(self, base_url: str, links: list, loader_kwargs, depth: int, c
 
             if current_depth < depth:
                 new_links = self.extract_links(link_content)
-                content_dict.update(self.process_links(full_link, new_links, depth, current_depth + 1))
+                content_dict.update(self.process_links(full_link, new_links, loader_kwargs, depth, current_depth + 1))
             else:
                 self.logger.warning(f"Failed to fetch content for {full_link}")
-        return content_dict
\ No newline at end of file
+        return content_dict
diff --git a/scrapegraphai/nodes/parse_node_depth_k.py b/scrapegraphai/nodes/parse_node_depth_k.py
index 7b7ab194..6427b051 100644
--- a/scrapegraphai/nodes/parse_node_depth_k.py
+++ b/scrapegraphai/nodes/parse_node_depth_k.py
@@ -1,11 +1,9 @@
 """
 ParseNodeDepthK Module
 """
-import re
-from typing import List, Optional, Tuple
-from .base_node import BaseNode
-from ..utils.convert_to_md import convert_to_md
+from typing import List, Optional
 from langchain_community.document_transformers import Html2TextTransformer
+from .base_node import BaseNode
 
 class ParseNodeDepthK(BaseNode):
     """
@@ -54,19 +52,16 @@ def execute(self, state: dict) -> dict:
         """
 
         self.logger.info(f"--- Executing {self.node_name} Node ---")
-        
-        # Interpret input keys based on the provided input expression
+
         input_keys = self.get_input_keys(state)
-        # Fetching data from the state based on the input keys
         input_data = [state[key] for key in input_keys]
 
         documents = input_data[0]
-        
+
         for doc in documents:
             document_md = Html2TextTransformer(ignore_links=True).transform_documents(doc["document"])
-            #document_md = convert_to_md(doc["document"])
             doc["document"] = document_md[0].page_content
-        
+
         state.update({self.output[0]: documents})
-        
+
         return state

From d056c439cd4582b4c6b4bf6efc5ebd057cd5a3a1 Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <mvincig11@gmail.com>
Date: Fri, 4 Oct 2024 14:16:13 +0200
Subject: [PATCH 36/36] Create code_generator_graph_togehter.py

---
 .../together/code_generator_graph_togehter.py | 60 +++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 examples/together/code_generator_graph_togehter.py

diff --git a/examples/together/code_generator_graph_togehter.py b/examples/together/code_generator_graph_togehter.py
new file mode 100644
index 00000000..aefbeba4
--- /dev/null
+++ b/examples/together/code_generator_graph_togehter.py
@@ -0,0 +1,60 @@
+""" 
+Basic example of scraping pipeline using Code Generator with schema
+"""
+
+import os, json
+from typing import List
+from dotenv import load_dotenv
+from pydantic import BaseModel, Field
+from scrapegraphai.graphs import CodeGeneratorGraph
+
+load_dotenv()
+
+# ************************************************
+# Define the output schema for the graph
+# ************************************************
+
+class Project(BaseModel):
+    title: str = Field(description="The title of the project")
+    description: str = Field(description="The description of the project")
+
+class Projects(BaseModel):
+    projects: List[Project]
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+together_key = os.getenv("TOGETHER_KEY")
+
+graph_config = {
+    "llm": {
+        "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
+        "api_key": together_key,
+    },
+    "verbose": True,
+    "headless": False,
+    "reduction": 2,
+    "max_iterations": {
+        "overall": 10,
+        "syntax": 3,
+        "execution": 3,
+        "validation": 3,
+        "semantic": 3
+    },
+    "output_file_name": "extracted_data.py"
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+code_generator_graph = CodeGeneratorGraph(
+    prompt="List me all the projects with their description",
+    source="https://perinim.github.io/projects/",
+    schema=Projects,
+    config=graph_config
+)
+
+result = code_generator_graph.run()
+print(result)