Merge pull request #702 from vedovati-matteo/pre/beta

VinciGit00 · web-flow · commit 857f28dba0ee · 2024-09-27T14:42:17.000+02:00
Reasoning Node added
diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py
@@ -25,4 +25,5 @@
 from .concat_answers_node import ConcatAnswersNode
 from .prompt_refiner_node import PromptRefinerNode
 from .html_analyzer_node import HtmlAnalyzerNode
-from .generate_code_node import GenerateCodeNode
+from .generate_code_node import GenerateCodeNode
+from .reasoning_node import ReasoningNode
diff --git a/scrapegraphai/nodes/generate_code_node.py b/scrapegraphai/nodes/generate_code_node.py
@@ -5,26 +5,23 @@
 from langchain.prompts import PromptTemplate
 from langchain.output_parsers import ResponseSchema, StructuredOutputParser
 from langchain_core.output_parsers import StrOutputParser
-from langchain_core.runnables import RunnableParallel
-from langchain_core.utils.pydantic import is_basemodel_subclass
 from langchain_community.chat_models import ChatOllama
 import ast
 import sys
 from io import StringIO
 from bs4 import BeautifulSoup
 import re
-from tqdm import tqdm
-from .base_node import BaseNode
+import json
+from jsonschema import validate, ValidationError
 from pydantic import ValidationError
+from .base_node import BaseNode
 from ..utils import (transform_schema,
                     extract_code,
                     syntax_focused_analysis, syntax_focused_code_generation,
                     execution_focused_analysis, execution_focused_code_generation,
                     validation_focused_analysis, validation_focused_code_generation,
                     semantic_focused_analysis, semantic_focused_code_generation,
                     are_content_equal)
-from jsonschema import validate, ValidationError
-import json
 from ..prompts import (
     TEMPLATE_INIT_CODE_GENERATION, TEMPLATE_SEMANTIC_COMPARISON
 )
diff --git a/scrapegraphai/nodes/html_analyzer_node.py b/scrapegraphai/nodes/html_analyzer_node.py
@@ -4,10 +4,7 @@
 from typing import List, Optional
 from langchain.prompts import PromptTemplate
 from langchain_core.output_parsers import StrOutputParser
-from langchain_core.runnables import RunnableParallel
-from langchain_core.utils.pydantic import is_basemodel_subclass
 from langchain_community.chat_models import ChatOllama
-from tqdm import tqdm
 from .base_node import BaseNode
 from ..utils import reduce_html
 from ..prompts import (
diff --git a/scrapegraphai/nodes/prompt_refiner_node.py b/scrapegraphai/nodes/prompt_refiner_node.py
@@ -4,12 +4,7 @@
 from typing import List, Optional
 from langchain.prompts import PromptTemplate
 from langchain_core.output_parsers import StrOutputParser
-from langchain_core.runnables import RunnableParallel
-from langchain_core.utils.pydantic import is_basemodel_subclass
-from langchain_openai import ChatOpenAI, AzureChatOpenAI
-from langchain_mistralai import ChatMistralAI
 from langchain_community.chat_models import ChatOllama
-from tqdm import tqdm
 from .base_node import BaseNode
 from ..utils import transform_schema
 from ..prompts import (
diff --git a/scrapegraphai/nodes/reasoning_node.py b/scrapegraphai/nodes/reasoning_node.py
@@ -0,0 +1,96 @@
+"""
+PromptRefinerNode Module
+"""
+from typing import List, Optional
+from langchain.prompts import PromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from langchain_community.chat_models import ChatOllama
+from .base_node import BaseNode
+from ..utils import transform_schema
+from ..prompts import (
+    TEMPLATE_REASONING, TEMPLATE_REASONING_WITH_CONTEXT
+)
+
+class ReasoningNode(BaseNode):
+    """
+    A node that refine the user prompt with the use of the schema and additional context and
+    create a precise prompt in subsequent steps that explicitly link elements in the user's
+    original input to their corresponding representations in the JSON schema.
+
+    Attributes:
+        llm_model: An instance of a language model client, configured for generating answers.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+
+    Args:
+        input (str): Boolean expression defining the input keys needed from the state.
+        output (List[str]): List of output keys to be updated in the state.
+        node_config (dict): Additional configuration for the node.
+        node_name (str): The unique identifier name for the node, defaulting to "GenerateAnswer".
+    """
+
+    def __init__(
+        self,
+        input: str,
+        output: List[str],
+        node_config: Optional[dict] = None,
+        node_name: str = "PromptRefiner",
+    ):
+        super().__init__(node_name, "node", input, output, 2, node_config)
+
+        self.llm_model = node_config["llm_model"]
+
+        if isinstance(node_config["llm_model"], ChatOllama):
+            self.llm_model.format="json"
+
+        self.verbose = (
+            True if node_config is None else node_config.get("verbose", False)
+        )
+        self.force = (
+            False if node_config is None else node_config.get("force", False)
+        )
+
+        self.additional_info = node_config.get("additional_info", None)
+        
+        self.output_schema = node_config.get("schema")
+
+    def execute(self, state: dict) -> dict:
+        """
+        Generate a refined prompt for the reasoning task based on the user's input and the JSON schema.
+
+        Args:
+            state (dict): The current state of the graph. The input keys will be used
+                            to fetch the correct data from the state.
+
+        Returns:
+            dict: The updated state with the output key containing the generated answer.
+
+        Raises:
+            KeyError: If the input keys are not found in the state, indicating
+                      that the necessary information for generating an answer is missing.
+        """
+
+        self.logger.info(f"--- Executing {self.node_name} Node ---")
+        
+        user_prompt = state['user_prompt']
+
+        self.simplefied_schema = transform_schema(self.output_schema.schema())
+        
+        if self.additional_info is not None:
+            prompt = PromptTemplate(
+                template=TEMPLATE_REASONING_WITH_CONTEXT,
+                partial_variables={"user_input": user_prompt,
+                                    "json_schema": str(self.simplefied_schema),
+                                    "additional_context": self.additional_info})
+        else:
+            prompt = PromptTemplate(
+                template=TEMPLATE_REASONING,
+                partial_variables={"user_input": user_prompt,
+                                    "json_schema": str(self.simplefied_schema)})
+
+        output_parser = StrOutputParser()
+
+        chain =  prompt | self.llm_model | output_parser
+        refined_prompt = chain.invoke({})
+
+        state.update({self.output[0]: refined_prompt})
+        return state
diff --git a/scrapegraphai/prompts/__init__.py b/scrapegraphai/prompts/__init__.py
@@ -18,4 +18,5 @@
                                          TEMPLATE_EXECUTION_ANALYSIS, TEMPLATE_EXECUTION_CODE_GENERATION,
                                          TEMPLATE_VALIDATION_ANALYSIS, TEMPLATE_VALIDATION_CODE_GENERATION,
                                          TEMPLATE_SEMANTIC_COMPARISON, TEMPLATE_SEMANTIC_ANALYSIS,
-                                         TEMPLATE_SEMANTIC_CODE_GENERATION)
+                                         TEMPLATE_SEMANTIC_CODE_GENERATION)
+from .reasoning_node_prompts import TEMPLATE_REASONING, TEMPLATE_REASONING_WITH_CONTEXT
diff --git a/scrapegraphai/prompts/reasoning_node_prompts.py b/scrapegraphai/prompts/reasoning_node_prompts.py
@@ -0,0 +1,72 @@
+"""
+Reasoning prompts helper
+"""
+
+TEMPLATE_REASONING = """
+**Task**: Analyze the user's request and the provided JSON schema to guide an LLM in extracting information directly from a markdown file previously parsed froma a HTML file.
+
+**User's Request**:
+{user_input}
+
+**Target JSON Schema**:
+```json
+{json_schema}
+```
+
+**Analysis Instructions**:
+1. **Interpret User Request:** 
+* Identify the key information types or entities the user is seeking.
+* Note any specific attributes, relationships, or constraints mentioned.
+
+2. **Map to JSON Schema**:
+* For each identified element in the user request, locate its corresponding field in the JSON schema.
+* Explain how the schema structure represents the requested information.
+* Highlight any relevant schema elements not explicitly mentioned in the user's request.
+
+3. **Data Transformation Guidance**:
+* Provide guidance on any necessary transformations to align extracted data with the JSON schema requirements.
+
+This analysis will be used to instruct an LLM that has the HTML content in its context. The LLM will use this guidance to extract the information and return it directly in the specified JSON format.
+
+**Reasoning Output**:
+[Your detailed analysis based on the above instructions]
+"""
+        
+TEMPLATE_REASONING_WITH_CONTEXT = """
+**Task**: Analyze the user's request and the provided JSON schema to guide an LLM in extracting information directly from a markdown file previously parsed froma a HTML file.
+
+**User's Request**:
+{user_input}
+
+**Target JSON Schema**:
+```json
+{json_schema}
+```
+
+**Additional Context**:
+{additional_context}
+
+**Analysis Instructions**:
+1. **Interpret User Request and Context:** 
+* Identify the key information types or entities the user is seeking.
+* Note any specific attributes, relationships, or constraints mentioned.
+* Incorporate insights from the additional context to refine understanding of the task.
+
+2. **Map to JSON Schema**:
+* For each identified element in the user request, locate its corresponding field in the JSON schema.
+* Explain how the schema structure represents the requested information.
+* Highlight any relevant schema elements not explicitly mentioned in the user's request.
+
+3. **Extraction Strategy**:
+* Based on the additional context, suggest specific strategies for locating and extracting the required information from the HTML.
+* Highlight any potential challenges or special considerations mentioned in the context.
+
+4. **Data Transformation Guidance**:
+* Provide guidance on any necessary transformations to align extracted data with the JSON schema requirements.
+* Note any special formatting, validation, or business logic considerations from the additional context.
+
+This analysis will be used to instruct an LLM that has the HTML content in its context. The LLM will use this guidance to extract the information and return it directly in the specified JSON format.
+
+**Reasoning Output**:
+[Your detailed analysis based on the above instructions, incorporating insights from the additional context]
+"""