ScrapeGraphAI · VinciGit00 · Aug 18, 2024 · Aug 19, 2024 · Aug 19, 2024 · Aug 20, 2024
diff --git a/examples/openai/smart_scraper_openai.py b/examples/openai/smart_scraper_openai.py
@@ -2,10 +2,12 @@
 Basic example of scraping pipeline using SmartScraper
 """
 
-import os, json
+import os
+import json
+from dotenv import load_dotenv
 from scrapegraphai.graphs import SmartScraperGraph
 from scrapegraphai.utils import prettify_exec_info
-from dotenv import load_dotenv
+
 load_dotenv()
 
 # ************************************************
@@ -16,7 +18,7 @@
 graph_config = {
     "llm": {
         "api_key": os.getenv("OPENAI_API_KEY"),
-        "model": "gpt-3.5-turbo",
+        "model": "gpt-4o",
     },
     "verbose": True,
     "headless": False,
@@ -40,4 +42,4 @@
 # ************************************************
 
 graph_exec_info = smart_scraper_graph.get_execution_info()
-print(prettify_exec_info(graph_exec_info))
+print(prettify_exec_info(graph_exec_info))
diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py
@@ -85,6 +85,7 @@ def _create_graph(self) -> BaseGraph:
                 "llm_model": self.llm_model,
                 "additional_info": self.config.get("additional_info"),
                 "schema": self.schema,
+                "config": self.config
             }
         )
 

diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py
@@ -1,33 +1,28 @@
-"""
-GenerateAnswerNode Module
-"""
+import re
+import json
 from typing import List, Optional
+import requests
+import asyncio
+from tqdm import tqdm
 from langchain.prompts import PromptTemplate
 from langchain_core.output_parsers import JsonOutputParser
 from langchain_core.runnables import RunnableParallel
 from langchain_openai import ChatOpenAI
 from langchain_community.chat_models import ChatOllama
-from tqdm import tqdm
 from ..utils.logging import get_logger
+from ..utils import parse_response_to_dict
 from .base_node import BaseNode
-from ..prompts import TEMPLATE_CHUNKS, TEMPLATE_NO_CHUNKS, TEMPLATE_MERGE, TEMPLATE_CHUNKS_MD, TEMPLATE_NO_CHUNKS_MD, TEMPLATE_MERGE_MD
+from ..prompts import (
+    TEMPLATE_CHUNKS, TEMPLATE_NO_CHUNKS, TEMPLATE_MERGE,
+    TEMPLATE_CHUNKS_MD, TEMPLATE_NO_CHUNKS_MD, TEMPLATE_MERGE_MD
+)
 
 class GenerateAnswerNode(BaseNode):
     """
     A node that generates an answer using a large language model (LLM) based on the user's input
     and the content extracted from a webpage. It constructs a prompt from the user's input
     and the scraped content, feeds it to the LLM, and parses the LLM's response to produce
     an answer.
-
-    Attributes:
-        llm_model: An instance of a language model client, configured for generating answers.
-        verbose (bool): A flag indicating whether to show print statements during execution.
-
-    Args:
-        input (str): Boolean expression defining the input keys needed from the state.
-        output (List[str]): List of output keys to be updated in the state.
-        node_config (dict): Additional configuration for the node.
-        node_name (str): The unique identifier name for the node, defaulting to "GenerateAnswer".
     """
 
     def __init__(
@@ -39,110 +34,163 @@ def __init__(
     ):
         super().__init__(node_name, "node", input, output, 2, node_config)
 
-        self.llm_model = node_config["llm_model"]
-
-        if isinstance(node_config["llm_model"], ChatOllama):
-            self.llm_model.format="json"
-
-        self.verbose = (
-            True if node_config is None else node_config.get("verbose", False)
-        )
-        self.force = (
-            False if node_config is None else node_config.get("force", False)
-        )
-        self.script_creator = (
-            False if node_config is None else node_config.get("script_creator", False)
-        )
-        self.is_md_scraper = (
-            False if node_config is None else node_config.get("is_md_scraper", False)
-        )
-
-        self.additional_info = node_config.get("additional_info")
+        self.llm_model = node_config.get("llm_model")
+        if isinstance(self.llm_model, ChatOllama):
+            self.llm_model.format = "json"
+
+        self.verbose = node_config.get("verbose", False)
+        self.force = node_config.get("force", False)
+        self.script_creator = node_config.get("script_creator", False)
+        self.is_md_scraper = node_config.get("is_md_scraper", False)
+        self.additional_info = node_config.get("additional_info", "")
+        self.api_key = node_config.get("config", {}).get("llm", {}).get("api_key", "")
+
+    async def _process_chunks_async(self, chunks, templates, user_prompt, format_instructions):
+        async def send_request(prompt):
+            url = "https://api.openai.com/v1/chat/completions"
+            headers = {
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {self.api_key}"
+            }
+            response = await requests.post(url, headers=headers, json={
+                "model": self.llm_model.model_name,
+                "messages": [{"role": "user", "content": prompt}],
+                "temperature": 0
+            }, timeout=10)
+            response_text = response.json()['choices'][0]['message']['content']
+            return parse_response_to_dict(response_text)
+
+        tasks = []
+        for i, chunk in enumerate(chunks):
+            prompt = templates['chunks'].format(
+                question=user_prompt,
+                context=chunk,
+                chunk_id=i + 1,
+                format_instructions=format_instructions
+            )
+            tasks.append(send_request(prompt))
+
+        results = await asyncio.gather(*tasks)
+        return results
 
     def execute(self, state: dict) -> dict:
-        """
-        Generates an answer by constructing a prompt from the user's input and the scraped
-        content, querying the language model, and parsing its response.
-
-        Args:
-            state (dict): The current state of the graph. The input keys will be used
-                            to fetch the correct data from the state.
-
-        Returns:
-            dict: The updated state with the output key containing the generated answer.
-
-        Raises:
-            KeyError: If the input keys are not found in the state, indicating
-                      that the necessary information for generating an answer is missing.
-        """
-
         self.logger.info(f"--- Executing {self.node_name} Node ---")
 
-        # Interpret input keys based on the provided input expression
         input_keys = self.get_input_keys(state)
-        # Fetching data from the state based on the input keys
-        input_data = [state[key] for key in input_keys]
-        user_prompt = input_data[0]
-        doc = input_data[1]
-
-        # Initialize the output parser
-        if self.node_config.get("schema", None) is not None:
-            output_parser = JsonOutputParser(pydantic_object=self.node_config["schema"])
-        else:
-            output_parser = JsonOutputParser()
+        user_prompt, doc = [state[key] for key in input_keys]
 
+        schema = self.node_config.get("schema")
+        output_parser = JsonOutputParser(pydantic_object=schema) if schema else JsonOutputParser()
         format_instructions = output_parser.get_format_instructions()
 
-        if  isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator or self.is_md_scraper:
-            template_no_chunks_prompt  = TEMPLATE_NO_CHUNKS_MD
-            template_chunks_prompt  = TEMPLATE_CHUNKS_MD
-            template_merge_prompt  = TEMPLATE_MERGE_MD
-        else:
-            template_no_chunks_prompt  = TEMPLATE_NO_CHUNKS
-            template_chunks_prompt  = TEMPLATE_CHUNKS
-            template_merge_prompt  = TEMPLATE_MERGE
-
-        if self.additional_info is not None:
-            template_no_chunks_prompt  = self.additional_info + template_no_chunks_prompt
-            template_chunks_prompt  = self.additional_info + template_chunks_prompt
-            template_merge_prompt  = self.additional_info + template_merge_prompt 
-
-        if len(doc) == 1:
-            prompt = PromptTemplate(
-                template=template_no_chunks_prompt ,
-                input_variables=["question"],
-                partial_variables={"context": doc,
-                                    "format_instructions": format_instructions})
-            chain =  prompt | self.llm_model | output_parser
-            answer = chain.invoke({"question": user_prompt})
-
-            state.update({self.output[0]: answer})
-            return state
-
-        chains_dict = {}
-        for i, chunk in enumerate(tqdm(doc, desc="Processing chunks", disable=not self.verbose)):
-
-            prompt = PromptTemplate(
-                template=TEMPLATE_CHUNKS,
-                input_variables=["question"],
-                partial_variables={"context": chunk,
-                                "chunk_id": i + 1,
-                                "format_instructions": format_instructions})
-            chain_name = f"chunk{i+1}"
-            chains_dict[chain_name] = prompt | self.llm_model | output_parser
+        if isinstance(self.llm_model, ChatOpenAI) and (not self.script_creator or self.force) or self.is_md_scraper:
+            templates = {
+                'no_chunks': TEMPLATE_NO_CHUNKS_MD,
+                'chunks': TEMPLATE_CHUNKS_MD,
+                'merge': TEMPLATE_MERGE_MD
+            }
+
+            if len(doc) == 1:
+                prompt = templates['no_chunks'].format(
+                    question=user_prompt,
+                    context=doc[0],
+                    format_instructions=format_instructions
+                )
+                response = requests.post(
+                    url="https://api.openai.com/v1/chat/completions",
+                    headers={
+                        "Content-Type": "application/json",
+                        "Authorization": f"Bearer {self.api_key}"
+                    },
+                    json={
+                        "model": self.llm_model.model_name,
+                        "messages": [{"role": "user", "content": prompt}],
+                        "temperature": 0
+                    },
+                    timeout=10
+                )
+
+                response_text = response.json()['choices'][0]['message']['content']
+                cleaned_response = parse_response_to_dict(response_text)
+                state.update({self.output[0]: cleaned_response})
+                return state
+
+            else:
+                chunks_responses = asyncio.run(
+                    self._process_chunks_async(doc, templates, user_prompt, format_instructions)
+                )
+
+                merge_context = " ".join([json.dumps(chunk) for chunk in chunks_responses])
+                merge_prompt = templates['merge'].format(
+                    question=user_prompt,
+                    context=merge_context,
+                    format_instructions=format_instructions
+                )
+                response = requests.post(
+                    url="https://api.openai.com/v1/chat/completions",
+                    headers={
+                        "Content-Type": "application/json",
+                        "Authorization": f"Bearer {self.api_key}"
+                    },
+                    json={
+                        "model": self.llm_model.model_name,
+                        "messages": [{"role": "user", "content": merge_prompt}],
+                        "temperature": 0
+                    },
+                    timeout=10
+                )
+
+                response_text = response.json()['choices'][0]['message']['content']
+                cleaned_response = parse_response_to_dict(response_text)
+                state.update({self.output[0]: cleaned_response})
+                return state
 
-        async_runner = RunnableParallel(**chains_dict)
-
-        batch_results =  async_runner.invoke({"question": user_prompt})
-
-        merge_prompt = PromptTemplate(
-                template = template_merge_prompt ,
+        else:
+            templates = {
+                'no_chunks': TEMPLATE_NO_CHUNKS,
+                'chunks': TEMPLATE_CHUNKS,
+                'merge': TEMPLATE_MERGE
+            }
+
+            if self.additional_info:
+                templates = {key: self.additional_info + 
+                             template for key, template in templates.items()}
+
+            if len(doc) == 1:
+                prompt = PromptTemplate(
+                    template=templates['no_chunks'],
+                    input_variables=["question"],
+                    partial_variables={"context": doc[0], 
+                                       "format_instructions": format_instructions}
+                )
+                chain = prompt | self.llm_model | output_parser
+                answer = chain.invoke({"question": user_prompt})
+                state.update({self.output[0]: answer})
+                return state
+
+            chains_dict = {}
+            for i, chunk in enumerate(tqdm(doc, 
+                                           desc="Processing chunks", 
+                                           disable=not self.verbose)):
+                prompt = PromptTemplate(
+                    template=templates['chunks'],
+                    input_variables=["question"],
+                    partial_variables={"context": chunk, "chunk_id": i + 1,
+                                       "format_instructions": format_instructions}
+                )
+                chain_name = f"chunk{i+1}"
+                chains_dict[chain_name] = prompt | self.llm_model | output_parser
+
+            async_runner = RunnableParallel(**chains_dict)
+            batch_results = async_runner.invoke({"question": user_prompt})
+
+            merge_prompt = PromptTemplate(
+                template=templates['merge'],
                 input_variables=["context", "question"],
-                partial_variables={"format_instructions": format_instructions},
+                partial_variables={"format_instructions": format_instructions}
             )
+            merge_chain = merge_prompt | self.llm_model | output_parser
+            answer = merge_chain.invoke({"context": batch_results, "question": user_prompt})
 
-        merge_chain = merge_prompt | self.llm_model | output_parser
-        answer = merge_chain.invoke({"context": batch_results, "question": user_prompt})
-
-        state.update({self.output[0]: answer})
-        return state
+            state.update({self.output[0]: answer})
+            return state
diff --git a/scrapegraphai/utils/__init__.py b/scrapegraphai/utils/__init__.py
@@ -1,7 +1,7 @@
 """
     __init__.py file for utils folder
 """
-
+from .response_to_dict import parse_response_to_dict
 from .convert_to_csv import convert_to_csv
 from .convert_to_json import convert_to_json
 from .prettify_exec_info import prettify_exec_info

diff --git a/scrapegraphai/utils/response_to_dict.py b/scrapegraphai/utils/response_to_dict.py
@@ -0,0 +1,37 @@
+"""
+parse_response_to_dict module
+"""
+import re
+import json
+
+def parse_response_to_dict(response_text: str) -> dict:
+    """
+    Parse the response text to a dictionary, handling different formats.
+
+    Args:
+        response_text (str): The raw text response from the model.
+
+    Returns:
+        dict: The parsed dictionary.
+    """
+    # Regex to capture text between ```json and ```
+    json_pattern = r'```json\s*(.*?)\s*```'
+
+    # Check if response matches the pattern
+    match = re.search(json_pattern, response_text, re.DOTALL)
+    if match:
+        json_str = match.group(1)
+    else:
+        # If no match, consider the whole response as potential JSON
+        json_str = response_text
+
+    # Clean any common escape characters and whitespace issues
+    cleaned_json_str = json_str.replace('\\n', '').replace('\\', '').strip()
+
+    # Parse the cleaned string into a dictionary
+    try:
+        parsed_dict = json.loads(cleaned_json_str)
+    except json.JSONDecodeError:
+        raise ValueError("The response could not be parsed into a valid JSON dictionary.")
+
+    return parsed_dict