feat: merge graphs node implementation

VinciGit00 · DiTo97 · VinciGit00 · commit ca5821ac4b38 · 2024-06-20T19:11:47.000+02:00
Co-Authored-By: Federico Minutoli &lt;40361744+DiTo97@users.noreply.github.com&gt;
diff --git a/scrapegraphai/graphs/explore_graph.py b/scrapegraphai/graphs/explore_graph.py
@@ -89,10 +89,10 @@ def _create_graph(self) -> BaseGraph:
                 "schema": self.schema,
             }
         )
-    
+
         search_link_node = SearchLinkNode(
             input="doc",
-            output=[{"link": "description"}],
+            output=[{"relevant_links"}],
             node_config={
                 "llm_model": self.llm_model,
             }
@@ -115,7 +115,7 @@ def _create_graph(self) -> BaseGraph:
             entry_point=fetch_node
         )
 
-    def run(self) -> str:
+    def run(self) -> tuple[str, dict]:
         """
         Executes the scraping process and returns the answer to the prompt.
 
@@ -125,4 +125,5 @@ def run(self) -> str:
         inputs = {"user_prompt": self.prompt, self.input_key: self.source}
         self.final_state, self.execution_info = self.graph.execute(inputs)
 
-        return self.final_state.get("answer", "No answer found.")
+        return (self.final_state.get("answer", "No answer found."),
+                self.final_state.get("relevant_links", dict()))
diff --git a/scrapegraphai/nodes/merge_explore_graphs_node.py b/scrapegraphai/nodes/merge_explore_graphs_node.py
@@ -4,8 +4,8 @@
 
 # Imports from standard library
 from typing import List, Optional
-
-# Imports from Langchain
+from functools import reduce
+import operator
 from langchain.prompts import PromptTemplate
 from langchain_core.output_parsers import JsonOutputParser
 from tqdm import tqdm
@@ -68,20 +68,51 @@ def execute(self, state: dict) -> dict:
 
         self.logger.info(f"--- Executing {self.node_name} Node ---")
 
-        template_answer = ""
+         # merge the answers in one string
+     
+
+        # Initialize the output parser
+        if self.node_config.get("schema", None) is not None:
+            output_parser = JsonOutputParser(pydantic_object=self.node_config["schema"])
+        else:
+            output_parser = JsonOutputParser()
+
+        format_instructions = output_parser.get_format_instructions()
+
+        template_answer = """
+        You are a website scraper and you have just scraped some content from multiple websites.\n
+        You are now asked to provide an answer to a USER PROMPT based on the content you have scraped.\n
+        You need to merge the content from the different websites into a single answer without repetitions (if there are any). \n
+        The scraped contents are in a JSON format and you need to merge them based on the context and providing a correct JSON structure.\n
+        OUTPUT INSTRUCTIONS: {format_instructions}\n
+        USER PROMPT: {user_prompt}\n
+        WEBSITE CONTENT: {website_content}
+        """
+
+        input_keys = self.get_input_keys(state)
+        # Fetching data from the state based on the input keys
+        input_data = [state[key] for key in input_keys]
+
+        user_prompt = input_data[0]
+        #answers is a list of strings
+        answers, relevant_links = zip(*input_data[1])
+
+        answers_str = ""
+        for i, answer in enumerate(answers):
+            answers_str += f"CONTENT WEBSITE {i+1}: {answer}\n"
 
         answers = str(state.get("answer"))
         relevant_links = str(state.get("relevant_links"))
         answer = {}
 
         merge_prompt = PromptTemplate(
                 template=template_answer,
-                 #input_variables=["context", "question"],
-                 #partial_variables={"format_instructions": format_instructions},
+                input_variables=["context", "question"],
+                partial_variables={"format_instructions": format_instructions},
             )
 
-         #answer = merge_prompt.invoke({"question": user_prompt})
+        answer = merge_prompt.invoke({"question": user_prompt})
 
-        state.update({"relevant_links": "TODO"})
-        state.update({"answer": "TODO"})
+        state.update({"relevant_links": reduce(operator.ior, relevant_links, {})})
+        state.update({"answer": answer})
         return state

Original file line number	Diff line number	Diff line change
`@@ -89,10 +89,10 @@ def _create_graph(self) -> BaseGraph:`
`89`	`89`	`"schema": self.schema,`
`90`	`90`	`}`
`91`	`91`	`)`
`92`		`-`
	`92`	`+`
`93`	`93`	`search_link_node = SearchLinkNode(`
`94`	`94`	`input="doc",`
`95`		`- output=[{"link": "description"}],`
	`95`	`+ output=[{"relevant_links"}],`
`96`	`96`	`node_config={`
`97`	`97`	`"llm_model": self.llm_model,`
`98`	`98`	`}`
`@@ -115,7 +115,7 @@ def _create_graph(self) -> BaseGraph:`
`115`	`115`	`entry_point=fetch_node`
`116`	`116`	`)`
`117`	`117`
`118`		`- def run(self) -> str:`
	`118`	`+ def run(self) -> tuple[str, dict]:`
`119`	`119`	`"""`
`120`	`120`	`Executes the scraping process and returns the answer to the prompt.`
`121`	`121`
`@@ -125,4 +125,5 @@ def run(self) -> str:`
`125`	`125`	`inputs = {"user_prompt": self.prompt, self.input_key: self.source}`
`126`	`126`	`self.final_state, self.execution_info = self.graph.execute(inputs)`
`127`	`127`
`128`		`- return self.final_state.get("answer", "No answer found.")`
	`128`	`+ return (self.final_state.get("answer", "No answer found."),`
	`129`	`+ self.final_state.get("relevant_links", dict()))`