From d00cde60309935e283ba9116cf0b114e53cb9640 Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <mvincig11@gmail.com>
Date: Thu, 23 May 2024 20:03:16 +0200
Subject: [PATCH 1/3] fix(pdf_scraper): fix the pdf scraper gaph

---
 scrapegraphai/graphs/abstract_graph.py    | 32 ++++++++++++++---------
 scrapegraphai/graphs/pdf_scraper_graph.py | 32 +++++------------------
 2 files changed, 25 insertions(+), 39 deletions(-)

diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py
index 6a0c7a4c..e9ba1213 100644
--- a/scrapegraphai/graphs/abstract_graph.py
+++ b/scrapegraphai/graphs/abstract_graph.py
@@ -181,6 +181,7 @@ def _create_llm(self, llm_config: dict, chat=False) -> object:
                     try:
                         self.model_token = models_tokens["ollama"][llm_params["model"]]
                     except KeyError as exc:
+                        print("model not found, using default token size (8192)")
                         self.model_token = 8192
                 else:
                     self.model_token = 8192
@@ -191,16 +192,18 @@ def _create_llm(self, llm_config: dict, chat=False) -> object:
         elif "hugging_face" in llm_params["model"]:
             try:
                 self.model_token = models_tokens["hugging_face"][llm_params["model"]]
-            except KeyError as exc:
-                raise KeyError("Model not supported") from exc
+            except KeyError:
+                print("model not found, using default token size (8192)")
+                self.model_token = 8192
             return HuggingFace(llm_params)
         elif "groq" in llm_params["model"]:
             llm_params["model"] = llm_params["model"].split("/")[-1]
 
             try:
                 self.model_token = models_tokens["groq"][llm_params["model"]]
-            except KeyError as exc:
-                raise KeyError("Model not supported") from exc
+            except KeyError:
+                print("model not found, using default token size (8192)")
+                self.model_token = 8192
             return Groq(llm_params)
         elif "bedrock" in llm_params["model"]:
             llm_params["model"] = llm_params["model"].split("/")[-1]
@@ -208,8 +211,9 @@ def _create_llm(self, llm_config: dict, chat=False) -> object:
             client = llm_params.get('client', None)
             try:
                 self.model_token = models_tokens["bedrock"][llm_params["model"]]
-            except KeyError as exc:
-                raise KeyError("Model not supported") from exc
+            except KeyError:
+                print("model not found, using default token size (8192)")
+                self.model_token = 8192
             return Bedrock({
                 "client": client,
                 "model_id": model_id,
@@ -218,13 +222,18 @@ def _create_llm(self, llm_config: dict, chat=False) -> object:
                 }
             })
         elif "claude-3-" in llm_params["model"]:
-            self.model_token = models_tokens["claude"]["claude3"]
+            try:
+                self.model_token = models_tokens["claude"]["claude3"]
+            except KeyError:
+                print("model not found, using default token size (8192)")
+                self.model_token = 8192
             return Anthropic(llm_params)
         elif "deepseek" in llm_params["model"]:
             try:
                 self.model_token = models_tokens["deepseek"][llm_params["model"]]
-            except KeyError as exc:
-                raise KeyError("Model not supported") from exc
+            except KeyError:
+                print("model not found, using default token size (8192)")
+                self.model_token = 8192
             return DeepSeek(llm_params)
         else:
             raise ValueError(
@@ -312,10 +321,7 @@ def _create_embedder(self, embedder_config: dict) -> object:
                 models_tokens["bedrock"][embedder_config["model"]]
             except KeyError as exc:
                 raise KeyError("Model not supported") from exc
-            return BedrockEmbeddings(client=client, model_id=embedder_config["model"])
-        else:
-            raise ValueError(
-                "Model provided by the configuration not supported")
+            return BedrockEmbeddings(client=client, model_id=embedder_config["model"])  
 
     def get_state(self, key=None) -> dict:
         """""
diff --git a/scrapegraphai/graphs/pdf_scraper_graph.py b/scrapegraphai/graphs/pdf_scraper_graph.py
index 86ab2a49..39278ab7 100644
--- a/scrapegraphai/graphs/pdf_scraper_graph.py
+++ b/scrapegraphai/graphs/pdf_scraper_graph.py
@@ -11,7 +11,7 @@
     FetchNode,
     ParseNode,
     RAGNode,
-    GenerateAnswerNode
+    GenerateAnswerPDFNode
 )
 
 
@@ -48,7 +48,7 @@ class PDFScraperGraph(AbstractGraph):
     """
 
     def __init__(self, prompt: str, source: str, config: dict, schema: Optional[str] = None):
-        super().__init__(prompt, config, source, schema)
+        super().__init__(prompt, config, source)
 
         self.input_key = "pdf" if source.endswith("pdf") else "pdf_dir"
 
@@ -64,41 +64,21 @@ def _create_graph(self) -> BaseGraph:
             input='pdf | pdf_dir',
             output=["doc", "link_urls", "img_urls"],
         )
-        parse_node = ParseNode(
-            input="doc",
-            output=["parsed_doc"],
-            node_config={
-                "chunk_size": self.model_token,
-            }
-        )
-        rag_node = RAGNode(
-            input="user_prompt & (parsed_doc | doc)",
-            output=["relevant_chunks"],
-            node_config={
-                "llm_model": self.llm_model,
-                "embedder_model": self.embedder_model,
-            }
-        )
-        generate_answer_node = GenerateAnswerNode(
+        generate_answer_node_pdf = GenerateAnswerPDFNode(
             input="user_prompt & (relevant_chunks | parsed_doc | doc)",
             output=["answer"],
             node_config={
                 "llm_model": self.llm_model,
-                "schema": self.schema,
             }
         )
 
         return BaseGraph(
             nodes=[
                 fetch_node,
-                parse_node,
-                rag_node,
-                generate_answer_node,
+                generate_answer_node_pdf,
             ],
             edges=[
-                (fetch_node, parse_node),
-                (parse_node, rag_node),
-                (rag_node, generate_answer_node)
+                (fetch_node, generate_answer_node_pdf)
             ],
             entry_point=fetch_node
         )
@@ -114,4 +94,4 @@ def run(self) -> str:
         inputs = {"user_prompt": self.prompt, self.input_key: self.source}
         self.final_state, self.execution_info = self.graph.execute(inputs)
 
-        return self.final_state.get("answer", "No answer found.")
\ No newline at end of file
+        return self.final_state.get("answer", "No answer found.")

From 5fd7633c63710e3cb4e233b422379972420f6789 Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <mvincig11@gmail.com>
Date: Thu, 23 May 2024 21:09:09 +0200
Subject: [PATCH 2/3] Update pdf_scraper_graph.py

---
 scrapegraphai/graphs/pdf_scraper_graph.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/scrapegraphai/graphs/pdf_scraper_graph.py b/scrapegraphai/graphs/pdf_scraper_graph.py
index 39278ab7..d966b0bc 100644
--- a/scrapegraphai/graphs/pdf_scraper_graph.py
+++ b/scrapegraphai/graphs/pdf_scraper_graph.py
@@ -9,8 +9,6 @@
 
 from ..nodes import (
     FetchNode,
-    ParseNode,
-    RAGNode,
     GenerateAnswerPDFNode
 )
 

From 8d5eb0bb0d5d008a63a96df94ce3842320376b8e Mon Sep 17 00:00:00 2001
From: Marco Perini <perinim.98@gmail.com>
Date: Sat, 25 May 2024 00:13:47 +0200
Subject: [PATCH 3/3] fix(local_file): fixed textual input pdf, csv, json and
 xml graph

---
 examples/openai/pdf_scraper_openai.py      | 74 ++++++++++++++++++++++
 scrapegraphai/graphs/csv_scraper_graph.py  | 18 ++----
 scrapegraphai/graphs/json_scraper_graph.py | 12 +---
 scrapegraphai/graphs/pdf_scraper_graph.py  | 17 ++++-
 scrapegraphai/graphs/xml_scraper_graph.py  | 16 +----
 scrapegraphai/nodes/fetch_node.py          |  3 +-
 6 files changed, 98 insertions(+), 42 deletions(-)
 create mode 100644 examples/openai/pdf_scraper_openai.py

diff --git a/examples/openai/pdf_scraper_openai.py b/examples/openai/pdf_scraper_openai.py
new file mode 100644
index 00000000..874c4142
--- /dev/null
+++ b/examples/openai/pdf_scraper_openai.py
@@ -0,0 +1,74 @@
+""" 
+Basic example of scraping pipeline using PDFScraper
+"""
+
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import PDFScraperGraph
+
+load_dotenv()
+
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+openai_key = os.getenv("OPENAI_APIKEY")
+
+graph_config = {
+    "llm": {
+        "api_key":openai_key,
+        "model": "gpt-3.5-turbo",
+    },
+    "verbose": True,
+    "headless": False,
+}
+
+# Covert to list
+sources = [
+    "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weatherâ€”the interaction between call center architecture and outdoor weather conditionsâ€”in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity â€“ largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.",
+    "The diffusion of social media coincided with a worsening of mental health conditions among adolescents and young adults in the United States, giving rise to speculation that social media might be detrimental to mental health. In this paper, we provide quasi-experimental estimates of the impact of social media on mental health by leveraging a unique natural experiment: the staggered introduction of Facebook across U.S. colleges. Our analysis couples data on student mental health around the years of Facebook's expansion with a generalized difference-in-differences empirical strategy. We find that the roll-out of Facebook at a college increased symptoms of poor mental health, especially depression. We also find that, among students predicted to be most susceptible to mental illness, the introduction of Facebook led to increased utilization of mental healthcare services. Lastly, we find that, after the introduction of Facebook, students were more likely to report experiencing impairments to academic performance resulting from poor mental health. Additional evidence on mechanisms suggests that the results are due to Facebook fostering unfavorable social comparisons.",
+    "Hollywood films are generally released first in the United States and then later abroad, with  some variation in lags across films and countries. With the growth in movie piracy since the appearance of BitTorrent in 2003, films have become available through illegal piracy immediately after release in the US, while they are not available for legal viewing abroad until their foreign premieres in each country. We make use of this variation in international release lags to ask whether longer lags â€“ which facilitate more local pre-release piracy â€“ depress theatrical box office receipts, particularly after the widespread adoption of BitTorrent. We find that longer release windows are associated with decreased box office returns, even after controlling for film and country fixed effects. This relationship is much stronger in contexts where piracy is more prevalent: after BitTorrentâ€™s adoption and in heavily-pirated genres. Our findings indicate that, as a lower bound, international box office returns in our sample were at least 7% lower than they would have been in the absence of pre-release piracy. By contrast, we do not see evidence of elevated sales displacement in US box office revenue following the adoption of BitTorrent, and we suggest that delayed legal availability of the content abroad may drive the losses to piracy."
+    # Add more sources here
+]
+
+prompt = """
+You are an expert in reviewing academic manuscripts. Please analyze the abstracts provided from an academic journal article to extract and clearly identify the following elements:
+
+Independent Variable (IV): The variable that is manipulated or considered as the primary cause affecting other variables.
+Dependent Variable (DV): The variable that is measured or observed, which is expected to change as a result of variations in the Independent Variable.
+Exogenous Shock: Identify any external or unexpected events used in the study that serve as a natural experiment or provide a unique setting for observing the effects on the IV and DV.
+Response Format: For each abstract, present your response in the following structured format:
+
+Independent Variable (IV):
+Dependent Variable (DV):
+Exogenous Shock:
+
+Example Queries and Responses:
+
+Query: This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather the interaction between call center architecture and outdoor weather conditions in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.
+
+Response:
+
+Independent Variable (IV): Employee happiness.
+Dependent Variable (DV): Overall firm productivity.
+Exogenous Shock: Sudden company-wide increase in bonus payments.
+
+Query: The diffusion of social media coincided with a worsening of mental health conditions among adolescents and young adults in the United States, giving rise to speculation that social media might be detrimental to mental health. In this paper, we provide quasi-experimental estimates of the impact of social media on mental health by leveraging a unique natural experiment: the staggered introduction of Facebook across U.S. colleges. Our analysis couples data on student mental health around the years of Facebook's expansion with a generalized difference-in-differences empirical strategy. We find that the roll-out of Facebook at a college increased symptoms of poor mental health, especially depression. We also find that, among students predicted to be most susceptible to mental illness, the introduction of Facebook led to increased utilization of mental healthcare services. Lastly, we find that, after the introduction of Facebook, students were more likely to report experiencing impairments to academic performance resulting from poor mental health. Additional evidence on mechanisms suggests that the results are due to Facebook fostering unfavorable social comparisons.
+
+Response:
+
+Independent Variable (IV): Exposure to social media.
+Dependent Variable (DV): Mental health outcomes.
+Exogenous Shock: staggered introduction of Facebook across U.S. colleges.
+"""
+
+pdf_scraper_graph = PDFScraperGraph(
+    prompt=prompt,
+    source=sources[0],
+    config=graph_config
+)
+result = pdf_scraper_graph.run()
+
+
+print(result)
diff --git a/scrapegraphai/graphs/csv_scraper_graph.py b/scrapegraphai/graphs/csv_scraper_graph.py
index 6ae8cbcb..df9d5676 100644
--- a/scrapegraphai/graphs/csv_scraper_graph.py
+++ b/scrapegraphai/graphs/csv_scraper_graph.py
@@ -9,7 +9,6 @@
 
 from ..nodes import (
     FetchNode,
-    ParseNode,
     RAGNode,
     GenerateAnswerCSVNode
 )
@@ -35,17 +34,10 @@ def _create_graph(self):
         """
         fetch_node = FetchNode(
             input="csv | csv_dir",
-            output=["doc", "link_urls", "img_urls"],
-        )
-        parse_node = ParseNode(
-            input="doc",
-            output=["parsed_doc"],
-            node_config={
-                "chunk_size": self.model_token,
-            }
+            output=["doc"],
         )
         rag_node = RAGNode(
-            input="user_prompt & (parsed_doc | doc)",
+            input="user_prompt & doc",
             output=["relevant_chunks"],
             node_config={
                 "llm_model": self.llm_model,
@@ -53,7 +45,7 @@ def _create_graph(self):
             }
         )
         generate_answer_node = GenerateAnswerCSVNode(
-            input="user_prompt & (relevant_chunks | parsed_doc | doc)",
+            input="user_prompt & (relevant_chunks | doc)",
             output=["answer"],
             node_config={
                 "llm_model": self.llm_model,
@@ -64,13 +56,11 @@ def _create_graph(self):
         return BaseGraph(
             nodes=[
                 fetch_node,
-                parse_node,
                 rag_node,
                 generate_answer_node,
             ],
             edges=[
-                (fetch_node, parse_node),
-                (parse_node, rag_node),
+                (fetch_node, rag_node),
                 (rag_node, generate_answer_node)
             ],
             entry_point=fetch_node
diff --git a/scrapegraphai/graphs/json_scraper_graph.py b/scrapegraphai/graphs/json_scraper_graph.py
index 5b263f70..57527f47 100644
--- a/scrapegraphai/graphs/json_scraper_graph.py
+++ b/scrapegraphai/graphs/json_scraper_graph.py
@@ -9,7 +9,6 @@
 
 from ..nodes import (
     FetchNode,
-    ParseNode,
     RAGNode,
     GenerateAnswerNode
 )
@@ -62,13 +61,6 @@ def _create_graph(self) -> BaseGraph:
             input="json | json_dir",
             output=["doc", "link_urls", "img_urls"],
         )
-        parse_node = ParseNode(
-            input="doc",
-            output=["parsed_doc"],
-            node_config={
-                "chunk_size": self.model_token
-            }
-        )
         rag_node = RAGNode(
             input="user_prompt & (parsed_doc | doc)",
             output=["relevant_chunks"],
@@ -89,13 +81,11 @@ def _create_graph(self) -> BaseGraph:
         return BaseGraph(
             nodes=[
                 fetch_node,
-                parse_node,
                 rag_node,
                 generate_answer_node,
             ],
             edges=[
-                (fetch_node, parse_node),
-                (parse_node, rag_node),
+                (fetch_node, rag_node),
                 (rag_node, generate_answer_node)
             ],
             entry_point=fetch_node
diff --git a/scrapegraphai/graphs/pdf_scraper_graph.py b/scrapegraphai/graphs/pdf_scraper_graph.py
index d966b0bc..976b5f9b 100644
--- a/scrapegraphai/graphs/pdf_scraper_graph.py
+++ b/scrapegraphai/graphs/pdf_scraper_graph.py
@@ -9,6 +9,7 @@
 
 from ..nodes import (
     FetchNode,
+    RAGNode,
     GenerateAnswerPDFNode
 )
 
@@ -60,10 +61,18 @@ def _create_graph(self) -> BaseGraph:
 
         fetch_node = FetchNode(
             input='pdf | pdf_dir',
-            output=["doc", "link_urls", "img_urls"],
+            output=["doc"],
+        )
+        rag_node = RAGNode(
+            input="user_prompt & doc",
+            output=["relevant_chunks"],
+            node_config={
+                "llm_model": self.llm_model,
+                "embedder_model": self.embedder_model
+            }
         )
         generate_answer_node_pdf = GenerateAnswerPDFNode(
-            input="user_prompt & (relevant_chunks | parsed_doc | doc)",
+            input="user_prompt & (relevant_chunks | doc)",
             output=["answer"],
             node_config={
                 "llm_model": self.llm_model,
@@ -73,10 +82,12 @@ def _create_graph(self) -> BaseGraph:
         return BaseGraph(
             nodes=[
                 fetch_node,
+                rag_node,
                 generate_answer_node_pdf,
             ],
             edges=[
-                (fetch_node, generate_answer_node_pdf)
+                (fetch_node, rag_node),
+                (rag_node, generate_answer_node_pdf)
             ],
             entry_point=fetch_node
         )
diff --git a/scrapegraphai/graphs/xml_scraper_graph.py b/scrapegraphai/graphs/xml_scraper_graph.py
index 1557ecd4..03d16158 100644
--- a/scrapegraphai/graphs/xml_scraper_graph.py
+++ b/scrapegraphai/graphs/xml_scraper_graph.py
@@ -9,7 +9,6 @@
 
 from ..nodes import (
     FetchNode,
-    ParseNode,
     RAGNode,
     GenerateAnswerNode
 )
@@ -64,15 +63,8 @@ def _create_graph(self) -> BaseGraph:
             input="xml | xml_dir",
             output=["doc", "link_urls", "img_urls"]
         )
-        parse_node = ParseNode(
-            input="doc",
-            output=["parsed_doc"],
-            node_config={
-                "chunk_size": self.model_token
-            }
-        )
         rag_node = RAGNode(
-            input="user_prompt & (parsed_doc | doc)",
+            input="user_prompt & doc",
             output=["relevant_chunks"],
             node_config={
                 "llm_model": self.llm_model,
@@ -80,7 +72,7 @@ def _create_graph(self) -> BaseGraph:
             }
         )
         generate_answer_node = GenerateAnswerNode(
-            input="user_prompt & (relevant_chunks | parsed_doc | doc)",
+            input="user_prompt & (relevant_chunks | doc)",
             output=["answer"],
             node_config={
                 "llm_model": self.llm_model,
@@ -91,13 +83,11 @@ def _create_graph(self) -> BaseGraph:
         return BaseGraph(
             nodes=[
                 fetch_node,
-                parse_node,
                 rag_node,
                 generate_answer_node,
             ],
             edges=[
-                (fetch_node, parse_node),
-                (parse_node, rag_node),
+                (fetch_node, rag_node),
                 (rag_node, generate_answer_node)
             ],
             entry_point=fetch_node
diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py
index 6c9858c9..18907d54 100644
--- a/scrapegraphai/nodes/fetch_node.py
+++ b/scrapegraphai/nodes/fetch_node.py
@@ -89,8 +89,9 @@ def execute(self, state):
             or input_keys[0] == "pdf_dir"
         ):
             compressed_document = [
-                Document(page_content=source, metadata={"source": "local_dir"})
+                source
             ]
+            
             state.update({self.output[0]: compressed_document})
             return state