From 930f67374752561903462a25728c739946f9449b Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Fri, 31 May 2024 21:03:48 +0200 Subject: [PATCH 1/9] feat: removed rag node --- scrapegraphai/graphs/pdf_scraper_graph.py | 17 ++++------------- scrapegraphai/graphs/smart_scraper_graph.py | 2 +- scrapegraphai/nodes/generate_answer_pdf_node.py | 6 ++---- 3 files changed, 7 insertions(+), 18 deletions(-) diff --git a/scrapegraphai/graphs/pdf_scraper_graph.py b/scrapegraphai/graphs/pdf_scraper_graph.py index 10556213..912f141e 100644 --- a/scrapegraphai/graphs/pdf_scraper_graph.py +++ b/scrapegraphai/graphs/pdf_scraper_graph.py @@ -1,3 +1,4 @@ + """ PDFScraperGraph Module """ @@ -9,7 +10,6 @@ from ..nodes import ( FetchNode, - RAGNode, GenerateAnswerPDFNode ) @@ -63,14 +63,7 @@ def _create_graph(self) -> BaseGraph: input='pdf | pdf_dir', output=["doc"], ) - rag_node = RAGNode( - input="user_prompt & doc", - output=["relevant_chunks"], - node_config={ - "llm_model": self.llm_model, - "embedder_model": self.embedder_model - } - ) + generate_answer_node_pdf = GenerateAnswerPDFNode( input="user_prompt & (relevant_chunks | doc)", output=["answer"], @@ -83,12 +76,10 @@ def _create_graph(self) -> BaseGraph: return BaseGraph( nodes=[ fetch_node, - rag_node, generate_answer_node_pdf, ], edges=[ - (fetch_node, rag_node), - (rag_node, generate_answer_node_pdf) + (fetch_node, generate_answer_node_pdf) ], entry_point=fetch_node ) @@ -104,4 +95,4 @@ def run(self) -> str: inputs = {"user_prompt": self.prompt, self.input_key: self.source} self.final_state, self.execution_info = self.graph.execute(inputs) - return self.final_state.get("answer", "No answer found.") + return self.final_state.get("answer", "No answer found.") \ No newline at end of file diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index ee230695..aadd0887 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -117,4 +117,4 @@ def run(self) -> str: inputs = {"user_prompt": self.prompt, self.input_key: self.source} self.final_state, self.execution_info = self.graph.execute(inputs) - return self.final_state.get("answer", "No answer found.") \ No newline at end of file + return self.final_state.get("answer", "No answer found.") diff --git a/scrapegraphai/nodes/generate_answer_pdf_node.py b/scrapegraphai/nodes/generate_answer_pdf_node.py index 3a520745..1f468a55 100644 --- a/scrapegraphai/nodes/generate_answer_pdf_node.py +++ b/scrapegraphai/nodes/generate_answer_pdf_node.py @@ -95,9 +95,7 @@ def execute(self, state): output_parser = JsonOutputParser() format_instructions = output_parser.get_format_instructions() - chains_dict = {} - # Use tqdm to add progress bar for i, chunk in enumerate( tqdm(doc, desc="Processing chunks", disable=not self.verbose) @@ -107,7 +105,7 @@ def execute(self, state): template=template_no_chunks_pdf, input_variables=["question"], partial_variables={ - "context": chunk.page_content, + "context":chunk, "format_instructions": format_instructions, }, ) @@ -116,7 +114,7 @@ def execute(self, state): template=template_chunks_pdf, input_variables=["question"], partial_variables={ - "context": chunk.page_content, + "context":chunk, "chunk_id": i + 1, "format_instructions": format_instructions, }, From f5cbd80c977f51233ac1978d8450fcf0ec2ff461 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sat, 1 Jun 2024 09:52:21 +0200 Subject: [PATCH 2/9] feat: add pdf scraper multi graph --- .../local_models/pdf_scraper_multi_ollama.py | 69 +++++++++++ scrapegraphai/graphs/__init__.py | 1 + scrapegraphai/graphs/pdf_scraper_multi.py | 117 ++++++++++++++++++ .../nodes/generate_answer_csv_node.py | 2 +- .../nodes/generate_answer_pdf_node.py | 2 +- scrapegraphai/nodes/generate_scraper_node.py | 1 - scrapegraphai/nodes/get_probable_tags_node.py | 2 - scrapegraphai/nodes/robots_node.py | 2 +- 8 files changed, 190 insertions(+), 6 deletions(-) create mode 100644 examples/local_models/pdf_scraper_multi_ollama.py create mode 100644 scrapegraphai/graphs/pdf_scraper_multi.py diff --git a/examples/local_models/pdf_scraper_multi_ollama.py b/examples/local_models/pdf_scraper_multi_ollama.py new file mode 100644 index 00000000..c7b439bd --- /dev/null +++ b/examples/local_models/pdf_scraper_multi_ollama.py @@ -0,0 +1,69 @@ +""" +Module for showing how PDFScraper multi works +""" +from scrapegraphai.graphs import PdfScraperMultiGraph + +graph_config = { + "llm": { + "model": "ollama/llama3", + "temperature": 0, + "format": "json", # Ollama needs the format to be specified explicitly + "model_tokens": 4000, + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + }, + "verbose": True, + "headless": False, +} + +# Covert to list +sources = [ + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", +] + +prompt = """ +You are an expert in reviewing academic manuscripts. Please analyze the abstracts provided from an academic journal article to extract and clearly identify the following elements: + +Independent Variable (IV): The variable that is manipulated or considered as the primary cause affecting other variables. +Dependent Variable (DV): The variable that is measured or observed, which is expected to change as a result of variations in the Independent Variable. +Exogenous Shock: Identify any external or unexpected events used in the study that serve as a natural experiment or provide a unique setting for observing the effects on the IV and DV. +Response Format: For each abstract, present your response in the following structured format: + +Independent Variable (IV): +Dependent Variable (DV): +Exogenous Shock: + +Example Queries and Responses: + +Query: This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather the interaction between call center architecture and outdoor weather conditions in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking. + +Response: + +Independent Variable (IV): Employee happiness. +Dependent Variable (DV): Overall firm productivity. +Exogenous Shock: Sudden company-wide increase in bonus payments. + +Query: The diffusion of social media coincided with a worsening of mental health conditions among adolescents and young adults in the United States, giving rise to speculation that social media might be detrimental to mental health. In this paper, we provide quasi-experimental estimates of the impact of social media on mental health by leveraging a unique natural experiment: the staggered introduction of Facebook across U.S. colleges. Our analysis couples data on student mental health around the years of Facebook's expansion with a generalized difference-in-differences empirical strategy. We find that the roll-out of Facebook at a college increased symptoms of poor mental health, especially depression. We also find that, among students predicted to be most susceptible to mental illness, the introduction of Facebook led to increased utilization of mental healthcare services. Lastly, we find that, after the introduction of Facebook, students were more likely to report experiencing impairments to academic performance resulting from poor mental health. Additional evidence on mechanisms suggests that the results are due to Facebook fostering unfavorable social comparisons. + +Response: + +Independent Variable (IV): Exposure to social media. +Dependent Variable (DV): Mental health outcomes. +Exogenous Shock: staggered introduction of Facebook across U.S. colleges. +""" +results = [] +for source in sources: + pdf_scraper_graph = PdfScraperMultiGraph( + prompt=prompt, + source=source, + config=graph_config + ) + result = pdf_scraper_graph.run() + results.append(result) + +print(results) diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py index 994b2e3a..b572905e 100644 --- a/scrapegraphai/graphs/__init__.py +++ b/scrapegraphai/graphs/__init__.py @@ -16,3 +16,4 @@ from .omni_scraper_graph import OmniScraperGraph from .omni_search_graph import OmniSearchGraph from .smart_scraper_multi_graph import SmartScraperMultiGraph +from .pdf_scraper_multi import PdfScraperMultiGraph diff --git a/scrapegraphai/graphs/pdf_scraper_multi.py b/scrapegraphai/graphs/pdf_scraper_multi.py new file mode 100644 index 00000000..125d70a0 --- /dev/null +++ b/scrapegraphai/graphs/pdf_scraper_multi.py @@ -0,0 +1,117 @@ +""" +PdfScraperMultiGraph Module +""" + +from copy import copy, deepcopy +from typing import List, Optional + +from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph +from .pdf_scraper_graph import PDFScraperGraph + +from ..nodes import ( + GraphIteratorNode, + MergeAnswersNode +) + + +class PdfScraperMultiGraph(AbstractGraph): + """ + PdfScraperMultiGraph is a scraping pipeline that scrapes a + list of URLs and generates answers to a given prompt. + It only requires a user prompt and a list of URLs. + + Attributes: + prompt (str): The user prompt to search the internet. + llm_model (dict): The configuration for the language model. + embedder_model (dict): The configuration for the embedder model. + headless (bool): A flag to run the browser in headless mode. + verbose (bool): A flag to display the execution information. + model_token (int): The token limit for the language model. + + Args: + prompt (str): The user prompt to search the internet. + source (List[str]): The source of the graph. + config (dict): Configuration parameters for the graph. + schema (Optional[str]): The schema for the graph output. + + Example: + >>> search_graph = MultipleSearchGraph( + ... "What is Chioggia famous for?", + ... {"llm": {"model": "gpt-3.5-turbo"}} + ... ) + >>> result = search_graph.run() + """ + + def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[str] = None): + + self.max_results = config.get("max_results", 3) + + if all(isinstance(value, str) for value in config.values()): + self.copy_config = copy(config) + else: + self.copy_config = deepcopy(config) + + super().__init__(prompt, config, source, schema) + + def _create_graph(self) -> BaseGraph: + """ + Creates the graph of nodes representing the workflow for web scraping and searching. + + Returns: + BaseGraph: A graph instance representing the web scraping and searching workflow. + """ + + # ************************************************ + # Create a PDFScraperGraph instance + # ************************************************ + + pdf_scraper_instance = PDFScraperGraph( + prompt="", + source="", + config=self.copy_config, + ) + + # ************************************************ + # Define the graph nodes + # ************************************************ + + graph_iterator_node = GraphIteratorNode( + input="user_prompt & pdfs", + output=["results"], + node_config={ + "graph_instance": pdf_scraper_instance, + } + ) + + merge_answers_node = MergeAnswersNode( + input="user_prompt & results", + output=["answer"], + node_config={ + "llm_model": self.llm_model, + "schema": self.schema + } + ) + + return BaseGraph( + nodes=[ + graph_iterator_node, + merge_answers_node, + ], + edges=[ + (graph_iterator_node, merge_answers_node), + ], + entry_point=graph_iterator_node + ) + + def run(self) -> str: + """ + Executes the web scraping and searching process. + + Returns: + str: The answer to the prompt. + """ + inputs = {"user_prompt": self.prompt, "pdfs": self.source} + self.final_state, self.execution_info = self.graph.execute(inputs) + + return self.final_state.get("answer", "No answer found.") diff --git a/scrapegraphai/nodes/generate_answer_csv_node.py b/scrapegraphai/nodes/generate_answer_csv_node.py index e12c64f9..a7f8f13b 100644 --- a/scrapegraphai/nodes/generate_answer_csv_node.py +++ b/scrapegraphai/nodes/generate_answer_csv_node.py @@ -49,7 +49,7 @@ def __init__( input: str, output: List[str], node_config: Optional[dict] = None, - node_name: str = "GenerateAnswer", + node_name: str = "GenerateAnswerCSV", ): """ Initializes the GenerateAnswerNodeCsv with a language model client and a node name. diff --git a/scrapegraphai/nodes/generate_answer_pdf_node.py b/scrapegraphai/nodes/generate_answer_pdf_node.py index 1f468a55..475fd4f7 100644 --- a/scrapegraphai/nodes/generate_answer_pdf_node.py +++ b/scrapegraphai/nodes/generate_answer_pdf_node.py @@ -48,7 +48,7 @@ def __init__( input: str, output: List[str], node_config: Optional[dict] = None, - node_name: str = "GenerateAnswer", + node_name: str = "GenerateAnswerPDF", ): """ Initializes the GenerateAnswerNodePDF with a language model client and a node name. diff --git a/scrapegraphai/nodes/generate_scraper_node.py b/scrapegraphai/nodes/generate_scraper_node.py index 8c272533..a4d74792 100644 --- a/scrapegraphai/nodes/generate_scraper_node.py +++ b/scrapegraphai/nodes/generate_scraper_node.py @@ -10,7 +10,6 @@ from langchain_core.output_parsers import StrOutputParser from langchain_core.runnables import RunnableParallel from tqdm import tqdm - from ..utils.logging import get_logger # Imports from the library diff --git a/scrapegraphai/nodes/get_probable_tags_node.py b/scrapegraphai/nodes/get_probable_tags_node.py index a26ded38..f31633c0 100644 --- a/scrapegraphai/nodes/get_probable_tags_node.py +++ b/scrapegraphai/nodes/get_probable_tags_node.py @@ -3,10 +3,8 @@ """ from typing import List, Optional - from langchain.output_parsers import CommaSeparatedListOutputParser from langchain.prompts import PromptTemplate - from ..utils.logging import get_logger from .base_node import BaseNode diff --git a/scrapegraphai/nodes/robots_node.py b/scrapegraphai/nodes/robots_node.py index 2ed7755f..d77c7a08 100644 --- a/scrapegraphai/nodes/robots_node.py +++ b/scrapegraphai/nodes/robots_node.py @@ -47,7 +47,7 @@ def __init__( input: str, output: List[str], node_config: Optional[dict] = None, - node_name: str = "Robots", + node_name: str = "RobotNode", ): super().__init__(node_name, "node", input, output, 1) From 4d42d7bfc65e36620d6af91ea19c0e8bc52673d7 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sat, 1 Jun 2024 11:20:24 +0200 Subject: [PATCH 3/9] add example --- .../local_models/json_scraper_multi_ollama.py | 47 +++++++ .../local_models/pdf_scraper_multi_ollama.py | 23 ++-- examples/openai/smart_scraper_multi_openai.py | 3 +- scrapegraphai/graphs/__init__.py | 1 + scrapegraphai/graphs/json_scraper_multi.py | 116 ++++++++++++++++++ 5 files changed, 179 insertions(+), 11 deletions(-) create mode 100644 examples/local_models/json_scraper_multi_ollama.py create mode 100644 scrapegraphai/graphs/json_scraper_multi.py diff --git a/examples/local_models/json_scraper_multi_ollama.py b/examples/local_models/json_scraper_multi_ollama.py new file mode 100644 index 00000000..d3540301 --- /dev/null +++ b/examples/local_models/json_scraper_multi_ollama.py @@ -0,0 +1,47 @@ +""" +Module for showing how PDFScraper multi works +""" +import os +from scrapegraphai.graphs import PdfScraperMultiGraph + +graph_config = { + "llm": { + "model": "ollama/llama3", + "temperature": 0, + "format": "json", # Ollama needs the format to be specified explicitly + "model_tokens": 4000, + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + }, + "verbose": True, + "headless": False, +} +FILE_NAME = "inputs/example.json" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + + +json_scraper_graph = JSONScraperGraph( + prompt="List me all the authors, title and genres of the books", + source=text, # Pass the content of the file, not the file object + config=graph_config +) + + + +results = [] +for source in sources: + pdf_scraper_graph = PdfScraperMultiGraph( + prompt=prompt, + source=source, + config=graph_config + ) + result = pdf_scraper_graph.run() + results.append(result) + +print(results) diff --git a/examples/local_models/pdf_scraper_multi_ollama.py b/examples/local_models/pdf_scraper_multi_ollama.py index c7b439bd..77565918 100644 --- a/examples/local_models/pdf_scraper_multi_ollama.py +++ b/examples/local_models/pdf_scraper_multi_ollama.py @@ -1,6 +1,7 @@ """ Module for showing how PDFScraper multi works """ +import json from scrapegraphai.graphs import PdfScraperMultiGraph graph_config = { @@ -56,14 +57,16 @@ Dependent Variable (DV): Mental health outcomes. Exogenous Shock: staggered introduction of Facebook across U.S. colleges. """ -results = [] -for source in sources: - pdf_scraper_graph = PdfScraperMultiGraph( - prompt=prompt, - source=source, - config=graph_config - ) - result = pdf_scraper_graph.run() - results.append(result) +# ******************************************************* +# Create the SmartScraperMultiGraph instance and run it +# ******************************************************* -print(results) +multiple_search_graph = PdfScraperMultiGraph( + prompt=prompt, + source= sources, + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/openai/smart_scraper_multi_openai.py b/examples/openai/smart_scraper_multi_openai.py index ddfc6239..504e00a8 100644 --- a/examples/openai/smart_scraper_multi_openai.py +++ b/examples/openai/smart_scraper_multi_openai.py @@ -2,7 +2,8 @@ Basic example of scraping pipeline using SmartScraper """ -import os, json +import os +import json from dotenv import load_dotenv from scrapegraphai.graphs import SmartScraperMultiGraph diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py index b572905e..b70686a7 100644 --- a/scrapegraphai/graphs/__init__.py +++ b/scrapegraphai/graphs/__init__.py @@ -17,3 +17,4 @@ from .omni_search_graph import OmniSearchGraph from .smart_scraper_multi_graph import SmartScraperMultiGraph from .pdf_scraper_multi import PdfScraperMultiGraph +from .json_scraper_multi import JsonScraperMultiGraph diff --git a/scrapegraphai/graphs/json_scraper_multi.py b/scrapegraphai/graphs/json_scraper_multi.py new file mode 100644 index 00000000..c7632d79 --- /dev/null +++ b/scrapegraphai/graphs/json_scraper_multi.py @@ -0,0 +1,116 @@ +""" +JsonScraperMultiGraph Module +""" + +from copy import copy, deepcopy +from typing import List, Optional + +from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph +from .json_scraper_graph import JSONScraperGraph + +from ..nodes import ( + GraphIteratorNode, + MergeAnswersNode +) + + +class JsonScraperMultiGraph(AbstractGraph): + """ + JsonScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and generates answers to a given prompt. + It only requires a user prompt and a list of URLs. + + Attributes: + prompt (str): The user prompt to search the internet. + llm_model (dict): The configuration for the language model. + embedder_model (dict): The configuration for the embedder model. + headless (bool): A flag to run the browser in headless mode. + verbose (bool): A flag to display the execution information. + model_token (int): The token limit for the language model. + + Args: + prompt (str): The user prompt to search the internet. + source (List[str]): The source of the graph. + config (dict): Configuration parameters for the graph. + schema (Optional[str]): The schema for the graph output. + + Example: + >>> search_graph = MultipleSearchGraph( + ... "What is Chioggia famous for?", + ... {"llm": {"model": "gpt-3.5-turbo"}} + ... ) + >>> result = search_graph.run() + """ + + def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[str] = None): + + self.max_results = config.get("max_results", 3) + + if all(isinstance(value, str) for value in config.values()): + self.copy_config = copy(config) + else: + self.copy_config = deepcopy(config) + + super().__init__(prompt, config, source, schema) + + def _create_graph(self) -> BaseGraph: + """ + Creates the graph of nodes representing the workflow for web scraping and searching. + + Returns: + BaseGraph: A graph instance representing the web scraping and searching workflow. + """ + + # ************************************************ + # Create a SmartScraperGraph instance + # ************************************************ + + smart_scraper_instance = JSONScraperGraph( + prompt="", + source="", + config=self.copy_config, + ) + + # ************************************************ + # Define the graph nodes + # ************************************************ + + graph_iterator_node = GraphIteratorNode( + input="user_prompt & jsons", + output=["results"], + node_config={ + "graph_instance": smart_scraper_instance, + } + ) + + merge_answers_node = MergeAnswersNode( + input="user_prompt & results", + output=["answer"], + node_config={ + "llm_model": self.llm_model, + "schema": self.schema + } + ) + + return BaseGraph( + nodes=[ + graph_iterator_node, + merge_answers_node, + ], + edges=[ + (graph_iterator_node, merge_answers_node), + ], + entry_point=graph_iterator_node + ) + + def run(self) -> str: + """ + Executes the web scraping and searching process. + + Returns: + str: The answer to the prompt. + """ + inputs = {"user_prompt": self.prompt, "jsons": self.source} + self.final_state, self.execution_info = self.graph.execute(inputs) + + return self.final_state.get("answer", "No answer found.") From 5bda918a39e4b50d86d784b4c592cc2ea1a68986 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sat, 1 Jun 2024 12:04:19 +0200 Subject: [PATCH 4/9] feat: add json multiscraper --- .../local_models/json_scraper_multi_ollama.py | 28 +++++++------------ .../local_models/pdf_scraper_multi_ollama.py | 1 - scrapegraphai/graphs/__init__.py | 2 +- scrapegraphai/graphs/json_scraper_multi.py | 6 ++-- scrapegraphai/nodes/__init__.py | 2 +- 5 files changed, 15 insertions(+), 24 deletions(-) diff --git a/examples/local_models/json_scraper_multi_ollama.py b/examples/local_models/json_scraper_multi_ollama.py index d3540301..2754425c 100644 --- a/examples/local_models/json_scraper_multi_ollama.py +++ b/examples/local_models/json_scraper_multi_ollama.py @@ -2,7 +2,8 @@ Module for showing how PDFScraper multi works """ import os -from scrapegraphai.graphs import PdfScraperMultiGraph +import json +from scrapegraphai.graphs import JSONScraperMultiGraph graph_config = { "llm": { @@ -25,23 +26,14 @@ with open(file_path, 'r', encoding="utf-8") as file: text = file.read() - -json_scraper_graph = JSONScraperGraph( - prompt="List me all the authors, title and genres of the books", - source=text, # Pass the content of the file, not the file object +sources = [text, text] + +multiple_search_graph = JSONScraperMultiGraph( + prompt= "List me all the authors, title and genres of the books", + source= sources, + schema=None, config=graph_config ) - - -results = [] -for source in sources: - pdf_scraper_graph = PdfScraperMultiGraph( - prompt=prompt, - source=source, - config=graph_config - ) - result = pdf_scraper_graph.run() - results.append(result) - -print(results) +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/local_models/pdf_scraper_multi_ollama.py b/examples/local_models/pdf_scraper_multi_ollama.py index 77565918..c0b65a63 100644 --- a/examples/local_models/pdf_scraper_multi_ollama.py +++ b/examples/local_models/pdf_scraper_multi_ollama.py @@ -16,7 +16,6 @@ "temperature": 0, }, "verbose": True, - "headless": False, } # Covert to list diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py index b70686a7..37814cd1 100644 --- a/scrapegraphai/graphs/__init__.py +++ b/scrapegraphai/graphs/__init__.py @@ -17,4 +17,4 @@ from .omni_search_graph import OmniSearchGraph from .smart_scraper_multi_graph import SmartScraperMultiGraph from .pdf_scraper_multi import PdfScraperMultiGraph -from .json_scraper_multi import JsonScraperMultiGraph +from .json_scraper_multi import JSONScraperMultiGraph diff --git a/scrapegraphai/graphs/json_scraper_multi.py b/scrapegraphai/graphs/json_scraper_multi.py index c7632d79..2010c856 100644 --- a/scrapegraphai/graphs/json_scraper_multi.py +++ b/scrapegraphai/graphs/json_scraper_multi.py @@ -1,5 +1,5 @@ """ -JsonScraperMultiGraph Module +JSONScraperMultiGraph Module """ from copy import copy, deepcopy @@ -15,9 +15,9 @@ ) -class JsonScraperMultiGraph(AbstractGraph): +class JSONScraperMultiGraph(AbstractGraph): """ - JsonScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and generates answers to a given prompt. + JSONScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and generates answers to a given prompt. It only requires a user prompt and a list of URLs. Attributes: diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py index 4577ee86..5c54937c 100644 --- a/scrapegraphai/nodes/__init__.py +++ b/scrapegraphai/nodes/__init__.py @@ -19,4 +19,4 @@ from .generate_answer_pdf_node import GenerateAnswerPDFNode from .graph_iterator_node import GraphIteratorNode from .merge_answers_node import MergeAnswersNode -from .generate_answer_omni_node import GenerateAnswerOmniNode \ No newline at end of file +from .generate_answer_omni_node import GenerateAnswerOmniNode From fff1232b8a51055b9b4b587a283d1710ef66b77f Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sat, 1 Jun 2024 13:06:20 +0200 Subject: [PATCH 5/9] add rag node --- scrapegraphai/graphs/pdf_scraper_graph.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/scrapegraphai/graphs/pdf_scraper_graph.py b/scrapegraphai/graphs/pdf_scraper_graph.py index 912f141e..6afa13de 100644 --- a/scrapegraphai/graphs/pdf_scraper_graph.py +++ b/scrapegraphai/graphs/pdf_scraper_graph.py @@ -10,6 +10,7 @@ from ..nodes import ( FetchNode, + RAGNode, GenerateAnswerPDFNode ) @@ -63,7 +64,15 @@ def _create_graph(self) -> BaseGraph: input='pdf | pdf_dir', output=["doc"], ) - + + rag_node = RAGNode( + input="user_prompt & (parsed_doc | doc)", + output=["relevant_chunks"], + node_config={ + "llm_model": self.llm_model, + "embedder_model": self.embedder_model + } + ) generate_answer_node_pdf = GenerateAnswerPDFNode( input="user_prompt & (relevant_chunks | doc)", output=["answer"], @@ -76,10 +85,12 @@ def _create_graph(self) -> BaseGraph: return BaseGraph( nodes=[ fetch_node, + rag_node, generate_answer_node_pdf, ], edges=[ - (fetch_node, generate_answer_node_pdf) + (fetch_node, rag_node), + (rag_node, generate_answer_node_pdf) ], entry_point=fetch_node ) @@ -95,4 +106,4 @@ def run(self) -> str: inputs = {"user_prompt": self.prompt, self.input_key: self.source} self.final_state, self.execution_info = self.graph.execute(inputs) - return self.final_state.get("answer", "No answer found.") \ No newline at end of file + return self.final_state.get("answer", "No answer found.") From 1fe49753b9e64cecd5c91df9770b78dd4759dd50 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sat, 1 Jun 2024 13:46:15 +0200 Subject: [PATCH 6/9] add openai and oneapi examples --- .../local_models/json_scraper_multi_ollama.py | 2 +- examples/oneapi/json_scraper_multi_oneapi..py | 32 +++++++++ examples/oneapi/json_scraper_oneapi.py | 4 -- examples/oneapi/pdf_scraper_multi_oneapi.py | 70 +++++++++++++++++++ examples/openai/deep_scraper_openai.py | 1 - examples/openai/json_scraper_multi_openai.py | 37 ++++++++++ examples/openai/pdf_scraper_multi_openai.py | 70 +++++++++++++++++++ 7 files changed, 210 insertions(+), 6 deletions(-) create mode 100644 examples/oneapi/json_scraper_multi_oneapi..py create mode 100644 examples/oneapi/pdf_scraper_multi_oneapi.py create mode 100644 examples/openai/json_scraper_multi_openai.py create mode 100644 examples/openai/pdf_scraper_multi_openai.py diff --git a/examples/local_models/json_scraper_multi_ollama.py b/examples/local_models/json_scraper_multi_ollama.py index 2754425c..91f4fab4 100644 --- a/examples/local_models/json_scraper_multi_ollama.py +++ b/examples/local_models/json_scraper_multi_ollama.py @@ -1,7 +1,7 @@ """ Module for showing how PDFScraper multi works """ -import os +import os import json from scrapegraphai.graphs import JSONScraperMultiGraph diff --git a/examples/oneapi/json_scraper_multi_oneapi..py b/examples/oneapi/json_scraper_multi_oneapi..py new file mode 100644 index 00000000..5dc365aa --- /dev/null +++ b/examples/oneapi/json_scraper_multi_oneapi..py @@ -0,0 +1,32 @@ +""" +Module for showing how PDFScraper multi works +""" +import os +import json +from scrapegraphai.graphs import JSONScraperMultiGraph + +graph_config = { + "llm": { + "api_key": "***************************", + "model": "oneapi/qwen-turbo", + "base_url": "http://127.0.0.1:3000/v1", # 设置 OneAPI URL + } +} +FILE_NAME = "inputs/example.json" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +sources = [text, text] + +multiple_search_graph = JSONScraperMultiGraph( + prompt= "List me all the authors, title and genres of the books", + source= sources, + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/oneapi/json_scraper_oneapi.py b/examples/oneapi/json_scraper_oneapi.py index 5f182594..87c7ea3c 100644 --- a/examples/oneapi/json_scraper_oneapi.py +++ b/examples/oneapi/json_scraper_oneapi.py @@ -3,10 +3,8 @@ """ import os -from dotenv import load_dotenv from scrapegraphai.graphs import JSONScraperGraph from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info -load_dotenv() # ************************************************ # Read the JSON file @@ -23,8 +21,6 @@ # Define the configuration for the graph # ************************************************ -openai_key = os.getenv("OPENAI_APIKEY") - graph_config = { "llm": { "api_key": "***************************", diff --git a/examples/oneapi/pdf_scraper_multi_oneapi.py b/examples/oneapi/pdf_scraper_multi_oneapi.py new file mode 100644 index 00000000..8b6c57a1 --- /dev/null +++ b/examples/oneapi/pdf_scraper_multi_oneapi.py @@ -0,0 +1,70 @@ +""" +Module for showing how PDFScraper multi works +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import PdfScraperMultiGraph + +load_dotenv() + +openai_key = os.getenv("OPENAI_APIKEY") + +graph_config = { + "llm": { + "api_key": openai_key, + "model": "gpt-3.5-turbo", + }, +} + +# Covert to list +sources = [ + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", +] + +prompt = """ +You are an expert in reviewing academic manuscripts. Please analyze the abstracts provided from an academic journal article to extract and clearly identify the following elements: + +Independent Variable (IV): The variable that is manipulated or considered as the primary cause affecting other variables. +Dependent Variable (DV): The variable that is measured or observed, which is expected to change as a result of variations in the Independent Variable. +Exogenous Shock: Identify any external or unexpected events used in the study that serve as a natural experiment or provide a unique setting for observing the effects on the IV and DV. +Response Format: For each abstract, present your response in the following structured format: + +Independent Variable (IV): +Dependent Variable (DV): +Exogenous Shock: + +Example Queries and Responses: + +Query: This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather the interaction between call center architecture and outdoor weather conditions in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking. + +Response: + +Independent Variable (IV): Employee happiness. +Dependent Variable (DV): Overall firm productivity. +Exogenous Shock: Sudden company-wide increase in bonus payments. + +Query: The diffusion of social media coincided with a worsening of mental health conditions among adolescents and young adults in the United States, giving rise to speculation that social media might be detrimental to mental health. In this paper, we provide quasi-experimental estimates of the impact of social media on mental health by leveraging a unique natural experiment: the staggered introduction of Facebook across U.S. colleges. Our analysis couples data on student mental health around the years of Facebook's expansion with a generalized difference-in-differences empirical strategy. We find that the roll-out of Facebook at a college increased symptoms of poor mental health, especially depression. We also find that, among students predicted to be most susceptible to mental illness, the introduction of Facebook led to increased utilization of mental healthcare services. Lastly, we find that, after the introduction of Facebook, students were more likely to report experiencing impairments to academic performance resulting from poor mental health. Additional evidence on mechanisms suggests that the results are due to Facebook fostering unfavorable social comparisons. + +Response: + +Independent Variable (IV): Exposure to social media. +Dependent Variable (DV): Mental health outcomes. +Exogenous Shock: staggered introduction of Facebook across U.S. colleges. +""" +# ******************************************************* +# Create the SmartScraperMultiGraph instance and run it +# ******************************************************* + +multiple_search_graph = PdfScraperMultiGraph( + prompt=prompt, + source= sources, + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/openai/deep_scraper_openai.py b/examples/openai/deep_scraper_openai.py index 6a2e1347..4860a31f 100644 --- a/examples/openai/deep_scraper_openai.py +++ b/examples/openai/deep_scraper_openai.py @@ -9,7 +9,6 @@ load_dotenv() - # ************************************************ # Define the configuration for the graph # ************************************************ diff --git a/examples/openai/json_scraper_multi_openai.py b/examples/openai/json_scraper_multi_openai.py new file mode 100644 index 00000000..5f3d9fc2 --- /dev/null +++ b/examples/openai/json_scraper_multi_openai.py @@ -0,0 +1,37 @@ +""" +Module for showing how PDFScraper multi works +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import JSONScraperMultiGraph + +load_dotenv() + +openai_key = os.getenv("OPENAI_APIKEY") + +graph_config = { + "llm": { + "api_key": openai_key, + "model": "gpt-3.5-turbo", + }, +} + +FILE_NAME = "inputs/example.json" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +sources = [text, text] + +multiple_search_graph = JSONScraperMultiGraph( + prompt= "List me all the authors, title and genres of the books", + source= sources, + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/openai/pdf_scraper_multi_openai.py b/examples/openai/pdf_scraper_multi_openai.py new file mode 100644 index 00000000..8b6c57a1 --- /dev/null +++ b/examples/openai/pdf_scraper_multi_openai.py @@ -0,0 +1,70 @@ +""" +Module for showing how PDFScraper multi works +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import PdfScraperMultiGraph + +load_dotenv() + +openai_key = os.getenv("OPENAI_APIKEY") + +graph_config = { + "llm": { + "api_key": openai_key, + "model": "gpt-3.5-turbo", + }, +} + +# Covert to list +sources = [ + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", +] + +prompt = """ +You are an expert in reviewing academic manuscripts. Please analyze the abstracts provided from an academic journal article to extract and clearly identify the following elements: + +Independent Variable (IV): The variable that is manipulated or considered as the primary cause affecting other variables. +Dependent Variable (DV): The variable that is measured or observed, which is expected to change as a result of variations in the Independent Variable. +Exogenous Shock: Identify any external or unexpected events used in the study that serve as a natural experiment or provide a unique setting for observing the effects on the IV and DV. +Response Format: For each abstract, present your response in the following structured format: + +Independent Variable (IV): +Dependent Variable (DV): +Exogenous Shock: + +Example Queries and Responses: + +Query: This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather the interaction between call center architecture and outdoor weather conditions in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking. + +Response: + +Independent Variable (IV): Employee happiness. +Dependent Variable (DV): Overall firm productivity. +Exogenous Shock: Sudden company-wide increase in bonus payments. + +Query: The diffusion of social media coincided with a worsening of mental health conditions among adolescents and young adults in the United States, giving rise to speculation that social media might be detrimental to mental health. In this paper, we provide quasi-experimental estimates of the impact of social media on mental health by leveraging a unique natural experiment: the staggered introduction of Facebook across U.S. colleges. Our analysis couples data on student mental health around the years of Facebook's expansion with a generalized difference-in-differences empirical strategy. We find that the roll-out of Facebook at a college increased symptoms of poor mental health, especially depression. We also find that, among students predicted to be most susceptible to mental illness, the introduction of Facebook led to increased utilization of mental healthcare services. Lastly, we find that, after the introduction of Facebook, students were more likely to report experiencing impairments to academic performance resulting from poor mental health. Additional evidence on mechanisms suggests that the results are due to Facebook fostering unfavorable social comparisons. + +Response: + +Independent Variable (IV): Exposure to social media. +Dependent Variable (DV): Mental health outcomes. +Exogenous Shock: staggered introduction of Facebook across U.S. colleges. +""" +# ******************************************************* +# Create the SmartScraperMultiGraph instance and run it +# ******************************************************* + +multiple_search_graph = PdfScraperMultiGraph( + prompt=prompt, + source= sources, + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) From fa9722d2b901947faecba5af488ebbce4e01593e Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sun, 2 Jun 2024 14:43:02 +0200 Subject: [PATCH 7/9] add examples --- .../anthropic/json_scraper_multi_haiku.py | 36 +++++++++ examples/anthropic/pdf_scraper_graph_haiku.py | 4 +- examples/anthropic/pdf_scraper_multi_haiku.py | 72 +++++++++++++++++ .../deepseek/json_scraper_multi_deepseek.py | 38 +++++++++ .../deepseek/pdf_scraper_multi_deepseek.py | 75 ++++++++++++++++++ examples/gemini/json_scraper_multi_gemini.py | 38 +++++++++ examples/gemini/pdf_scraper_multi_gemini.py | 74 +++++++++++++++++ examples/groq/json_scraper_multi_groq.py | 38 +++++++++ examples/groq/pdf_scraper_multi_groq.py | 74 +++++++++++++++++ .../json_scraper_multi_huggingfacehub.py | 46 +++++++++++ .../pdf_scraper_multi_huggingfacehub.py | 79 +++++++++++++++++++ 11 files changed, 573 insertions(+), 1 deletion(-) create mode 100644 examples/anthropic/json_scraper_multi_haiku.py create mode 100644 examples/anthropic/pdf_scraper_multi_haiku.py create mode 100644 examples/deepseek/json_scraper_multi_deepseek.py create mode 100644 examples/deepseek/pdf_scraper_multi_deepseek.py create mode 100644 examples/gemini/json_scraper_multi_gemini.py create mode 100644 examples/gemini/pdf_scraper_multi_gemini.py create mode 100644 examples/groq/json_scraper_multi_groq.py create mode 100644 examples/groq/pdf_scraper_multi_groq.py create mode 100644 examples/huggingfacehub/json_scraper_multi_huggingfacehub.py create mode 100644 examples/huggingfacehub/pdf_scraper_multi_huggingfacehub.py diff --git a/examples/anthropic/json_scraper_multi_haiku.py b/examples/anthropic/json_scraper_multi_haiku.py new file mode 100644 index 00000000..0327673b --- /dev/null +++ b/examples/anthropic/json_scraper_multi_haiku.py @@ -0,0 +1,36 @@ +""" +Module for showing how JSONScraperMultiGraph multi works +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import JSONScraperMultiGraph + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("ANTHROPIC_API_KEY"), + "model": "claude-3-haiku-20240307", + "max_tokens": 4000 + }, +} + +FILE_NAME = "inputs/example.json" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +sources = [text, text] + +multiple_search_graph = JSONScraperMultiGraph( + prompt= "List me all the authors, title and genres of the books", + source= sources, + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/anthropic/pdf_scraper_graph_haiku.py b/examples/anthropic/pdf_scraper_graph_haiku.py index cf7e8326..10080b0f 100644 --- a/examples/anthropic/pdf_scraper_graph_haiku.py +++ b/examples/anthropic/pdf_scraper_graph_haiku.py @@ -1,10 +1,12 @@ +""" +Module for showing how PDFScraper multi works +""" import os, json from dotenv import load_dotenv from scrapegraphai.graphs import PDFScraperGraph load_dotenv() - # ************************************************ # Define the configuration for the graph # ************************************************ diff --git a/examples/anthropic/pdf_scraper_multi_haiku.py b/examples/anthropic/pdf_scraper_multi_haiku.py new file mode 100644 index 00000000..974dd2f8 --- /dev/null +++ b/examples/anthropic/pdf_scraper_multi_haiku.py @@ -0,0 +1,72 @@ +""" +Module for showing how PDFScraper multi works +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import PdfScraperMultiGraph + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("ANTHROPIC_API_KEY"), + "model": "claude-3-haiku-20240307", + "max_tokens": 4000 + }, +} + +# *************** +# Covert to list +# *************** + +sources = [ + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", +] + +prompt = """ +You are an expert in reviewing academic manuscripts. Please analyze the abstracts provided from an academic journal article to extract and clearly identify the following elements: + +Independent Variable (IV): The variable that is manipulated or considered as the primary cause affecting other variables. +Dependent Variable (DV): The variable that is measured or observed, which is expected to change as a result of variations in the Independent Variable. +Exogenous Shock: Identify any external or unexpected events used in the study that serve as a natural experiment or provide a unique setting for observing the effects on the IV and DV. +Response Format: For each abstract, present your response in the following structured format: + +Independent Variable (IV): +Dependent Variable (DV): +Exogenous Shock: + +Example Queries and Responses: + +Query: This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather the interaction between call center architecture and outdoor weather conditions in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking. + +Response: + +Independent Variable (IV): Employee happiness. +Dependent Variable (DV): Overall firm productivity. +Exogenous Shock: Sudden company-wide increase in bonus payments. + +Query: The diffusion of social media coincided with a worsening of mental health conditions among adolescents and young adults in the United States, giving rise to speculation that social media might be detrimental to mental health. In this paper, we provide quasi-experimental estimates of the impact of social media on mental health by leveraging a unique natural experiment: the staggered introduction of Facebook across U.S. colleges. Our analysis couples data on student mental health around the years of Facebook's expansion with a generalized difference-in-differences empirical strategy. We find that the roll-out of Facebook at a college increased symptoms of poor mental health, especially depression. We also find that, among students predicted to be most susceptible to mental illness, the introduction of Facebook led to increased utilization of mental healthcare services. Lastly, we find that, after the introduction of Facebook, students were more likely to report experiencing impairments to academic performance resulting from poor mental health. Additional evidence on mechanisms suggests that the results are due to Facebook fostering unfavorable social comparisons. + +Response: + +Independent Variable (IV): Exposure to social media. +Dependent Variable (DV): Mental health outcomes. +Exogenous Shock: staggered introduction of Facebook across U.S. colleges. +""" +# ******************************************************* +# Create the SmartScraperMultiGraph instance and run it +# ******************************************************* + +multiple_search_graph = PdfScraperMultiGraph( + prompt=prompt, + source= sources, + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/deepseek/json_scraper_multi_deepseek.py b/examples/deepseek/json_scraper_multi_deepseek.py new file mode 100644 index 00000000..b957dde0 --- /dev/null +++ b/examples/deepseek/json_scraper_multi_deepseek.py @@ -0,0 +1,38 @@ +""" +Module for showing how JSONScraperMultiGraph multi works +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import JSONScraperMultiGraph + +load_dotenv() + +deepseek_key = os.getenv("DEEPSEEK_APIKEY") + +graph_config = { + "llm": { + "model": "deepseek-chat", + "openai_api_key": deepseek_key, + "openai_api_base": 'https://api.deepseek.com/v1', + }, + "verbose": True, +} +FILE_NAME = "inputs/example.json" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +sources = [text, text] + +multiple_search_graph = JSONScraperMultiGraph( + prompt= "List me all the authors, title and genres of the books", + source= sources, + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/deepseek/pdf_scraper_multi_deepseek.py b/examples/deepseek/pdf_scraper_multi_deepseek.py new file mode 100644 index 00000000..211e4635 --- /dev/null +++ b/examples/deepseek/pdf_scraper_multi_deepseek.py @@ -0,0 +1,75 @@ +""" +Module for showing how PDFScraper multi works +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import PdfScraperMultiGraph + +load_dotenv() + +deepseek_key = os.getenv("DEEPSEEK_APIKEY") + +graph_config = { + "llm": { + "model": "deepseek-chat", + "openai_api_key": deepseek_key, + "openai_api_base": 'https://api.deepseek.com/v1', + }, + "verbose": True, +} + +# *************** +# Covert to list +# *************** + +sources = [ + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", +] + +prompt = """ +You are an expert in reviewing academic manuscripts. Please analyze the abstracts provided from an academic journal article to extract and clearly identify the following elements: + +Independent Variable (IV): The variable that is manipulated or considered as the primary cause affecting other variables. +Dependent Variable (DV): The variable that is measured or observed, which is expected to change as a result of variations in the Independent Variable. +Exogenous Shock: Identify any external or unexpected events used in the study that serve as a natural experiment or provide a unique setting for observing the effects on the IV and DV. +Response Format: For each abstract, present your response in the following structured format: + +Independent Variable (IV): +Dependent Variable (DV): +Exogenous Shock: + +Example Queries and Responses: + +Query: This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather the interaction between call center architecture and outdoor weather conditions in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking. + +Response: + +Independent Variable (IV): Employee happiness. +Dependent Variable (DV): Overall firm productivity. +Exogenous Shock: Sudden company-wide increase in bonus payments. + +Query: The diffusion of social media coincided with a worsening of mental health conditions among adolescents and young adults in the United States, giving rise to speculation that social media might be detrimental to mental health. In this paper, we provide quasi-experimental estimates of the impact of social media on mental health by leveraging a unique natural experiment: the staggered introduction of Facebook across U.S. colleges. Our analysis couples data on student mental health around the years of Facebook's expansion with a generalized difference-in-differences empirical strategy. We find that the roll-out of Facebook at a college increased symptoms of poor mental health, especially depression. We also find that, among students predicted to be most susceptible to mental illness, the introduction of Facebook led to increased utilization of mental healthcare services. Lastly, we find that, after the introduction of Facebook, students were more likely to report experiencing impairments to academic performance resulting from poor mental health. Additional evidence on mechanisms suggests that the results are due to Facebook fostering unfavorable social comparisons. + +Response: + +Independent Variable (IV): Exposure to social media. +Dependent Variable (DV): Mental health outcomes. +Exogenous Shock: staggered introduction of Facebook across U.S. colleges. +""" +# ******************************************************* +# Create the SmartScraperMultiGraph instance and run it +# ******************************************************* + +multiple_search_graph = PdfScraperMultiGraph( + prompt=prompt, + source= sources, + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/gemini/json_scraper_multi_gemini.py b/examples/gemini/json_scraper_multi_gemini.py new file mode 100644 index 00000000..e914109b --- /dev/null +++ b/examples/gemini/json_scraper_multi_gemini.py @@ -0,0 +1,38 @@ +""" +Module for showing how JSONScraperMultiGraph multi works +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import JSONScraperMultiGraph + +load_dotenv() + +gemini_key = os.getenv("GOOGLE_APIKEY") + +graph_config = { + "llm": { + "api_key": gemini_key, + "model": "gemini-pro", + }, + "library": "beautifulsoup" +} + +FILE_NAME = "inputs/example.json" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +sources = [text, text] + +multiple_search_graph = JSONScraperMultiGraph( + prompt= "List me all the authors, title and genres of the books", + source= sources, + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/gemini/pdf_scraper_multi_gemini.py b/examples/gemini/pdf_scraper_multi_gemini.py new file mode 100644 index 00000000..66afbef2 --- /dev/null +++ b/examples/gemini/pdf_scraper_multi_gemini.py @@ -0,0 +1,74 @@ +""" +Module for showing how PDFScraper multi works +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import PdfScraperMultiGraph + +load_dotenv() + +gemini_key = os.getenv("GOOGLE_APIKEY") + +graph_config = { + "llm": { + "api_key": gemini_key, + "model": "gemini-pro", + }, + "library": "beautifulsoup" +} + +# *************** +# Covert to list +# *************** + +sources = [ + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", +] + +prompt = """ +You are an expert in reviewing academic manuscripts. Please analyze the abstracts provided from an academic journal article to extract and clearly identify the following elements: + +Independent Variable (IV): The variable that is manipulated or considered as the primary cause affecting other variables. +Dependent Variable (DV): The variable that is measured or observed, which is expected to change as a result of variations in the Independent Variable. +Exogenous Shock: Identify any external or unexpected events used in the study that serve as a natural experiment or provide a unique setting for observing the effects on the IV and DV. +Response Format: For each abstract, present your response in the following structured format: + +Independent Variable (IV): +Dependent Variable (DV): +Exogenous Shock: + +Example Queries and Responses: + +Query: This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather the interaction between call center architecture and outdoor weather conditions in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking. + +Response: + +Independent Variable (IV): Employee happiness. +Dependent Variable (DV): Overall firm productivity. +Exogenous Shock: Sudden company-wide increase in bonus payments. + +Query: The diffusion of social media coincided with a worsening of mental health conditions among adolescents and young adults in the United States, giving rise to speculation that social media might be detrimental to mental health. In this paper, we provide quasi-experimental estimates of the impact of social media on mental health by leveraging a unique natural experiment: the staggered introduction of Facebook across U.S. colleges. Our analysis couples data on student mental health around the years of Facebook's expansion with a generalized difference-in-differences empirical strategy. We find that the roll-out of Facebook at a college increased symptoms of poor mental health, especially depression. We also find that, among students predicted to be most susceptible to mental illness, the introduction of Facebook led to increased utilization of mental healthcare services. Lastly, we find that, after the introduction of Facebook, students were more likely to report experiencing impairments to academic performance resulting from poor mental health. Additional evidence on mechanisms suggests that the results are due to Facebook fostering unfavorable social comparisons. + +Response: + +Independent Variable (IV): Exposure to social media. +Dependent Variable (DV): Mental health outcomes. +Exogenous Shock: staggered introduction of Facebook across U.S. colleges. +""" +# ******************************************************* +# Create the SmartScraperMultiGraph instance and run it +# ******************************************************* + +multiple_search_graph = PdfScraperMultiGraph( + prompt=prompt, + source= sources, + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/groq/json_scraper_multi_groq.py b/examples/groq/json_scraper_multi_groq.py new file mode 100644 index 00000000..df3b9276 --- /dev/null +++ b/examples/groq/json_scraper_multi_groq.py @@ -0,0 +1,38 @@ +""" +Module for showing how JSONScraperMultiGraph multi works +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import JSONScraperMultiGraph + +load_dotenv() + +groq_key = os.getenv("GROQ_APIKEY") + +graph_config = { + "llm": { + "model": "groq/gemma-7b-it", + "api_key": groq_key, + "temperature": 0 + }, + "library": "beautifulsoup" +} +FILE_NAME = "inputs/example.json" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +sources = [text, text] + +multiple_search_graph = JSONScraperMultiGraph( + prompt= "List me all the authors, title and genres of the books", + source= sources, + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/groq/pdf_scraper_multi_groq.py b/examples/groq/pdf_scraper_multi_groq.py new file mode 100644 index 00000000..c43a7087 --- /dev/null +++ b/examples/groq/pdf_scraper_multi_groq.py @@ -0,0 +1,74 @@ +""" +Module for showing how PDFScraper multi works +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import PdfScraperMultiGraph + +load_dotenv() +groq_key = os.getenv("GROQ_APIKEY") + +graph_config = { + "llm": { + "model": "groq/gemma-7b-it", + "api_key": groq_key, + "temperature": 0 + }, + "library": "beautifulsoup" +} + +# *************** +# Covert to list +# *************** + +sources = [ + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", +] + +prompt = """ +You are an expert in reviewing academic manuscripts. Please analyze the abstracts provided from an academic journal article to extract and clearly identify the following elements: + +Independent Variable (IV): The variable that is manipulated or considered as the primary cause affecting other variables. +Dependent Variable (DV): The variable that is measured or observed, which is expected to change as a result of variations in the Independent Variable. +Exogenous Shock: Identify any external or unexpected events used in the study that serve as a natural experiment or provide a unique setting for observing the effects on the IV and DV. +Response Format: For each abstract, present your response in the following structured format: + +Independent Variable (IV): +Dependent Variable (DV): +Exogenous Shock: + +Example Queries and Responses: + +Query: This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather the interaction between call center architecture and outdoor weather conditions in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking. + +Response: + +Independent Variable (IV): Employee happiness. +Dependent Variable (DV): Overall firm productivity. +Exogenous Shock: Sudden company-wide increase in bonus payments. + +Query: The diffusion of social media coincided with a worsening of mental health conditions among adolescents and young adults in the United States, giving rise to speculation that social media might be detrimental to mental health. In this paper, we provide quasi-experimental estimates of the impact of social media on mental health by leveraging a unique natural experiment: the staggered introduction of Facebook across U.S. colleges. Our analysis couples data on student mental health around the years of Facebook's expansion with a generalized difference-in-differences empirical strategy. We find that the roll-out of Facebook at a college increased symptoms of poor mental health, especially depression. We also find that, among students predicted to be most susceptible to mental illness, the introduction of Facebook led to increased utilization of mental healthcare services. Lastly, we find that, after the introduction of Facebook, students were more likely to report experiencing impairments to academic performance resulting from poor mental health. Additional evidence on mechanisms suggests that the results are due to Facebook fostering unfavorable social comparisons. + +Response: + +Independent Variable (IV): Exposure to social media. +Dependent Variable (DV): Mental health outcomes. +Exogenous Shock: staggered introduction of Facebook across U.S. colleges. +""" +# ******************************************************* +# Create the SmartScraperMultiGraph instance and run it +# ******************************************************* + +multiple_search_graph = PdfScraperMultiGraph( + prompt=prompt, + source= sources, + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/huggingfacehub/json_scraper_multi_huggingfacehub.py b/examples/huggingfacehub/json_scraper_multi_huggingfacehub.py new file mode 100644 index 00000000..8ca3ba51 --- /dev/null +++ b/examples/huggingfacehub/json_scraper_multi_huggingfacehub.py @@ -0,0 +1,46 @@ +""" +Module for showing how PDFScraper multi works +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import JSONScraperMultiGraph +from langchain_community.llms import HuggingFaceEndpoint +from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings + +load_dotenv() + +HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') + +repo_id = "mistralai/Mistral-7B-Instruct-v0.2" + +llm_model_instance = HuggingFaceEndpoint( + repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN +) + +embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( + api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2" +) + +graph_config = { + "llm": {"model_instance": llm_model_instance}, + "embeddings": {"model_instance": embedder_model_instance} +} +FILE_NAME = "inputs/example.json" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +sources = [text, text] + +multiple_search_graph = JSONScraperMultiGraph( + prompt= "List me all the authors, title and genres of the books", + source= sources, + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/huggingfacehub/pdf_scraper_multi_huggingfacehub.py b/examples/huggingfacehub/pdf_scraper_multi_huggingfacehub.py new file mode 100644 index 00000000..d24d522c --- /dev/null +++ b/examples/huggingfacehub/pdf_scraper_multi_huggingfacehub.py @@ -0,0 +1,79 @@ +""" +Module for showing how PDFScraper multi works +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import PdfScraperMultiGraph +from langchain_community.llms import HuggingFaceEndpoint +from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings +load_dotenv() + +HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') + +repo_id = "mistralai/Mistral-7B-Instruct-v0.2" + +llm_model_instance = HuggingFaceEndpoint( + repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN +) + +embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( + api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2" +) + +graph_config = { + "llm": {"model_instance": llm_model_instance}, + "embeddings": {"model_instance": embedder_model_instance} +} + +# Covert to list +sources = [ + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", +] + +prompt = """ +You are an expert in reviewing academic manuscripts. Please analyze the abstracts provided from an academic journal article to extract and clearly identify the following elements: + +Independent Variable (IV): The variable that is manipulated or considered as the primary cause affecting other variables. +Dependent Variable (DV): The variable that is measured or observed, which is expected to change as a result of variations in the Independent Variable. +Exogenous Shock: Identify any external or unexpected events used in the study that serve as a natural experiment or provide a unique setting for observing the effects on the IV and DV. +Response Format: For each abstract, present your response in the following structured format: + +Independent Variable (IV): +Dependent Variable (DV): +Exogenous Shock: + +Example Queries and Responses: + +Query: This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather the interaction between call center architecture and outdoor weather conditions in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking. + +Response: + +Independent Variable (IV): Employee happiness. +Dependent Variable (DV): Overall firm productivity. +Exogenous Shock: Sudden company-wide increase in bonus payments. + +Query: The diffusion of social media coincided with a worsening of mental health conditions among adolescents and young adults in the United States, giving rise to speculation that social media might be detrimental to mental health. In this paper, we provide quasi-experimental estimates of the impact of social media on mental health by leveraging a unique natural experiment: the staggered introduction of Facebook across U.S. colleges. Our analysis couples data on student mental health around the years of Facebook's expansion with a generalized difference-in-differences empirical strategy. We find that the roll-out of Facebook at a college increased symptoms of poor mental health, especially depression. We also find that, among students predicted to be most susceptible to mental illness, the introduction of Facebook led to increased utilization of mental healthcare services. Lastly, we find that, after the introduction of Facebook, students were more likely to report experiencing impairments to academic performance resulting from poor mental health. Additional evidence on mechanisms suggests that the results are due to Facebook fostering unfavorable social comparisons. + +Response: + +Independent Variable (IV): Exposure to social media. +Dependent Variable (DV): Mental health outcomes. +Exogenous Shock: staggered introduction of Facebook across U.S. colleges. +""" +# ******************************************************* +# Create the SmartScraperMultiGraph instance and run it +# ******************************************************* + +multiple_search_graph = PdfScraperMultiGraph( + prompt=prompt, + source= sources, + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) From b4086550cc9dc42b2fd91ee7ef60c6a2c2ac3fd2 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sun, 2 Jun 2024 22:57:33 +0200 Subject: [PATCH 8/9] feat: add csv scraper and xml scraper multi --- .../csv_scraper_graph_multi_ollama.py | 62 ++++++++++ .../xml_scraper_graph_multi_ollama.py | 64 ++++++++++ scrapegraphai/graphs/__init__.py | 2 + .../graphs/csv_scraper_graph_multi.py | 116 +++++++++++++++++ .../graphs/xml_scraper_graph_multi.py | 117 ++++++++++++++++++ 5 files changed, 361 insertions(+) create mode 100644 examples/local_models/csv_scraper_graph_multi_ollama.py create mode 100644 examples/local_models/xml_scraper_graph_multi_ollama.py create mode 100644 scrapegraphai/graphs/csv_scraper_graph_multi.py create mode 100644 scrapegraphai/graphs/xml_scraper_graph_multi.py diff --git a/examples/local_models/csv_scraper_graph_multi_ollama.py b/examples/local_models/csv_scraper_graph_multi_ollama.py new file mode 100644 index 00000000..fb6bce51 --- /dev/null +++ b/examples/local_models/csv_scraper_graph_multi_ollama.py @@ -0,0 +1,62 @@ +""" +Basic example of scraping pipeline using CSVScraperMultiGraph from CSV documents +""" + +import os +import pandas as pd +from scrapegraphai.graphs import CSVScraperMultiGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +# ************************************************ +# Read the CSV file +# ************************************************ + +FILE_NAME = "inputs/username.csv" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +text = pd.read_csv(file_path) + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "ollama/llama3", + "temperature": 0, + "format": "json", # Ollama needs the format to be specified explicitly + # "model_tokens": 2000, # set context length arbitrarily + "base_url": "http://localhost:11434", + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + "base_url": "http://localhost:11434", + }, + "verbose": True, +} + +# ************************************************ +# Create the CSVScraperMultiGraph instance and run it +# ************************************************ + +csv_scraper_graph = CSVScraperMultiGraph( + prompt="List me all the last names", + source=[str(text), str(text)], + config=graph_config +) + +result = csv_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = csv_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/local_models/xml_scraper_graph_multi_ollama.py b/examples/local_models/xml_scraper_graph_multi_ollama.py new file mode 100644 index 00000000..2ce9c456 --- /dev/null +++ b/examples/local_models/xml_scraper_graph_multi_ollama.py @@ -0,0 +1,64 @@ +""" +Basic example of scraping pipeline using XMLScraperMultiGraph from XML documents +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import XMLScraperMultiGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info +load_dotenv() + +# ************************************************ +# Read the XML file +# ************************************************ + +FILE_NAME = "inputs/books.xml" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "ollama/llama3", + "temperature": 0, + "format": "json", # Ollama needs the format to be specified explicitly + # "model_tokens": 2000, # set context length arbitrarily + "base_url": "http://localhost:11434", + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + "base_url": "http://localhost:11434", + }, + "verbose": True, +} + +# ************************************************ +# Create the XMLScraperMultiGraph instance and run it +# ************************************************ + +xml_scraper_graph = XMLScraperMultiGraph( + prompt="List me all the authors, title and genres of the books", + source=[text, text], # Pass the content of the file, not the file object + config=graph_config +) + +result = xml_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = xml_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py index 37814cd1..29f001fa 100644 --- a/scrapegraphai/graphs/__init__.py +++ b/scrapegraphai/graphs/__init__.py @@ -18,3 +18,5 @@ from .smart_scraper_multi_graph import SmartScraperMultiGraph from .pdf_scraper_multi import PdfScraperMultiGraph from .json_scraper_multi import JSONScraperMultiGraph +from .csv_scraper_graph_multi import CSVScraperMultiGraph +from .xml_scraper_graph_multi import XMLScraperMultiGraph diff --git a/scrapegraphai/graphs/csv_scraper_graph_multi.py b/scrapegraphai/graphs/csv_scraper_graph_multi.py new file mode 100644 index 00000000..85ed1727 --- /dev/null +++ b/scrapegraphai/graphs/csv_scraper_graph_multi.py @@ -0,0 +1,116 @@ +""" +CSVScraperMultiGraph Module +""" + +from copy import copy, deepcopy +from typing import List, Optional + +from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph +from .csv_scraper_graph import CSVScraperGraph + +from ..nodes import ( + GraphIteratorNode, + MergeAnswersNode +) + + +class CSVScraperMultiGraph(AbstractGraph): + """ + CSVScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and generates answers to a given prompt. + It only requires a user prompt and a list of URLs. + + Attributes: + prompt (str): The user prompt to search the internet. + llm_model (dict): The configuration for the language model. + embedder_model (dict): The configuration for the embedder model. + headless (bool): A flag to run the browser in headless mode. + verbose (bool): A flag to display the execution information. + model_token (int): The token limit for the language model. + + Args: + prompt (str): The user prompt to search the internet. + source (List[str]): The source of the graph. + config (dict): Configuration parameters for the graph. + schema (Optional[str]): The schema for the graph output. + + Example: + >>> search_graph = MultipleSearchGraph( + ... "What is Chioggia famous for?", + ... {"llm": {"model": "gpt-3.5-turbo"}} + ... ) + >>> result = search_graph.run() + """ + + def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[str] = None): + + self.max_results = config.get("max_results", 3) + + if all(isinstance(value, str) for value in config.values()): + self.copy_config = copy(config) + else: + self.copy_config = deepcopy(config) + + super().__init__(prompt, config, source, schema) + + def _create_graph(self) -> BaseGraph: + """ + Creates the graph of nodes representing the workflow for web scraping and searching. + + Returns: + BaseGraph: A graph instance representing the web scraping and searching workflow. + """ + + # ************************************************ + # Create a SmartScraperGraph instance + # ************************************************ + + smart_scraper_instance = CSVScraperGraph( + prompt="", + source="", + config=self.copy_config, + ) + + # ************************************************ + # Define the graph nodes + # ************************************************ + + graph_iterator_node = GraphIteratorNode( + input="user_prompt & jsons", + output=["results"], + node_config={ + "graph_instance": smart_scraper_instance, + } + ) + + merge_answers_node = MergeAnswersNode( + input="user_prompt & results", + output=["answer"], + node_config={ + "llm_model": self.llm_model, + "schema": self.schema + } + ) + + return BaseGraph( + nodes=[ + graph_iterator_node, + merge_answers_node, + ], + edges=[ + (graph_iterator_node, merge_answers_node), + ], + entry_point=graph_iterator_node + ) + + def run(self) -> str: + """ + Executes the web scraping and searching process. + + Returns: + str: The answer to the prompt. + """ + inputs = {"user_prompt": self.prompt, "jsons": self.source} + self.final_state, self.execution_info = self.graph.execute(inputs) + + return self.final_state.get("answer", "No answer found.") diff --git a/scrapegraphai/graphs/xml_scraper_graph_multi.py b/scrapegraphai/graphs/xml_scraper_graph_multi.py new file mode 100644 index 00000000..1198f580 --- /dev/null +++ b/scrapegraphai/graphs/xml_scraper_graph_multi.py @@ -0,0 +1,117 @@ +""" +XMLScraperMultiGraph Module +""" + +from copy import copy, deepcopy +from typing import List, Optional + +from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph +from .xml_scraper_graph import XMLScraperGraph + +from ..nodes import ( + GraphIteratorNode, + MergeAnswersNode +) + + +class XMLScraperMultiGraph(AbstractGraph): + """ + XMLScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and + generates answers to a given prompt. + It only requires a user prompt and a list of URLs. + + Attributes: + prompt (str): The user prompt to search the internet. + llm_model (dict): The configuration for the language model. + embedder_model (dict): The configuration for the embedder model. + headless (bool): A flag to run the browser in headless mode. + verbose (bool): A flag to display the execution information. + model_token (int): The token limit for the language model. + + Args: + prompt (str): The user prompt to search the internet. + source (List[str]): The source of the graph. + config (dict): Configuration parameters for the graph. + schema (Optional[str]): The schema for the graph output. + + Example: + >>> search_graph = MultipleSearchGraph( + ... "What is Chioggia famous for?", + ... {"llm": {"model": "gpt-3.5-turbo"}} + ... ) + >>> result = search_graph.run() + """ + + def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[str] = None): + + self.max_results = config.get("max_results", 3) + + if all(isinstance(value, str) for value in config.values()): + self.copy_config = copy(config) + else: + self.copy_config = deepcopy(config) + + super().__init__(prompt, config, source, schema) + + def _create_graph(self) -> BaseGraph: + """ + Creates the graph of nodes representing the workflow for web scraping and searching. + + Returns: + BaseGraph: A graph instance representing the web scraping and searching workflow. + """ + + # ************************************************ + # Create a SmartScraperGraph instance + # ************************************************ + + smart_scraper_instance = XMLScraperGraph( + prompt="", + source="", + config=self.copy_config, + ) + + # ************************************************ + # Define the graph nodes + # ************************************************ + + graph_iterator_node = GraphIteratorNode( + input="user_prompt & jsons", + output=["results"], + node_config={ + "graph_instance": smart_scraper_instance, + } + ) + + merge_answers_node = MergeAnswersNode( + input="user_prompt & results", + output=["answer"], + node_config={ + "llm_model": self.llm_model, + "schema": self.schema + } + ) + + return BaseGraph( + nodes=[ + graph_iterator_node, + merge_answers_node, + ], + edges=[ + (graph_iterator_node, merge_answers_node), + ], + entry_point=graph_iterator_node + ) + + def run(self) -> str: + """ + Executes the web scraping and searching process. + + Returns: + str: The answer to the prompt. + """ + inputs = {"user_prompt": self.prompt, "jsons": self.source} + self.final_state, self.execution_info = self.graph.execute(inputs) + + return self.final_state.get("answer", "No answer found.") From 743dfe119191447c1111fa1cf4e539b106ef98bf Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Mon, 3 Jun 2024 12:19:43 +0200 Subject: [PATCH 9/9] add all possible examples --- .../csv_scraper_graph_multi_haiku.py | 55 +++++++++++++++++ .../xml_scraper_graph_multi_haiku.py | 55 +++++++++++++++++ .../csv_scraper_graph_multi_bedrock.py | 59 ++++++++++++++++++ .../xml_scraper_graph_multi_bedrock.py | 59 ++++++++++++++++++ .../csv_scraper_graph_multi_deepseek.py | 58 ++++++++++++++++++ .../xml_scraper_graph_multi_deepseek.py | 58 ++++++++++++++++++ .../gemini/csv_scraper_graph_multi_gemini.py | 57 ++++++++++++++++++ .../gemini/xml_scraper_graph_multi_gemini.py | 57 ++++++++++++++++++ examples/groq/csv_scraper_graph_multi_groq.py | 59 ++++++++++++++++++ examples/groq/xml_scraper_graph_multi_groq.py | 60 +++++++++++++++++++ .../xml_scraper_graph_multi_ollama.py | 2 - .../oneapi/csv_scraper_graph_multi_oneapi.py | 0 .../oneapi/xml_scraper_graph_multi_oneapi.py | 57 ++++++++++++++++++ examples/oneapi/xml_scraper_oneapi.py | 2 +- .../openai/csv_scraper_graph_multi_openai.py | 56 +++++++++++++++++ .../openai/xml_scraper_graph_multi_ollama.py | 57 ++++++++++++++++++ 16 files changed, 748 insertions(+), 3 deletions(-) create mode 100644 examples/anthropic/csv_scraper_graph_multi_haiku.py create mode 100644 examples/anthropic/xml_scraper_graph_multi_haiku.py create mode 100644 examples/bedrock/csv_scraper_graph_multi_bedrock.py create mode 100644 examples/bedrock/xml_scraper_graph_multi_bedrock.py create mode 100644 examples/deepseek/csv_scraper_graph_multi_deepseek.py create mode 100644 examples/deepseek/xml_scraper_graph_multi_deepseek.py create mode 100644 examples/gemini/csv_scraper_graph_multi_gemini.py create mode 100644 examples/gemini/xml_scraper_graph_multi_gemini.py create mode 100644 examples/groq/csv_scraper_graph_multi_groq.py create mode 100644 examples/groq/xml_scraper_graph_multi_groq.py create mode 100644 examples/oneapi/csv_scraper_graph_multi_oneapi.py create mode 100644 examples/oneapi/xml_scraper_graph_multi_oneapi.py create mode 100644 examples/openai/csv_scraper_graph_multi_openai.py create mode 100644 examples/openai/xml_scraper_graph_multi_ollama.py diff --git a/examples/anthropic/csv_scraper_graph_multi_haiku.py b/examples/anthropic/csv_scraper_graph_multi_haiku.py new file mode 100644 index 00000000..b833af01 --- /dev/null +++ b/examples/anthropic/csv_scraper_graph_multi_haiku.py @@ -0,0 +1,55 @@ +""" +Basic example of scraping pipeline using CSVScraperMultiGraph from CSV documents +""" + +import os +from dotenv import load_dotenv +import pandas as pd +from scrapegraphai.graphs import CSVScraperMultiGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +load_dotenv() +# ************************************************ +# Read the CSV file +# ************************************************ + +FILE_NAME = "inputs/username.csv" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +text = pd.read_csv(file_path) + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "api_key": os.getenv("ANTHROPIC_API_KEY"), + "model": "claude-3-haiku-20240307", + "max_tokens": 4000}, +} + +# ************************************************ +# Create the CSVScraperMultiGraph instance and run it +# ************************************************ + +csv_scraper_graph = CSVScraperMultiGraph( + prompt="List me all the last names", + source=[str(text), str(text)], + config=graph_config +) + +result = csv_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = csv_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/anthropic/xml_scraper_graph_multi_haiku.py b/examples/anthropic/xml_scraper_graph_multi_haiku.py new file mode 100644 index 00000000..6b79f709 --- /dev/null +++ b/examples/anthropic/xml_scraper_graph_multi_haiku.py @@ -0,0 +1,55 @@ +""" +Basic example of scraping pipeline using XMLScraperMultiGraph from XML documents +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import XMLScraperMultiGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info +load_dotenv() + +# ************************************************ +# Read the XML file +# ************************************************ + +FILE_NAME = "inputs/books.xml" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "api_key": os.getenv("ANTHROPIC_API_KEY"), + "model": "claude-3-haiku-20240307", + "max_tokens": 4000}, +} + +# ************************************************ +# Create the XMLScraperMultiGraph instance and run it +# ************************************************ + +xml_scraper_graph = XMLScraperMultiGraph( + prompt="List me all the authors, title and genres of the books", + source=[text, text], # Pass the content of the file, not the file object + config=graph_config +) + +result = xml_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = xml_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/bedrock/csv_scraper_graph_multi_bedrock.py b/examples/bedrock/csv_scraper_graph_multi_bedrock.py new file mode 100644 index 00000000..c776c508 --- /dev/null +++ b/examples/bedrock/csv_scraper_graph_multi_bedrock.py @@ -0,0 +1,59 @@ +""" +Basic example of scraping pipeline using CSVScraperMultiGraph from CSV documents +""" + +import os +from dotenv import load_dotenv +import pandas as pd +from scrapegraphai.graphs import CSVScraperMultiGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +load_dotenv() +# ************************************************ +# Read the CSV file +# ************************************************ + +FILE_NAME = "inputs/username.csv" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +text = pd.read_csv(file_path) + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "client": "client_name", + "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + "temperature": 0.0 + }, + "embeddings": { + "model": "bedrock/cohere.embed-multilingual-v3" + } +} + +# ************************************************ +# Create the CSVScraperMultiGraph instance and run it +# ************************************************ + +csv_scraper_graph = CSVScraperMultiGraph( + prompt="List me all the last names", + source=[str(text), str(text)], + config=graph_config +) + +result = csv_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = csv_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/bedrock/xml_scraper_graph_multi_bedrock.py b/examples/bedrock/xml_scraper_graph_multi_bedrock.py new file mode 100644 index 00000000..a0ed3560 --- /dev/null +++ b/examples/bedrock/xml_scraper_graph_multi_bedrock.py @@ -0,0 +1,59 @@ +""" +Basic example of scraping pipeline using XMLScraperMultiGraph from XML documents +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import XMLScraperMultiGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info +load_dotenv() + +# ************************************************ +# Read the XML file +# ************************************************ + +FILE_NAME = "inputs/books.xml" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "client": "client_name", + "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + "temperature": 0.0 + }, + "embeddings": { + "model": "bedrock/cohere.embed-multilingual-v3" + } +} + +# ************************************************ +# Create the XMLScraperMultiGraph instance and run it +# ************************************************ + +xml_scraper_graph = XMLScraperMultiGraph( + prompt="List me all the authors, title and genres of the books", + source=[text, text], # Pass the content of the file, not the file object + config=graph_config +) + +result = xml_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = xml_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/deepseek/csv_scraper_graph_multi_deepseek.py b/examples/deepseek/csv_scraper_graph_multi_deepseek.py new file mode 100644 index 00000000..ea5e9154 --- /dev/null +++ b/examples/deepseek/csv_scraper_graph_multi_deepseek.py @@ -0,0 +1,58 @@ +""" +Basic example of scraping pipeline using CSVScraperMultiGraph from CSV documents +""" + +import os +from dotenv import load_dotenv +import pandas as pd +from scrapegraphai.graphs import CSVScraperMultiGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +load_dotenv() +# ************************************************ +# Read the CSV file +# ************************************************ + +FILE_NAME = "inputs/username.csv" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +text = pd.read_csv(file_path) + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +deepseek_key = os.getenv("DEEPSEEK_APIKEY") + +graph_config = { + "llm": { + "model": "deepseek-chat", + "openai_api_key": deepseek_key, + "openai_api_base": 'https://api.deepseek.com/v1', + }, + "verbose": True, +} +# ************************************************ +# Create the CSVScraperMultiGraph instance and run it +# ************************************************ + +csv_scraper_graph = CSVScraperMultiGraph( + prompt="List me all the last names", + source=[str(text), str(text)], + config=graph_config +) + +result = csv_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = csv_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/deepseek/xml_scraper_graph_multi_deepseek.py b/examples/deepseek/xml_scraper_graph_multi_deepseek.py new file mode 100644 index 00000000..0f53a6b2 --- /dev/null +++ b/examples/deepseek/xml_scraper_graph_multi_deepseek.py @@ -0,0 +1,58 @@ +""" +Basic example of scraping pipeline using XMLScraperMultiGraph from XML documents +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import XMLScraperMultiGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info +load_dotenv() + +# ************************************************ +# Read the XML file +# ************************************************ + +FILE_NAME = "inputs/books.xml" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +deepseek_key = os.getenv("DEEPSEEK_APIKEY") + +graph_config = { + "llm": { + "model": "deepseek-chat", + "openai_api_key": deepseek_key, + "openai_api_base": 'https://api.deepseek.com/v1', + }, + "verbose": True, +} +# ************************************************ +# Create the XMLScraperMultiGraph instance and run it +# ************************************************ + +xml_scraper_graph = XMLScraperMultiGraph( + prompt="List me all the authors, title and genres of the books", + source=[text, text], # Pass the content of the file, not the file object + config=graph_config +) + +result = xml_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = xml_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/gemini/csv_scraper_graph_multi_gemini.py b/examples/gemini/csv_scraper_graph_multi_gemini.py new file mode 100644 index 00000000..bfe1b19a --- /dev/null +++ b/examples/gemini/csv_scraper_graph_multi_gemini.py @@ -0,0 +1,57 @@ +""" +Basic example of scraping pipeline using CSVScraperMultiGraph from CSV documents +""" + +import os +from dotenv import load_dotenv +import pandas as pd +from scrapegraphai.graphs import CSVScraperMultiGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +load_dotenv() +# ************************************************ +# Read the CSV file +# ************************************************ + +FILE_NAME = "inputs/username.csv" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +text = pd.read_csv(file_path) + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +gemini_key = os.getenv("GOOGLE_APIKEY") + +graph_config = { + "llm": { + "api_key": gemini_key, + "model": "gemini-pro", + }, +} + +# ************************************************ +# Create the CSVScraperMultiGraph instance and run it +# ************************************************ + +csv_scraper_graph = CSVScraperMultiGraph( + prompt="List me all the last names", + source=[str(text), str(text)], + config=graph_config +) + +result = csv_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = csv_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/gemini/xml_scraper_graph_multi_gemini.py b/examples/gemini/xml_scraper_graph_multi_gemini.py new file mode 100644 index 00000000..e0d979b7 --- /dev/null +++ b/examples/gemini/xml_scraper_graph_multi_gemini.py @@ -0,0 +1,57 @@ +""" +Basic example of scraping pipeline using XMLScraperMultiGraph from XML documents +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import XMLScraperMultiGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info +load_dotenv() + +# ************************************************ +# Read the XML file +# ************************************************ + +FILE_NAME = "inputs/books.xml" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +gemini_key = os.getenv("GOOGLE_APIKEY") + +graph_config = { + "llm": { + "api_key": gemini_key, + "model": "gemini-pro", + }, +} + +# ************************************************ +# Create the XMLScraperMultiGraph instance and run it +# ************************************************ + +xml_scraper_graph = XMLScraperMultiGraph( + prompt="List me all the authors, title and genres of the books", + source=[text, text], # Pass the content of the file, not the file object + config=graph_config +) + +result = xml_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = xml_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/groq/csv_scraper_graph_multi_groq.py b/examples/groq/csv_scraper_graph_multi_groq.py new file mode 100644 index 00000000..475b8cac --- /dev/null +++ b/examples/groq/csv_scraper_graph_multi_groq.py @@ -0,0 +1,59 @@ +""" +Basic example of scraping pipeline using CSVScraperMultiGraph from CSV documents +""" + +import os +from dotenv import load_dotenv +import pandas as pd +from scrapegraphai.graphs import CSVScraperMultiGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +load_dotenv() +# ************************************************ +# Read the CSV file +# ************************************************ + +FILE_NAME = "inputs/username.csv" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +text = pd.read_csv(file_path) + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +groq_key = os.getenv("GROQ_APIKEY") + +graph_config = { + "llm": { + "model": "groq/gemma-7b-it", + "api_key": groq_key, + "temperature": 0 + }, + "headless": False +} + +# ************************************************ +# Create the CSVScraperMultiGraph instance and run it +# ************************************************ + +csv_scraper_graph = CSVScraperMultiGraph( + prompt="List me all the last names", + source=[str(text), str(text)], + config=graph_config +) + +result = csv_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = csv_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/groq/xml_scraper_graph_multi_groq.py b/examples/groq/xml_scraper_graph_multi_groq.py new file mode 100644 index 00000000..62540671 --- /dev/null +++ b/examples/groq/xml_scraper_graph_multi_groq.py @@ -0,0 +1,60 @@ +""" +Basic example of scraping pipeline using XMLScraperMultiGraph from XML documents +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import XMLScraperMultiGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info +load_dotenv() + +# ************************************************ +# Read the XML file +# ************************************************ + +FILE_NAME = "inputs/books.xml" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +groq_key = os.getenv("GROQ_APIKEY") + +graph_config = { + "llm": { + "model": "groq/gemma-7b-it", + "api_key": groq_key, + "temperature": 0 + }, + "headless": False +} + + +# ************************************************ +# Create the XMLScraperMultiGraph instance and run it +# ************************************************ + +xml_scraper_graph = XMLScraperMultiGraph( + prompt="List me all the authors, title and genres of the books", + source=[text, text], # Pass the content of the file, not the file object + config=graph_config +) + +result = xml_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = xml_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/local_models/xml_scraper_graph_multi_ollama.py b/examples/local_models/xml_scraper_graph_multi_ollama.py index 2ce9c456..d84c6c9f 100644 --- a/examples/local_models/xml_scraper_graph_multi_ollama.py +++ b/examples/local_models/xml_scraper_graph_multi_ollama.py @@ -3,10 +3,8 @@ """ import os -from dotenv import load_dotenv from scrapegraphai.graphs import XMLScraperMultiGraph from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info -load_dotenv() # ************************************************ # Read the XML file diff --git a/examples/oneapi/csv_scraper_graph_multi_oneapi.py b/examples/oneapi/csv_scraper_graph_multi_oneapi.py new file mode 100644 index 00000000..e69de29b diff --git a/examples/oneapi/xml_scraper_graph_multi_oneapi.py b/examples/oneapi/xml_scraper_graph_multi_oneapi.py new file mode 100644 index 00000000..564c2a3a --- /dev/null +++ b/examples/oneapi/xml_scraper_graph_multi_oneapi.py @@ -0,0 +1,57 @@ +""" +Basic example of scraping pipeline using XMLScraperMultiGraph from XML documents +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import XMLScraperMultiGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info +load_dotenv() + +# ************************************************ +# Read the XML file +# ************************************************ + +FILE_NAME = "inputs/books.xml" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +openai_key = os.getenv("OPENAI_APIKEY") + +graph_config = { + "llm": { + "api_key": openai_key, + "model": "gpt-3.5-turbo", + }, +} + +# ************************************************ +# Create the XMLScraperMultiGraph instance and run it +# ************************************************ + +xml_scraper_graph = XMLScraperMultiGraph( + prompt="List me all the authors, title and genres of the books", + source=[text, text], # Pass the content of the file, not the file object + config=graph_config +) + +result = xml_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = xml_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/oneapi/xml_scraper_oneapi.py b/examples/oneapi/xml_scraper_oneapi.py index 5be5716e..15862052 100644 --- a/examples/oneapi/xml_scraper_oneapi.py +++ b/examples/oneapi/xml_scraper_oneapi.py @@ -23,7 +23,7 @@ # Define the configuration for the graph # ************************************************ -openai_key = os.getenv("OPENAI_APIKEY") +openai_key = os.getenv("ONEAPI_KEY") graph_config = { "llm": { diff --git a/examples/openai/csv_scraper_graph_multi_openai.py b/examples/openai/csv_scraper_graph_multi_openai.py new file mode 100644 index 00000000..890765df --- /dev/null +++ b/examples/openai/csv_scraper_graph_multi_openai.py @@ -0,0 +1,56 @@ +""" +Basic example of scraping pipeline using CSVScraperMultiGraph from CSV documents +""" + +import os +from dotenv import load_dotenv +import pandas as pd +from scrapegraphai.graphs import CSVScraperMultiGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +load_dotenv() +# ************************************************ +# Read the CSV file +# ************************************************ + +FILE_NAME = "inputs/username.csv" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +text = pd.read_csv(file_path) + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "api_key": "***************************", + "model": "oneapi/qwen-turbo", + "base_url": "http://127.0.0.1:3000/v1", # 设置 OneAPI URL + } +} + +# ************************************************ +# Create the CSVScraperMultiGraph instance and run it +# ************************************************ + +csv_scraper_graph = CSVScraperMultiGraph( + prompt="List me all the last names", + source=[str(text), str(text)], + config=graph_config +) + +result = csv_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = csv_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/openai/xml_scraper_graph_multi_ollama.py b/examples/openai/xml_scraper_graph_multi_ollama.py new file mode 100644 index 00000000..e0edfaef --- /dev/null +++ b/examples/openai/xml_scraper_graph_multi_ollama.py @@ -0,0 +1,57 @@ +""" +Basic example of scraping pipeline using XMLScraperMultiGraph from XML documents +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import XMLScraperMultiGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info +load_dotenv() + +# ************************************************ +# Read the XML file +# ************************************************ + +FILE_NAME = "inputs/books.xml" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "api_key": "***************************", + "model": "oneapi/qwen-turbo", + "base_url": "http://127.0.0.1:3000/v1", # 设置 OneAPI URL + } +} + + +# ************************************************ +# Create the XMLScraperMultiGraph instance and run it +# ************************************************ + +xml_scraper_graph = XMLScraperMultiGraph( + prompt="List me all the authors, title and genres of the books", + source=[text, text], # Pass the content of the file, not the file object + config=graph_config +) + +result = xml_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = xml_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result")