From 5c9843f1410a78568892635e53872793d5ba0d6f Mon Sep 17 00:00:00 2001 From: Marco Perini Date: Wed, 5 Jun 2024 15:07:21 +0200 Subject: [PATCH 1/2] fix(schema): fixed json output --- requirements-dev.lock | 8 -------- scrapegraphai/nodes/generate_answer_csv_node.py | 7 ++----- scrapegraphai/nodes/generate_answer_node.py | 10 ++++------ scrapegraphai/nodes/generate_answer_omni_node.py | 7 ++----- scrapegraphai/nodes/generate_answer_pdf_node.py | 6 +++--- scrapegraphai/nodes/merge_answers_node.py | 11 +++-------- 6 files changed, 14 insertions(+), 35 deletions(-) diff --git a/requirements-dev.lock b/requirements-dev.lock index fcbcdd7d..a1e9a303 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -185,10 +185,6 @@ idna==3.7 # via yarl imagesize==1.4.1 # via sphinx -importlib-metadata==7.1.0 - # via sphinx -importlib-resources==6.4.0 - # via matplotlib iniconfig==2.0.0 # via pytest jinja2==3.1.4 @@ -475,7 +471,6 @@ typing-extensions==4.12.0 # via pyee # via sf-hamilton # via sqlalchemy - # via starlette # via streamlit # via typer # via typing-inspect @@ -507,6 +502,3 @@ win32-setctime==1.1.0 # via loguru yarl==1.9.4 # via aiohttp -zipp==3.19.1 - # via importlib-metadata - # via importlib-resources diff --git a/scrapegraphai/nodes/generate_answer_csv_node.py b/scrapegraphai/nodes/generate_answer_csv_node.py index 6f3f5e16..7440d17d 100644 --- a/scrapegraphai/nodes/generate_answer_csv_node.py +++ b/scrapegraphai/nodes/generate_answer_csv_node.py @@ -8,7 +8,7 @@ # Imports from Langchain from langchain.prompts import PromptTemplate -from langchain_core.output_parsers import JsonOutputParser, PydanticOutputParser +from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel from tqdm import tqdm @@ -96,7 +96,7 @@ def execute(self, state): # Initialize the output parser if self.node_config.get("schema", None) is not None: - output_parser = PydanticOutputParser(pydantic_object=self.node_config.get("schema", None)) + output_parser = JsonOutputParser(pydantic_object=self.node_config["schema"]) else: output_parser = JsonOutputParser() @@ -150,9 +150,6 @@ def execute(self, state): single_chain = list(chains_dict.values())[0] answer = single_chain.invoke({"question": user_prompt}) - if type(answer) == PydanticOutputParser: - answer = answer.model_dump() - # Update the state with the generated answer state.update({self.output[0]: answer}) return state diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index 0cd21732..c7c45d02 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -7,10 +7,11 @@ # Imports from Langchain from langchain.prompts import PromptTemplate -from langchain_core.output_parsers import JsonOutputParser, PydanticOutputParser +from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel from tqdm import tqdm + from ..utils.logging import get_logger from ..models import Ollama # Imports from the library @@ -81,8 +82,8 @@ def execute(self, state: dict) -> dict: doc = input_data[1] # Initialize the output parser - if self.node_config.get("schema",None) is not None: - output_parser = PydanticOutputParser(pydantic_object=self.node_config.get("schema", None)) + if self.node_config.get("schema", None) is not None: + output_parser = JsonOutputParser(pydantic_object=self.node_config["schema"]) else: output_parser = JsonOutputParser() @@ -129,9 +130,6 @@ def execute(self, state: dict) -> dict: single_chain = list(chains_dict.values())[0] answer = single_chain.invoke({"question": user_prompt}) - if type(answer) == PydanticOutputParser: - answer = answer.model_dump() - # Update the state with the generated answer state.update({self.output[0]: answer}) return state diff --git a/scrapegraphai/nodes/generate_answer_omni_node.py b/scrapegraphai/nodes/generate_answer_omni_node.py index 627033db..480459e3 100644 --- a/scrapegraphai/nodes/generate_answer_omni_node.py +++ b/scrapegraphai/nodes/generate_answer_omni_node.py @@ -7,7 +7,7 @@ # Imports from Langchain from langchain.prompts import PromptTemplate -from langchain_core.output_parsers import JsonOutputParser, PydanticOutputParser +from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel from tqdm import tqdm from ..models import Ollama @@ -82,7 +82,7 @@ def execute(self, state: dict) -> dict: # Initialize the output parser if self.node_config.get("schema", None) is not None: - output_parser = PydanticOutputParser(pydantic_object=self.node_config.get("schema", None)) + output_parser = JsonOutputParser(pydantic_object=self.node_config["schema"]) else: output_parser = JsonOutputParser() @@ -141,9 +141,6 @@ def execute(self, state: dict) -> dict: single_chain = list(chains_dict.values())[0] answer = single_chain.invoke({"question": user_prompt}) - if type(answer) == PydanticOutputParser: - answer = answer.model_dump() - # Update the state with the generated answer state.update({self.output[0]: answer}) return state diff --git a/scrapegraphai/nodes/generate_answer_pdf_node.py b/scrapegraphai/nodes/generate_answer_pdf_node.py index 8457b248..897e1c56 100644 --- a/scrapegraphai/nodes/generate_answer_pdf_node.py +++ b/scrapegraphai/nodes/generate_answer_pdf_node.py @@ -7,7 +7,7 @@ # Imports from Langchain from langchain.prompts import PromptTemplate -from langchain_core.output_parsers import JsonOutputParser, PydanticOutputParser +from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel from tqdm import tqdm from ..models import Ollama @@ -96,8 +96,8 @@ def execute(self, state): doc = input_data[1] # Initialize the output parser - if self.node_config.get("schema",None) is not None: - output_parser = PydanticOutputParser(pydantic_object=self.node_config.get("schema", None)) + if self.node_config.get("schema", None) is not None: + output_parser = JsonOutputParser(pydantic_object=self.node_config["schema"]) else: output_parser = JsonOutputParser() diff --git a/scrapegraphai/nodes/merge_answers_node.py b/scrapegraphai/nodes/merge_answers_node.py index eaeb424e..0efd8ec8 100644 --- a/scrapegraphai/nodes/merge_answers_node.py +++ b/scrapegraphai/nodes/merge_answers_node.py @@ -8,7 +8,7 @@ # Imports from Langchain from langchain.prompts import PromptTemplate -from langchain_core.output_parsers import JsonOutputParser, PydanticOutputParser +from langchain_core.output_parsers import JsonOutputParser from tqdm import tqdm from ..utils.logging import get_logger @@ -80,10 +80,8 @@ def execute(self, state: dict) -> dict: answers_str += f"CONTENT WEBSITE {i+1}: {answer}\n" # Initialize the output parser - if self.node_config["schema"] is not None: - output_parser = PydanticOutputParser( - pydantic_object=self.node_config["schema"] - ) + if self.node_config.get("schema", None) is not None: + output_parser = JsonOutputParser(pydantic_object=self.node_config["schema"]) else: output_parser = JsonOutputParser() @@ -111,9 +109,6 @@ def execute(self, state: dict) -> dict: merge_chain = prompt_template | self.llm_model | output_parser answer = merge_chain.invoke({"user_prompt": user_prompt}) - if type(answer) == PydanticOutputParser: - answer = answer.model_dump() - # Update the state with the generated answer state.update({self.output[0]: answer}) return state From 5d1fbf806a20746931ebb7fcb32c383d9d549d93 Mon Sep 17 00:00:00 2001 From: Marco Perini Date: Wed, 5 Jun 2024 18:45:37 +0200 Subject: [PATCH 2/2] feat(indexify-node): add example --- .../integrations/indexify_node_example.py | 72 +++++++++++++++++ scrapegraphai/integrations/__init__.py | 3 +- scrapegraphai/integrations/indexify_node.py | 79 +++++++++++++++++++ 3 files changed, 153 insertions(+), 1 deletion(-) create mode 100644 examples/integrations/indexify_node_example.py create mode 100644 scrapegraphai/integrations/indexify_node.py diff --git a/examples/integrations/indexify_node_example.py b/examples/integrations/indexify_node_example.py new file mode 100644 index 00000000..07a184ec --- /dev/null +++ b/examples/integrations/indexify_node_example.py @@ -0,0 +1,72 @@ +""" +Basic example of scraping pipeline using SmartScraper with schema +""" + +import os, json +from typing import List + +from dotenv import load_dotenv +load_dotenv() + +from pydantic import BaseModel, Field +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.integrations import IndexifyNode + + +# ************************************************ +# Define the output schema for the graph +# ************************************************ + +class Image(BaseModel): + url: str = Field(description="The url of the image") + +class Images(BaseModel): + images: List[Image] + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +openai_key = os.getenv("OPENAI_APIKEY") + +graph_config = { + "llm": { + "api_key":openai_key, + "model": "gpt-3.5-turbo", + }, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Define the custom nodes for the graph +# ************************************************ + +indexify_node = IndexifyNode( + input="answer & img_urls", + output=["is_indexed"], + node_config={ + "verbose": True + } +) + +# ************************************************ +# Create the SmartScraperGraph instance +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the images with their url", + source="https://giphy.com/", + schema=Images, + config=graph_config +) + +# Add the custom node to the graph +smart_scraper_graph.append_node(indexify_node) + +# ************************************************ +# Run the SmartScraperGraph +# ************************************************ + +result = smart_scraper_graph.run() +print(json.dumps(result, indent=2)) diff --git a/scrapegraphai/integrations/__init__.py b/scrapegraphai/integrations/__init__.py index 556ccc2f..be6b4bf7 100644 --- a/scrapegraphai/integrations/__init__.py +++ b/scrapegraphai/integrations/__init__.py @@ -2,4 +2,5 @@ Init file for integrations module """ -from .burr_bridge import BurrBridge \ No newline at end of file +from .burr_bridge import BurrBridge +from .indexify_node import IndexifyNode \ No newline at end of file diff --git a/scrapegraphai/integrations/indexify_node.py b/scrapegraphai/integrations/indexify_node.py new file mode 100644 index 00000000..e12adc69 --- /dev/null +++ b/scrapegraphai/integrations/indexify_node.py @@ -0,0 +1,79 @@ +""" +IndexifyNode Module +""" + +from typing import List, Optional + +from ..utils.logging import get_logger +from ..nodes.base_node import BaseNode + +# try: +# import indexify +# except ImportError: +# raise ImportError("indexify package is not installed. Please install it with 'pip install scrapegraphai[indexify]'") + + +class IndexifyNode(BaseNode): + """ + A node responsible for indexing the content present in the state. + + Attributes: + verbose (bool): A flag indicating whether to show print statements during execution. + + Args: + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "Parse". + """ + + def __init__( + self, + input: str, + output: List[str], + node_config: Optional[dict] = None, + node_name: str = "Indexify", + ): + super().__init__(node_name, "node", input, output, 2, node_config) + + self.verbose = ( + False if node_config is None else node_config.get("verbose", False) + ) + + def execute(self, state: dict) -> dict: + """ + Executes the node's logic to index the content present in the state. + + Args: + state (dict): The current state of the graph. The input keys will be used to fetch the + correct data from the state. + + Returns: + dict: The updated state with the output key containing the parsed content chunks. + + Raises: + KeyError: If the input keys are not found in the state, indicating that the + necessary information for parsing the content is missing. + """ + + self.logger.info(f"--- Executing {self.node_name} Node ---") + + # Interpret input keys based on the provided input expression + # input_keys length matches the min_input_len parameter in the __init__ method + # e.g. "answer & parsed_doc" or "answer | img_urls" + + input_keys = self.get_input_keys(state) + + # Fetching data from the state based on the input keys + input_data = [state[key] for key in input_keys] + + answer = input_data[0] + img_urls = input_data[1] + + # Indexify the content + # ... + + isIndexified = True + state.update({self.output[0]: isIndexified}) + + return state