diff --git a/cookbook/README.md b/cookbook/README.md deleted file mode 100644 index fe1cb2c6..00000000 --- a/cookbook/README.md +++ /dev/null @@ -1,9 +0,0 @@ -## 📚 Official Cookbook - -Looking for examples and guides? Then head over to the official ScrapeGraph SDK [Cookbook](https://github.com/ScrapeGraphAI/scrapegraph-sdk/tree/main/cookbook)! - -The cookbook provides step-by-step instructions, practical examples, and tips to help you get started and make the most out of ScrapeGraph SDK. - -You will find some colab notebooks with our partners as well, such as Langchain 🦜 and LlamaIndex 🦙 - -Happy scraping! 🚀 diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst index 666322af..30acfb5a 100644 --- a/docs/source/getting_started/installation.rst +++ b/docs/source/getting_started/installation.rst @@ -25,18 +25,18 @@ The library is available on PyPI, so it can be installed using the following com It is higly recommended to install the library in a virtual environment (conda, venv, etc.) -If your clone the repository, it is recommended to use a package manager like `rye `_. -To install the library using rye, you can run the following command: +If your clone the repository, it is recommended to use a package manager like `uv `_. +To install the library using uv, you can run the following command: .. code-block:: bash - rye pin 3.10 - rye sync - rye build + uv pin 3.10 + uv sync + uv build .. caution:: - **Rye** must be installed first by following the instructions on the `official website `_. + **Rye** must be installed first by following the instructions on the `official website `_. Additionally on Windows when using WSL ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/docs/source/introduction/overview.rst b/docs/source/introduction/overview.rst index 4e2bd604..4dcaadbe 100644 --- a/docs/source/introduction/overview.rst +++ b/docs/source/introduction/overview.rst @@ -30,37 +30,93 @@ ScrapGraphAI supports a wide range of AI models from various providers. Each mod OpenAI Models ------------- - GPT-3.5 Turbo (16,385 tokens) -- GPT-4 (8,192 tokens) +- GPT-3.5 (4,096 tokens) +- GPT-3.5 Turbo Instruct (4,096 tokens) - GPT-4 Turbo Preview (128,000 tokens) -- GPT-4o (128000 tokens) -- GTP-4o-mini (128000 tokens) +- GPT-4 Vision Preview (128,000 tokens) +- GPT-4 (8,192 tokens) +- GPT-4 32k (32,768 tokens) +- GPT-4o (128,000 tokens) +- O1 Preview (128,000 tokens) +- O1 Mini (128,000 tokens) Azure OpenAI Models ------------------- - GPT-3.5 Turbo (16,385 tokens) -- GPT-4 (8,192 tokens) +- GPT-3.5 (4,096 tokens) - GPT-4 Turbo Preview (128,000 tokens) -- GPT-4o (128000 tokens) -- GTP-4o-mini (128000 tokens) +- GPT-4 (8,192 tokens) +- GPT-4 32k (32,768 tokens) +- GPT-4o (128,000 tokens) +- O1 Preview (128,000 tokens) +- O1 Mini (128,000 tokens) Google AI Models ---------------- - Gemini Pro (128,000 tokens) +- Gemini 1.5 Flash (128,000 tokens) - Gemini 1.5 Pro (128,000 tokens) +- Gemini 1.0 Pro (128,000 tokens) Anthropic Models ---------------- - Claude Instant (100,000 tokens) -- Claude 2 (200,000 tokens) +- Claude 2 (9,000 tokens) +- Claude 2.1 (200,000 tokens) - Claude 3 (200,000 tokens) +- Claude 3.5 (200,000 tokens) +- Claude 3 Opus (200,000 tokens) +- Claude 3 Sonnet (200,000 tokens) +- Claude 3 Haiku (200,000 tokens) Mistral AI Models ----------------- -- Mistral Large (128,000 tokens) +- Mistral Large Latest (128,000 tokens) +- Open Mistral Nemo (128,000 tokens) +- Codestral Latest (32,000 tokens) - Open Mistral 7B (32,000 tokens) - Open Mixtral 8x7B (32,000 tokens) +- Open Mixtral 8x22B (64,000 tokens) +- Open Codestral Mamba (256,000 tokens) -For a complete list of supported models and their token limits, please refer to the API documentation. +Ollama Models +------------- +- Command-R (12,800 tokens) +- CodeLlama (16,000 tokens) +- DBRX (32,768 tokens) +- DeepSeek Coder 33B (16,000 tokens) +- Llama2 Series (4,096 tokens) +- Llama3 Series (8,192-128,000 tokens) +- Mistral Models (32,000-128,000 tokens) +- Mixtral 8x22B Instruct (65,536 tokens) +- Phi3 Series (12,800-128,000 tokens) +- Qwen Series (32,000 tokens) + +Hugging Face Models +------------------ +- Grok-1 (8,192 tokens) +- Meta Llama 3 Series (8,192 tokens) +- Google Gemma Series (8,192 tokens) +- Microsoft Phi Series (2,048-131,072 tokens) +- GPT-2 Series (1,024 tokens) +- DeepSeek V2 Series (131,072 tokens) + +Bedrock Models +------------- +- Claude 3 Series (200,000 tokens) +- Llama2 & Llama3 Series (4,096-8,192 tokens) +- Mistral Series (32,768 tokens) +- Titan Embed Text (8,000 tokens) +- Cohere Embed (512 tokens) + +Fireworks Models +--------------- +- Llama V2 7B (4,096 tokens) +- Mixtral 8x7B Instruct (4,096 tokens) +- Llama 3.1 Series (131,072 tokens) +- Mixtral MoE Series (65,536 tokens) + +For a complete and up-to-date list of supported models and their token limits, please refer to the API documentation. Understanding token limits is crucial for optimizing your scraping tasks. Larger token limits allow for processing more text in a single API call, which can be beneficial for scraping lengthy web pages or documents. @@ -139,3 +195,8 @@ Sponsors :width: 15% :alt: Stat Proxies :target: https://dashboard.statproxies.com/?refferal=scrapegraph + +.. image:: ../../assets/scrapedo.png + :width: 11% + :alt: Scrapedo + :target: https://scrape.do diff --git a/docs/source/modules/scrapegraphai.helpers.models_tokens.rst b/docs/source/modules/scrapegraphai.helpers.models_tokens.rst index 173e1bc3..82615b3b 100644 --- a/docs/source/modules/scrapegraphai.helpers.models_tokens.rst +++ b/docs/source/modules/scrapegraphai.helpers.models_tokens.rst @@ -19,7 +19,7 @@ Example usage: print(f"GPT-4 token limit: {gpt4_limit}") # Check the token limit for a specific model - model_name = "gpt-3.5-turbo" + model_name = "gpt-4o-mini" if model_name in models_tokens['openai']: print(f"{model_name} token limit: {models_tokens['openai'][model_name]}") else: diff --git a/docs/source/scrapers/benchmarks.rst b/docs/source/scrapers/benchmarks.rst deleted file mode 100644 index b5521ef1..00000000 --- a/docs/source/scrapers/benchmarks.rst +++ /dev/null @@ -1,23 +0,0 @@ -Benchmarks -========== - -SearchGraph -^^^^^^^^^^^ - -`SearchGraph` instantiates multiple `SmartScraperGraph` object for each URL and extract the data from the HTML. -A concurrent approach is used to speed up the process and the following table shows the time required for a scraping task with different **batch sizes**. -Only two results are taken into account. - -.. list-table:: SearchGraph - :header-rows: 1 - - * - Batch Size - - Total Time (s) - * - 1 - - 31.1 - * - 2 - - 33.52 - * - 4 - - 28.47 - * - 16 - - 21.80 diff --git a/examples/ScrapegraphAI_cookbook.ipynb b/examples/ScrapegraphAI_cookbook.ipynb new file mode 100644 index 00000000..b58bf0ea --- /dev/null +++ b/examples/ScrapegraphAI_cookbook.ipynb @@ -0,0 +1,915 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9_CQrFgOj78b" + }, + "outputs": [], + "source": [ + "%%capture\n", + "!pip install scrapegraphai\n", + "!apt install chromium-chromedriver\n", + "!pip install nest_asyncio\n", + "!pip install playwright\n", + "!playwright install" + ] + }, + { + "cell_type": "code", + "source": [ + "import nest_asyncio\n", + "nest_asyncio.apply()" + ], + "metadata": { + "id": "tb33AcRHywFb" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "00a84YVhhxJr" + }, + "outputs": [], + "source": [ + "# correct APIKEY\n", + "OPENAI_API_KEY = \"YOUR API KEY\"" + ] + }, + { + "cell_type": "markdown", + "source": [ + "For more examples visit [the examples folder](https://github.com/ScrapeGraphAI/Scrapegraph-ai/tree/main/examples)" + ], + "metadata": { + "id": "vGDjka17pqqg" + } + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Mrujgp-nlp12" + }, + "source": [ + "# SmartScraperGraph\n", + "**SmartScraperGraph** is a class representing one of the default scraping pipelines. It uses a direct graph implementation where each node has its own function, from retrieving html from a website to extracting relevant information based on your query and generate a coherent answer." + ] + }, + { + "cell_type": "markdown", + "source": [ + "![Screenshot 2024-09-19 alle 17.04.56.png]()" + ], + "metadata": { + "id": "M-dmSB0_zHCQ" + } + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uqYBNOM2YZD9" + }, + "source": [ + "## Using OpenAI models" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ogiF4g5Z-bzG" + }, + "outputs": [], + "source": [ + "from scrapegraphai.graphs import SmartScraperGraph" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7ZzONlJ6-oe_" + }, + "source": [ + "Define the configuration for the graph" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MPZgrZ12-eRc" + }, + "outputs": [], + "source": [ + "graph_config = {\n", + " \"llm\": {\n", + " \"api_key\": OPENAI_API_KEY,\n", + " \"model\": \"openai/gpt-4o-mini\",\n", + " \"temperature\":0,\n", + " },\n", + " \"verbose\":True,\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DjDt_10r-q8P" + }, + "source": [ + "Create the SmartScraperGraph instance and run it" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "aV4VTnx9-h_d" + }, + "outputs": [], + "source": [ + "smart_scraper_graph = SmartScraperGraph(\n", + " prompt=\"List me all the projects with their descriptions.\",\n", + " # also accepts a string with the already downloaded HTML code\n", + " source=\"https://perinim.github.io/projects/\",\n", + " config=graph_config\n", + ")" + ] + }, + { + "cell_type": "code", + "source": [ + "graph_config = {\n", + " \"llm\": {\n", + " \"api_key\": OPENAI_API_KEY,\n", + " \"model\": \"openai/gpt-4o-mini\",\n", + " },\n", + " \"verbose\": True,\n", + " \"headless\": True,\n", + "}\n", + "\n", + "# ************************************************\n", + "# Create the SmartScraperGraph instance and run it\n", + "# ************************************************\n", + "\n", + "smart_scraper_graph = SmartScraperGraph(\n", + " prompt=\"List me all the projects with their description\",\n", + " source=\"https://perinim.github.io/projects/\",\n", + " config=graph_config\n", + ")" + ], + "metadata": { + "id": "E3pyGQZLTiZ8" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Zty23idsAtwU", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "419dd75f-18c6-44d2-da82-ca8967d17e0f" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "--- Executing Fetch Node ---\n", + "--- (Fetching HTML from: https://perinim.github.io/projects/) ---\n", + "--- Executing ParseNode Node ---\n", + "--- Executing GenerateAnswer Node ---\n" + ] + } + ], + "source": [ + "result = smart_scraper_graph.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rnGhLGCuAqRU", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "062aeab2-3e96-4fec-d04a-b9acae142f40" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "{\n", + " \"projects\": [\n", + " {\n", + " \"name\": \"Rotary Pendulum RL\",\n", + " \"description\": \"Open Source project aimed at controlling a real life rotary pendulum using RL algorithms\"\n", + " },\n", + " {\n", + " \"name\": \"DQN Implementation from scratch\",\n", + " \"description\": \"Developed a Deep Q-Network algorithm to train a simple and double pendulum\"\n", + " },\n", + " {\n", + " \"name\": \"Multi Agents HAED\",\n", + " \"description\": \"University project which focuses on simulating a multi-agent system to perform environment mapping. Agents, equipped with sensors, explore and record their surroundings, considering uncertainties in their readings.\"\n", + " },\n", + " {\n", + " \"name\": \"Wireless ESC for Modular Drones\",\n", + " \"description\": \"Modular drone architecture proposal and proof of concept. The project received maximum grade.\"\n", + " }\n", + " ]\n", + "}\n" + ] + } + ], + "source": [ + "import json\n", + "\n", + "output = json.dumps(result, indent=2)\n", + "\n", + "line_list = output.split(\"\\n\") # Sort of line replacing \"\\n\" with a new line\n", + "\n", + "for line in line_list:\n", + " print(line)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5poLHYLVa-6E" + }, + "source": [ + "# Search graph\n", + "This graph **transforms** the user prompt in a **internet search query**, fetch the relevant URLs, and start the scraping process. Similar to the **SmartScraperGraph** but with the addition of the **SearchInternetNode** node." + ] + }, + { + "cell_type": "markdown", + "source": [ + "![image.png]()" + ], + "metadata": { + "id": "NRIoaXSzzP8M" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "RIvbQjyhbHhW" + }, + "outputs": [], + "source": [ + "from scrapegraphai.graphs import SearchGraph\n", + "\n", + "# Define the configuration for the graph\n", + "graph_config = {\n", + " \"llm\": {\n", + " \"api_key\": OPENAI_API_KEY,\n", + " \"model\": \"openai/gpt-4o-mini\",\n", + " \"temperature\": 0,\n", + " },\n", + "}\n", + "\n", + "# Create the SearchGraph instance\n", + "search_graph = SearchGraph(\n", + " prompt=\"List me all the European countries. Look in wikipedia.\",\n", + " config=graph_config\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XnVtc7SzCkUY" + }, + "outputs": [], + "source": [ + "result = search_graph.run()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3LPAh-yQCqkY" + }, + "source": [ + "Prettify the result and display the JSON" + ] + }, + { + "cell_type": "code", + "source": [ + "import json\n", + "\n", + "output = json.dumps(result, indent=2)\n", + "\n", + "line_list = output.split(\"\\n\") # Sort of line replacing \"\\n\" with a new line\n", + "\n", + "for line in line_list:\n", + " print(line)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "xgnWDLTjzHwv", + "outputId": "f0c8ebf4-5ba5-4330-dbd8-1c9fdd93eaeb" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "{\n", + " \"European_countries\": [\n", + " \"Albania\",\n", + " \"Andorra\",\n", + " \"Armenia\",\n", + " \"Austria\",\n", + " \"Azerbaijan\",\n", + " \"Belarus\",\n", + " \"Belgium\",\n", + " \"Bosnia and Herzegovina\",\n", + " \"Bulgaria\",\n", + " \"Croatia\",\n", + " \"Cyprus\",\n", + " \"Czech Republic\",\n", + " \"Denmark\",\n", + " \"Estonia\",\n", + " \"Finland\",\n", + " \"France\",\n", + " \"Georgia\",\n", + " \"Germany\",\n", + " \"Greece\",\n", + " \"Hungary\",\n", + " \"Iceland\",\n", + " \"Ireland\",\n", + " \"Italy\",\n", + " \"Jersey\",\n", + " \"Isle of Man\",\n", + " \"Kazakhstan\",\n", + " \"Latvia\",\n", + " \"Liechtenstein\",\n", + " \"Lithuania\",\n", + " \"Luxembourg\",\n", + " \"Malta\",\n", + " \"Moldova\",\n", + " \"Monaco\",\n", + " \"Montenegro\",\n", + " \"Netherlands\",\n", + " \"North Macedonia\",\n", + " \"Norway\",\n", + " \"Poland\",\n", + " \"Portugal\",\n", + " \"Romania\",\n", + " \"Russia\",\n", + " \"San Marino\",\n", + " \"Serbia\",\n", + " \"Slovakia\",\n", + " \"Slovenia\",\n", + " \"Spain\",\n", + " \"Sweden\",\n", + " \"Switzerland\",\n", + " \"Turkey\",\n", + " \"Ukraine\",\n", + " \"United Kingdom\",\n", + " \"Vatican City\",\n", + " \"Kosovo\",\n", + " \"Gibraltar\",\n", + " \"Faroe Islands\",\n", + " \"Guernsey\",\n", + " \"Jersey\"\n", + " ],\n", + " \"sources\": [\n", + " \"https://simple.wikipedia.org/wiki/List_of_European_countries\",\n", + " \"https://en.wikipedia.org/wiki/List_of_European_countries_by_population\",\n", + " \"https://en.wikipedia.org/wiki/Member_state_of_the_European_Union\"\n", + " ]\n", + "}\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "N5IMdKHvlXFY" + }, + "source": [ + "# SpeechGraph\n", + "**SpeechGraph** is a class representing one of the default scraping pipelines that generate the answer together with an audio file. Similar to the **SmartScraperGraph** but with the addition of the **TextToSpeechNode** node.\n" + ] + }, + { + "cell_type": "markdown", + "source": [ + "![image.png]()" + ], + "metadata": { + "id": "pqJsEVgizs-M" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "W9KhWlT3lXFd" + }, + "outputs": [], + "source": [ + "from scrapegraphai.graphs import SpeechGraph\n", + "\n", + "# Define the configuration for the graph\n", + "graph_config = {\n", + " \"llm\": {\n", + " \"api_key\": OPENAI_API_KEY,\n", + " \"model\": \"gpt-3.5-turbo\",\n", + " },\n", + " \"tts_model\": {\n", + " \"api_key\": OPENAI_API_KEY,\n", + " \"model\": \"tts-1\",\n", + " \"voice\": \"alloy\"\n", + " },\n", + " \"output_path\": \"website_summary.mp3\",\n", + "}\n", + "\n", + "# Create the SpeechGraph instance\n", + "speech_graph = SpeechGraph(\n", + " prompt=\"Create a summary of the website\",\n", + " source=\"https://perinim.github.io/projects/\",\n", + " config=graph_config,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "nVolb3paEczD", + "outputId": "d7d316a0-7580-4a6c-8f20-7e1cb1fc3f07" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--- Executing Fetch Node ---\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Fetching pages: 100%|##########| 1/1 [00:00<00:00, 17.07it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--- Executing Parse Node ---\n", + "--- Executing RAG Node ---\n", + "--- (updated chunks metadata) ---\n", + "--- (tokens compressed and vector stored) ---\n", + "--- Executing GenerateAnswer Node ---\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Processing chunks: 100%|██████████| 1/1 [00:00<00:00, 339.78it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--- Executing TextToSpeech Node ---\n", + "Audio saved to website_summary.mp3\n" + ] + } + ], + "source": [ + "result = speech_graph.run()\n", + "answer = result.get(\"answer\", \"No answer found\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "znt2EOKZE3z2" + }, + "source": [ + "Prettify the result and display the JSON" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "QqY0TbwbEp-O", + "outputId": "c2b1127d-0c49-4121-922e-39da65c329ee" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "{\n", + " \"summary\": {\n", + " \"title\": \"Projects | Marco Perini\",\n", + " \"projects\": [\n", + " {\n", + " \"title\": \"Rotary Pendulum RL\",\n", + " \"description\": \"Open Source project aimed at controlling a real life rotary pendulum using RL algorithms\"\n", + " },\n", + " {\n", + " \"title\": \"DQN Implementation from scratch\",\n", + " \"description\": \"Developed a Deep Q-Network algorithm to train a simple and double pendulum\"\n", + " },\n", + " {\n", + " \"title\": \"Multi Agents HAED\",\n", + " \"description\": \"University project which focuses on simulating a multi-agent system to perform environment mapping. Agents, equipped with sensors, explore and record their surroundings, considering uncertainties in their readings.\"\n", + " },\n", + " {\n", + " \"title\": \"Wireless ESC for Modular Drones\",\n", + " \"description\": \"Modular drone architecture proposal and proof of concept. The project received maximum grade.\"\n", + " }\n", + " ]\n", + " }\n", + "}\n" + ] + } + ], + "source": [ + "import json\n", + "\n", + "output = json.dumps(answer, indent=2)\n", + "\n", + "line_list = output.split(\"\\n\") # Sort of line replacing \"\\n\" with a new line\n", + "\n", + "for line in line_list:\n", + " print(line)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 75 + }, + "id": "lfJ_jVwklXFd", + "outputId": "dc4ad491-4422-4edb-91ae-35775b23168a" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + } + ], + "source": [ + "from IPython.display import Audio\n", + "wn = Audio(\"website_summary.mp3\", autoplay=True)\n", + "display(wn)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "p9kC0x4NuLTx" + }, + "source": [ + "# Build a Custom Graph\n", + "It is possible to **build your own scraping pipeline** by using the default nodes and place them as you wish, without using pre-defined graphs." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Pr6DIqt2uLUI" + }, + "source": [ + "You can create **custom graphs** based on your necessities, using standard nodes provided by the library.\n", + "\n", + "The list of the existing nodes can be found through the *nodes_metadata* json construct.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "-o29vDSIvG4t", + "outputId": "be469b65-ba01-437a-e217-ed1c4f3ad264" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "dict_keys(['SearchInternetNode', 'FetchNode', 'GetProbableTagsNode', 'ParseNode', 'RAGNode', 'GenerateAnswerNode', 'ConditionalNode', 'ImageToTextNode', 'TextToSpeechNode'])" + ] + }, + "metadata": {}, + "execution_count": 17 + } + ], + "source": [ + "# check available nodes\n", + "from scrapegraphai.helpers import nodes_metadata\n", + "\n", + "nodes_metadata.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "829wW5E6vrjJ", + "outputId": "58203025-64ce-4107-f6d3-3b3cfa5537d5" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'description': 'Converts image content to text by \\n extracting visual information and interpreting it.',\n", + " 'type': 'node',\n", + " 'args': {'image_data': 'Data of the image to be processed.'},\n", + " 'returns': \"Updated state with the textual description of the image under 'image_text' key.\"}" + ] + }, + "metadata": {}, + "execution_count": 18 + } + ], + "source": [ + "# to get more information about a node\n", + "nodes_metadata['ImageToTextNode']" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3pnNFDckwWy7" + }, + "source": [ + "To create a custom graph we must:\n", + "\n", + "1. **Istantiate the nodes** you want to use\n", + "2. Create the graph using **BaseGraph** class, which must have a **list of nodes**, tuples representing the **edges** of the graph, an **entry_point**\n", + "3. Run it using the **execute** method\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "eQLZJyg4uLUJ" + }, + "outputs": [], + "source": [ + "from langchain_openai import OpenAIEmbeddings\n", + "from scrapegraphai.models import OpenAI\n", + "from scrapegraphai.graphs import BaseGraph\n", + "from scrapegraphai.nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode\n", + "\n", + "# Define the configuration for the graph\n", + "graph_config = {\n", + " \"llm\": {\n", + " \"api_key\": OPENAI_API_KEY,\n", + " \"model\": \"openai/gpt-4o\",\n", + " \"temperature\": 0,\n", + " \"streaming\": True\n", + " },\n", + "}\n", + "\n", + "llm_model = OpenAI(graph_config[\"llm\"])\n", + "embedder = OpenAIEmbeddings(api_key=llm_model.openai_api_key)\n", + "\n", + "# define the nodes for the graph\n", + "fetch_node = FetchNode(\n", + " input=\"url | local_dir\",\n", + " output=[\"doc\", \"link_urls\", \"img_urls\"],\n", + " node_config={\n", + " \"verbose\": True,\n", + " \"headless\": True,\n", + " }\n", + ")\n", + "parse_node = ParseNode(\n", + " input=\"doc\",\n", + " output=[\"parsed_doc\"],\n", + " node_config={\n", + " \"chunk_size\": 4096,\n", + " \"verbose\": True,\n", + " }\n", + ")\n", + "rag_node = RAGNode(\n", + " input=\"user_prompt & (parsed_doc | doc)\",\n", + " output=[\"relevant_chunks\"],\n", + " node_config={\n", + " \"llm_model\": llm_model,\n", + " \"embedder_model\": embedder,\n", + " \"verbose\": True,\n", + " }\n", + ")\n", + "generate_answer_node = GenerateAnswerNode(\n", + " input=\"user_prompt & (relevant_chunks | parsed_doc | doc)\",\n", + " output=[\"answer\"],\n", + " node_config={\n", + " \"llm_model\": llm_model,\n", + " \"verbose\": True,\n", + " }\n", + ")\n", + "\n", + "# create the graph by defining the nodes and their connections\n", + "graph = BaseGraph(\n", + " nodes=[\n", + " fetch_node,\n", + " parse_node,\n", + " rag_node,\n", + " generate_answer_node,\n", + " ],\n", + " edges=[\n", + " (fetch_node, parse_node),\n", + " (parse_node, rag_node),\n", + " (rag_node, generate_answer_node)\n", + " ],\n", + " entry_point=fetch_node\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5FYKF9H1Fvb8", + "outputId": "666d51fe-5e2f-4398-a3b0-bb820960a0d1" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--- Executing Fetch Node ---\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Fetching pages: 100%|##########| 1/1 [00:00<00:00, 28.65it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--- Executing Parse Node ---\n", + "--- Executing RAG Node ---\n", + "--- (updated chunks metadata) ---\n", + "--- (tokens compressed and vector stored) ---\n", + "--- Executing GenerateAnswer Node ---\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing chunks: 100%|██████████| 1/1 [00:00<00:00, 911.01it/s]\n" + ] + } + ], + "source": [ + "# execute the graph\n", + "result, execution_info = graph.execute({\n", + " \"user_prompt\": \"List me the projects with their description\",\n", + " \"url\": \"https://perinim.github.io/projects/\"\n", + "})\n", + "\n", + "# get the answer from the result\n", + "result = result.get(\"answer\", \"No answer found.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JEP8_zZ9GHW2" + }, + "source": [ + "Prettify the result and display the JSON" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "nx9qGaxvFmfT", + "outputId": "fb327a6a-0dfa-417b-8dbb-505bebc96fe8" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"projects\": [\n", + " {\n", + " \"title\": \"Rotary Pendulum RL\",\n", + " \"description\": \"Open Source project aimed at controlling a real life rotary pendulum using RL algorithms\"\n", + " },\n", + " {\n", + " \"title\": \"DQN Implementation from scratch\",\n", + " \"description\": \"Developed a Deep Q-Network algorithm to train a simple and double pendulum\"\n", + " },\n", + " {\n", + " \"title\": \"Multi Agents HAED\",\n", + " \"description\": \"University project which focuses on simulating a multi-agent system to perform environment mapping. Agents, equipped with sensors, explore and record their surroundings, considering uncertainties in their readings.\"\n", + " },\n", + " {\n", + " \"title\": \"Wireless ESC for Modular Drones\",\n", + " \"description\": \"Modular drone architecture proposal and proof of concept. The project received maximum grade.\"\n", + " }\n", + " ]\n", + "}\n" + ] + } + ], + "source": [ + "import json\n", + "\n", + "output = json.dumps(result, indent=2)\n", + "\n", + "line_list = output.split(\"\\n\") # Sort of line replacing \"\\n\" with a new line\n", + "\n", + "for line in line_list:\n", + " print(line)" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [ + "N5IMdKHvlXFY" + ], + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/examples/anthropic/.env.example b/examples/anthropic/.env.example deleted file mode 100644 index 2789e380..00000000 --- a/examples/anthropic/.env.example +++ /dev/null @@ -1 +0,0 @@ -ANTHROPIC_API_KEY="YOUR ANTHROPIC API KEY" \ No newline at end of file diff --git a/examples/anthropic/code_generator_graph_anthropic.py b/examples/anthropic/code_generator_graph_anthropic.py deleted file mode 100644 index 71160b8c..00000000 --- a/examples/anthropic/code_generator_graph_anthropic.py +++ /dev/null @@ -1,59 +0,0 @@ -""" -Basic example of scraping pipeline using Code Generator with schema -""" -import os, json -from typing import List -from dotenv import load_dotenv -from pydantic import BaseModel, Field -from scrapegraphai.graphs import CodeGeneratorGraph - -load_dotenv() - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Project(BaseModel): - title: str = Field(description="The title of the project") - description: str = Field(description="The description of the project") - -class Projects(BaseModel): - projects: List[Project] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -anthropic_key = os.getenv("ANTHROPIC_API_KEY") - -graph_config = { - "llm": { - "api_key":anthropic_key, - "model": "anthropic/claude-3-haiku-20240307", - }, - "verbose": True, - "headless": False, - "reduction": 2, - "max_iterations": { - "overall": 10, - "syntax": 3, - "execution": 3, - "validation": 3, - "semantic": 3 - }, - "output_file_name": "extracted_data.py" -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -code_generator_graph = CodeGeneratorGraph( - prompt="List me all the projects with their description", - source="https://perinim.github.io/projects/", - schema=Projects, - config=graph_config -) - -result = code_generator_graph.run() -print(result) diff --git a/examples/anthropic/csv_scraper_anthropic.py b/examples/anthropic/csv_scraper_anthropic.py deleted file mode 100644 index 4fd5aaaf..00000000 --- a/examples/anthropic/csv_scraper_anthropic.py +++ /dev/null @@ -1,56 +0,0 @@ -""" -Basic example of scraping pipeline using CSVScraperGraph from CSV documents -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import CSVScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the CSV file -# ************************************************ - -FILE_NAME = "inputs/username.csv" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r') as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -# required environment variables in .env -# HUGGINGFACEHUB_API_TOKEN -# ANTHROPIC_API_KEY -load_dotenv() - -graph_config = { - "llm": { - "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "anthropic/claude-3-haiku-20240307", - }, -} - -# ************************************************ -# Create the CSVScraperGraph instance and run it -# ************************************************ - -csv_scraper_graph = CSVScraperGraph( - prompt="List me all the last names", - source=text, # Pass the content of the file - config=graph_config -) - -result = csv_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = csv_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) \ No newline at end of file diff --git a/examples/anthropic/csv_scraper_graph_multi_anthropic.py b/examples/anthropic/csv_scraper_graph_multi_anthropic.py deleted file mode 100644 index ed0bcbc5..00000000 --- a/examples/anthropic/csv_scraper_graph_multi_anthropic.py +++ /dev/null @@ -1,50 +0,0 @@ -""" -Basic example of scraping pipeline using CSVScraperMultiGraph from CSV documents -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import CSVScraperMultiGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() -# ************************************************ -# Read the CSV file -# ************************************************ - -FILE_NAME = "inputs/username.csv" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r') as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "anthropic/claude-3-haiku-20240307", - }, -} - -# ************************************************ -# Create the CSVScraperMultiGraph instance and run it -# ************************************************ - -csv_scraper_graph = CSVScraperMultiGraph( - prompt="List me all the last names", - source=[str(text), str(text)], - config=graph_config -) - -result = csv_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = csv_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/anthropic/custom_graph_anthropic.py b/examples/anthropic/custom_graph_anthropic.py deleted file mode 100644 index 6df51108..00000000 --- a/examples/anthropic/custom_graph_anthropic.py +++ /dev/null @@ -1,94 +0,0 @@ -""" -Example of custom graph using existing nodes -""" -import os -from dotenv import load_dotenv -from langchain_anthropic import ChatAnthropic -from scrapegraphai.graphs import BaseGraph -from scrapegraphai.nodes import FetchNode, ParseNode, GenerateAnswerNode, RobotsNode -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "claude-3-haiku-20240307", - }, -} - -# ************************************************ -# Define the graph nodes -# ************************************************ - -llm_model = ChatAnthropic(graph_config["llm"]) - -# define the nodes for the graph -robot_node = RobotsNode( - input="url", - output=["is_scrapable"], - node_config={ - "llm_model": llm_model, - "force_scraping": True, - "verbose": True, - } -) - -fetch_node = FetchNode( - input="url | local_dir", - output=["doc"], - node_config={ - "verbose": True, - "headless": True, - } -) -parse_node = ParseNode( - input="doc", - output=["parsed_doc"], - node_config={ - "chunk_size": 4096, - "verbose": True, - } -) -generate_answer_node = GenerateAnswerNode( - input="user_prompt & (relevant_chunks | parsed_doc | doc)", - output=["answer"], - node_config={ - "llm_model": llm_model, - "verbose": True, - } -) - -# ************************************************ -# Create the graph by defining the connections -# ************************************************ - -graph = BaseGraph( - nodes=[ - robot_node, - fetch_node, - parse_node, - generate_answer_node, - ], - edges=[ - (robot_node, fetch_node), - (fetch_node, parse_node), - (parse_node, generate_answer_node) - ], - entry_point=robot_node -) - -# ************************************************ -# Execute the graph -# ************************************************ - -result, execution_info = graph.execute({ - "user_prompt": "Describe the content", - "url": "https://example.com/" -}) - -# get the answer from the result -result = result.get("answer", "No answer found.") -print(result) diff --git a/examples/anthropic/depth_search_graph_anthropic.py b/examples/anthropic/depth_search_graph_anthropic.py deleted file mode 100644 index 565934ed..00000000 --- a/examples/anthropic/depth_search_graph_anthropic.py +++ /dev/null @@ -1,28 +0,0 @@ -""" -depth_search_graph_opeani example -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import DepthSearchGraph - -load_dotenv() - -graph_config = { - "llm": { - "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "anthropic/claude-3-haiku-20240307", - }, - "verbose": True, - "headless": False, - "depth": 2, - "only_inside_links": False, -} - -search_graph = DepthSearchGraph( - prompt="List me all the projects with their description", - source="https://perinim.github.io", - config=graph_config -) - -result = search_graph.run() -print(result) diff --git a/examples/anthropic/document_scraper_anthropic.py b/examples/anthropic/document_scraper_anthropic.py deleted file mode 100644 index a8f253be..00000000 --- a/examples/anthropic/document_scraper_anthropic.py +++ /dev/null @@ -1,42 +0,0 @@ -""" -document_scraper example -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import DocumentScraperGraph - -load_dotenv() - - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "anthropic/claude-3-haiku-20240307", - } -} - - -source = """ - The Divine Comedy, Italian La Divina Commedia, original name La commedia, long narrative poem written in Italian - circa 1308/21 by Dante. It is usually held to be one of the world s great works of literature. - Divided into three major sections—Inferno, Purgatorio, and Paradiso—the narrative traces the journey of Dante - from darkness and error to the revelation of the divine light, culminating in the Beatific Vision of God. - Dante is guided by the Roman poet Virgil, who represents the epitome of human knowledge, from the dark wood - through the descending circles of the pit of Hell (Inferno). He then climbs the mountain of Purgatory, guided - by the Roman poet Statius, who represents the fulfilment of human knowledge, and is finally led by his lifelong love, - the Beatrice of his earlier poetry, through the celestial spheres of Paradise. -""" - -pdf_scraper_graph = DocumentScraperGraph( - prompt="Summarize the text and find the main topics", - source=source, - config=graph_config, -) -result = pdf_scraper_graph.run() - -print(json.dumps(result, indent=4)) \ No newline at end of file diff --git a/examples/anthropic/json_scraper_anthropic.py b/examples/anthropic/json_scraper_anthropic.py deleted file mode 100644 index fd5aa4e8..00000000 --- a/examples/anthropic/json_scraper_anthropic.py +++ /dev/null @@ -1,43 +0,0 @@ -""" -Basic example of scraping pipeline using JSONScraperGraph from JSON documents -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import JSONScraperGraph - -load_dotenv() - -# ************************************************ -# Read the JSON file -# ************************************************ - -FILE_NAME = "inputs/example.json" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "anthropic/claude-3-haiku-20240307", - } -} - -# ************************************************ -# Create the JSONScraperGraph instance and run it -# ************************************************ - -json_scraper_graph = JSONScraperGraph( - prompt="List me all the authors, title and genres of the books", - source=text, # Pass the content of the file, not the file object - config=graph_config -) - -result = json_scraper_graph.run() -print(result) diff --git a/examples/anthropic/json_scraper_multi_anthropic.py b/examples/anthropic/json_scraper_multi_anthropic.py deleted file mode 100644 index d016439d..00000000 --- a/examples/anthropic/json_scraper_multi_anthropic.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -Module for showing how JSONScraperMultiGraph multi works -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import JSONScraperMultiGraph - -load_dotenv() - -graph_config = { - "llm": { - "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "anthropic/claude-3-haiku-20240307", - }, -} - -FILE_NAME = "inputs/example.json" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -sources = [text, text] - -multiple_search_graph = JSONScraperMultiGraph( - prompt= "List me all the authors, title and genres of the books", - source= sources, - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/anthropic/rate_limit_anthropic.py b/examples/anthropic/rate_limit_anthropic.py deleted file mode 100644 index f9321770..00000000 --- a/examples/anthropic/rate_limit_anthropic.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper while setting an API rate limit. -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - - -# required environment variables in .env -# ANTHROPIC_API_KEY -load_dotenv() - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -graph_config = { - "llm": { - "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "anthropic/claude-3-haiku-20240307", - "rate_limit": { - "requests_per_second": 1 - } - }, -} - -smart_scraper_graph = SmartScraperGraph( - prompt="""Don't say anything else. Output JSON only. List me all the events, with the following fields: company_name, event_name, event_start_date, event_start_time, - event_end_date, event_end_time, location, event_mode, event_category, - third_party_redirect, no_of_days, - time_in_hours, hosted_or_attending, refreshments_type, - registration_available, registration_link""", - # also accepts a string with the already downloaded HTML code - source="https://www.hmhco.com/event", - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/anthropic/scrape_plain_text_anthropic.py b/examples/anthropic/scrape_plain_text_anthropic.py deleted file mode 100644 index fd8ebd1d..00000000 --- a/examples/anthropic/scrape_plain_text_anthropic.py +++ /dev/null @@ -1,52 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper from text -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the text file -# ************************************************ - -FILE_NAME = "inputs/plain_html_example.txt" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -# It could be also a http request using the request model -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "anthropic/claude-3-haiku-20240307", - }, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description.", - source=text, - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/anthropic/script_generator_anthropic.py b/examples/anthropic/script_generator_anthropic.py deleted file mode 100644 index 8c9333e1..00000000 --- a/examples/anthropic/script_generator_anthropic.py +++ /dev/null @@ -1,42 +0,0 @@ -""" -Basic example of scraping pipeline using ScriptCreatorGraph -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import ScriptCreatorGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "anthropic/claude-3-haiku-20240307", - }, -} - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -script_creator_graph = ScriptCreatorGraph( - prompt="List me all the projects with their description.", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects", - config=graph_config -) - -result = script_creator_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = script_creator_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - diff --git a/examples/anthropic/script_multi_generator_anthropic.py b/examples/anthropic/script_multi_generator_anthropic.py deleted file mode 100644 index d47e60e9..00000000 --- a/examples/anthropic/script_multi_generator_anthropic.py +++ /dev/null @@ -1,51 +0,0 @@ -""" -Basic example of scraping pipeline using ScriptCreatorGraph -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import ScriptCreatorMultiGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "anthropic/claude-3-haiku-20240307", - }, - "library": "beautifulsoup" -} - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -urls=[ - "https://schultzbergagency.com/emil-raste-karlsen/", - "https://schultzbergagency.com/johanna-hedberg/", -] - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -script_creator_graph = ScriptCreatorMultiGraph( - prompt="Find information about actors", - # also accepts a string with the already downloaded HTML code - source=urls, - config=graph_config -) - -result = script_creator_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = script_creator_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/anthropic/search_graph_anthropic.py b/examples/anthropic/search_graph_anthropic.py deleted file mode 100644 index 0e1d7b45..00000000 --- a/examples/anthropic/search_graph_anthropic.py +++ /dev/null @@ -1,43 +0,0 @@ -""" -Example of Search Graph -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SearchGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "anthropic/claude-3-haiku-20240307", - }, -} - -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me Chioggia's famous dishes", - config=graph_config -) - -result = search_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = search_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json and csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/anthropic/search_graph_schema_anthropic.py b/examples/anthropic/search_graph_schema_anthropic.py deleted file mode 100644 index 926e72ea..00000000 --- a/examples/anthropic/search_graph_schema_anthropic.py +++ /dev/null @@ -1,44 +0,0 @@ -""" -Example of Search Graph -""" -import os -from typing import List -from dotenv import load_dotenv -from pydantic import BaseModel, Field -from scrapegraphai.graphs import SearchGraph - -load_dotenv() - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Dish(BaseModel): - name: str = Field(description="The name of the dish") - description: str = Field(description="The description of the dish") - -class Dishes(BaseModel): - dishes: List[Dish] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ -graph_config = { - "llm": { - "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "anthropic/claude-3-haiku-20240307", - }, -} - -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me Chioggia's famous dishes", - config=graph_config, - schema=Dishes -) - -result = search_graph.run() -print(result) diff --git a/examples/anthropic/search_link_graph_anthropic.py b/examples/anthropic/search_link_graph_anthropic.py deleted file mode 100644 index 70798cf3..00000000 --- a/examples/anthropic/search_link_graph_anthropic.py +++ /dev/null @@ -1,45 +0,0 @@ -""" -Example of Search Graph -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SearchGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info -from langchain_openai import AzureChatOpenAI -from langchain_openai import AzureOpenAIEmbeddings - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -load_dotenv() - -graph_config = { - "llm": { - "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "anthropic/claude-3-haiku-20240307", - }, -} - -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me the best escursions near Trento", - config=graph_config -) - -result = search_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = search_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json and csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/anthropic/smart_scraper_anthropic.py b/examples/anthropic/smart_scraper_anthropic.py deleted file mode 100644 index 7eb655d5..00000000 --- a/examples/anthropic/smart_scraper_anthropic.py +++ /dev/null @@ -1,41 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper using Azure OpenAI Key -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -graph_config = { - "llm": { - "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "anthropic/claude-3-haiku-20240307", - }, -} - -smart_scraper_graph = SmartScraperGraph( - prompt="""Don't say anything else. Output JSON only. List me all the events, with the following fields: company_name, event_name, event_start_date, event_start_time, - event_end_date, event_end_time, location, event_mode, event_category, - third_party_redirect, no_of_days, - time_in_hours, hosted_or_attending, refreshments_type, - registration_available, registration_link""", - # also accepts a string with the already downloaded HTML code - source="https://www.hmhco.com/event", - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/anthropic/smart_scraper_lite_anthropic.py b/examples/anthropic/smart_scraper_lite_anthropic.py deleted file mode 100644 index 698623c6..00000000 --- a/examples/anthropic/smart_scraper_lite_anthropic.py +++ /dev/null @@ -1,32 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperLiteGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -graph_config = { - "llm": { - "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "anthropic/claude-3-haiku-20240307", - }, - "verbose": True, - "headless": False, -} - -smart_scraper_lite_graph = SmartScraperLiteGraph( - prompt="Who is Marco Perini?", - source="https://perinim.github.io/", - config=graph_config -) - -result = smart_scraper_lite_graph.run() -print(json.dumps(result, indent=4)) - -graph_exec_info = smart_scraper_lite_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - diff --git a/examples/anthropic/smart_scraper_multi_anthropic.py b/examples/anthropic/smart_scraper_multi_anthropic.py deleted file mode 100644 index e4dc0aca..00000000 --- a/examples/anthropic/smart_scraper_multi_anthropic.py +++ /dev/null @@ -1,56 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiGraph - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -load_dotenv() - -graph_config = { - "llm": { - "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "anthropic/claude-3-haiku-20240307", - }, -} - -# ******************************************************* -# Create the SmartScraperMultiGraph instance and run it -# ******************************************************* - -multiple_search_graph = SmartScraperMultiGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) - -# ******************************************************* -# Create the SmartScraperMultiGraph instance and run it -# ******************************************************* - -multiple_search_graph = SmartScraperMultiGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/anthropic/smart_scraper_multi_concat_anthropic.py b/examples/anthropic/smart_scraper_multi_concat_anthropic.py deleted file mode 100644 index d5c65a14..00000000 --- a/examples/anthropic/smart_scraper_multi_concat_anthropic.py +++ /dev/null @@ -1,38 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiConcatGraph - -load_dotenv() - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -graph_config = { - "llm": { - "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "anthropic/claude-3-haiku-20240307", - }, -} - - -# ******************************************************* -# Create the SmartScraperMultiGraph instance and run it -# ******************************************************* - -multiple_search_graph = SmartScraperMultiConcatGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/anthropic/smart_scraper_multi_lite_anthropic.py b/examples/anthropic/smart_scraper_multi_lite_anthropic.py deleted file mode 100644 index 7cf3c09d..00000000 --- a/examples/anthropic/smart_scraper_multi_lite_anthropic.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiLiteGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -graph_config = { - "llm": { - "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "anthropic/claude-3-haiku-20240307", - }, - "verbose": True, - "headless": False, -} - -smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - config=graph_config -) - -result = smart_scraper_multi_lite_graph.run() -print(json.dumps(result, indent=4)) - -graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - diff --git a/examples/anthropic/smart_scraper_schema_anthropic.py b/examples/anthropic/smart_scraper_schema_anthropic.py deleted file mode 100644 index 3cebd257..00000000 --- a/examples/anthropic/smart_scraper_schema_anthropic.py +++ /dev/null @@ -1,51 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper using Azure OpenAI Key -""" -import os -from typing import List -from pydantic import BaseModel, Field -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Project(BaseModel): - title: str = Field(description="The title of the project") - description: str = Field(description="The description of the project") - -class Projects(BaseModel): - projects: List[Project] - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -graph_config = { - "llm": { - "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "anthropic/claude-3-haiku-20240307", - }, -} - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description", - # also accepts a string with the already downloaded HTML code - schema=Projects, - source="https://perinim.github.io/projects/", - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/anthropic/xml_scraper_anthropic.py b/examples/anthropic/xml_scraper_anthropic.py deleted file mode 100644 index 5568f0a3..00000000 --- a/examples/anthropic/xml_scraper_anthropic.py +++ /dev/null @@ -1,55 +0,0 @@ -""" -Basic example of scraping pipeline using XMLScraperGraph from XML documents -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import XMLScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the XML file -# ************************************************ - -FILE_NAME = "inputs/books.xml" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "anthropic/claude-3-haiku-20240307", - }, -} - -# ************************************************ -# Create the XMLScraperGraph instance and run it -# ************************************************ - -xml_scraper_graph = XMLScraperGraph( - prompt="List me all the authors, title and genres of the books", - source=text, # Pass the content of the file, not the file object - config=graph_config -) - -result = xml_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = xml_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/anthropic/xml_scraper_graph_multi_anthropic.py b/examples/anthropic/xml_scraper_graph_multi_anthropic.py deleted file mode 100644 index 577e2e1d..00000000 --- a/examples/anthropic/xml_scraper_graph_multi_anthropic.py +++ /dev/null @@ -1,55 +0,0 @@ -""" -Basic example of scraping pipeline using XMLScraperMultiGraph from XML documents -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import XMLScraperMultiGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the XML file -# ************************************************ - -FILE_NAME = "inputs/books.xml" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "anthropic/claude-3-haiku-20240307", - }, -} - -# ************************************************ -# Create the XMLScraperMultiGraph instance and run it -# ************************************************ - -xml_scraper_graph = XMLScraperMultiGraph( - prompt="List me all the authors, title and genres of the books", - source=[text, text], # Pass the content of the file, not the file object - config=graph_config -) - -result = xml_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = xml_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/azure/code_generator_graph_azure.py b/examples/azure/code_generator_graph_azure.py deleted file mode 100644 index 7dc13602..00000000 --- a/examples/azure/code_generator_graph_azure.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -Basic example of scraping pipeline using Code Generator with schema -""" -import os -from typing import List -from dotenv import load_dotenv -from pydantic import BaseModel, Field -from scrapegraphai.graphs import CodeGeneratorGraph - -load_dotenv() - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Project(BaseModel): - title: str = Field(description="The title of the project") - description: str = Field(description="The description of the project") - -class Projects(BaseModel): - projects: List[Project] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-4o" - }, - "verbose": True, - "headless": False, - "reduction": 2, - "max_iterations": { - "overall": 10, - "syntax": 3, - "execution": 3, - "validation": 3, - "semantic": 3 - }, - "output_file_name": "extracted_data.py" -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -code_generator_graph = CodeGeneratorGraph( - prompt="List me all the projects with their description", - source="https://perinim.github.io/projects/", - schema=Projects, - config=graph_config -) - -result = code_generator_graph.run() -print(result) diff --git a/examples/azure/csv_scraper_azure.py b/examples/azure/csv_scraper_azure.py deleted file mode 100644 index 5bc9ca50..00000000 --- a/examples/azure/csv_scraper_azure.py +++ /dev/null @@ -1,56 +0,0 @@ -""" -Basic example of scraping pipeline using CSVScraperGraph from CSV documents -""" -import os -from dotenv import load_dotenv -import pandas as pd -from scrapegraphai.graphs import CSVScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the CSV file -# ************************************************ - -FILE_NAME = "inputs/username.csv" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -text = pd.read_csv(file_path) - -# ************************************************ -# Define the configuration for the graph -# ************************************************ -graph_config = { - "llm": { - "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-4o" - }, - "verbose": True, - "headless": False -} - -# ************************************************ -# Create the CSVScraperGraph instance and run it -# ************************************************ - -csv_scraper_graph = CSVScraperGraph( - prompt="List me all the last names", - source=str(text), # Pass the content of the file, not the file object - config=graph_config -) - -result = csv_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = csv_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/azure/csv_scraper_graph_multi_azure.py b/examples/azure/csv_scraper_graph_multi_azure.py deleted file mode 100644 index 0c599427..00000000 --- a/examples/azure/csv_scraper_graph_multi_azure.py +++ /dev/null @@ -1,55 +0,0 @@ -""" -Basic example of scraping pipeline using CSVScraperMultiGraph from CSV documents -""" -import os -from dotenv import load_dotenv -import pandas as pd -from scrapegraphai.graphs import CSVScraperMultiGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() -# ************************************************ -# Read the CSV file -# ************************************************ - -FILE_NAME = "inputs/username.csv" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -text = pd.read_csv(file_path) - -# ************************************************ -# Define the configuration for the graph -# ************************************************ -graph_config = { - "llm": { - "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-4o" - }, - "verbose": True, - "headless": False -} - -# ************************************************ -# Create the CSVScraperMultiGraph instance and run it -# ************************************************ - -csv_scraper_graph = CSVScraperMultiGraph( - prompt="List me all the last names", - source=[str(text), str(text)], - config=graph_config -) - -result = csv_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = csv_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/azure/depth_search_graph_azure.py b/examples/azure/depth_search_graph_azure.py deleted file mode 100644 index 96ccc23e..00000000 --- a/examples/azure/depth_search_graph_azure.py +++ /dev/null @@ -1,28 +0,0 @@ -""" -depth_search_graph_azure example -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import DepthSearchGraph - -load_dotenv() - -graph_config = { - "llm": { - "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-4o", - }, - "verbose": True, - "headless": False, - "depth": 2, - "only_inside_links": False, -} - -search_graph = DepthSearchGraph( - prompt="List me all the projects with their description", - source="https://perinim.github.io", - config=graph_config -) - -result = search_graph.run() -print(result) diff --git a/examples/azure/document_scraper_azure.py b/examples/azure/document_scraper_azure.py deleted file mode 100644 index 43f00678..00000000 --- a/examples/azure/document_scraper_azure.py +++ /dev/null @@ -1,44 +0,0 @@ -""" -document_scraper example -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import DocumentScraperGraph - -load_dotenv() - - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-4o" - }, - "verbose": True, - "headless": False -} - - -source = """ - The Divine Comedy, Italian La Divina Commedia, original name La commedia, long narrative poem written in Italian - circa 1308/21 by Dante. It is usually held to be one of the world s great works of literature. - Divided into three major sections—Inferno, Purgatorio, and Paradiso—the narrative traces the journey of Dante - from darkness and error to the revelation of the divine light, culminating in the Beatific Vision of God. - Dante is guided by the Roman poet Virgil, who represents the epitome of human knowledge, from the dark wood - through the descending circles of the pit of Hell (Inferno). He then climbs the mountain of Purgatory, guided - by the Roman poet Statius, who represents the fulfilment of human knowledge, and is finally led by his lifelong love, - the Beatrice of his earlier poetry, through the celestial spheres of Paradise. -""" - -pdf_scraper_graph = DocumentScraperGraph( - prompt="Summarize the text and find the main topics", - source=source, - config=graph_config, -) -result = pdf_scraper_graph.run() - -print(json.dumps(result, indent=4)) \ No newline at end of file diff --git a/examples/azure/json_scraper_azure.py b/examples/azure/json_scraper_azure.py deleted file mode 100644 index 5224f9bb..00000000 --- a/examples/azure/json_scraper_azure.py +++ /dev/null @@ -1,45 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper using Azure OpenAI Key -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import JSONScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -FILE_NAME = "inputs/example.json" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Initialize the model instances -# ************************************************ - -graph_config = { - "llm": { - "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-4o" - }, - "verbose": True, - "headless": False -} - -smart_scraper_graph = JSONScraperGraph( - prompt="List me all the authors, title and genres of the books", - source=text, # Pass the content of the file, not the file object - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/azure/json_scraper_multi_azure.py b/examples/azure/json_scraper_multi_azure.py deleted file mode 100644 index 93ac02e3..00000000 --- a/examples/azure/json_scraper_multi_azure.py +++ /dev/null @@ -1,38 +0,0 @@ -""" -Module for showing how JSONScraperMultiGraph multi works -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import JSONScraperMultiGraph - -load_dotenv() - - -graph_config = { - "llm": { - "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-4o" - }, - "verbose": True, - "headless": False -} - -FILE_NAME = "inputs/example.json" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -sources = [text, text] - -multiple_search_graph = JSONScraperMultiGraph( - prompt= "List me all the authors, title and genres of the books", - source= sources, - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/azure/rate_limit_azure.py b/examples/azure/rate_limit_azure.py deleted file mode 100644 index aa0f943d..00000000 --- a/examples/azure/rate_limit_azure.py +++ /dev/null @@ -1,56 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper with a custom rate limit -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - - -# required environment variable in .env -# AZURE_OPENAI_ENDPOINT -# AZURE_OPENAI_CHAT_DEPLOYMENT_NAME -# MODEL_NAME -# AZURE_OPENAI_API_KEY -# OPENAI_API_TYPE -# AZURE_OPENAI_API_VERSION -# AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME -load_dotenv() - - -# ************************************************ -# Initialize the model instances -# ************************************************ - -graph_config = { - "llm": { - "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-4o", - "rate_limit": { - "requests_per_second": 1 - }, - }, - "verbose": True, - "headless": False -} - -smart_scraper_graph = SmartScraperGraph( - prompt="""List me all the events, with the following fields: company_name, event_name, event_start_date, event_start_time, - event_end_date, event_end_time, location, event_mode, event_category, - third_party_redirect, no_of_days, - time_in_hours, hosted_or_attending, refreshments_type, - registration_available, registration_link""", - # also accepts a string with the already downloaded HTML code - source="https://www.hmhco.com/event", - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/azure/scrape_plain_text_azure.py b/examples/azure/scrape_plain_text_azure.py deleted file mode 100644 index 0beb1526..00000000 --- a/examples/azure/scrape_plain_text_azure.py +++ /dev/null @@ -1,54 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper from text -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the text file -# ************************************************ - -FILE_NAME = "inputs/plain_html_example.txt" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -# It could be also a http request using the request model -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-4o" - }, - "verbose": True, - "headless": False -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description.", - source=text, - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/azure/script_generator_azure.py b/examples/azure/script_generator_azure.py deleted file mode 100644 index 5eb40b1c..00000000 --- a/examples/azure/script_generator_azure.py +++ /dev/null @@ -1,43 +0,0 @@ -""" -Basic example of scraping pipeline using ScriptCreatorGraph -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import ScriptCreatorGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ -graph_config = { - "llm": { - "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-4o" - }, - "verbose": True, - "headless": False -} - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -script_creator_graph = ScriptCreatorGraph( - prompt="List me all the projects with their description.", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects", - config=graph_config -) - -result = script_creator_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = script_creator_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - diff --git a/examples/azure/script_multi_generator_azure.py b/examples/azure/script_multi_generator_azure.py deleted file mode 100644 index 6bb94051..00000000 --- a/examples/azure/script_multi_generator_azure.py +++ /dev/null @@ -1,52 +0,0 @@ -""" -Basic example of scraping pipeline using ScriptCreatorGraph -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import ScriptCreatorMultiGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-4o" - }, - "verbose": True, - "headless": False -} - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -urls=[ - "https://schultzbergagency.com/emil-raste-karlsen/", - "https://schultzbergagency.com/johanna-hedberg/", -] - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -script_creator_graph = ScriptCreatorMultiGraph( - prompt="Find information about actors", - # also accepts a string with the already downloaded HTML code - source=urls, - config=graph_config -) - -result = script_creator_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = script_creator_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/azure/search_graph_azure.py b/examples/azure/search_graph_azure.py deleted file mode 100644 index 8c7d9a9e..00000000 --- a/examples/azure/search_graph_azure.py +++ /dev/null @@ -1,51 +0,0 @@ -""" -Example of Search Graph -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SearchGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info -load_dotenv() - -FILE_NAME = "inputs/example.json" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Initialize the model instances -# ************************************************ - -graph_config = { - "llm": { - "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-4o" - }, - "verbose": True, - "headless": False -} - -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me the best escursions near Trento", - config=graph_config -) - -result = search_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = search_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json and csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/azure/search_graph_schema_azure.py b/examples/azure/search_graph_schema_azure.py deleted file mode 100644 index bc22f7bc..00000000 --- a/examples/azure/search_graph_schema_azure.py +++ /dev/null @@ -1,60 +0,0 @@ -""" -Example of Search Graph -""" -import os -from typing import List -from dotenv import load_dotenv -from scrapegraphai.graphs import SearchGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -from pydantic import BaseModel, Field - -load_dotenv() - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Dish(BaseModel): - name: str = Field(description="The name of the dish") - description: str = Field(description="The description of the dish") - -class Dishes(BaseModel): - dishes: List[Dish] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-4o" - }, - "verbose": True, - "headless": False -} - -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me Chioggia's famous dishes", - config=graph_config, - schema=Dishes -) - -result = search_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = search_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json and csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/azure/search_link_graph_azure.py b/examples/azure/search_link_graph_azure.py deleted file mode 100644 index 42ed07ad..00000000 --- a/examples/azure/search_link_graph_azure.py +++ /dev/null @@ -1,45 +0,0 @@ -""" -Example of Search Graph -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SearchGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -load_dotenv() - -graph_config = { - "llm": { - "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-4o" - }, - "verbose": True, - "headless": False -} - -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me the best escursions near Trento", - config=graph_config -) - -result = search_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = search_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json and csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/azure/smart_scraper_azure.py b/examples/azure/smart_scraper_azure.py deleted file mode 100644 index 11643a6d..00000000 --- a/examples/azure/smart_scraper_azure.py +++ /dev/null @@ -1,44 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper using Azure OpenAI Key -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Initialize the model instances -# ************************************************ - -graph_config = { - "llm": { - "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-4o" - }, - "verbose": True, - "headless": False -} - -smart_scraper_graph = SmartScraperGraph( - prompt="""List me all the events, with the following fields: - company_name, event_name, event_start_date, event_start_time, - event_end_date, event_end_time, location, event_mode, event_category, - third_party_redirect, no_of_days, - time_in_hours, hosted_or_attending, refreshments_type, - registration_available, registration_link""", - # also accepts a string with the already downloaded HTML code - source="https://www.hmhco.com/event", - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/azure/smart_scraper_lite_azure.py b/examples/azure/smart_scraper_lite_azure.py deleted file mode 100644 index 335c4832..00000000 --- a/examples/azure/smart_scraper_lite_azure.py +++ /dev/null @@ -1,31 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperLiteGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -graph_config = { - "llm": { - "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-4o" - }, - "verbose": True, - "headless": False -} - -smart_scraper_lite_graph = SmartScraperLiteGraph( - prompt="Who is Marco Perini?", - source="https://perinim.github.io/", - config=graph_config -) - -result = smart_scraper_lite_graph.run() -print(json.dumps(result, indent=4)) - -graph_exec_info = smart_scraper_lite_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/azure/smart_scraper_multi_azure.py b/examples/azure/smart_scraper_multi_azure.py deleted file mode 100644 index e066eaf1..00000000 --- a/examples/azure/smart_scraper_multi_azure.py +++ /dev/null @@ -1,38 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiGraph - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ -graph_config = { - "llm": { - "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-4o" - }, - "verbose": True, - "headless": False -} - -# ******************************************************* -# Create the SmartScraperMultiGraph instance and run it -# ******************************************************* - -multiple_search_graph = SmartScraperMultiGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/azure/smart_scraper_multi_concat_azure.py b/examples/azure/smart_scraper_multi_concat_azure.py deleted file mode 100644 index 072cb190..00000000 --- a/examples/azure/smart_scraper_multi_concat_azure.py +++ /dev/null @@ -1,38 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiConcatGraph - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ -graph_config = { - "llm": { - "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-4o" - }, - "verbose": True, - "headless": False -} - -# ******************************************************* -# Create the SmartScraperMultiGraph instance and run it -# ******************************************************* - -multiple_search_graph = SmartScraperMultiConcatGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/azure/smart_scraper_multi_lite_azure.py b/examples/azure/smart_scraper_multi_lite_azure.py deleted file mode 100644 index b9046d9f..00000000 --- a/examples/azure/smart_scraper_multi_lite_azure.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiLiteGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -graph_config = { - "llm": { - "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-4o" - }, - "verbose": True, - "headless": False -} - -smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - config=graph_config -) - -result = smart_scraper_multi_lite_graph.run() -print(json.dumps(result, indent=4)) - -graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - diff --git a/examples/azure/smart_scraper_schema_azure.py b/examples/azure/smart_scraper_schema_azure.py deleted file mode 100644 index 28d8b87e..00000000 --- a/examples/azure/smart_scraper_schema_azure.py +++ /dev/null @@ -1,49 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper with schema -""" -import os -import json -from typing import List -from pydantic import BaseModel, Field -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph - -load_dotenv() - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Project(BaseModel): - title: str = Field(description="The title of the project") - description: str = Field(description="The description of the project") - -class Projects(BaseModel): - projects: List[Project] - -# ************************************************ -# Initialize the model instances -# ************************************************ - -graph_config = { - "llm": { - "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-4o" - }, - "verbose": True, - "headless": False -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description", - source="https://perinim.github.io/projects/", - schema=Projects, - config=graph_config -) - -result = smart_scraper_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/azure/xml_scraper_azure.py b/examples/azure/xml_scraper_azure.py deleted file mode 100644 index cd53242c..00000000 --- a/examples/azure/xml_scraper_azure.py +++ /dev/null @@ -1,46 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper using Azure OpenAI Key -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import XMLScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -FILE_NAME = "inputs/books.xml" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - - -# ************************************************ -# Initialize the model instances -# ************************************************ - -graph_config = { - "llm": { - "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-4o" - }, - "verbose": True, - "headless": False -} - -smart_scraper_graph = XMLScraperGraph( - prompt="List me all the authors, title and genres of the books", - source=text, # Pass the content of the file, not the file object - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/azure/xml_scraper_graph_multi_azure.py b/examples/azure/xml_scraper_graph_multi_azure.py deleted file mode 100644 index e7aaf382..00000000 --- a/examples/azure/xml_scraper_graph_multi_azure.py +++ /dev/null @@ -1,56 +0,0 @@ -""" -Basic example of scraping pipeline using XMLScraperMultiGraph from XML documents -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import XMLScraperMultiGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the XML file -# ************************************************ - -FILE_NAME = "inputs/books.xml" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ -graph_config = { - "llm": { - "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-4o", - }, - "verbose": True, - "headless": False -} - -# ************************************************ -# Create the XMLScraperMultiGraph instance and run it -# ************************************************ - -xml_scraper_graph = XMLScraperMultiGraph( - prompt="List me all the authors, title and genres of the books", - source=[text, text], # Pass the content of the file, not the file object - config=graph_config -) - -result = xml_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = xml_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/bedrock/.env.example b/examples/bedrock/.env.example deleted file mode 100644 index cd27769e..00000000 --- a/examples/bedrock/.env.example +++ /dev/null @@ -1,4 +0,0 @@ -AWS_ACCESS_KEY_ID="..." -AWS_SECRET_ACCESS_KEY="..." -AWS_SESSION_TOKEN="..." -AWS_DEFAULT_REGION="..." \ No newline at end of file diff --git a/examples/bedrock/README.md b/examples/bedrock/README.md deleted file mode 100644 index 88edd82c..00000000 --- a/examples/bedrock/README.md +++ /dev/null @@ -1,3 +0,0 @@ -This folder contains examples of how to use ScrapeGraphAI with [Amazon Bedrock](https://aws.amazon.com/bedrock/) ⛰️. The examples show how to extract information from websites and files using a natural language prompt. - -![](scrapegraphai_bedrock.png) \ No newline at end of file diff --git a/examples/bedrock/code_generator_graph_bedrock.py b/examples/bedrock/code_generator_graph_bedrock.py deleted file mode 100644 index 7a0561fe..00000000 --- a/examples/bedrock/code_generator_graph_bedrock.py +++ /dev/null @@ -1,60 +0,0 @@ -""" -Basic example of scraping pipeline using Code Generator with schema -""" - -import os, json -from typing import List -from dotenv import load_dotenv -from pydantic import BaseModel, Field -from scrapegraphai.graphs import CodeGeneratorGraph - -load_dotenv() - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Project(BaseModel): - title: str = Field(description="The title of the project") - description: str = Field(description="The description of the project") - -class Projects(BaseModel): - projects: List[Project] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - - -graph_config = { - "llm": { - "client": "client_name", - "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", - "temperature": 0.0 - }, - "verbose": True, - "headless": False, - "reduction": 2, - "max_iterations": { - "overall": 10, - "syntax": 3, - "execution": 3, - "validation": 3, - "semantic": 3 - }, - "output_file_name": "extracted_data.py" -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -code_generator_graph = CodeGeneratorGraph( - prompt="List me all the projects with their description", - source="https://perinim.github.io/projects/", - schema=Projects, - config=graph_config -) - -result = code_generator_graph.run() -print(result) \ No newline at end of file diff --git a/examples/bedrock/csv_scraper_bedrock.py b/examples/bedrock/csv_scraper_bedrock.py deleted file mode 100644 index cf453ab3..00000000 --- a/examples/bedrock/csv_scraper_bedrock.py +++ /dev/null @@ -1,50 +0,0 @@ -""" -Basic example of scraping pipeline using CSVScraperGraph from CSV documents -""" - -import os -import json - -from dotenv import load_dotenv - -import pandas as pd - -from scrapegraphai.graphs import CSVScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the CSV file -# ************************************************ - -FILE_NAME = "inputs/username.csv" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -text = pd.read_csv(file_path) - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "client": "client_name", - "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", - "temperature": 0.0 - } -} -# ************************************************ -# Create the CSVScraperGraph instance and run it -# ************************************************ - -csv_scraper_graph = CSVScraperGraph( - prompt="List me all the last names", - source=str(text), # Pass the content of the file, not the file object - config=graph_config -) - -result = csv_scraper_graph.run() -print(json.dumps(result, indent=4)) - diff --git a/examples/bedrock/csv_scraper_graph_multi_bedrock.py b/examples/bedrock/csv_scraper_graph_multi_bedrock.py deleted file mode 100644 index b9dd7f6f..00000000 --- a/examples/bedrock/csv_scraper_graph_multi_bedrock.py +++ /dev/null @@ -1,56 +0,0 @@ -""" -Basic example of scraping pipeline using CSVScraperMultiGraph from CSV documents -""" - -import os -from dotenv import load_dotenv -import pandas as pd -from scrapegraphai.graphs import CSVScraperMultiGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() -# ************************************************ -# Read the CSV file -# ************************************************ - -FILE_NAME = "inputs/username.csv" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -text = pd.read_csv(file_path) - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "client": "client_name", - "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", - "temperature": 0.0 - } -} - -# ************************************************ -# Create the CSVScraperMultiGraph instance and run it -# ************************************************ - -csv_scraper_graph = CSVScraperMultiGraph( - prompt="List me all the last names", - source=[str(text), str(text)], - config=graph_config -) - -result = csv_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = csv_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/bedrock/custom_graph_bedrock.py b/examples/bedrock/custom_graph_bedrock.py deleted file mode 100644 index d72f6999..00000000 --- a/examples/bedrock/custom_graph_bedrock.py +++ /dev/null @@ -1,125 +0,0 @@ -""" -Example of custom graph using existing nodes -""" - -import json - -from dotenv import load_dotenv - -from langchain_aws import BedrockEmbeddings -from scrapegraphai.models import Bedrock -from scrapegraphai.graphs import BaseGraph -from scrapegraphai.nodes import ( - FetchNode, - ParseNode, - RAGNode, - GenerateAnswerNode, - RobotsNode -) - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "client": "client_name", - "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", - "temperature": 0.0 - } -} - -# ************************************************ -# Define the graph nodes -# ************************************************ - -llm_model = Bedrock({ - 'model_id': graph_config["llm"]["model"].split("/")[-1], - 'model_kwargs': { - 'temperature': 0.0 - }}) -embedder = BedrockEmbeddings(model_id=graph_config["embeddings"]["model"].split("/")[-1]) - -# Define the nodes for the graph -robot_node = RobotsNode( - input="url", - output=["is_scrapable"], - node_config={ - "llm_model": llm_model, - "force_scraping": True, - "verbose": True, - } -) - -fetch_node = FetchNode( - input="url | local_dir", - output=["doc"], - node_config={ - "verbose": True, - "headless": True, - } -) - -parse_node = ParseNode( - input="doc", - output=["parsed_doc"], - node_config={ - "chunk_size": 4096, - "verbose": True, - } -) - -rag_node = RAGNode( - input="user_prompt & (parsed_doc | doc)", - output=["relevant_chunks"], - node_config={ - "llm_model": llm_model, - "embedder_model": embedder, - "verbose": True, - } -) - -generate_answer_node = GenerateAnswerNode( - input="user_prompt & (relevant_chunks | parsed_doc | doc)", - output=["answer"], - node_config={ - "llm_model": llm_model, - "verbose": True, - } -) - -# ************************************************ -# Create the graph by defining the connections -# ************************************************ - -graph = BaseGraph( - nodes=[ - robot_node, - fetch_node, - parse_node, - rag_node, - generate_answer_node, - ], - edges=[ - (robot_node, fetch_node), - (fetch_node, parse_node), - (parse_node, rag_node), - (rag_node, generate_answer_node) - ], - entry_point=robot_node -) - -# ************************************************ -# Execute the graph -# ************************************************ - -result, execution_info = graph.execute({ - "user_prompt": "List me all the articles", - "url": "https://perinim.github.io/projects" -}) - -# Get the answer from the result -result = result.get("answer", "No answer found.") -print(json.dumps(result, indent=4)) diff --git a/examples/bedrock/depth_search_graph_bedrock.py b/examples/bedrock/depth_search_graph_bedrock.py deleted file mode 100644 index 243547a4..00000000 --- a/examples/bedrock/depth_search_graph_bedrock.py +++ /dev/null @@ -1,25 +0,0 @@ -""" -depth_search_graph_opeani example -""" -from scrapegraphai.graphs import DepthSearchGraph - -graph_config = { - "llm": { - "client": "client_name", - "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", - "temperature": 0.0 - }, - "verbose": True, - "headless": False, - "depth": 2, - "only_inside_links": False, -} - -search_graph = DepthSearchGraph( - prompt="List me all the projects with their description", - source="https://perinim.github.io", - config=graph_config -) - -result = search_graph.run() -print(result) diff --git a/examples/bedrock/document_scraper_bedrock.py b/examples/bedrock/document_scraper_bedrock.py deleted file mode 100644 index f9b99e1f..00000000 --- a/examples/bedrock/document_scraper_bedrock.py +++ /dev/null @@ -1,42 +0,0 @@ -""" -document_scraper example -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import DocumentScraperGraph - -load_dotenv() - - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "client": "client_name", - "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", - "temperature": 0.0 - } -} - -source = """ - The Divine Comedy, Italian La Divina Commedia, original name La commedia, long narrative poem written in Italian - circa 1308/21 by Dante. It is usually held to be one of the world s great works of literature. - Divided into three major sections—Inferno, Purgatorio, and Paradiso—the narrative traces the journey of Dante - from darkness and error to the revelation of the divine light, culminating in the Beatific Vision of God. - Dante is guided by the Roman poet Virgil, who represents the epitome of human knowledge, from the dark wood - through the descending circles of the pit of Hell (Inferno). He then climbs the mountain of Purgatory, guided - by the Roman poet Statius, who represents the fulfilment of human knowledge, and is finally led by his lifelong love, - the Beatrice of his earlier poetry, through the celestial spheres of Paradise. -""" - -pdf_scraper_graph = DocumentScraperGraph( - prompt="Summarize the text and find the main topics", - source=source, - config=graph_config, -) -result = pdf_scraper_graph.run() - -print(json.dumps(result, indent=4)) \ No newline at end of file diff --git a/examples/bedrock/inputs/books.xml b/examples/bedrock/inputs/books.xml deleted file mode 100644 index e3d1fe87..00000000 --- a/examples/bedrock/inputs/books.xml +++ /dev/null @@ -1,120 +0,0 @@ - - - - Gambardella, Matthew - XML Developer's Guide - Computer - 44.95 - 2000-10-01 - An in-depth look at creating applications - with XML. - - - Ralls, Kim - Midnight Rain - Fantasy - 5.95 - 2000-12-16 - A former architect battles corporate zombies, - an evil sorceress, and her own childhood to become queen - of the world. - - - Corets, Eva - Maeve Ascendant - Fantasy - 5.95 - 2000-11-17 - After the collapse of a nanotechnology - society in England, the young survivors lay the - foundation for a new society. - - - Corets, Eva - Oberon's Legacy - Fantasy - 5.95 - 2001-03-10 - In post-apocalypse England, the mysterious - agent known only as Oberon helps to create a new life - for the inhabitants of London. Sequel to Maeve - Ascendant. - - - Corets, Eva - The Sundered Grail - Fantasy - 5.95 - 2001-09-10 - The two daughters of Maeve, half-sisters, - battle one another for control of England. Sequel to - Oberon's Legacy. - - - Randall, Cynthia - Lover Birds - Romance - 4.95 - 2000-09-02 - When Carla meets Paul at an ornithology - conference, tempers fly as feathers get ruffled. - - - Thurman, Paula - Splish Splash - Romance - 4.95 - 2000-11-02 - A deep sea diver finds true love twenty - thousand leagues beneath the sea. - - - Knorr, Stefan - Creepy Crawlies - Horror - 4.95 - 2000-12-06 - An anthology of horror stories about roaches, - centipedes, scorpions and other insects. - - - Kress, Peter - Paradox Lost - Science Fiction - 6.95 - 2000-11-02 - After an inadvertant trip through a Heisenberg - Uncertainty Device, James Salway discovers the problems - of being quantum. - - - O'Brien, Tim - Microsoft .NET: The Programming Bible - Computer - 36.95 - 2000-12-09 - Microsoft's .NET initiative is explored in - detail in this deep programmer's reference. - - - O'Brien, Tim - MSXML3: A Comprehensive Guide - Computer - 36.95 - 2000-12-01 - The Microsoft MSXML3 parser is covered in - detail, with attention to XML DOM interfaces, XSLT processing, - SAX and more. - - - Galos, Mike - Visual Studio 7: A Comprehensive Guide - Computer - 49.95 - 2001-04-16 - Microsoft Visual Studio 7 is explored in depth, - looking at how Visual Basic, Visual C++, C#, and ASP+ are - integrated into a comprehensive development - environment. - - \ No newline at end of file diff --git a/examples/bedrock/inputs/example.json b/examples/bedrock/inputs/example.json deleted file mode 100644 index d729b76a..00000000 --- a/examples/bedrock/inputs/example.json +++ /dev/null @@ -1,38 +0,0 @@ -{ - "quiz": { - "sport": { - "q1": { - "question": "Which one is correct team name in NBA?", - "options": [ - "New York Bulls", - "Los Angeles Kings", - "Golden State Warriros", - "Huston Rocket" - ], - "answer": "Huston Rocket" - } - }, - "maths": { - "q1": { - "question": "5 + 7 = ?", - "options": [ - "10", - "11", - "12", - "13" - ], - "answer": "12" - }, - "q2": { - "question": "12 - 8 = ?", - "options": [ - "1", - "2", - "3", - "4" - ], - "answer": "4" - } - } - } -} \ No newline at end of file diff --git a/examples/bedrock/inputs/username.csv b/examples/bedrock/inputs/username.csv deleted file mode 100644 index 8c039d7e..00000000 --- a/examples/bedrock/inputs/username.csv +++ /dev/null @@ -1,6 +0,0 @@ -Username; Identifier;First name;Last name -booker12;9012;Rachel;Booker -grey07;2070;Laura;Grey -johnson81;4081;Craig;Johnson -jenkins46;9346;Mary;Jenkins -smith79;5079;Jamie;Smith \ No newline at end of file diff --git a/examples/bedrock/json_scraper_bedrock.py b/examples/bedrock/json_scraper_bedrock.py deleted file mode 100644 index c34cb1bd..00000000 --- a/examples/bedrock/json_scraper_bedrock.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -Basic example of scraping pipeline using JSONScraperGraph from JSON documents -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import JSONScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the JSON file -# ************************************************ - -FILE_NAME = "inputs/example.json" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "client": "client_name", - "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", - "temperature": 0.0 - } -} - -# ************************************************ -# Create the JSONScraperGraph instance and run it -# ************************************************ - -json_scraper_graph = JSONScraperGraph( - prompt="List me all questions and options in the math section, no answers.", - source=text, # Pass the content of the file, not the file object - config=graph_config -) - -result = json_scraper_graph.run() -print(json.dumps(result, indent=4)) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = json_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/bedrock/json_scraper_multi_bedrock.py b/examples/bedrock/json_scraper_multi_bedrock.py deleted file mode 100644 index 5848ef17..00000000 --- a/examples/bedrock/json_scraper_multi_bedrock.py +++ /dev/null @@ -1,32 +0,0 @@ -""" -Module for showing how JSONScraperMultiGraph multi works -""" -import os -import json -from scrapegraphai.graphs import JSONScraperMultiGraph - -graph_config = { - "llm": { - "client": "client_name", - "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", - "temperature": 0.0 - } -} -FILE_NAME = "inputs/example.json" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -sources = [text, text] - -multiple_search_graph = JSONScraperMultiGraph( - prompt= "List me all the authors, title and genres of the books", - source= sources, - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/bedrock/rate_limit_bedrock.py b/examples/bedrock/rate_limit_bedrock.py deleted file mode 100644 index 98e2e3db..00000000 --- a/examples/bedrock/rate_limit_bedrock.py +++ /dev/null @@ -1,44 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper with a custom rate limit -""" -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "client": "client_name", - "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", - "temperature": 0.0, - "rate_limit": { - "requests_per_second": 1 - }, - } -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects/", - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/bedrock/scrape_plain_text_bedrock.py b/examples/bedrock/scrape_plain_text_bedrock.py deleted file mode 100644 index 1a89786e..00000000 --- a/examples/bedrock/scrape_plain_text_bedrock.py +++ /dev/null @@ -1,54 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper from text -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the text file -# ************************************************ - -FILE_NAME = "inputs/plain_html_example.txt" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -# It could be also a http request using the request model -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "client": "client_name", - "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", - "temperature": 0.0 - } -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description.", - source=text, - config=graph_config -) - -result = smart_scraper_graph.run() -print(json.dumps(result, indent=4)) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/bedrock/scrapegraphai_bedrock.png b/examples/bedrock/scrapegraphai_bedrock.png deleted file mode 100644 index 918cf191..00000000 Binary files a/examples/bedrock/scrapegraphai_bedrock.png and /dev/null differ diff --git a/examples/bedrock/script_generator_bedrock.py b/examples/bedrock/script_generator_bedrock.py deleted file mode 100644 index 4adb13f1..00000000 --- a/examples/bedrock/script_generator_bedrock.py +++ /dev/null @@ -1,43 +0,0 @@ -""" -Basic example of scraping pipeline using ScriptCreatorGraph -""" -from dotenv import load_dotenv -from scrapegraphai.graphs import ScriptCreatorGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "client": "client_name", - "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", - "temperature": 0.0 - }, - "library": "beautifulsoup" -} - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -script_creator_graph = ScriptCreatorGraph( - prompt="List me all the projects with their description.", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects", - config=graph_config -) - -result = script_creator_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = script_creator_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - diff --git a/examples/bedrock/script_multi_generator_bedrock.py b/examples/bedrock/script_multi_generator_bedrock.py deleted file mode 100644 index 2491a1f9..00000000 --- a/examples/bedrock/script_multi_generator_bedrock.py +++ /dev/null @@ -1,48 +0,0 @@ -""" -Basic example of scraping pipeline using ScriptCreatorGraph -""" -from scrapegraphai.graphs import ScriptCreatorMultiGraph -from scrapegraphai.utils import prettify_exec_info - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "client": "client_name", - "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", - "temperature": 0.0 - }, - "library": "beautifulsoup" -} - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -urls=[ - "https://schultzbergagency.com/emil-raste-karlsen/", - "https://schultzbergagency.com/johanna-hedberg/", -] - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -script_creator_graph = ScriptCreatorMultiGraph( - prompt="Find information about actors", - # also accepts a string with the already downloaded HTML code - source=urls, - config=graph_config -) - -result = script_creator_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = script_creator_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/bedrock/search_graph_bedrock.py b/examples/bedrock/search_graph_bedrock.py deleted file mode 100644 index 6369f647..00000000 --- a/examples/bedrock/search_graph_bedrock.py +++ /dev/null @@ -1,27 +0,0 @@ -""" -Example of Search Graph -""" -from scrapegraphai.graphs import SearchGraph - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "client": "client_name", - "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", - "temperature": 0.0 - } -} -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me Chioggia's famous dishes", - config=graph_config -) - -result = search_graph.run() -print(result) diff --git a/examples/bedrock/search_graph_schema_bedrock.py b/examples/bedrock/search_graph_schema_bedrock.py deleted file mode 100644 index 55ad772c..00000000 --- a/examples/bedrock/search_graph_schema_bedrock.py +++ /dev/null @@ -1,54 +0,0 @@ -""" -Example of Search Graph -""" -from typing import List -from pydantic import BaseModel, Field -from scrapegraphai.graphs import SearchGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Dish(BaseModel): - name: str = Field(description="The name of the dish") - description: str = Field(description="The description of the dish") - -class Dishes(BaseModel): - dishes: List[Dish] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "client": "client_name", - "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", - "temperature": 0.0 - } -} - -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me Chioggia's famous dishes", - config=graph_config, - schema=Dishes -) - -result = search_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = search_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json and csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/bedrock/search_link_graph_bedrock.py b/examples/bedrock/search_link_graph_bedrock.py deleted file mode 100644 index 64e62710..00000000 --- a/examples/bedrock/search_link_graph_bedrock.py +++ /dev/null @@ -1,40 +0,0 @@ -""" -Example of Search Graph -""" -from scrapegraphai.graphs import SearchGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "client": "client_name", - "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", - "temperature": 0.0 - } -} - -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me the best escursions near Trento", - config=graph_config -) - -result = search_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = search_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json and csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/bedrock/smart_scraper_bedrock.py b/examples/bedrock/smart_scraper_bedrock.py deleted file mode 100644 index d63f1ece..00000000 --- a/examples/bedrock/smart_scraper_bedrock.py +++ /dev/null @@ -1,42 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "client": "client_name", - "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", - "temperature": 0.0 - } -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects/", - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/bedrock/smart_scraper_lite_bedrock.py b/examples/bedrock/smart_scraper_lite_bedrock.py deleted file mode 100644 index 2bf0471c..00000000 --- a/examples/bedrock/smart_scraper_lite_bedrock.py +++ /dev/null @@ -1,26 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import json -from scrapegraphai.graphs import SmartScraperLiteGraph -from scrapegraphai.utils import prettify_exec_info - -graph_config = { - "llm": { - "client": "client_name", - "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", - "temperature": 0.0 - } -} - -smart_scraper_lite_graph = SmartScraperLiteGraph( - prompt="Who is Marco Perini?", - source="https://perinim.github.io/", - config=graph_config -) - -result = smart_scraper_lite_graph.run() -print(json.dumps(result, indent=4)) - -graph_exec_info = smart_scraper_lite_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/bedrock/smart_scraper_multi_bedrock.py b/examples/bedrock/smart_scraper_multi_bedrock.py deleted file mode 100644 index 9de097b0..00000000 --- a/examples/bedrock/smart_scraper_multi_bedrock.py +++ /dev/null @@ -1,34 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import json -from scrapegraphai.graphs import SmartScraperMultiGraph - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "client": "client_name", - "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", - "temperature": 0.0 - } -} - -# ******************************************************* -# Create the SmartScraperMultiGraph instance and run it -# ******************************************************* - -multiple_search_graph = SmartScraperMultiGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/bedrock/smart_scraper_multi_concat_bedrock.py b/examples/bedrock/smart_scraper_multi_concat_bedrock.py deleted file mode 100644 index 74c30a3f..00000000 --- a/examples/bedrock/smart_scraper_multi_concat_bedrock.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import json -from scrapegraphai.graphs import SmartScraperMultiConcatGraph - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "client": "client_name", - "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", - "temperature": 0.0 - } -} - - -# ******************************************************* -# Create the SmartScraperMultiGraph instance and run it -# ******************************************************* - -multiple_search_graph = SmartScraperMultiConcatGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/bedrock/smart_scraper_multi_lite_bedrock.py b/examples/bedrock/smart_scraper_multi_lite_bedrock.py deleted file mode 100644 index 5cb26067..00000000 --- a/examples/bedrock/smart_scraper_multi_lite_bedrock.py +++ /dev/null @@ -1,29 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import json -from scrapegraphai.graphs import SmartScraperMultiLiteGraph -from scrapegraphai.utils import prettify_exec_info - -graph_config = { - "llm": { - "client": "client_name", - "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", - "temperature": 0.0 - } -} - -smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - config=graph_config -) - -result = smart_scraper_multi_lite_graph.run() -print(json.dumps(result, indent=4)) - -graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/bedrock/smart_scraper_schema_bedrock.py b/examples/bedrock/smart_scraper_schema_bedrock.py deleted file mode 100644 index 2829efec..00000000 --- a/examples/bedrock/smart_scraper_schema_bedrock.py +++ /dev/null @@ -1,52 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -from typing import List -from pydantic import BaseModel, Field -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Project(BaseModel): - title: str = Field(description="The title of the project") - description: str = Field(description="The description of the project") - -class Projects(BaseModel): - projects: List[Project] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "client": "client_name", - "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", - "temperature": 0.0 - } -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects/", - schema=Projects, - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/bedrock/xml_scraper_bedrock.py b/examples/bedrock/xml_scraper_bedrock.py deleted file mode 100644 index 2110fc9f..00000000 --- a/examples/bedrock/xml_scraper_bedrock.py +++ /dev/null @@ -1,58 +0,0 @@ -""" -Basic example of scraping pipeline using XMLScraperGraph from XML documents -""" - -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import XMLScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the XML file -# ************************************************ - -FILE_NAME = "inputs/books.xml" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "client": "client_name", - "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", - "temperature": 0.0 - }, -} - -# ************************************************ -# Create the XMLScraperGraph instance and run it -# ************************************************ - -xml_scraper_graph = XMLScraperGraph( - prompt="List me all the authors, title and genres of the books. Skip the preamble.", - source=text, # Pass the content of the file, not the file object - config=graph_config -) - -result = xml_scraper_graph.run() -print(json.dumps(result, indent=4)) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = xml_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/bedrock/xml_scraper_graph_multi_bedrock.py b/examples/bedrock/xml_scraper_graph_multi_bedrock.py deleted file mode 100644 index ab7bd4ad..00000000 --- a/examples/bedrock/xml_scraper_graph_multi_bedrock.py +++ /dev/null @@ -1,56 +0,0 @@ -""" -Basic example of scraping pipeline using XMLScraperMultiGraph from XML documents -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import XMLScraperMultiGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the XML file -# ************************************************ - -FILE_NAME = "inputs/books.xml" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "client": "client_name", - "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", - "temperature": 0.0 - }, -} - -# ************************************************ -# Create the XMLScraperMultiGraph instance and run it -# ************************************************ - -xml_scraper_graph = XMLScraperMultiGraph( - prompt="List me all the authors, title and genres of the books", - source=[text, text], # Pass the content of the file, not the file object - config=graph_config -) - -result = xml_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = xml_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/benchmarks/GenerateScraper/.env.example b/examples/benchmarks/GenerateScraper/.env.example deleted file mode 100644 index 599a2397..00000000 --- a/examples/benchmarks/GenerateScraper/.env.example +++ /dev/null @@ -1 +0,0 @@ -OPENAI_APIKEY="your openai key here" \ No newline at end of file diff --git a/examples/benchmarks/GenerateScraper/Readme.md b/examples/benchmarks/GenerateScraper/Readme.md deleted file mode 100644 index 79201d22..00000000 --- a/examples/benchmarks/GenerateScraper/Readme.md +++ /dev/null @@ -1,43 +0,0 @@ -# Local models -# Local models -The two websites benchmark are: -- Example 1: https://perinim.github.io/projects -- Example 2: https://www.wired.com (at 17/4/2024) - -Both are strored locally as txt file in .txt format because in this way we do not have to think about the internet connection - -The time is measured in seconds - -The model runned for this benchmark is Mistral on Ollama with nomic-embed-text - -| Hardware | Model | Example 1 | Example 2 | -| ---------------------- | --------------------------------------- | --------- | --------- | -| Macbook 14' m1 pro | Mistral on Ollama with nomic-embed-text | 30.54s | 35.76s | -| Macbook m2 max | Mistral on Ollama with nomic-embed-text | | | -| Macbook 14' m1 pro
| Llama3 on Ollama with nomic-embed-text | 27.82s | 29.986s | -| Macbook m2 max
| Llama3 on Ollama with nomic-embed-text | | | - - -**Note**: the examples on Docker are not runned on other devices than the Macbook because the performance are to slow (10 times slower than Ollama). -# Performance on APIs services -### Example 1: personal portfolio -**URL**: https://perinim.github.io/projects -**Task**: List me all the projects with their description. - -| Name | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD | -| --------------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- | -| gpt-3.5-turbo | 24.21 | 1892 | 1802 | 90 | 1 | 0.002883 | -| gpt-4-turbo-preview | 6.614 | 1936 | 1802 | 134 | 1 | 0.02204 | -| Grooq with nomic-embed-text | 6.71 | 2201 | 2024 | 177 | 1 | 0 | - -### Example 2: Wired -**URL**: https://www.wired.com -**Task**: List me all the articles with their description. - -| Name | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD | -| --------------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- | -| gpt-3.5-turbo | | | | | | | -| gpt-4-turbo-preview | | | | | | | -| Grooq with nomic-embed-text | | | | | | | - - diff --git a/examples/benchmarks/GenerateScraper/benchmark_docker.py b/examples/benchmarks/GenerateScraper/benchmark_docker.py deleted file mode 100644 index e69de29b..00000000 diff --git a/examples/benchmarks/GenerateScraper/benchmark_groq.py b/examples/benchmarks/GenerateScraper/benchmark_groq.py deleted file mode 100644 index bef4e8b6..00000000 --- a/examples/benchmarks/GenerateScraper/benchmark_groq.py +++ /dev/null @@ -1,61 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper from text -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import ScriptCreatorGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the text file -# ************************************************ -files = ["inputs/example_1.txt", "inputs/example_2.txt"] -tasks = ["List me all the projects with their description.", - "List me all the articles with their description."] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -groq_key = os.getenv("GROQ_APIKEY") - -graph_config = { - "llm": { - "model": "groq/gemma-7b-it", - "api_key": groq_key, - "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - "base_url": "http://localhost:11434", # set ollama URL arbitrarily - }, - "headless": False, - "library": "beautifoulsoup" -} - - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -for i in range(0, 2): - with open(files[i], 'r', encoding="utf-8") as file: - text = file.read() - - smart_scraper_graph = ScriptCreatorGraph( - prompt=tasks[i], - source=text, - config=graph_config - ) - - result = smart_scraper_graph.run() - print(result) - # ************************************************ - # Get graph execution info - # ************************************************ - - graph_exec_info = smart_scraper_graph.get_execution_info() - print(prettify_exec_info(graph_exec_info)) diff --git a/examples/benchmarks/GenerateScraper/benchmark_llama3.py b/examples/benchmarks/GenerateScraper/benchmark_llama3.py deleted file mode 100644 index a80b2e71..00000000 --- a/examples/benchmarks/GenerateScraper/benchmark_llama3.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper from text -""" - -from scrapegraphai.graphs import ScriptCreatorGraph -from scrapegraphai.utils import prettify_exec_info - -# ************************************************ -# Read the text file -# ************************************************ -files = ["inputs/example_1.txt", "inputs/example_2.txt"] -tasks = ["List me all the projects with their description.", - "List me all the articles with their description."] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - - -graph_config = { - "llm": { - "model": "ollama/llama3", - "temperature": 0, - # "model_tokens": 2000, # set context length arbitrarily, - "base_url": "http://localhost:11434", # set ollama URL arbitrarily - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - "base_url": "http://localhost:11434", # set ollama URL arbitrarily - }, - "library": "beautifoulsoup" -} - - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -for i in range(0, 2): - with open(files[i], 'r', encoding="utf-8") as file: - text = file.read() - - smart_scraper_graph = ScriptCreatorGraph( - prompt=tasks[i], - source=text, - config=graph_config - ) - - result = smart_scraper_graph.run() - print(result) - # ************************************************ - # Get graph execution info - # ************************************************ - - graph_exec_info = smart_scraper_graph.get_execution_info() - print(prettify_exec_info(graph_exec_info)) diff --git a/examples/benchmarks/GenerateScraper/benchmark_mistral.py b/examples/benchmarks/GenerateScraper/benchmark_mistral.py deleted file mode 100644 index 87219eb4..00000000 --- a/examples/benchmarks/GenerateScraper/benchmark_mistral.py +++ /dev/null @@ -1,62 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper from text -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import ScriptCreatorGraph -from scrapegraphai.utils import prettify_exec_info -load_dotenv() - -# ************************************************ -# Read the text file -# ************************************************ -files = ["inputs/example_1.txt", "inputs/example_2.txt"] -tasks = ["List me all the projects with their description.", - "List me all the articles with their description."] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -openai_key = os.getenv("GPT4_KEY") - - -graph_config = { - "llm": { - "model": "ollama/mistral", - "temperature": 0, - # "model_tokens": 2000, # set context length arbitrarily, - "base_url": "http://localhost:11434", # set ollama URL arbitrarily - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - "base_url": "http://localhost:11434", # set ollama URL arbitrarily - }, - "library": "beautifoulsoup" -} - - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -for i in range(0, 2): - with open(files[i], 'r', encoding="utf-8") as file: - text = file.read() - - smart_scraper_graph = ScriptCreatorGraph( - prompt=tasks[i], - source=text, - config=graph_config - ) - - result = smart_scraper_graph.run() - print(result) - # ************************************************ - # Get graph execution info - # ************************************************ - - graph_exec_info = smart_scraper_graph.get_execution_info() - print(prettify_exec_info(graph_exec_info)) diff --git a/examples/benchmarks/GenerateScraper/benchmark_openai_gpt35.py b/examples/benchmarks/GenerateScraper/benchmark_openai_gpt35.py deleted file mode 100644 index 83ed3913..00000000 --- a/examples/benchmarks/GenerateScraper/benchmark_openai_gpt35.py +++ /dev/null @@ -1,53 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper from text -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import ScriptCreatorGraph -from scrapegraphai.utils import prettify_exec_info -load_dotenv() - -# ************************************************ -# Read the text file -# ************************************************ -files = ["inputs/example_1.txt", "inputs/example_2.txt"] -tasks = ["List me all the projects with their description.", - "List me all the articles with their description."] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -openai_key = os.getenv("OPENAI_APIKEY") - -graph_config = { - "llm": { - "api_key": openai_key, - "model": "openai/gpt-3.5-turbo", - }, - "library": "beautifoulsoup" -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -for i in range(0, 2): - with open(files[i], 'r', encoding="utf-8") as file: - text = file.read() - - smart_scraper_graph = ScriptCreatorGraph( - prompt=tasks[i], - source=text, - config=graph_config - ) - - result = smart_scraper_graph.run() - print(result) - # ************************************************ - # Get graph execution info - # ************************************************ - - graph_exec_info = smart_scraper_graph.get_execution_info() - print(prettify_exec_info(graph_exec_info)) diff --git a/examples/benchmarks/GenerateScraper/benchmark_openai_gpt4.py b/examples/benchmarks/GenerateScraper/benchmark_openai_gpt4.py deleted file mode 100644 index 37791c29..00000000 --- a/examples/benchmarks/GenerateScraper/benchmark_openai_gpt4.py +++ /dev/null @@ -1,53 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper from text -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import ScriptCreatorGraph -from scrapegraphai.utils import prettify_exec_info -load_dotenv() - -# ************************************************ -# Read the text file -# ************************************************ -files = ["inputs/example_1.txt", "inputs/example_2.txt"] -tasks = ["List me all the projects with their description.", - "List me all the articles with their description."] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -openai_key = os.getenv("OPENAI_APIKEY") - -graph_config = { - "llm": { - "api_key": openai_key, - "model": "openai/gpt-4-turbo-2024-04-09", - }, - "library": "beautifoulsoup" -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -for i in range(0, 2): - with open(files[i], 'r', encoding="utf-8") as file: - text = file.read() - - smart_scraper_graph = ScriptCreatorGraph( - prompt=tasks[i], - source=text, - config=graph_config - ) - - result = smart_scraper_graph.run() - print(result) - # ************************************************ - # Get graph execution info - # ************************************************ - - graph_exec_info = smart_scraper_graph.get_execution_info() - print(prettify_exec_info(graph_exec_info)) diff --git a/examples/benchmarks/GenerateScraper/inputs/example_1.txt b/examples/benchmarks/GenerateScraper/inputs/example_1.txt deleted file mode 100644 index 78f814ae..00000000 --- a/examples/benchmarks/GenerateScraper/inputs/example_1.txt +++ /dev/null @@ -1,105 +0,0 @@ - -
- - -
-
-
- - -
- \ No newline at end of file diff --git a/examples/benchmarks/GenerateScraper/inputs/example_2.txt b/examples/benchmarks/GenerateScraper/inputs/example_2.txt deleted file mode 100644 index b7810eed..00000000 --- a/examples/benchmarks/GenerateScraper/inputs/example_2.txt +++ /dev/null @@ -1,400 +0,0 @@ -WIRED - The Latest in Technology, Science, Culture and Business | WIRED
Skip to main content

WIRED

Book Excerpt

They Experimented on Themselves in Secret. What They Discovered Helped Win a War

The untold, top-secret story of the British researchers who found the key to keeping humans alive underwater—and helped make D-Day a success.
Business

Microsoft in the age of Satya Nadella

Originally published February 2015: More than five years before Microsoft invested its first $1 billion in OpenAI, its engineers were hard at work on something that they believed would transform consumer computing, and it wasn’t artificial intelligence.
- WIRED - The Latest in Technology, Science, Culture and Business | WIRED -
Skip to main content

WIRED

Book Excerpt

They Experimented on Themselves in Secret. What They Discovered Helped Win a War

The untold, top-secret story of the British researchers who found the key to keeping humans alive underwater—and helped make D-Day a success.
Business

Microsoft in the age of Satya Nadella

Originally published February 2015: More than five years before Microsoft invested its first $1 billion in OpenAI, its engineers were hard at work on something that they believed would transform consumer computing, and it wasn’t artificial intelligence.
-WIRED - The Latest in Technology, Science, Culture and Business | WIRED
Skip to main content

WIRED

Book Excerpt

They Experimented on Themselves in Secret. What They Discovered Helped Win a War

The untold, top-secret story of the British researchers who found the key to keeping humans alive underwater—and helped make D-Day a success.
Business

Microsoft in the age of Satya Nadella

Originally published February 2015: More than five years before Microsoft invested its first $1 billion in OpenAI, its engineers were hard at work on something that they believed would transform consumer computing, and it wasn’t artificial intelligence.
\ No newline at end of file diff --git a/examples/benchmarks/SmartScraper/.env.example b/examples/benchmarks/SmartScraper/.env.example deleted file mode 100644 index 599a2397..00000000 --- a/examples/benchmarks/SmartScraper/.env.example +++ /dev/null @@ -1 +0,0 @@ -OPENAI_APIKEY="your openai key here" \ No newline at end of file diff --git a/examples/benchmarks/SmartScraper/Readme.md b/examples/benchmarks/SmartScraper/Readme.md deleted file mode 100644 index 9c9f9c37..00000000 --- a/examples/benchmarks/SmartScraper/Readme.md +++ /dev/null @@ -1,42 +0,0 @@ -# Local models -# Local models -The two websites benchmark are: -- Example 1: https://perinim.github.io/projects -- Example 2: https://www.wired.com (at 17/4/2024) - -Both are strored locally as txt file in .txt format because in this way we do not have to think about the internet connection - -| Hardware | Model | Example 1 | Example 2 | -| ---------------------- | --------------------------------------- | --------- | --------- | -| Macbook 14' m1 pro | Mistral on Ollama with nomic-embed-text | 16.291s | 38.74s | -| Macbook m2 max | Mistral on Ollama with nomic-embed-text | | | -| Macbook 14' m1 pro
| Llama3 on Ollama with nomic-embed-text | 12.88s | 13.84s | -| Macbook m2 max
| Llama3 on Ollama with nomic-embed-text | | | - -**Note**: the examples on Docker are not runned on other devices than the Macbook because the performance are to slow (10 times slower than Ollama). Indeed the results are the following: - -| Hardware | Example 1 | Example 2 | -| ------------------ | --------- | --------- | -| Macbook 14' m1 pro | 139.89 | Too long | -# Performance on APIs services -### Example 1: personal portfolio -**URL**: https://perinim.github.io/projects -**Task**: List me all the projects with their description. - -| Name | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD | -| ------------------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- | -| gpt-3.5-turbo | 4.132s | 438 | 303 | 135 | 1 | 0.000724 | -| gpt-4-turbo-preview | 6.965s | 442 | 303 | 139 | 1 | 0.0072 | -| gpt-4-o | 4.446s | 444 | 305 | 139 | 1 | 0 | -| Grooq with nomic-embed-text
| 1.335s | 648 | 482 | 166 | 1 | 0 | - -### Example 2: Wired -**URL**: https://www.wired.com -**Task**: List me all the articles with their description. - -| Name | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD | -| ------------------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- | -| gpt-3.5-turbo | 8.836s | 1167 | 726 | 441 | 1 | 0.001971 | -| gpt-4-turbo-preview | 21.53s | 1205 | 726 | 479 | 1 | 0.02163 | -| gpt-4-o | 15.27s | 1400 | 715 | 685 | 1 | 0 | -| Grooq with nomic-embed-text
| 3.82s | 2459 | 2192 | 267 | 1 | 0 | diff --git a/examples/benchmarks/SmartScraper/benchmark_docker.py b/examples/benchmarks/SmartScraper/benchmark_docker.py deleted file mode 100644 index e5754c4b..00000000 --- a/examples/benchmarks/SmartScraper/benchmark_docker.py +++ /dev/null @@ -1,51 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper from text -""" - -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -files = ["inputs/example_1.txt", "inputs/example_2.txt"] -tasks = ["List me all the projects with their description.", - "List me all the articles with their description."] - - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "model": "ollama/mistral", - "temperature": 0, - "format": "json", # Ollama needs the format to be specified explicitly - # "model_tokens": 2000, # set context length arbitrarily - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - } -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -for i in range(0, 2): - with open(files[i], 'r', encoding="utf-8") as file: - text = file.read() - - smart_scraper_graph = SmartScraperGraph( - prompt=tasks[i], - source=text, - config=graph_config - ) - - result = smart_scraper_graph.run() - print(result) - # ************************************************ - # Get graph execution info - # ************************************************ - - graph_exec_info = smart_scraper_graph.get_execution_info() - print(prettify_exec_info(graph_exec_info)) diff --git a/examples/benchmarks/SmartScraper/benchmark_groq.py b/examples/benchmarks/SmartScraper/benchmark_groq.py deleted file mode 100644 index e769ee52..00000000 --- a/examples/benchmarks/SmartScraper/benchmark_groq.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper from text -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -files = ["inputs/example_1.txt", "inputs/example_2.txt"] -tasks = ["List me all the projects with their description.", - "List me all the articles with their description."] - - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -groq_key = os.getenv("GROQ_APIKEY") - -graph_config = { - "llm": { - "model": "groq/gemma-7b-it", - "api_key": groq_key, - "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - "base_url": "http://localhost:11434", # set ollama URL arbitrarily - }, - "headless": False -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -for i in range(0, 2): - with open(files[i], 'r', encoding="utf-8") as file: - text = file.read() - - smart_scraper_graph = SmartScraperGraph( - prompt=tasks[i], - source=text, - config=graph_config - ) - - result = smart_scraper_graph.run() - print(result) - # ************************************************ - # Get graph execution info - # ************************************************ - - graph_exec_info = smart_scraper_graph.get_execution_info() - print(prettify_exec_info(graph_exec_info)) diff --git a/examples/benchmarks/SmartScraper/benchmark_llama3.py b/examples/benchmarks/SmartScraper/benchmark_llama3.py deleted file mode 100644 index 2b182f20..00000000 --- a/examples/benchmarks/SmartScraper/benchmark_llama3.py +++ /dev/null @@ -1,53 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper from text -""" - -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -files = ["inputs/example_1.txt", "inputs/example_2.txt"] -tasks = ["List me all the projects with their description.", - "List me all the articles with their description."] - - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "model": "ollama/llama3", - "temperature": 0, - "format": "json", # Ollama needs the format to be specified explicitly - # "model_tokens": 2000, # set context length arbitrarily - "base_url": "http://localhost:11434", - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - "base_url": "http://localhost:11434", - } -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -for i in range(0, 2): - with open(files[i], 'r', encoding="utf-8") as file: - text = file.read() - - smart_scraper_graph = SmartScraperGraph( - prompt=tasks[i], - source=text, - config=graph_config - ) - - result = smart_scraper_graph.run() - print(result) - # ************************************************ - # Get graph execution info - # ************************************************ - - graph_exec_info = smart_scraper_graph.get_execution_info() - print(prettify_exec_info(graph_exec_info)) diff --git a/examples/benchmarks/SmartScraper/benchmark_mistral.py b/examples/benchmarks/SmartScraper/benchmark_mistral.py deleted file mode 100644 index 0e6e53e5..00000000 --- a/examples/benchmarks/SmartScraper/benchmark_mistral.py +++ /dev/null @@ -1,54 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper from text -""" - -import os -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -files = ["inputs/example_1.txt", "inputs/example_2.txt"] -tasks = ["List me all the projects with their description.", - "List me all the articles with their description."] - - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "model": "ollama/mistral", - "temperature": 0, - "format": "json", # Ollama needs the format to be specified explicitly - # "model_tokens": 2000, # set context length arbitrarily - "base_url": "http://localhost:11434", - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - "base_url": "http://localhost:11434", - } -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -for i in range(0, 2): - with open(files[i], 'r', encoding="utf-8") as file: - text = file.read() - - smart_scraper_graph = SmartScraperGraph( - prompt=tasks[i], - source=text, - config=graph_config - ) - - result = smart_scraper_graph.run() - print(result) - # ************************************************ - # Get graph execution info - # ************************************************ - - graph_exec_info = smart_scraper_graph.get_execution_info() - print(prettify_exec_info(graph_exec_info)) diff --git a/examples/benchmarks/SmartScraper/benchmark_openai_gpt35.py b/examples/benchmarks/SmartScraper/benchmark_openai_gpt35.py deleted file mode 100644 index 659d2c78..00000000 --- a/examples/benchmarks/SmartScraper/benchmark_openai_gpt35.py +++ /dev/null @@ -1,52 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper from text -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info -load_dotenv() - -# ************************************************ -# Read the text file -# ************************************************ -files = ["inputs/example_1.txt", "inputs/example_2.txt"] -tasks = ["List me all the projects with their description.", - "List me all the articles with their description."] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -openai_key = os.getenv("OPENAI_APIKEY") - -graph_config = { - "llm": { - "api_key": openai_key, - "model": "openai/gpt-3.5-turbo", - }, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -for i in range(0, 2): - with open(files[i], 'r', encoding="utf-8") as file: - text = file.read() - - smart_scraper_graph = SmartScraperGraph( - prompt=tasks[i], - source=text, - config=graph_config - ) - - result = smart_scraper_graph.run() - print(result) - # ************************************************ - # Get graph execution info - # ************************************************ - - graph_exec_info = smart_scraper_graph.get_execution_info() - print(prettify_exec_info(graph_exec_info)) diff --git a/examples/benchmarks/SmartScraper/benchmark_openai_gpt4.py b/examples/benchmarks/SmartScraper/benchmark_openai_gpt4.py deleted file mode 100644 index a23901a9..00000000 --- a/examples/benchmarks/SmartScraper/benchmark_openai_gpt4.py +++ /dev/null @@ -1,53 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper from text -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info -load_dotenv() - -# ************************************************ -# Read the text file -# ************************************************ -files = ["inputs/example_1.txt", "inputs/example_2.txt"] -tasks = ["List me all the projects with their description.", - "List me all the articles with their description."] - - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -openai_key = os.getenv("OPENAI_APIKEY") - -graph_config = { - "llm": { - "api_key": openai_key, - "model": "openai/gpt-4-turbo", - }, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -for i in range(0, 2): - with open(files[i], 'r', encoding="utf-8") as file: - text = file.read() - - smart_scraper_graph = SmartScraperGraph( - prompt=tasks[i], - source=text, - config=graph_config - ) - - result = smart_scraper_graph.run() - print(result) - # ************************************************ - # Get graph execution info - # ************************************************ - - graph_exec_info = smart_scraper_graph.get_execution_info() - print(prettify_exec_info(graph_exec_info)) diff --git a/examples/benchmarks/SmartScraper/benchmark_openai_gpt4o.py b/examples/benchmarks/SmartScraper/benchmark_openai_gpt4o.py deleted file mode 100644 index 8b2da6d7..00000000 --- a/examples/benchmarks/SmartScraper/benchmark_openai_gpt4o.py +++ /dev/null @@ -1,53 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper from text -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info -load_dotenv() - -# ************************************************ -# Read the text file -# ************************************************ -files = ["inputs/example_1.txt", "inputs/example_2.txt"] -tasks = ["List me all the projects with their description.", - "List me all the articles with their description."] - - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -openai_key = os.getenv("OPENAI_APIKEY") - -graph_config = { - "llm": { - "api_key": openai_key, - "model": "openai/gpt-4o", - }, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -for i in range(0, 2): - with open(files[i], 'r', encoding="utf-8") as file: - text = file.read() - - smart_scraper_graph = SmartScraperGraph( - prompt=tasks[i], - source=text, - config=graph_config - ) - - result = smart_scraper_graph.run() - print(result) - # ************************************************ - # Get graph execution info - # ************************************************ - - graph_exec_info = smart_scraper_graph.get_execution_info() - print(prettify_exec_info(graph_exec_info)) diff --git a/examples/benchmarks/SmartScraper/inputs/example_1.txt b/examples/benchmarks/SmartScraper/inputs/example_1.txt deleted file mode 100644 index 78f814ae..00000000 --- a/examples/benchmarks/SmartScraper/inputs/example_1.txt +++ /dev/null @@ -1,105 +0,0 @@ - -
- - -
-
-
- - -
- \ No newline at end of file diff --git a/examples/benchmarks/SmartScraper/inputs/example_2.txt b/examples/benchmarks/SmartScraper/inputs/example_2.txt deleted file mode 100644 index b7810eed..00000000 --- a/examples/benchmarks/SmartScraper/inputs/example_2.txt +++ /dev/null @@ -1,400 +0,0 @@ -WIRED - The Latest in Technology, Science, Culture and Business | WIRED
Skip to main content

WIRED

Book Excerpt

They Experimented on Themselves in Secret. What They Discovered Helped Win a War

The untold, top-secret story of the British researchers who found the key to keeping humans alive underwater—and helped make D-Day a success.
Business

Microsoft in the age of Satya Nadella

Originally published February 2015: More than five years before Microsoft invested its first $1 billion in OpenAI, its engineers were hard at work on something that they believed would transform consumer computing, and it wasn’t artificial intelligence.
- WIRED - The Latest in Technology, Science, Culture and Business | WIRED -
Skip to main content

WIRED

Book Excerpt

They Experimented on Themselves in Secret. What They Discovered Helped Win a War

The untold, top-secret story of the British researchers who found the key to keeping humans alive underwater—and helped make D-Day a success.
Business

Microsoft in the age of Satya Nadella

Originally published February 2015: More than five years before Microsoft invested its first $1 billion in OpenAI, its engineers were hard at work on something that they believed would transform consumer computing, and it wasn’t artificial intelligence.
-WIRED - The Latest in Technology, Science, Culture and Business | WIRED
Skip to main content

WIRED

Book Excerpt

They Experimented on Themselves in Secret. What They Discovered Helped Win a War

The untold, top-secret story of the British researchers who found the key to keeping humans alive underwater—and helped make D-Day a success.
Business

Microsoft in the age of Satya Nadella

Originally published February 2015: More than five years before Microsoft invested its first $1 billion in OpenAI, its engineers were hard at work on something that they believed would transform consumer computing, and it wasn’t artificial intelligence.
\ No newline at end of file diff --git a/examples/benchmarks/readme.md b/examples/benchmarks/readme.md deleted file mode 100644 index ca672ad0..00000000 --- a/examples/benchmarks/readme.md +++ /dev/null @@ -1,4 +0,0 @@ -These 2 subfolders contain all the scripts and performance documents for the 2 graphs used for the scrapers. -In particular: -* __GenerateScraper__: contains the benchmarks for GenerateScraper class -* __SmartScraper__: contains the benchamrks for SmartScraper class \ No newline at end of file diff --git a/examples/code_generator_graph/.env.example b/examples/code_generator_graph/.env.example new file mode 100644 index 00000000..a93912e4 --- /dev/null +++ b/examples/code_generator_graph/.env.example @@ -0,0 +1,14 @@ +# OpenAI API Configuration +OPENAI_API_KEY=your-openai-api-key-here + +# Optional Configurations +MAX_TOKENS=4000 +MODEL_NAME=gpt-4-1106-preview +TEMPERATURE=0.7 + +# Code Generator Settings +DEFAULT_LANGUAGE=python +GENERATE_TESTS=true +ADD_DOCUMENTATION=true +CODE_STYLE=pep8 +TYPE_CHECKING=true \ No newline at end of file diff --git a/examples/code_generator_graph/README.md b/examples/code_generator_graph/README.md new file mode 100644 index 00000000..bc4b5dec --- /dev/null +++ b/examples/code_generator_graph/README.md @@ -0,0 +1,30 @@ +# Code Generator Graph Example + +This example demonstrates how to use Scrapegraph-ai to generate code based on specifications and requirements. + +## Features + +- Code generation from specifications +- Multiple programming languages support +- Code documentation +- Best practices implementation + +## Setup + +1. Install required dependencies +2. Copy `.env.example` to `.env` +3. Configure your API keys in the `.env` file + +## Usage + +```python +from scrapegraphai.graphs import CodeGeneratorGraph + +graph = CodeGeneratorGraph() +code = graph.generate("code specification") +``` + +## Environment Variables + +Required environment variables: +- `OPENAI_API_KEY`: Your OpenAI API key \ No newline at end of file diff --git a/examples/local_models/code_generator_graph_ollama.py b/examples/code_generator_graph/ollama/code_generator_graph_ollama.py similarity index 100% rename from examples/local_models/code_generator_graph_ollama.py rename to examples/code_generator_graph/ollama/code_generator_graph_ollama.py diff --git a/examples/openai/code_generator_graph_openai.py b/examples/code_generator_graph/openai/code_generator_graph_openai.py similarity index 100% rename from examples/openai/code_generator_graph_openai.py rename to examples/code_generator_graph/openai/code_generator_graph_openai.py diff --git a/examples/csv_scraper_graph/.env.example b/examples/csv_scraper_graph/.env.example new file mode 100644 index 00000000..1917f9aa --- /dev/null +++ b/examples/csv_scraper_graph/.env.example @@ -0,0 +1,11 @@ +# OpenAI API Configuration +OPENAI_API_KEY=your-openai-api-key-here + +# Optional Configurations +MAX_TOKENS=4000 +MODEL_NAME=gpt-4-1106-preview +TEMPERATURE=0.7 + +# CSV Scraper Settings +CSV_DELIMITER=, +MAX_ROWS=1000 \ No newline at end of file diff --git a/examples/csv_scraper_graph/README.md b/examples/csv_scraper_graph/README.md new file mode 100644 index 00000000..d39858b0 --- /dev/null +++ b/examples/csv_scraper_graph/README.md @@ -0,0 +1,30 @@ +# CSV Scraper Graph Example + +This example demonstrates how to use Scrapegraph-ai to extract data from web sources and save it in CSV format. + +## Features + +- Table data extraction +- CSV formatting +- Data cleaning +- Structured output + +## Setup + +1. Install required dependencies +2. Copy `.env.example` to `.env` +3. Configure your API keys in the `.env` file + +## Usage + +```python +from scrapegraphai.graphs import CsvScraperGraph + +graph = CsvScraperGraph() +csv_data = graph.scrape("https://example.com/table") +``` + +## Environment Variables + +Required environment variables: +- `OPENAI_API_KEY`: Your OpenAI API key \ No newline at end of file diff --git a/examples/local_models/csv_scraper_graph_multi_ollama.py b/examples/csv_scraper_graph/ollama/csv_scraper_graph_multi_ollama.py similarity index 86% rename from examples/local_models/csv_scraper_graph_multi_ollama.py rename to examples/csv_scraper_graph/ollama/csv_scraper_graph_multi_ollama.py index fb6bce51..558a876f 100644 --- a/examples/local_models/csv_scraper_graph_multi_ollama.py +++ b/examples/csv_scraper_graph/ollama/csv_scraper_graph_multi_ollama.py @@ -3,9 +3,9 @@ """ import os -import pandas as pd + from scrapegraphai.graphs import CSVScraperMultiGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info +from scrapegraphai.utils import prettify_exec_info # ************************************************ # Read the CSV file @@ -15,7 +15,8 @@ curr_dir = os.path.dirname(os.path.realpath(__file__)) file_path = os.path.join(curr_dir, FILE_NAME) -text = pd.read_csv(file_path) +with open(file_path, "r") as file: + text = file.read() # ************************************************ # Define the configuration for the graph @@ -44,7 +45,7 @@ csv_scraper_graph = CSVScraperMultiGraph( prompt="List me all the last names", source=[str(text), str(text)], - config=graph_config + config=graph_config, ) result = csv_scraper_graph.run() @@ -56,7 +57,3 @@ graph_exec_info = csv_scraper_graph.get_execution_info() print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/local_models/csv_scraper_ollama.py b/examples/csv_scraper_graph/ollama/csv_scraper_ollama.py similarity index 86% rename from examples/local_models/csv_scraper_ollama.py rename to examples/csv_scraper_graph/ollama/csv_scraper_ollama.py index 8d1edbd7..d6e6eab2 100644 --- a/examples/local_models/csv_scraper_ollama.py +++ b/examples/csv_scraper_graph/ollama/csv_scraper_ollama.py @@ -3,9 +3,9 @@ """ import os -import pandas as pd + from scrapegraphai.graphs import CSVScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info +from scrapegraphai.utils import prettify_exec_info # ************************************************ # Read the CSV file @@ -15,7 +15,8 @@ curr_dir = os.path.dirname(os.path.realpath(__file__)) file_path = os.path.join(curr_dir, FILE_NAME) -text = pd.read_csv(file_path) +with open(file_path, "r") as file: + text = file.read() # ************************************************ # Define the configuration for the graph @@ -44,7 +45,7 @@ csv_scraper_graph = CSVScraperGraph( prompt="List me all the last names", source=str(text), # Pass the content of the file, not the file object - config=graph_config + config=graph_config, ) result = csv_scraper_graph.run() @@ -56,7 +57,3 @@ graph_exec_info = csv_scraper_graph.get_execution_info() print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/anthropic/inputs/username.csv b/examples/csv_scraper_graph/ollama/inputs/username.csv similarity index 100% rename from examples/anthropic/inputs/username.csv rename to examples/csv_scraper_graph/ollama/inputs/username.csv diff --git a/examples/openai/csv_scraper_graph_multi_openai.py b/examples/csv_scraper_graph/openai/csv_scraper_graph_multi_openai.py similarity index 83% rename from examples/openai/csv_scraper_graph_multi_openai.py rename to examples/csv_scraper_graph/openai/csv_scraper_graph_multi_openai.py index 6ed33c90..b7bc83ae 100644 --- a/examples/openai/csv_scraper_graph_multi_openai.py +++ b/examples/csv_scraper_graph/openai/csv_scraper_graph_multi_openai.py @@ -1,11 +1,13 @@ """ Basic example of scraping pipeline using CSVScraperMultiGraph from CSV documents """ + import os + from dotenv import load_dotenv -import pandas as pd + from scrapegraphai.graphs import CSVScraperMultiGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info +from scrapegraphai.utils import prettify_exec_info load_dotenv() # ************************************************ @@ -16,7 +18,8 @@ curr_dir = os.path.dirname(os.path.realpath(__file__)) file_path = os.path.join(curr_dir, FILE_NAME) -text = pd.read_csv(file_path) +with open(file_path, "r") as file: + text = file.read() # ************************************************ # Define the configuration for the graph @@ -24,7 +27,7 @@ openai_key = os.getenv("OPENAI_APIKEY") graph_config = { - "llm": { + "llm": { "api_key": openai_key, "model": "openai/gpt-4o", }, @@ -37,7 +40,7 @@ csv_scraper_graph = CSVScraperMultiGraph( prompt="List me all the last names", source=[str(text), str(text)], - config=graph_config + config=graph_config, ) result = csv_scraper_graph.run() @@ -49,7 +52,3 @@ graph_exec_info = csv_scraper_graph.get_execution_info() print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/openai/csv_scraper_openai.py b/examples/csv_scraper_graph/openai/csv_scraper_openai.py similarity index 84% rename from examples/openai/csv_scraper_openai.py rename to examples/csv_scraper_graph/openai/csv_scraper_openai.py index d9527b86..a0abd714 100644 --- a/examples/openai/csv_scraper_openai.py +++ b/examples/csv_scraper_graph/openai/csv_scraper_openai.py @@ -1,11 +1,13 @@ """ Basic example of scraping pipeline using CSVScraperGraph from CSV documents """ + import os + from dotenv import load_dotenv -import pandas as pd + from scrapegraphai.graphs import CSVScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info +from scrapegraphai.utils import prettify_exec_info load_dotenv() @@ -17,7 +19,8 @@ curr_dir = os.path.dirname(os.path.realpath(__file__)) file_path = os.path.join(curr_dir, FILE_NAME) -text = pd.read_csv(file_path) +with open(file_path, "r") as file: + text = file.read() # ************************************************ # Define the configuration for the graph @@ -39,7 +42,7 @@ csv_scraper_graph = CSVScraperGraph( prompt="List me all the last names", source=str(text), # Pass the content of the file, not the file object - config=graph_config + config=graph_config, ) result = csv_scraper_graph.run() @@ -51,7 +54,3 @@ graph_exec_info = csv_scraper_graph.get_execution_info() print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/azure/inputs/username.csv b/examples/csv_scraper_graph/openai/inputs/username.csv similarity index 100% rename from examples/azure/inputs/username.csv rename to examples/csv_scraper_graph/openai/inputs/username.csv diff --git a/examples/custom_graph/.env.example b/examples/custom_graph/.env.example new file mode 100644 index 00000000..9eac4cb8 --- /dev/null +++ b/examples/custom_graph/.env.example @@ -0,0 +1,13 @@ +# OpenAI API Configuration +OPENAI_API_KEY=your-openai-api-key-here + +# Optional Configurations +MAX_TOKENS=4000 +MODEL_NAME=gpt-4-1106-preview +TEMPERATURE=0.7 + +# Custom Graph Settings +CUSTOM_NODE_TIMEOUT=30 +MAX_NODES=10 +DEBUG_MODE=false +LOG_LEVEL=info \ No newline at end of file diff --git a/examples/custom_graph/README.md b/examples/custom_graph/README.md new file mode 100644 index 00000000..e6d3b88a --- /dev/null +++ b/examples/custom_graph/README.md @@ -0,0 +1,31 @@ +# Custom Graph Example + +This example demonstrates how to create and implement custom graphs using Scrapegraph-ai. + +## Features + +- Custom node creation +- Graph customization +- Pipeline configuration +- Custom data processing + +## Setup + +1. Install required dependencies +2. Copy `.env.example` to `.env` +3. Configure your API keys in the `.env` file + +## Usage + +```python +from scrapegraphai.graphs import CustomGraph + +graph = CustomGraph() +graph.add_node("custom_node", CustomNode()) +results = graph.process() +``` + +## Environment Variables + +Required environment variables: +- `OPENAI_API_KEY`: Your OpenAI API key \ No newline at end of file diff --git a/examples/local_models/custom_graph_ollama.py b/examples/custom_graph/ollama/custom_graph_ollama.py similarity index 100% rename from examples/local_models/custom_graph_ollama.py rename to examples/custom_graph/ollama/custom_graph_ollama.py diff --git a/examples/openai/custom_graph_openai.py b/examples/custom_graph/openai/custom_graph_openai.py similarity index 100% rename from examples/openai/custom_graph_openai.py rename to examples/custom_graph/openai/custom_graph_openai.py diff --git a/examples/deepseek/.env.example b/examples/deepseek/.env.example deleted file mode 100644 index 37511138..00000000 --- a/examples/deepseek/.env.example +++ /dev/null @@ -1 +0,0 @@ -DEEPSEEK_APIKEY="your api key" \ No newline at end of file diff --git a/examples/deepseek/code_generator_graph_deepseek.py b/examples/deepseek/code_generator_graph_deepseek.py deleted file mode 100644 index f78a42b6..00000000 --- a/examples/deepseek/code_generator_graph_deepseek.py +++ /dev/null @@ -1,59 +0,0 @@ -""" -Basic example of scraping pipeline using Code Generator with schema -""" -import os -from typing import List -from dotenv import load_dotenv -from pydantic import BaseModel, Field -from scrapegraphai.graphs import CodeGeneratorGraph - -load_dotenv() - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Project(BaseModel): - title: str = Field(description="The title of the project") - description: str = Field(description="The description of the project") - -class Projects(BaseModel): - projects: List[Project] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -deepseek_key = os.getenv("DEEPSEEK_APIKEY") - -graph_config = { - "llm": { - "model": "deepseek/deepseek-chat", - "api_key": deepseek_key, - }, - "verbose": True, - "headless": False, - "reduction": 2, - "max_iterations": { - "overall": 10, - "syntax": 3, - "execution": 3, - "validation": 3, - "semantic": 3 - }, - "output_file_name": "extracted_data.py" -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -code_generator_graph = CodeGeneratorGraph( - prompt="List me all the projects with their description", - source="https://perinim.github.io/projects/", - schema=Projects, - config=graph_config -) - -result = code_generator_graph.run() -print(result) diff --git a/examples/deepseek/csv_scraper_deepseek.py b/examples/deepseek/csv_scraper_deepseek.py deleted file mode 100644 index 6ef0ac92..00000000 --- a/examples/deepseek/csv_scraper_deepseek.py +++ /dev/null @@ -1,56 +0,0 @@ -""" -Basic example of scraping pipeline using CSVScraperGraph from CSV documents -""" -import os -from dotenv import load_dotenv -import pandas as pd -from scrapegraphai.graphs import CSVScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info -load_dotenv() - -# ************************************************ -# Read the CSV file -# ************************************************ - -FILE_NAME = "inputs/username.csv" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -text = pd.read_csv(file_path) - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -deepseek_key = os.getenv("DEEPSEEK_APIKEY") - -graph_config = { - "llm": { - "model": "deepseek/deepseek-chat", - "api_key": deepseek_key, - }, - "verbose": True, -} -# ************************************************ -# Create the CSVScraperGraph instance and run it -# ************************************************ - -csv_scraper_graph = CSVScraperGraph( - prompt="List me all the last names", - source=str(text), # Pass the content of the file, not the file object - config=graph_config -) - -result = csv_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = csv_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/deepseek/csv_scraper_graph_multi_deepseek.py b/examples/deepseek/csv_scraper_graph_multi_deepseek.py deleted file mode 100644 index 95474360..00000000 --- a/examples/deepseek/csv_scraper_graph_multi_deepseek.py +++ /dev/null @@ -1,56 +0,0 @@ -""" -Basic example of scraping pipeline using CSVScraperMultiGraph from CSV documents -""" -import os -from dotenv import load_dotenv -import pandas as pd -from scrapegraphai.graphs import CSVScraperMultiGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() -# ************************************************ -# Read the CSV file -# ************************************************ - -FILE_NAME = "inputs/username.csv" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -text = pd.read_csv(file_path) - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -deepseek_key = os.getenv("DEEPSEEK_APIKEY") - -graph_config = { - "llm": { - "model": "deepseek/deepseek-chat", - "api_key": deepseek_key, - }, - "verbose": True, -} -# ************************************************ -# Create the CSVScraperMultiGraph instance and run it -# ************************************************ - -csv_scraper_graph = CSVScraperMultiGraph( - prompt="List me all the last names", - source=[str(text), str(text)], - config=graph_config -) - -result = csv_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = csv_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/deepseek/depth_search_graph_deepseek.py b/examples/deepseek/depth_search_graph_deepseek.py deleted file mode 100644 index 064690a5..00000000 --- a/examples/deepseek/depth_search_graph_deepseek.py +++ /dev/null @@ -1,30 +0,0 @@ -""" -depth_search_graph_opeani example -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import DepthSearchGraph - -load_dotenv() - -deepseek_key = os.getenv("DEEPSEEK_APIKEY") - -graph_config = { - "llm": { - "model": "deepseek/deepseek-chat", - "api_key": deepseek_key, - }, - "verbose": True, - "headless": False, - "depth": 2, - "only_inside_links": False, -} - -search_graph = DepthSearchGraph( - prompt="List me all the projects with their description", - source="https://perinim.github.io", - config=graph_config -) - -result = search_graph.run() -print(result) diff --git a/examples/deepseek/document_scraper_deepseek.py b/examples/deepseek/document_scraper_deepseek.py deleted file mode 100644 index e94826d3..00000000 --- a/examples/deepseek/document_scraper_deepseek.py +++ /dev/null @@ -1,44 +0,0 @@ -""" -document_scraper example -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import DocumentScraperGraph - -load_dotenv() - - -# ************************************************ -# Define the configuration for the graph -# ************************************************ -deepseek_key = os.getenv("DEEPSEEK_APIKEY") - -graph_config = { - "llm": { - "model": "deepseek/deepseek-chat", - "api_key": deepseek_key, - }, - "verbose": True, -} - - -source = """ - The Divine Comedy, Italian La Divina Commedia, original name La commedia, long narrative poem written in Italian - circa 1308/21 by Dante. It is usually held to be one of the world s great works of literature. - Divided into three major sections—Inferno, Purgatorio, and Paradiso—the narrative traces the journey of Dante - from darkness and error to the revelation of the divine light, culminating in the Beatific Vision of God. - Dante is guided by the Roman poet Virgil, who represents the epitome of human knowledge, from the dark wood - through the descending circles of the pit of Hell (Inferno). He then climbs the mountain of Purgatory, guided - by the Roman poet Statius, who represents the fulfilment of human knowledge, and is finally led by his lifelong love, - the Beatrice of his earlier poetry, through the celestial spheres of Paradise. -""" - -pdf_scraper_graph = DocumentScraperGraph( - prompt="Summarize the text and find the main topics", - source=source, - config=graph_config, -) -result = pdf_scraper_graph.run() - -print(json.dumps(result, indent=4)) \ No newline at end of file diff --git a/examples/deepseek/inputs/books.xml b/examples/deepseek/inputs/books.xml deleted file mode 100644 index e3d1fe87..00000000 --- a/examples/deepseek/inputs/books.xml +++ /dev/null @@ -1,120 +0,0 @@ - - - - Gambardella, Matthew - XML Developer's Guide - Computer - 44.95 - 2000-10-01 - An in-depth look at creating applications - with XML. - - - Ralls, Kim - Midnight Rain - Fantasy - 5.95 - 2000-12-16 - A former architect battles corporate zombies, - an evil sorceress, and her own childhood to become queen - of the world. - - - Corets, Eva - Maeve Ascendant - Fantasy - 5.95 - 2000-11-17 - After the collapse of a nanotechnology - society in England, the young survivors lay the - foundation for a new society. - - - Corets, Eva - Oberon's Legacy - Fantasy - 5.95 - 2001-03-10 - In post-apocalypse England, the mysterious - agent known only as Oberon helps to create a new life - for the inhabitants of London. Sequel to Maeve - Ascendant. - - - Corets, Eva - The Sundered Grail - Fantasy - 5.95 - 2001-09-10 - The two daughters of Maeve, half-sisters, - battle one another for control of England. Sequel to - Oberon's Legacy. - - - Randall, Cynthia - Lover Birds - Romance - 4.95 - 2000-09-02 - When Carla meets Paul at an ornithology - conference, tempers fly as feathers get ruffled. - - - Thurman, Paula - Splish Splash - Romance - 4.95 - 2000-11-02 - A deep sea diver finds true love twenty - thousand leagues beneath the sea. - - - Knorr, Stefan - Creepy Crawlies - Horror - 4.95 - 2000-12-06 - An anthology of horror stories about roaches, - centipedes, scorpions and other insects. - - - Kress, Peter - Paradox Lost - Science Fiction - 6.95 - 2000-11-02 - After an inadvertant trip through a Heisenberg - Uncertainty Device, James Salway discovers the problems - of being quantum. - - - O'Brien, Tim - Microsoft .NET: The Programming Bible - Computer - 36.95 - 2000-12-09 - Microsoft's .NET initiative is explored in - detail in this deep programmer's reference. - - - O'Brien, Tim - MSXML3: A Comprehensive Guide - Computer - 36.95 - 2000-12-01 - The Microsoft MSXML3 parser is covered in - detail, with attention to XML DOM interfaces, XSLT processing, - SAX and more. - - - Galos, Mike - Visual Studio 7: A Comprehensive Guide - Computer - 49.95 - 2001-04-16 - Microsoft Visual Studio 7 is explored in depth, - looking at how Visual Basic, Visual C++, C#, and ASP+ are - integrated into a comprehensive development - environment. - - \ No newline at end of file diff --git a/examples/deepseek/inputs/example.json b/examples/deepseek/inputs/example.json deleted file mode 100644 index 2263184c..00000000 --- a/examples/deepseek/inputs/example.json +++ /dev/null @@ -1,182 +0,0 @@ -{ - "kind":"youtube#searchListResponse", - "etag":"q4ibjmYp1KA3RqMF4jFLl6PBwOg", - "nextPageToken":"CAUQAA", - "regionCode":"NL", - "pageInfo":{ - "totalResults":1000000, - "resultsPerPage":5 - }, - "items":[ - { - "kind":"youtube#searchResult", - "etag":"QCsHBifbaernVCbLv8Cu6rAeaDQ", - "id":{ - "kind":"youtube#video", - "videoId":"TvWDY4Mm5GM" - }, - "snippet":{ - "publishedAt":"2023-07-24T14:15:01Z", - "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", - "title":"3 Football Clubs Kylian Mbappe Should Avoid Signing ✍️❌⚽️ #football #mbappe #shorts", - "description":"", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"FC Motivate", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T14:15:01Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"0NG5QHdtIQM_V-DBJDEf-jK_Y9k", - "id":{ - "kind":"youtube#video", - "videoId":"aZM_42CcNZ4" - }, - "snippet":{ - "publishedAt":"2023-07-24T16:09:27Z", - "channelId":"UCM5gMM_HqfKHYIEJ3lstMUA", - "title":"Which Football Club Could Cristiano Ronaldo Afford To Buy? 💰", - "description":"Sign up to Sorare and get a FREE card: https://sorare.pxf.io/NellisShorts Give Soraredata a go for FREE: ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"John Nellis", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T16:09:27Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"WbBz4oh9I5VaYj91LjeJvffrBVY", - "id":{ - "kind":"youtube#video", - "videoId":"wkP3XS3aNAY" - }, - "snippet":{ - "publishedAt":"2023-07-24T16:00:50Z", - "channelId":"UC4EP1dxFDPup_aFLt0ElsDw", - "title":"PAULO DYBALA vs THE WORLD'S LONGEST FREEKICK WALL", - "description":"Can Paulo Dybala curl a football around the World's longest free kick wall? We met up with the World Cup winner and put him to ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"Shoot for Love", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T16:00:50Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"juxv_FhT_l4qrR05S1QTrb4CGh8", - "id":{ - "kind":"youtube#video", - "videoId":"rJkDZ0WvfT8" - }, - "snippet":{ - "publishedAt":"2023-07-24T10:00:39Z", - "channelId":"UCO8qj5u80Ga7N_tP3BZWWhQ", - "title":"TOP 10 DEFENDERS 2023", - "description":"SoccerKingz https://soccerkingz.nl Use code: 'ILOVEHOF' to get 10% off. TOP 10 DEFENDERS 2023 Follow us! • Instagram ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"Home of Football", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T10:00:39Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"wtuknXTmI1txoULeH3aWaOuXOow", - "id":{ - "kind":"youtube#video", - "videoId":"XH0rtu4U6SE" - }, - "snippet":{ - "publishedAt":"2023-07-21T16:30:05Z", - "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", - "title":"3 Things You Didn't Know About Erling Haaland ⚽️🇳🇴 #football #haaland #shorts", - "description":"", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"FC Motivate", - "liveBroadcastContent":"none", - "publishTime":"2023-07-21T16:30:05Z" - } - } - ] -} \ No newline at end of file diff --git a/examples/deepseek/inputs/username.csv b/examples/deepseek/inputs/username.csv deleted file mode 100644 index 006ac8e6..00000000 --- a/examples/deepseek/inputs/username.csv +++ /dev/null @@ -1,7 +0,0 @@ -Username; Identifier;First name;Last name -booker12;9012;Rachel;Booker -grey07;2070;Laura;Grey -johnson81;4081;Craig;Johnson -jenkins46;9346;Mary;Jenkins -smith79;5079;Jamie;Smith - diff --git a/examples/deepseek/json_scraper_deepseek.py b/examples/deepseek/json_scraper_deepseek.py deleted file mode 100644 index d714c1db..00000000 --- a/examples/deepseek/json_scraper_deepseek.py +++ /dev/null @@ -1,46 +0,0 @@ -""" -Basic example of scraping pipeline using JSONScraperGraph from JSON documents -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import JSONScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the JSON file -# ************************************************ -deepseek_key = os.getenv("DEEPSEEK_APIKEY") - -FILE_NAME = "inputs/example.json" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "model": "deepseek/deepseek-chat", - "api_key": deepseek_key, - }, - "verbose": True, -} - -# ************************************************ -# Create the JSONScraperGraph instance and run it -# ************************************************ - -json_scraper_graph = JSONScraperGraph( - prompt="List me all the authors, title and genres of the books", - source=text, # Pass the content of the file, not the file object - config=graph_config -) - -result = json_scraper_graph.run() -print(result) diff --git a/examples/deepseek/json_scraper_multi_deepseek.py b/examples/deepseek/json_scraper_multi_deepseek.py deleted file mode 100644 index 893937cd..00000000 --- a/examples/deepseek/json_scraper_multi_deepseek.py +++ /dev/null @@ -1,37 +0,0 @@ -""" -Module for showing how JSONScraperMultiGraph multi works -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import JSONScraperMultiGraph - -load_dotenv() - -deepseek_key = os.getenv("DEEPSEEK_APIKEY") - -graph_config = { - "llm": { - "model": "deepseek/deepseek-chat", - "api_key": deepseek_key, - }, - "verbose": True, -} -FILE_NAME = "inputs/example.json" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -sources = [text, text] - -multiple_search_graph = JSONScraperMultiGraph( - prompt= "List me all the authors, title and genres of the books", - source= sources, - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/deepseek/rate_limit_deepseek.py b/examples/deepseek/rate_limit_deepseek.py deleted file mode 100644 index 16781f39..00000000 --- a/examples/deepseek/rate_limit_deepseek.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper with a custom rate limit -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -deepseek_key = os.getenv("DEEPSEEK_APIKEY") - -graph_config = { - "llm": { - "model": "deepseek/deepseek-chat", - "api_key": deepseek_key, - "rate_limit": { - "requests_per_second": 1 - } - }, - "verbose": True, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description.", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects/", - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/deepseek/scrape_plain_text_deepseek.py b/examples/deepseek/scrape_plain_text_deepseek.py deleted file mode 100644 index 2b243d35..00000000 --- a/examples/deepseek/scrape_plain_text_deepseek.py +++ /dev/null @@ -1,54 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper from text -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the text file -# ************************************************ - -FILE_NAME = "inputs/plain_html_example.txt" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -# It could be also a http request using the request model -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -deepseek_key = os.getenv("DEEPSEEK_APIKEY") - -graph_config = { - "llm": { - "model": "deepseek/deepseek-chat", - "api_key": deepseek_key, - }, - "verbose": True, -} -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the news with their description.", - source=text, - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/deepseek/script_generator_deepseek.py b/examples/deepseek/script_generator_deepseek.py deleted file mode 100644 index 899c7a35..00000000 --- a/examples/deepseek/script_generator_deepseek.py +++ /dev/null @@ -1,44 +0,0 @@ -""" -Basic example of scraping pipeline using ScriptCreatorGraph -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import ScriptCreatorGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -deepseek_key = os.getenv("DEEPSEEK_APIKEY") - -graph_config = { - "llm": { - "model": "deepseek/deepseek-chat", - "api_key": deepseek_key, - }, - "library": "beautifulsoup" -} - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -script_creator_graph = ScriptCreatorGraph( - prompt="List me all the projects with their description.", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects", - config=graph_config -) - -result = script_creator_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = script_creator_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/deepseek/script_multi_generator_deepseek.py b/examples/deepseek/script_multi_generator_deepseek.py deleted file mode 100644 index 48ca2d20..00000000 --- a/examples/deepseek/script_multi_generator_deepseek.py +++ /dev/null @@ -1,53 +0,0 @@ -""" -Basic example of scraping pipeline using ScriptCreatorGraph -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import ScriptCreatorMultiGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -deepseek_key = os.getenv("DEEPSEEK_APIKEY") - -graph_config = { - "llm": { - "model": "deepseek/deepseek-chat", - "api_key": deepseek_key, - }, - "library": "beautifulsoup" -} - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -urls=[ - "https://schultzbergagency.com/emil-raste-karlsen/", - "https://schultzbergagency.com/johanna-hedberg/", -] - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -script_creator_graph = ScriptCreatorMultiGraph( - prompt="Find information about actors", - # also accepts a string with the already downloaded HTML code - source=urls, - config=graph_config -) - -result = script_creator_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = script_creator_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/deepseek/search_graph_deepseek.py b/examples/deepseek/search_graph_deepseek.py deleted file mode 100644 index 7a3baf0d..00000000 --- a/examples/deepseek/search_graph_deepseek.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -Example of Search Graph -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SearchGraph - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -deepseek_key = os.getenv("DEEPSEEK_APIKEY") - -graph_config = { - "llm": { - "model": "deepseek/deepseek-chat", - "api_key": deepseek_key, - }, - "max_results": 2, - "verbose": True, -} - -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me the best escursions near Trento", - config=graph_config -) - -result = search_graph.run() -print(result) diff --git a/examples/deepseek/search_graph_schema_deepseek.py b/examples/deepseek/search_graph_schema_deepseek.py deleted file mode 100644 index f5f20e25..00000000 --- a/examples/deepseek/search_graph_schema_deepseek.py +++ /dev/null @@ -1,60 +0,0 @@ -""" -Example of Search Graph -""" -import os -from typing import List -from dotenv import load_dotenv -from pydantic import BaseModel, Field -from scrapegraphai.graphs import SearchGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Dish(BaseModel): - name: str = Field(description="The name of the dish") - description: str = Field(description="The description of the dish") - -class Dishes(BaseModel): - dishes: List[Dish] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -deepseek_key = os.getenv("DEEPSEEK_APIKEY") - -graph_config = { - "llm": { - "model": "deepseek/deepseek-chat", - "api_key": deepseek_key, - }, - "verbose": True, -} - -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me Chioggia's famous dishes", - config=graph_config, - schema=Dishes -) - -result = search_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = search_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json and csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/deepseek/search_link_graph_deepseek.py b/examples/deepseek/search_link_graph_deepseek.py deleted file mode 100644 index dac13737..00000000 --- a/examples/deepseek/search_link_graph_deepseek.py +++ /dev/null @@ -1,46 +0,0 @@ -""" -Example of Search Graph -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SearchGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -load_dotenv() - -deepseek_key = os.getenv("DEEPSEEK_APIKEY") - -graph_config = { - "llm": { - "model": "deepseek/deepseek-chat", - "api_key": deepseek_key, - }, - "verbose": True, -} - -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me the best escursions near Trento", - config=graph_config -) - -result = search_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = search_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json and csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/deepseek/smart_scraper_deepseek.py b/examples/deepseek/smart_scraper_deepseek.py deleted file mode 100644 index 0eac94e8..00000000 --- a/examples/deepseek/smart_scraper_deepseek.py +++ /dev/null @@ -1,44 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -deepseek_key = os.getenv("DEEPSEEK_APIKEY") - -graph_config = { - "llm": { - "model": "deepseek/deepseek-chat", - "api_key": deepseek_key, - }, - "verbose": True, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description.", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects/", - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/deepseek/smart_scraper_lite_deepseek.py b/examples/deepseek/smart_scraper_lite_deepseek.py deleted file mode 100644 index a70d76b0..00000000 --- a/examples/deepseek/smart_scraper_lite_deepseek.py +++ /dev/null @@ -1,31 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperLiteGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -graph_config = { - "llm": { - "api_key": os.getenv("DEEPSEEK_API_KEY"), - "model": "deepseek/deepseek-coder-33b-instruct", - }, - "verbose": True, - "headless": False, -} - -smart_scraper_lite_graph = SmartScraperLiteGraph( - prompt="Who is Marco Perini?", - source="https://perinim.github.io/", - config=graph_config -) - -result = smart_scraper_lite_graph.run() -print(json.dumps(result, indent=4)) - -graph_exec_info = smart_scraper_lite_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/deepseek/smart_scraper_multi_concat_deepseek.py b/examples/deepseek/smart_scraper_multi_concat_deepseek.py deleted file mode 100644 index eeb1816c..00000000 --- a/examples/deepseek/smart_scraper_multi_concat_deepseek.py +++ /dev/null @@ -1,41 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiConcatGraph - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -deepseek_key = os.getenv("DEEPSEEK_APIKEY") - -graph_config = { - "llm": { - "model": "deepseek/deepseek-chat", - "api_key": deepseek_key, - }, - "verbose": True, -} - - -# ******************************************************* -# Create the SmartScraperMultiGraph instance and run it -# ******************************************************* - -multiple_search_graph = SmartScraperMultiConcatGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/deepseek/smart_scraper_multi_deepseek.py b/examples/deepseek/smart_scraper_multi_deepseek.py deleted file mode 100644 index 5923e302..00000000 --- a/examples/deepseek/smart_scraper_multi_deepseek.py +++ /dev/null @@ -1,41 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiGraph - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -deepseek_key = os.getenv("DEEPSEEK_APIKEY") - -graph_config = { - "llm": { - "model": "deepseek/deepseek-chat", - "api_key": deepseek_key, - }, - "verbose": True, -} - - -# ******************************************************* -# Create the SmartScraperMultiGraph instance and run it -# ******************************************************* - -multiple_search_graph = SmartScraperMultiGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/deepseek/smart_scraper_multi_lite_deepseek.py b/examples/deepseek/smart_scraper_multi_lite_deepseek.py deleted file mode 100644 index eb5eea01..00000000 --- a/examples/deepseek/smart_scraper_multi_lite_deepseek.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiLiteGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -graph_config = { - "llm": { - "api_key": os.getenv("DEEPSEEK_API_KEY"), - "model": "deepseek/deepseek-coder-33b-instruct", - }, - "verbose": True, - "headless": False, -} - -smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - config=graph_config -) - -result = smart_scraper_multi_lite_graph.run() -print(json.dumps(result, indent=4)) - -graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - diff --git a/examples/deepseek/smart_scraper_schema_deepseek.py b/examples/deepseek/smart_scraper_schema_deepseek.py deleted file mode 100644 index fd87fbdc..00000000 --- a/examples/deepseek/smart_scraper_schema_deepseek.py +++ /dev/null @@ -1,58 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -from typing import List -from pydantic import BaseModel, Field -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Project(BaseModel): - title: str = Field(description="The title of the project") - description: str = Field(description="The description of the project") - -class Projects(BaseModel): - projects: List[Project] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -deepseek_key = os.getenv("DEEPSEEK_APIKEY") - -graph_config = { - "llm": { - "model": "deepseek/deepseek-chat", - "api_key": deepseek_key, - }, - "verbose": True, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description.", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects/", - schema=Projects, - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/deepseek/xml_scraper_deepseek.py b/examples/deepseek/xml_scraper_deepseek.py deleted file mode 100644 index d66b0eab..00000000 --- a/examples/deepseek/xml_scraper_deepseek.py +++ /dev/null @@ -1,59 +0,0 @@ -""" -Basic example of scraping pipeline using XMLScraperGraph from XML documents -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import XMLScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the XML file -# ************************************************ - -FILE_NAME = "inputs/books.xml" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - - -deepseek_key = os.getenv("DEEPSEEK_APIKEY") - -graph_config = { - "llm": { - "model": "deepseek/deepseek-chat", - "api_key": deepseek_key, - }, - "verbose": True, -} - -# ************************************************ -# Create the XMLScraperGraph instance and run it -# ************************************************ - -xml_scraper_graph = XMLScraperGraph( - prompt="List me all the authors, title and genres of the books", - source=text, # Pass the content of the file, not the file object - config=graph_config -) - -result = xml_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = xml_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/deepseek/xml_scraper_graph_multi_deepseek.py b/examples/deepseek/xml_scraper_graph_multi_deepseek.py deleted file mode 100644 index 2d190926..00000000 --- a/examples/deepseek/xml_scraper_graph_multi_deepseek.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -Basic example of scraping pipeline using XMLScraperMultiGraph from XML documents -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import XMLScraperMultiGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the XML file -# ************************************************ - -FILE_NAME = "inputs/books.xml" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -deepseek_key = os.getenv("DEEPSEEK_APIKEY") - -graph_config = { - "llm": { - "model": "deepseek/deepseek-chat", - "api_key": deepseek_key, - }, - "verbose": True, -} -# ************************************************ -# Create the XMLScraperMultiGraph instance and run it -# ************************************************ - -xml_scraper_graph = XMLScraperMultiGraph( - prompt="List me all the authors, title and genres of the books", - source=[text, text], # Pass the content of the file, not the file object - config=graph_config -) - -result = xml_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = xml_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/depth_search_graph/.env.example b/examples/depth_search_graph/.env.example new file mode 100644 index 00000000..8c10cfbb --- /dev/null +++ b/examples/depth_search_graph/.env.example @@ -0,0 +1,14 @@ +# OpenAI API Configuration +OPENAI_API_KEY=your-openai-api-key-here + +# Optional Configurations +MAX_TOKENS=4000 +MODEL_NAME=gpt-4-1106-preview +TEMPERATURE=0.7 + +# Depth Search Settings +MAX_DEPTH=5 +CRAWL_DELAY=1 +RESPECT_ROBOTS_TXT=true +MAX_PAGES_PER_DOMAIN=100 +USER_AGENT=Mozilla/5.0 \ No newline at end of file diff --git a/examples/depth_search_graph/README.md b/examples/depth_search_graph/README.md new file mode 100644 index 00000000..c4ce05df --- /dev/null +++ b/examples/depth_search_graph/README.md @@ -0,0 +1,30 @@ +# Depth Search Graph Example + +This example demonstrates how to use Scrapegraph-ai for deep web crawling and content exploration. + +## Features + +- Deep web crawling +- Content discovery +- Link analysis +- Recursive search + +## Setup + +1. Install required dependencies +2. Copy `.env.example` to `.env` +3. Configure your API keys in the `.env` file + +## Usage + +```python +from scrapegraphai.graphs import DepthSearchGraph + +graph = DepthSearchGraph() +results = graph.search("https://example.com", depth=3) +``` + +## Environment Variables + +Required environment variables: +- `OPENAI_API_KEY`: Your OpenAI API key \ No newline at end of file diff --git a/examples/local_models/depth_search_graph_ollama.py b/examples/depth_search_graph/ollama/depth_search_graph_ollama.py similarity index 100% rename from examples/local_models/depth_search_graph_ollama.py rename to examples/depth_search_graph/ollama/depth_search_graph_ollama.py diff --git a/examples/openai/depth_search_graph_openai.py b/examples/depth_search_graph/openai/depth_search_graph_openai.py similarity index 100% rename from examples/openai/depth_search_graph_openai.py rename to examples/depth_search_graph/openai/depth_search_graph_openai.py diff --git a/examples/document_scraper_graph/.env.example b/examples/document_scraper_graph/.env.example new file mode 100644 index 00000000..2e7bab46 --- /dev/null +++ b/examples/document_scraper_graph/.env.example @@ -0,0 +1,13 @@ +# OpenAI API Configuration +OPENAI_API_KEY=your-openai-api-key-here + +# Optional Configurations +MAX_TOKENS=4000 +MODEL_NAME=gpt-4-1106-preview +TEMPERATURE=0.7 + +# Document Scraper Settings +OCR_ENABLED=true +EXTRACT_METADATA=true +MAX_FILE_SIZE=10485760 # 10MB +SUPPORTED_FORMATS=pdf,doc,docx,txt \ No newline at end of file diff --git a/examples/document_scraper_graph/README.md b/examples/document_scraper_graph/README.md new file mode 100644 index 00000000..f8561ee7 --- /dev/null +++ b/examples/document_scraper_graph/README.md @@ -0,0 +1,30 @@ +# Document Scraper Graph Example + +This example demonstrates how to use Scrapegraph-ai to extract data from various document formats (PDF, DOC, DOCX, etc.). + +## Features + +- Multi-format document support +- Text extraction +- Document parsing +- Metadata extraction + +## Setup + +1. Install required dependencies +2. Copy `.env.example` to `.env` +3. Configure your API keys in the `.env` file + +## Usage + +```python +from scrapegraphai.graphs import DocumentScraperGraph + +graph = DocumentScraperGraph() +content = graph.scrape("document.pdf") +``` + +## Environment Variables + +Required environment variables: +- `OPENAI_API_KEY`: Your OpenAI API key \ No newline at end of file diff --git a/examples/local_models/document_scraper_ollama.py b/examples/document_scraper_graph/ollama/document_scraper_ollama.py similarity index 100% rename from examples/local_models/document_scraper_ollama.py rename to examples/document_scraper_graph/ollama/document_scraper_ollama.py diff --git a/examples/anthropic/inputs/plain_html_example.txt b/examples/document_scraper_graph/ollama/inputs/plain_html_example.txt similarity index 100% rename from examples/anthropic/inputs/plain_html_example.txt rename to examples/document_scraper_graph/ollama/inputs/plain_html_example.txt diff --git a/examples/openai/document_scraper_openai.py b/examples/document_scraper_graph/openai/document_scraper_openai.py similarity index 100% rename from examples/openai/document_scraper_openai.py rename to examples/document_scraper_graph/openai/document_scraper_openai.py diff --git a/examples/mistral/inputs/markdown_example.md b/examples/document_scraper_graph/openai/inputs/markdown_example.md similarity index 100% rename from examples/mistral/inputs/markdown_example.md rename to examples/document_scraper_graph/openai/inputs/markdown_example.md diff --git a/examples/bedrock/inputs/plain_html_example.txt b/examples/document_scraper_graph/openai/inputs/plain_html_example.txt similarity index 100% rename from examples/bedrock/inputs/plain_html_example.txt rename to examples/document_scraper_graph/openai/inputs/plain_html_example.txt diff --git a/examples/ernie/code_generator_graph_ernie.py b/examples/ernie/code_generator_graph_ernie.py deleted file mode 100644 index 65b8e4b9..00000000 --- a/examples/ernie/code_generator_graph_ernie.py +++ /dev/null @@ -1,61 +0,0 @@ -""" -Basic example of scraping pipeline using Code Generator with schema -""" -import os -from typing import List -from dotenv import load_dotenv -from pydantic import BaseModel, Field -from scrapegraphai.graphs import CodeGeneratorGraph - -load_dotenv() - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Project(BaseModel): - title: str = Field(description="The title of the project") - description: str = Field(description="The description of the project") - -class Projects(BaseModel): - projects: List[Project] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -openai_key = os.getenv("OPENAI_APIKEY") - -graph_config = { - "llm": { - "model": "ernie/ernie-bot-turbo", - "ernie_client_id": "", - "ernie_client_secret": "", - "temperature": 0.1 - }, - "verbose": True, - "headless": False, - "reduction": 2, - "max_iterations": { - "overall": 10, - "syntax": 3, - "execution": 3, - "validation": 3, - "semantic": 3 - }, - "output_file_name": "extracted_data.py" -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -code_generator_graph = CodeGeneratorGraph( - prompt="List me all the projects with their description", - source="https://perinim.github.io/projects/", - schema=Projects, - config=graph_config -) - -result = code_generator_graph.run() -print(result) \ No newline at end of file diff --git a/examples/ernie/csv_scraper_ernie.py b/examples/ernie/csv_scraper_ernie.py deleted file mode 100644 index 6f4335b6..00000000 --- a/examples/ernie/csv_scraper_ernie.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -Basic example of scraping pipeline using CSVScraperGraph from CSV documents -""" -import os -from dotenv import load_dotenv -import pandas as pd -from scrapegraphai.graphs import CSVScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the CSV file -# ************************************************ - -FILE_NAME = "inputs/username.csv" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -text = pd.read_csv(file_path) - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "model": "ernie/ernie-bot-turbo", - "ernie_client_id": "", - "ernie_client_secret": "", - "temperature": 0.1 - } -} - -# ************************************************ -# Create the CSVScraperGraph instance and run it -# ************************************************ - -csv_scraper_graph = CSVScraperGraph( - prompt="List me all the last names", - source=str(text), # Pass the content of the file, not the file object - config=graph_config -) - -result = csv_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = csv_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/ernie/custom_graph_ernie.py b/examples/ernie/custom_graph_ernie.py deleted file mode 100644 index a987560e..00000000 --- a/examples/ernie/custom_graph_ernie.py +++ /dev/null @@ -1,106 +0,0 @@ -""" -Example of custom graph using existing nodes -""" -from langchain_openai import OpenAIEmbeddings -from langchain_openai import ChatOpenAI -from scrapegraphai.graphs import BaseGraph -from scrapegraphai.nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode, RobotsNode - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "model": "ernie/ernie-bot-turbo", - "ernie_client_id": "", - "ernie_client_secret": "", - "temperature": 0.1 - } -} - -# ************************************************ -# Define the graph nodes -# ************************************************ - -llm_model = ChatOpenAI(graph_config["llm"]) -embedder = OpenAIEmbeddings(api_key=llm_model.openai_api_key) - -# define the nodes for the graph -robot_node = RobotsNode( - input="url", - output=["is_scrapable"], - node_config={ - "llm_model": llm_model, - "force_scraping": True, - "verbose": True, - } -) - -fetch_node = FetchNode( - input="url | local_dir", - output=["doc"], - node_config={ - "verbose": True, - "headless": True, - } -) -parse_node = ParseNode( - input="doc", - output=["parsed_doc"], - node_config={ - "chunk_size": 4096, - "verbose": True, - } -) -rag_node = RAGNode( - input="user_prompt & (parsed_doc | doc)", - output=["relevant_chunks"], - node_config={ - "llm_model": llm_model, - "embedder_model": embedder, - "verbose": True, - } -) -generate_answer_node = GenerateAnswerNode( - input="user_prompt & (relevant_chunks | parsed_doc | doc)", - output=["answer"], - node_config={ - "llm_model": llm_model, - "verbose": True, - } -) - -# ************************************************ -# Create the graph by defining the connections -# ************************************************ - -graph = BaseGraph( - nodes=[ - robot_node, - fetch_node, - parse_node, - rag_node, - generate_answer_node, - ], - edges=[ - (robot_node, fetch_node), - (fetch_node, parse_node), - (parse_node, rag_node), - (rag_node, generate_answer_node) - ], - entry_point=robot_node -) - -# ************************************************ -# Execute the graph -# ************************************************ - -result, execution_info = graph.execute({ - "user_prompt": "Describe the content", - "url": "https://example.com/" -}) - -# get the answer from the result -result = result.get("answer", "No answer found.") -print(result) diff --git a/examples/ernie/depth_search_graph_ernie.py b/examples/ernie/depth_search_graph_ernie.py deleted file mode 100644 index 99470d8d..00000000 --- a/examples/ernie/depth_search_graph_ernie.py +++ /dev/null @@ -1,26 +0,0 @@ -""" -depth_search_graph_opeani example -""" -from scrapegraphai.graphs import DepthSearchGraph - -graph_config = { - "llm": { - "model": "ernie/ernie-bot-turbo", - "ernie_client_id": "", - "ernie_client_secret": "", - "temperature": 0.1 - }, - "verbose": True, - "headless": False, - "depth": 2, - "only_inside_links": False, -} - -search_graph = DepthSearchGraph( - prompt="List me all the projects with their description", - source="https://perinim.github.io", - config=graph_config -) - -result = search_graph.run() -print(result) diff --git a/examples/ernie/document_scraper_anthropic_ernie.py b/examples/ernie/document_scraper_anthropic_ernie.py deleted file mode 100644 index 74d91be1..00000000 --- a/examples/ernie/document_scraper_anthropic_ernie.py +++ /dev/null @@ -1,39 +0,0 @@ -""" -document_scraper example -""" -import os -import json -from scrapegraphai.graphs import DocumentScraperGraph - -# ************************************************ -# Define the configuration for the graph -# ************************************************ -graph_config = { - "llm": { - "model": "ernie/ernie-bot-turbo", - "ernie_client_id": "", - "ernie_client_secret": "", - "temperature": 0.1 - } -} - - -source = """ - The Divine Comedy, Italian La Divina Commedia, original name La commedia, long narrative poem written in Italian - circa 1308/21 by Dante. It is usually held to be one of the world s great works of literature. - Divided into three major sections—Inferno, Purgatorio, and Paradiso—the narrative traces the journey of Dante - from darkness and error to the revelation of the divine light, culminating in the Beatific Vision of God. - Dante is guided by the Roman poet Virgil, who represents the epitome of human knowledge, from the dark wood - through the descending circles of the pit of Hell (Inferno). He then climbs the mountain of Purgatory, guided - by the Roman poet Statius, who represents the fulfilment of human knowledge, and is finally led by his lifelong love, - the Beatrice of his earlier poetry, through the celestial spheres of Paradise. -""" - -pdf_scraper_graph = DocumentScraperGraph( - prompt="Summarize the text and find the main topics", - source=source, - config=graph_config, -) -result = pdf_scraper_graph.run() - -print(json.dumps(result, indent=4)) \ No newline at end of file diff --git a/examples/ernie/inputs/books.xml b/examples/ernie/inputs/books.xml deleted file mode 100644 index e3d1fe87..00000000 --- a/examples/ernie/inputs/books.xml +++ /dev/null @@ -1,120 +0,0 @@ - - - - Gambardella, Matthew - XML Developer's Guide - Computer - 44.95 - 2000-10-01 - An in-depth look at creating applications - with XML. - - - Ralls, Kim - Midnight Rain - Fantasy - 5.95 - 2000-12-16 - A former architect battles corporate zombies, - an evil sorceress, and her own childhood to become queen - of the world. - - - Corets, Eva - Maeve Ascendant - Fantasy - 5.95 - 2000-11-17 - After the collapse of a nanotechnology - society in England, the young survivors lay the - foundation for a new society. - - - Corets, Eva - Oberon's Legacy - Fantasy - 5.95 - 2001-03-10 - In post-apocalypse England, the mysterious - agent known only as Oberon helps to create a new life - for the inhabitants of London. Sequel to Maeve - Ascendant. - - - Corets, Eva - The Sundered Grail - Fantasy - 5.95 - 2001-09-10 - The two daughters of Maeve, half-sisters, - battle one another for control of England. Sequel to - Oberon's Legacy. - - - Randall, Cynthia - Lover Birds - Romance - 4.95 - 2000-09-02 - When Carla meets Paul at an ornithology - conference, tempers fly as feathers get ruffled. - - - Thurman, Paula - Splish Splash - Romance - 4.95 - 2000-11-02 - A deep sea diver finds true love twenty - thousand leagues beneath the sea. - - - Knorr, Stefan - Creepy Crawlies - Horror - 4.95 - 2000-12-06 - An anthology of horror stories about roaches, - centipedes, scorpions and other insects. - - - Kress, Peter - Paradox Lost - Science Fiction - 6.95 - 2000-11-02 - After an inadvertant trip through a Heisenberg - Uncertainty Device, James Salway discovers the problems - of being quantum. - - - O'Brien, Tim - Microsoft .NET: The Programming Bible - Computer - 36.95 - 2000-12-09 - Microsoft's .NET initiative is explored in - detail in this deep programmer's reference. - - - O'Brien, Tim - MSXML3: A Comprehensive Guide - Computer - 36.95 - 2000-12-01 - The Microsoft MSXML3 parser is covered in - detail, with attention to XML DOM interfaces, XSLT processing, - SAX and more. - - - Galos, Mike - Visual Studio 7: A Comprehensive Guide - Computer - 49.95 - 2001-04-16 - Microsoft Visual Studio 7 is explored in depth, - looking at how Visual Basic, Visual C++, C#, and ASP+ are - integrated into a comprehensive development - environment. - - \ No newline at end of file diff --git a/examples/ernie/inputs/example.json b/examples/ernie/inputs/example.json deleted file mode 100644 index 2263184c..00000000 --- a/examples/ernie/inputs/example.json +++ /dev/null @@ -1,182 +0,0 @@ -{ - "kind":"youtube#searchListResponse", - "etag":"q4ibjmYp1KA3RqMF4jFLl6PBwOg", - "nextPageToken":"CAUQAA", - "regionCode":"NL", - "pageInfo":{ - "totalResults":1000000, - "resultsPerPage":5 - }, - "items":[ - { - "kind":"youtube#searchResult", - "etag":"QCsHBifbaernVCbLv8Cu6rAeaDQ", - "id":{ - "kind":"youtube#video", - "videoId":"TvWDY4Mm5GM" - }, - "snippet":{ - "publishedAt":"2023-07-24T14:15:01Z", - "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", - "title":"3 Football Clubs Kylian Mbappe Should Avoid Signing ✍️❌⚽️ #football #mbappe #shorts", - "description":"", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"FC Motivate", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T14:15:01Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"0NG5QHdtIQM_V-DBJDEf-jK_Y9k", - "id":{ - "kind":"youtube#video", - "videoId":"aZM_42CcNZ4" - }, - "snippet":{ - "publishedAt":"2023-07-24T16:09:27Z", - "channelId":"UCM5gMM_HqfKHYIEJ3lstMUA", - "title":"Which Football Club Could Cristiano Ronaldo Afford To Buy? 💰", - "description":"Sign up to Sorare and get a FREE card: https://sorare.pxf.io/NellisShorts Give Soraredata a go for FREE: ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"John Nellis", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T16:09:27Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"WbBz4oh9I5VaYj91LjeJvffrBVY", - "id":{ - "kind":"youtube#video", - "videoId":"wkP3XS3aNAY" - }, - "snippet":{ - "publishedAt":"2023-07-24T16:00:50Z", - "channelId":"UC4EP1dxFDPup_aFLt0ElsDw", - "title":"PAULO DYBALA vs THE WORLD'S LONGEST FREEKICK WALL", - "description":"Can Paulo Dybala curl a football around the World's longest free kick wall? We met up with the World Cup winner and put him to ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"Shoot for Love", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T16:00:50Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"juxv_FhT_l4qrR05S1QTrb4CGh8", - "id":{ - "kind":"youtube#video", - "videoId":"rJkDZ0WvfT8" - }, - "snippet":{ - "publishedAt":"2023-07-24T10:00:39Z", - "channelId":"UCO8qj5u80Ga7N_tP3BZWWhQ", - "title":"TOP 10 DEFENDERS 2023", - "description":"SoccerKingz https://soccerkingz.nl Use code: 'ILOVEHOF' to get 10% off. TOP 10 DEFENDERS 2023 Follow us! • Instagram ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"Home of Football", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T10:00:39Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"wtuknXTmI1txoULeH3aWaOuXOow", - "id":{ - "kind":"youtube#video", - "videoId":"XH0rtu4U6SE" - }, - "snippet":{ - "publishedAt":"2023-07-21T16:30:05Z", - "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", - "title":"3 Things You Didn't Know About Erling Haaland ⚽️🇳🇴 #football #haaland #shorts", - "description":"", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"FC Motivate", - "liveBroadcastContent":"none", - "publishTime":"2023-07-21T16:30:05Z" - } - } - ] -} \ No newline at end of file diff --git a/examples/ernie/inputs/plain_html_example.txt b/examples/ernie/inputs/plain_html_example.txt deleted file mode 100644 index 78f814ae..00000000 --- a/examples/ernie/inputs/plain_html_example.txt +++ /dev/null @@ -1,105 +0,0 @@ - -
- - -
-
-
- - -
- \ No newline at end of file diff --git a/examples/ernie/inputs/username.csv b/examples/ernie/inputs/username.csv deleted file mode 100644 index 006ac8e6..00000000 --- a/examples/ernie/inputs/username.csv +++ /dev/null @@ -1,7 +0,0 @@ -Username; Identifier;First name;Last name -booker12;9012;Rachel;Booker -grey07;2070;Laura;Grey -johnson81;4081;Craig;Johnson -jenkins46;9346;Mary;Jenkins -smith79;5079;Jamie;Smith - diff --git a/examples/ernie/json_scraper_ernie.py b/examples/ernie/json_scraper_ernie.py deleted file mode 100644 index 35324da2..00000000 --- a/examples/ernie/json_scraper_ernie.py +++ /dev/null @@ -1,43 +0,0 @@ -""" -Basic example of scraping pipeline using JSONScraperGraph from JSON documents -""" -import os -from scrapegraphai.graphs import JSONScraperGraph -from scrapegraphai.utils import prettify_exec_info - -# ************************************************ -# Read the JSON file -# ************************************************ - -FILE_NAME = "inputs/example.json" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "model": "ernie/ernie-bot-turbo", - "ernie_client_id": "", - "ernie_client_secret": "", - "temperature": 0.1 - } -} - -# ************************************************ -# Create the JSONScraperGraph instance and run it -# ************************************************ - -json_scraper_graph = JSONScraperGraph( - prompt="List me all the authors, title and genres of the books", - source=text, # Pass the content of the file, not the file object - config=graph_config -) - -result = json_scraper_graph.run() -print(result) diff --git a/examples/ernie/rate_limit_ernie.py b/examples/ernie/rate_limit_ernie.py deleted file mode 100644 index 043029a7..00000000 --- a/examples/ernie/rate_limit_ernie.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper with a custom rate limit -""" -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "model": "ernie/ernie-bot-turbo", - "ernie_client_id": "", - "ernie_client_secret": "", - "temperature": 0.1, - "rate_limit": { - "requests_per_second": 1 - }, - }, - "library": "beautifulsoup" -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects/", - config=graph_config, -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/ernie/scrape_plain_text_ernie.py b/examples/ernie/scrape_plain_text_ernie.py deleted file mode 100644 index dde49537..00000000 --- a/examples/ernie/scrape_plain_text_ernie.py +++ /dev/null @@ -1,54 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper from text -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the text file -# ************************************************ - -FILE_NAME = "inputs/plain_html_example.txt" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -# It could be also a http request using the request model -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "model": "ernie/ernie-bot-turbo", - "ernie_client_id": "", - "ernie_client_secret": "", - "temperature": 0.1 - } -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description.", - source=text, - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/ernie/script_generator_ernie.py b/examples/ernie/script_generator_ernie.py deleted file mode 100644 index f518739c..00000000 --- a/examples/ernie/script_generator_ernie.py +++ /dev/null @@ -1,43 +0,0 @@ -""" -Basic example of scraping pipeline using ScriptCreatorGraph -""" -from dotenv import load_dotenv -from scrapegraphai.graphs import ScriptCreatorGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "model": "ernie/ernie-bot-turbo", - "ernie_client_id": "", - "ernie_client_secret": "", - "temperature": 0.1 - }, - "library": "beautifulsoup" -} - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -script_creator_graph = ScriptCreatorGraph( - prompt="List me all the projects with their description.", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects", - config=graph_config -) - -result = script_creator_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = script_creator_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/ernie/script_multi_generator_ernie.py b/examples/ernie/script_multi_generator_ernie.py deleted file mode 100644 index 4b3c88f7..00000000 --- a/examples/ernie/script_multi_generator_ernie.py +++ /dev/null @@ -1,49 +0,0 @@ -""" -Basic example of scraping pipeline using ScriptCreatorGraph -""" -from scrapegraphai.graphs import ScriptCreatorMultiGraph -from scrapegraphai.utils import prettify_exec_info - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "model": "ernie/ernie-bot-turbo", - "ernie_client_id": "", - "ernie_client_secret": "", - "temperature": 0.1 - }, - "library": "beautifulsoup" -} - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -urls=[ - "https://schultzbergagency.com/emil-raste-karlsen/", - "https://schultzbergagency.com/johanna-hedberg/", -] - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -script_creator_graph = ScriptCreatorMultiGraph( - prompt="Find information about actors", - # also accepts a string with the already downloaded HTML code - source=urls, - config=graph_config -) - -result = script_creator_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = script_creator_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/ernie/search_graph_ernie.py b/examples/ernie/search_graph_ernie.py deleted file mode 100644 index ff9b3d8b..00000000 --- a/examples/ernie/search_graph_ernie.py +++ /dev/null @@ -1,33 +0,0 @@ -""" -Example of Search Graph -""" -from dotenv import load_dotenv -from scrapegraphai.graphs import SearchGraph - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "model": "ernie/ernie-bot-turbo", - "ernie_client_id": "", - "ernie_client_secret": "", - "temperature": 0.1 - }, - "library": "beautifulsoup" -} - -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me Chioggia's famous dishes", - config=graph_config -) - -result = search_graph.run() -print(result) diff --git a/examples/ernie/search_link_graph_ernie.py b/examples/ernie/search_link_graph_ernie.py deleted file mode 100644 index 645dd505..00000000 --- a/examples/ernie/search_link_graph_ernie.py +++ /dev/null @@ -1,42 +0,0 @@ -""" -Example of Search Graph -""" -from scrapegraphai.graphs import SearchGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "model": "ernie/ernie-bot-turbo", - "ernie_client_id": "", - "ernie_client_secret": "", - "temperature": 0.1 - }, - "library": "beautifulsoup" -} - -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me the best escursions near Trento", - config=graph_config -) - -result = search_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = search_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json and csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/ernie/smart_scraper_ernie.py b/examples/ernie/smart_scraper_ernie.py deleted file mode 100644 index 4bbe608a..00000000 --- a/examples/ernie/smart_scraper_ernie.py +++ /dev/null @@ -1,40 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "model": "ernie/ernie-bot-turbo", - "ernie_client_id": "", - "ernie_client_secret": "", - "temperature": 0.1 - }, - "library": "beautifulsoup" -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects/", - config=graph_config, -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/ernie/smart_scraper_lite_ernie.py b/examples/ernie/smart_scraper_lite_ernie.py deleted file mode 100644 index 5d3ba9d9..00000000 --- a/examples/ernie/smart_scraper_lite_ernie.py +++ /dev/null @@ -1,31 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperLiteGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -graph_config = { - "llm": { - "api_key": os.getenv("ERNIE_API_KEY"), - "model": "ernie/ernie-bot-4", - }, - "verbose": True, - "headless": False, -} - -smart_scraper_lite_graph = SmartScraperLiteGraph( - prompt="Who is Marco Perini?", - source="https://perinim.github.io/", - config=graph_config -) - -result = smart_scraper_lite_graph.run() -print(json.dumps(result, indent=4)) - -graph_exec_info = smart_scraper_lite_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/ernie/smart_scraper_multi_concat_ernie.py b/examples/ernie/smart_scraper_multi_concat_ernie.py deleted file mode 100644 index 5be9898d..00000000 --- a/examples/ernie/smart_scraper_multi_concat_ernie.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import json -from scrapegraphai.graphs import SmartScraperMultiConcatGraph - -# ************************************************ -# Define the configuration for the graph -# ************************************************ -graph_config = { - "llm": { - "model": "ernie/ernie-bot-turbo", - "ernie_client_id": "", - "ernie_client_secret": "", - "temperature": 0.1 - }, - "library": "beautifulsoup" -} - -# ******************************************************* -# Create the SmartScraperMultiGraph instance and run it -# ******************************************************* - -multiple_search_graph = SmartScraperMultiConcatGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/ernie/smart_scraper_multi_ernie.py b/examples/ernie/smart_scraper_multi_ernie.py deleted file mode 100644 index 4e44ab6a..00000000 --- a/examples/ernie/smart_scraper_multi_ernie.py +++ /dev/null @@ -1,41 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiGraph - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "model": "ernie/ernie-bot-turbo", - "ernie_client_id": "", - "ernie_client_secret": "", - "temperature": 0.1 - }, - "verbose": True, - "headless": False, -} - -# ******************************************************* -# Create the SmartScraperMultiGraph instance and run it -# ******************************************************* - -multiple_search_graph = SmartScraperMultiGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/ernie/smart_scraper_multi_lite_ernie.py b/examples/ernie/smart_scraper_multi_lite_ernie.py deleted file mode 100644 index 777a760e..00000000 --- a/examples/ernie/smart_scraper_multi_lite_ernie.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiLiteGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -graph_config = { - "llm": { - "api_key": os.getenv("ERNIE_API_KEY"), - "model": "ernie/ernie-bot-4", - }, - "verbose": True, - "headless": False, -} - -smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - config=graph_config -) - -result = smart_scraper_multi_lite_graph.run() -print(json.dumps(result, indent=4)) - -graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - diff --git a/examples/ernie/smart_scraper_schema_ernie.py b/examples/ernie/smart_scraper_schema_ernie.py deleted file mode 100644 index e9d9ab0a..00000000 --- a/examples/ernie/smart_scraper_schema_ernie.py +++ /dev/null @@ -1,54 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper with schema -""" -import json -import os -from typing import Dict -from dotenv import load_dotenv -from pydantic import BaseModel -from scrapegraphai.graphs import SmartScraperGraph - -load_dotenv() - -# ************************************************ -# Define the output schema for the graph -# ************************************************ -class Project(BaseModel): - title: str - description: str - - -class Projects(BaseModel): - Projects: Dict[str, Project] - - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -openai_key = os.getenv("OPENAI_APIKEY") - -graph_config = { - "llm": { - "model": "ernie/ernie-bot-turbo", - "ernie_client_id": "", - "ernie_client_secret": "", - "temperature": 0.1 - }, - "verbose": True, - "headless": False, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description", - source="https://perinim.github.io/projects/", - schema=Projects, - config=graph_config, -) - -result = smart_scraper_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/ernie/speech_graph_ernie.py b/examples/ernie/speech_graph_ernie.py deleted file mode 100644 index 0b4ed620..00000000 --- a/examples/ernie/speech_graph_ernie.py +++ /dev/null @@ -1,58 +0,0 @@ -""" -Basic example of scraping pipeline using SpeechSummaryGraph -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SpeechGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define audio output path -# ************************************************ - -FILE_NAME = "website_summary.mp3" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -output_path = os.path.join(curr_dir, FILE_NAME) - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -openai_key = os.getenv("OPENAI_APIKEY") - -graph_config = { - "llm": { - "model": "ernie/ernie-bot-turbo", - "ernie_client_id": "", - "ernie_client_secret": "", - "temperature": 0.1 - }, - "tts_model": { - "api_key": openai_key, - "model": "tts-1", - "voice": "alloy" - }, - "output_path": output_path, -} - -# ************************************************ -# Create the SpeechGraph instance and run it -# ************************************************ - -speech_graph = SpeechGraph( - prompt="Make a detailed audio summary of the projects.", - source="https://perinim.github.io/projects/", - config=graph_config, -) - -result = speech_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = speech_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/ernie/xml_scraper_ernie.py b/examples/ernie/xml_scraper_ernie.py deleted file mode 100644 index 90a1230a..00000000 --- a/examples/ernie/xml_scraper_ernie.py +++ /dev/null @@ -1,61 +0,0 @@ -""" -Basic example of scraping pipeline using XMLScraperGraph from XML documents -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import XMLScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the XML file -# ************************************************ - -FILE_NAME = "inputs/books.xml" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -openai_key = os.getenv("OPENAI_APIKEY") - -graph_config = { - "llm": { - "model": "ernie/ernie-bot-turbo", - "ernie_client_id": "", - "ernie_client_secret": "", - "temperature": 0.1 - }, - "verbose":False, -} - -# ************************************************ -# Create the XMLScraperGraph instance and run it -# ************************************************ - -xml_scraper_graph = XMLScraperGraph( - prompt="List me all the authors, title and genres of the books", - source=text, # Pass the content of the file, not the file object - config=graph_config -) - -result = xml_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = xml_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") - diff --git a/examples/fireworks/.env.example b/examples/fireworks/.env.example deleted file mode 100644 index ab200215..00000000 --- a/examples/fireworks/.env.example +++ /dev/null @@ -1 +0,0 @@ -FIREWORKS_APIKEY="your fireworks api key" diff --git a/examples/fireworks/code_generator_graph_fireworks.py b/examples/fireworks/code_generator_graph_fireworks.py deleted file mode 100644 index e38c48a1..00000000 --- a/examples/fireworks/code_generator_graph_fireworks.py +++ /dev/null @@ -1,60 +0,0 @@ -""" -Basic example of scraping pipeline using Code Generator with schema -""" -import os -import json -from typing import List -from dotenv import load_dotenv -from pydantic import BaseModel, Field -from scrapegraphai.graphs import CodeGeneratorGraph - -load_dotenv() - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Project(BaseModel): - title: str = Field(description="The title of the project") - description: str = Field(description="The description of the project") - -class Projects(BaseModel): - projects: List[Project] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -fireworks_api_key = os.getenv("FIREWORKS_APIKEY") - -graph_config = { - "llm": { - "api_key": fireworks_api_key, - "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "verbose": True, - "headless": False, - "reduction": 2, - "max_iterations": { - "overall": 10, - "syntax": 3, - "execution": 3, - "validation": 3, - "semantic": 3 - }, - "output_file_name": "extracted_data.py" -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -code_generator_graph = CodeGeneratorGraph( - prompt="List me all the projects with their description", - source="https://perinim.github.io/projects/", - schema=Projects, - config=graph_config -) - -result = code_generator_graph.run() -print(result) \ No newline at end of file diff --git a/examples/fireworks/csv_scraper_fireworks.py b/examples/fireworks/csv_scraper_fireworks.py deleted file mode 100644 index c380f9bd..00000000 --- a/examples/fireworks/csv_scraper_fireworks.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -Basic example of scraping pipeline using CSVScraperGraph from CSV documents -""" -import os -from dotenv import load_dotenv -import pandas as pd -from scrapegraphai.graphs import CSVScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the CSV file -# ************************************************ - -FILE_NAME = "inputs/username.csv" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -text = pd.read_csv(file_path) - -# ************************************************ -# Define the configuration for the graph -# ************************************************ -fireworks_api_key = os.getenv("FIREWORKS_APIKEY") - -graph_config = { - "llm": { - "api_key": fireworks_api_key, - "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "headless": False, -} - -# ************************************************ -# Create the CSVScraperGraph instance and run it -# ************************************************ - -csv_scraper_graph = CSVScraperGraph( - prompt="List me all the last names", - source=str(text), # Pass the content of the file, not the file object - config=graph_config -) - -result = csv_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = csv_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/fireworks/csv_scraper_graph_multi_fireworks.py b/examples/fireworks/csv_scraper_graph_multi_fireworks.py deleted file mode 100644 index 61518822..00000000 --- a/examples/fireworks/csv_scraper_graph_multi_fireworks.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -Basic example of scraping pipeline using CSVScraperMultiGraph from CSV documents -""" -import os -from dotenv import load_dotenv -import pandas as pd -from scrapegraphai.graphs import CSVScraperMultiGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() -# ************************************************ -# Read the CSV file -# ************************************************ - -FILE_NAME = "inputs/username.csv" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -text = pd.read_csv(file_path) - -# ************************************************ -# Define the configuration for the graph -# ************************************************ -fireworks_api_key = os.getenv("FIREWORKS_APIKEY") - -graph_config = { - "llm": { - "api_key": fireworks_api_key, - "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "verbose": True, - "headless": False, -} - -# ************************************************ -# Create the CSVScraperMultiGraph instance and run it -# ************************************************ - -csv_scraper_graph = CSVScraperMultiGraph( - prompt="List me all the last names", - source=[str(text), str(text)], - config=graph_config -) - -result = csv_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = csv_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/fireworks/custom_graph_fireworks.py b/examples/fireworks/custom_graph_fireworks.py deleted file mode 100644 index 518e9df3..00000000 --- a/examples/fireworks/custom_graph_fireworks.py +++ /dev/null @@ -1,98 +0,0 @@ -""" -Example of custom graph using existing nodes -""" -import os -from dotenv import load_dotenv -from langchain_openai import ChatOpenAI -from scrapegraphai.graphs import BaseGraph -from scrapegraphai.nodes import FetchNode, ParseNode, GenerateAnswerNode, RobotsNode -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -fireworks_api_key = os.getenv("FIREWORKS_APIKEY") - -graph_config = { - "llm": { - "api_key": fireworks_api_key, - "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "verbose": True, - "headless": False, -} - -# ************************************************ -# Define the graph nodes -# ************************************************ - -llm_model = ChatOpenAI(graph_config["llm"]) - -# define the nodes for the graph -robot_node = RobotsNode( - input="url", - output=["is_scrapable"], - node_config={ - "llm_model": llm_model, - "force_scraping": True, - "verbose": True, - } -) - -fetch_node = FetchNode( - input="url | local_dir", - output=["doc"], - node_config={ - "verbose": True, - "headless": True, - } -) -parse_node = ParseNode( - input="doc", - output=["parsed_doc"], - node_config={ - "chunk_size": 4096, - "verbose": True, - } -) - -generate_answer_node = GenerateAnswerNode( - input="user_prompt & (relevant_chunks | parsed_doc | doc)", - output=["answer"], - node_config={ - "llm_model": llm_model, - "verbose": True, - } -) - -# ************************************************ -# Create the graph by defining the connections -# ************************************************ - -graph = BaseGraph( - nodes=[ - robot_node, - fetch_node, - parse_node, - ], - edges=[ - (robot_node, fetch_node), - (fetch_node, parse_node), - (parse_node, generate_answer_node) - ], - entry_point=robot_node -) - -# ************************************************ -# Execute the graph -# ************************************************ - -result, execution_info = graph.execute({ - "user_prompt": "Describe the content", - "url": "https://example.com/" -}) - -# get the answer from the result -result = result.get("answer", "No answer found.") -print(result) diff --git a/examples/fireworks/depth_search_graph_fireworks.py b/examples/fireworks/depth_search_graph_fireworks.py deleted file mode 100644 index f467be9f..00000000 --- a/examples/fireworks/depth_search_graph_fireworks.py +++ /dev/null @@ -1,30 +0,0 @@ -""" -depth_search_graph_opeani example -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import DepthSearchGraph - -load_dotenv() - -fireworks_api_key = os.getenv("FIREWORKS_APIKEY") - -graph_config = { - "llm": { - "api_key": fireworks_api_key, - "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "verbose": True, - "headless": False, - "depth": 2, - "only_inside_links": False, -} - -search_graph = DepthSearchGraph( - prompt="List me all the projects with their description", - source="https://perinim.github.io", - config=graph_config -) - -result = search_graph.run() -print(result) diff --git a/examples/fireworks/document_scraper_anthropic_fireworks.py b/examples/fireworks/document_scraper_anthropic_fireworks.py deleted file mode 100644 index 33f6c0d5..00000000 --- a/examples/fireworks/document_scraper_anthropic_fireworks.py +++ /dev/null @@ -1,44 +0,0 @@ -""" -document_scraper example -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import DocumentScraperGraph - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ -fireworks_api_key = os.getenv("FIREWORKS_APIKEY") - -graph_config = { - "llm": { - "api_key": fireworks_api_key, - "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "verbose": True, - "headless": False, -} - - -source = """ - The Divine Comedy, Italian La Divina Commedia, original name La commedia, long narrative poem written in Italian - circa 1308/21 by Dante. It is usually held to be one of the world s great works of literature. - Divided into three major sections—Inferno, Purgatorio, and Paradiso—the narrative traces the journey of Dante - from darkness and error to the revelation of the divine light, culminating in the Beatific Vision of God. - Dante is guided by the Roman poet Virgil, who represents the epitome of human knowledge, from the dark wood - through the descending circles of the pit of Hell (Inferno). He then climbs the mountain of Purgatory, guided - by the Roman poet Statius, who represents the fulfilment of human knowledge, and is finally led by his lifelong love, - the Beatrice of his earlier poetry, through the celestial spheres of Paradise. -""" - -pdf_scraper_graph = DocumentScraperGraph( - prompt="Summarize the text and find the main topics", - source=source, - config=graph_config, -) -result = pdf_scraper_graph.run() - -print(json.dumps(result, indent=4)) \ No newline at end of file diff --git a/examples/fireworks/inputs/books.xml b/examples/fireworks/inputs/books.xml deleted file mode 100644 index e3d1fe87..00000000 --- a/examples/fireworks/inputs/books.xml +++ /dev/null @@ -1,120 +0,0 @@ - - - - Gambardella, Matthew - XML Developer's Guide - Computer - 44.95 - 2000-10-01 - An in-depth look at creating applications - with XML. - - - Ralls, Kim - Midnight Rain - Fantasy - 5.95 - 2000-12-16 - A former architect battles corporate zombies, - an evil sorceress, and her own childhood to become queen - of the world. - - - Corets, Eva - Maeve Ascendant - Fantasy - 5.95 - 2000-11-17 - After the collapse of a nanotechnology - society in England, the young survivors lay the - foundation for a new society. - - - Corets, Eva - Oberon's Legacy - Fantasy - 5.95 - 2001-03-10 - In post-apocalypse England, the mysterious - agent known only as Oberon helps to create a new life - for the inhabitants of London. Sequel to Maeve - Ascendant. - - - Corets, Eva - The Sundered Grail - Fantasy - 5.95 - 2001-09-10 - The two daughters of Maeve, half-sisters, - battle one another for control of England. Sequel to - Oberon's Legacy. - - - Randall, Cynthia - Lover Birds - Romance - 4.95 - 2000-09-02 - When Carla meets Paul at an ornithology - conference, tempers fly as feathers get ruffled. - - - Thurman, Paula - Splish Splash - Romance - 4.95 - 2000-11-02 - A deep sea diver finds true love twenty - thousand leagues beneath the sea. - - - Knorr, Stefan - Creepy Crawlies - Horror - 4.95 - 2000-12-06 - An anthology of horror stories about roaches, - centipedes, scorpions and other insects. - - - Kress, Peter - Paradox Lost - Science Fiction - 6.95 - 2000-11-02 - After an inadvertant trip through a Heisenberg - Uncertainty Device, James Salway discovers the problems - of being quantum. - - - O'Brien, Tim - Microsoft .NET: The Programming Bible - Computer - 36.95 - 2000-12-09 - Microsoft's .NET initiative is explored in - detail in this deep programmer's reference. - - - O'Brien, Tim - MSXML3: A Comprehensive Guide - Computer - 36.95 - 2000-12-01 - The Microsoft MSXML3 parser is covered in - detail, with attention to XML DOM interfaces, XSLT processing, - SAX and more. - - - Galos, Mike - Visual Studio 7: A Comprehensive Guide - Computer - 49.95 - 2001-04-16 - Microsoft Visual Studio 7 is explored in depth, - looking at how Visual Basic, Visual C++, C#, and ASP+ are - integrated into a comprehensive development - environment. - - \ No newline at end of file diff --git a/examples/fireworks/inputs/example.json b/examples/fireworks/inputs/example.json deleted file mode 100644 index 2263184c..00000000 --- a/examples/fireworks/inputs/example.json +++ /dev/null @@ -1,182 +0,0 @@ -{ - "kind":"youtube#searchListResponse", - "etag":"q4ibjmYp1KA3RqMF4jFLl6PBwOg", - "nextPageToken":"CAUQAA", - "regionCode":"NL", - "pageInfo":{ - "totalResults":1000000, - "resultsPerPage":5 - }, - "items":[ - { - "kind":"youtube#searchResult", - "etag":"QCsHBifbaernVCbLv8Cu6rAeaDQ", - "id":{ - "kind":"youtube#video", - "videoId":"TvWDY4Mm5GM" - }, - "snippet":{ - "publishedAt":"2023-07-24T14:15:01Z", - "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", - "title":"3 Football Clubs Kylian Mbappe Should Avoid Signing ✍️❌⚽️ #football #mbappe #shorts", - "description":"", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"FC Motivate", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T14:15:01Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"0NG5QHdtIQM_V-DBJDEf-jK_Y9k", - "id":{ - "kind":"youtube#video", - "videoId":"aZM_42CcNZ4" - }, - "snippet":{ - "publishedAt":"2023-07-24T16:09:27Z", - "channelId":"UCM5gMM_HqfKHYIEJ3lstMUA", - "title":"Which Football Club Could Cristiano Ronaldo Afford To Buy? 💰", - "description":"Sign up to Sorare and get a FREE card: https://sorare.pxf.io/NellisShorts Give Soraredata a go for FREE: ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"John Nellis", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T16:09:27Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"WbBz4oh9I5VaYj91LjeJvffrBVY", - "id":{ - "kind":"youtube#video", - "videoId":"wkP3XS3aNAY" - }, - "snippet":{ - "publishedAt":"2023-07-24T16:00:50Z", - "channelId":"UC4EP1dxFDPup_aFLt0ElsDw", - "title":"PAULO DYBALA vs THE WORLD'S LONGEST FREEKICK WALL", - "description":"Can Paulo Dybala curl a football around the World's longest free kick wall? We met up with the World Cup winner and put him to ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"Shoot for Love", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T16:00:50Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"juxv_FhT_l4qrR05S1QTrb4CGh8", - "id":{ - "kind":"youtube#video", - "videoId":"rJkDZ0WvfT8" - }, - "snippet":{ - "publishedAt":"2023-07-24T10:00:39Z", - "channelId":"UCO8qj5u80Ga7N_tP3BZWWhQ", - "title":"TOP 10 DEFENDERS 2023", - "description":"SoccerKingz https://soccerkingz.nl Use code: 'ILOVEHOF' to get 10% off. TOP 10 DEFENDERS 2023 Follow us! • Instagram ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"Home of Football", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T10:00:39Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"wtuknXTmI1txoULeH3aWaOuXOow", - "id":{ - "kind":"youtube#video", - "videoId":"XH0rtu4U6SE" - }, - "snippet":{ - "publishedAt":"2023-07-21T16:30:05Z", - "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", - "title":"3 Things You Didn't Know About Erling Haaland ⚽️🇳🇴 #football #haaland #shorts", - "description":"", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"FC Motivate", - "liveBroadcastContent":"none", - "publishTime":"2023-07-21T16:30:05Z" - } - } - ] -} \ No newline at end of file diff --git a/examples/fireworks/inputs/plain_html_example.txt b/examples/fireworks/inputs/plain_html_example.txt deleted file mode 100644 index 78f814ae..00000000 --- a/examples/fireworks/inputs/plain_html_example.txt +++ /dev/null @@ -1,105 +0,0 @@ - -
- - -
-
-
- - -
- \ No newline at end of file diff --git a/examples/fireworks/inputs/username.csv b/examples/fireworks/inputs/username.csv deleted file mode 100644 index 006ac8e6..00000000 --- a/examples/fireworks/inputs/username.csv +++ /dev/null @@ -1,7 +0,0 @@ -Username; Identifier;First name;Last name -booker12;9012;Rachel;Booker -grey07;2070;Laura;Grey -johnson81;4081;Craig;Johnson -jenkins46;9346;Mary;Jenkins -smith79;5079;Jamie;Smith - diff --git a/examples/fireworks/json_scraper_fireworkspy.py b/examples/fireworks/json_scraper_fireworkspy.py deleted file mode 100644 index ef1b8264..00000000 --- a/examples/fireworks/json_scraper_fireworkspy.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -Basic example of scraping pipeline using JSONScraperGraph from JSON documents -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import JSONScraperGraph -from scrapegraphai.utils import prettify_exec_info -load_dotenv() - -# ************************************************ -# Read the JSON file -# ************************************************ - -FILE_NAME = "inputs/example.json" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -fireworks_api_key = os.getenv("FIREWORKS_APIKEY") - -graph_config = { - "llm": { - "api_key": fireworks_api_key, - "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "verbose": True, - "headless": False, -} - -# ************************************************ -# Create the JSONScraperGraph instance and run it -# ************************************************ - -json_scraper_graph = JSONScraperGraph( - prompt="List me all the authors, title and genres of the books", - source=text, # Pass the content of the file, not the file object - config=graph_config -) - -result = json_scraper_graph.run() -print(result) diff --git a/examples/fireworks/json_scraper_multi_fireworks.py b/examples/fireworks/json_scraper_multi_fireworks.py deleted file mode 100644 index cd16c525..00000000 --- a/examples/fireworks/json_scraper_multi_fireworks.py +++ /dev/null @@ -1,39 +0,0 @@ -""" -Module for showing how PDFScraper multi works -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import JSONScraperMultiGraph - -load_dotenv() - -fireworks_api_key = os.getenv("FIREWORKS_APIKEY") - -graph_config = { - "llm": { - "api_key": fireworks_api_key, - "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "verbose": True, - "headless": False, -} - -FILE_NAME = "inputs/example.json" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -sources = [text, text] - -multiple_search_graph = JSONScraperMultiGraph( - prompt= "List me all the authors, title and genres of the books", - source= sources, - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/fireworks/rate_limit_fireworks.py b/examples/fireworks/rate_limit_fireworks.py deleted file mode 100644 index 813b6d5d..00000000 --- a/examples/fireworks/rate_limit_fireworks.py +++ /dev/null @@ -1,49 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper with a custom rate limit -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -fireworks_api_key = os.getenv("FIREWORKS_APIKEY") - -graph_config = { - "llm": { - "api_key": fireworks_api_key, - "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct", - "rate_limit": { - "requests_per_second": 1 - }, - }, - "verbose": True, - "headless": False, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects/", - config=graph_config, -) - -result = smart_scraper_graph.run() -print(json.dumps(result, indent=4)) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/fireworks/scrape_plain_text_fireworks.py b/examples/fireworks/scrape_plain_text_fireworks.py deleted file mode 100644 index c82bdf15..00000000 --- a/examples/fireworks/scrape_plain_text_fireworks.py +++ /dev/null @@ -1,54 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper from text -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the text file -# ************************************************ - -FILE_NAME = "inputs/plain_html_example.txt" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -# It could be also a http request using the request model -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -fireworks_api_key = os.getenv("FIREWORKS_APIKEY") - -graph_config = { - "llm": { - "api_key": fireworks_api_key, - "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description.", - source=text, - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/fireworks/script_generator_fireworks.py b/examples/fireworks/script_generator_fireworks.py deleted file mode 100644 index d195cbdc..00000000 --- a/examples/fireworks/script_generator_fireworks.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -Basic example of scraping pipeline using ScriptCreatorGraph -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import ScriptCreatorGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -fireworks_api_key = os.getenv("FIREWORKS_APIKEY") - -graph_config = { - "llm": { - "api_key": fireworks_api_key, - "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "verbose": True, - "headless": False, - "library": "beautifulsoup" - -} - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -script_creator_graph = ScriptCreatorGraph( - prompt="List me all the projects with their description.", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects", - config=graph_config -) - -result = script_creator_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = script_creator_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/fireworks/script_generator_schema_fireworks.py b/examples/fireworks/script_generator_schema_fireworks.py deleted file mode 100644 index 20e46fb7..00000000 --- a/examples/fireworks/script_generator_schema_fireworks.py +++ /dev/null @@ -1,60 +0,0 @@ -""" -Basic example of scraping pipeline using ScriptCreatorGraph -""" -import os -from typing import List -from dotenv import load_dotenv -from pydantic import BaseModel, Field -from scrapegraphai.graphs import ScriptCreatorGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the schema for the graph -# ************************************************ - -class Project(BaseModel): - title: str = Field(description="The title of the project") - description: str = Field(description="The description of the project") - -class Projects(BaseModel): - projects: List[Project] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -fireworks_api_key = os.getenv("FIREWORKS_APIKEY") - -graph_config = { - "llm": { - "api_key": fireworks_api_key, - "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "verbose": True, - "library": "beautifulsoup", -} - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -script_creator_graph = ScriptCreatorGraph( - prompt="List me all the projects with their description.", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects", - config=graph_config, - schema=Projects -) - -result = script_creator_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = script_creator_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - diff --git a/examples/fireworks/script_multi_generator_fireworks.py b/examples/fireworks/script_multi_generator_fireworks.py deleted file mode 100644 index c0f474dc..00000000 --- a/examples/fireworks/script_multi_generator_fireworks.py +++ /dev/null @@ -1,52 +0,0 @@ -""" -Basic example of scraping pipeline using ScriptCreatorGraph -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import ScriptCreatorMultiGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -fireworks_api_key = os.getenv("FIREWORKS_APIKEY") - -graph_config = { - "llm": { - "api_key": fireworks_api_key, - "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "verbose": True, - "library": "beautifulsoup", -} -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ -urls=[ - "https://schultzbergagency.com/emil-raste-karlsen/", - "https://schultzbergagency.com/johanna-hedberg/", -] - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -script_creator_graph = ScriptCreatorMultiGraph( - prompt="Find information about actors", - # also accepts a string with the already downloaded HTML code - source=urls, - config=graph_config -) - -result = script_creator_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = script_creator_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/fireworks/search_graph_fireworks.py b/examples/fireworks/search_graph_fireworks.py deleted file mode 100644 index 72728a28..00000000 --- a/examples/fireworks/search_graph_fireworks.py +++ /dev/null @@ -1,36 +0,0 @@ -""" -Example of Search Graph -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SearchGraph - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -fireworks_api_key = os.getenv("FIREWORKS_APIKEY") - -graph_config = { - "llm": { - "api_key": fireworks_api_key, - "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "max_results": 2, - "verbose": True, - "headless": False, -} - -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me Chioggia's famous dishes", - config=graph_config -) - -result = search_graph.run() -print(result) diff --git a/examples/fireworks/search_graph_schema_fireworks.py b/examples/fireworks/search_graph_schema_fireworks.py deleted file mode 100644 index bd54a69a..00000000 --- a/examples/fireworks/search_graph_schema_fireworks.py +++ /dev/null @@ -1,62 +0,0 @@ -""" -Example of Search Graph -""" - -import os -from typing import List -from dotenv import load_dotenv -from pydantic import BaseModel, Field -from scrapegraphai.graphs import SearchGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Dish(BaseModel): - name: str = Field(description="The name of the dish") - description: str = Field(description="The description of the dish") - -class Dishes(BaseModel): - dishes: List[Dish] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -fireworks_api_key = os.getenv("FIREWORKS_APIKEY") - -graph_config = { - "llm": { - "api_key": fireworks_api_key, - "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "max_results": 2, - "verbose": True, - "headless": False, -} -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me Chioggia's famous dishes", - config=graph_config, - schema=Dishes -) - -result = search_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = search_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json and csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/fireworks/search_link_graph_fireworks.py b/examples/fireworks/search_link_graph_fireworks.py deleted file mode 100644 index e71e2a4f..00000000 --- a/examples/fireworks/search_link_graph_fireworks.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -Example of Search Graph -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SearchGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -load_dotenv() - -fireworks_api_key = os.getenv("FIREWORKS_APIKEY") - -graph_config = { - "llm": { - "api_key": fireworks_api_key, - "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "max_results": 2, - "verbose": True, - "headless": False, -} -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me the best escursions near Trento", - config=graph_config -) - -result = search_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = search_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json and csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/fireworks/smart_scraper_fireworks.py b/examples/fireworks/smart_scraper_fireworks.py deleted file mode 100644 index 2ccac269..00000000 --- a/examples/fireworks/smart_scraper_fireworks.py +++ /dev/null @@ -1,46 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -fireworks_api_key = os.getenv("FIREWORKS_APIKEY") - -graph_config = { - "llm": { - "api_key": fireworks_api_key, - "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "verbose": True, - "headless": False, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects/", - config=graph_config, -) - -result = smart_scraper_graph.run() -print(json.dumps(result, indent=4)) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/fireworks/smart_scraper_lite_fireworks.py b/examples/fireworks/smart_scraper_lite_fireworks.py deleted file mode 100644 index 6c9a7745..00000000 --- a/examples/fireworks/smart_scraper_lite_fireworks.py +++ /dev/null @@ -1,31 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperLiteGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -graph_config = { - "llm": { - "api_key": os.getenv("FIREWORKS_API_KEY"), - "model": "fireworks/llama-v2-70b-chat", - }, - "verbose": True, - "headless": False, -} - -smart_scraper_lite_graph = SmartScraperLiteGraph( - prompt="Who is Marco Perini?", - source="https://perinim.github.io/", - config=graph_config -) - -result = smart_scraper_lite_graph.run() -print(json.dumps(result, indent=4)) - -graph_exec_info = smart_scraper_lite_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/fireworks/smart_scraper_multi_concat_fireworks.py b/examples/fireworks/smart_scraper_multi_concat_fireworks.py deleted file mode 100644 index c0da49a3..00000000 --- a/examples/fireworks/smart_scraper_multi_concat_fireworks.py +++ /dev/null @@ -1,37 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiConcatGraph - -load_dotenv() - -fireworks_api_key = os.getenv("FIREWORKS_APIKEY") - -graph_config = { - "llm": { - "api_key": fireworks_api_key, - "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "verbose": True, - "headless": False, -} - -# ******************************************************* -# Create the SmartScraperMultiGraph instance and run it -# ******************************************************* - -multiple_search_graph = SmartScraperMultiConcatGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/fireworks/smart_scraper_multi_fireworks.py b/examples/fireworks/smart_scraper_multi_fireworks.py deleted file mode 100644 index a75f9ab1..00000000 --- a/examples/fireworks/smart_scraper_multi_fireworks.py +++ /dev/null @@ -1,40 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiGraph - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ -fireworks_api_key = os.getenv("FIREWORKS_APIKEY") - -graph_config = { - "llm": { - "api_key": fireworks_api_key, - "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "verbose": True, - "headless": False, -} - -# ******************************************************* -# Create the SmartScraperMultiGraph instance and run it -# ******************************************************* - -multiple_search_graph = SmartScraperMultiGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/fireworks/smart_scraper_multi_lite_fireworks.py b/examples/fireworks/smart_scraper_multi_lite_fireworks.py deleted file mode 100644 index 4ffaf6bb..00000000 --- a/examples/fireworks/smart_scraper_multi_lite_fireworks.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiLiteGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -graph_config = { - "llm": { - "api_key": os.getenv("FIREWORKS_API_KEY"), - "model": "fireworks/llama-v2-70b-chat", - }, - "verbose": True, - "headless": False, -} - -smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - config=graph_config -) - -result = smart_scraper_multi_lite_graph.run() -print(json.dumps(result, indent=4)) - -graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - diff --git a/examples/fireworks/smart_scraper_schema_fireworks.py b/examples/fireworks/smart_scraper_schema_fireworks.py deleted file mode 100644 index b576bc7d..00000000 --- a/examples/fireworks/smart_scraper_schema_fireworks.py +++ /dev/null @@ -1,49 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper with schema -""" -import os -from typing import List -from dotenv import load_dotenv -from pydantic import BaseModel, Field -from scrapegraphai.graphs import SmartScraperGraph - -load_dotenv() - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Project(BaseModel): - title: str = Field(description="The title of the project") - description: str = Field(description="The description of the project") - -class Projects(BaseModel): - projects: List[Project] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -fireworks_api_key = os.getenv("FIREWORKS_APIKEY") - -graph_config = { - "llm": { - "api_key": fireworks_api_key, - "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "verbose": True, - "headless": False, -} -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description", - source="https://perinim.github.io/projects/", - schema=Projects, - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) diff --git a/examples/fireworks/xml_scraper_fireworks.py b/examples/fireworks/xml_scraper_fireworks.py deleted file mode 100644 index 88673cf6..00000000 --- a/examples/fireworks/xml_scraper_fireworks.py +++ /dev/null @@ -1,58 +0,0 @@ -""" -Basic example of scraping pipeline using XMLScraperGraph from XML documents -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import XMLScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the XML file -# ************************************************ - -FILE_NAME = "inputs/books.xml" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ -fireworks_api_key = os.getenv("FIREWORKS_APIKEY") - -graph_config = { - "llm": { - "api_key": fireworks_api_key, - "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "verbose": True, - "headless": False, -} - -# ************************************************ -# Create the XMLScraperGraph instance and run it -# ************************************************ - -xml_scraper_graph = XMLScraperGraph( - prompt="List me all the authors, title and genres of the books", - source=text, # Pass the content of the file, not the file object - config=graph_config -) - -result = xml_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = xml_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/fireworks/xml_scraper_graph_multi_fireworks.py b/examples/fireworks/xml_scraper_graph_multi_fireworks.py deleted file mode 100644 index 1744325b..00000000 --- a/examples/fireworks/xml_scraper_graph_multi_fireworks.py +++ /dev/null @@ -1,58 +0,0 @@ -""" -Basic example of scraping pipeline using XMLScraperMultiGraph from XML documents -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import XMLScraperMultiGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the XML file -# ************************************************ - -FILE_NAME = "inputs/books.xml" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -fireworks_api_key = os.getenv("FIREWORKS_APIKEY") - -graph_config = { - "llm": { - "api_key": fireworks_api_key, - "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "verbose": True, - "headless": False, -} -# ************************************************ -# Create the XMLScraperMultiGraph instance and run it -# ************************************************ - -xml_scraper_graph = XMLScraperMultiGraph( - prompt="List me all the authors, title and genres of the books", - source=[text, text], # Pass the content of the file, not the file object - config=graph_config -) - -result = xml_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = xml_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/google_genai/.env.example b/examples/google_genai/.env.example deleted file mode 100644 index fc0dacb0..00000000 --- a/examples/google_genai/.env.example +++ /dev/null @@ -1 +0,0 @@ -GOOGLE_APIKEY="your google api key" diff --git a/examples/google_genai/code_generator_graph_gemini.py b/examples/google_genai/code_generator_graph_gemini.py deleted file mode 100644 index 48ea9833..00000000 --- a/examples/google_genai/code_generator_graph_gemini.py +++ /dev/null @@ -1,59 +0,0 @@ -""" -Basic example of scraping pipeline using Code Generator with schema -""" -import os -from typing import List -from dotenv import load_dotenv -from pydantic import BaseModel, Field -from scrapegraphai.graphs import CodeGeneratorGraph - -load_dotenv() - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Project(BaseModel): - title: str = Field(description="The title of the project") - description: str = Field(description="The description of the project") - -class Projects(BaseModel): - projects: List[Project] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_genai/gemini-pro", - }, - "verbose": True, - "headless": False, - "reduction": 2, - "max_iterations": { - "overall": 10, - "syntax": 3, - "execution": 3, - "validation": 3, - "semantic": 3 - }, - "output_file_name": "extracted_data.py" -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -code_generator_graph = CodeGeneratorGraph( - prompt="List me all the projects with their description", - source="https://perinim.github.io/projects/", - schema=Projects, - config=graph_config -) - -result = code_generator_graph.run() -print(result) diff --git a/examples/google_genai/csv_scraper_gemini.py b/examples/google_genai/csv_scraper_gemini.py deleted file mode 100644 index cb792169..00000000 --- a/examples/google_genai/csv_scraper_gemini.py +++ /dev/null @@ -1,52 +0,0 @@ -""" -Basic example of scraping pipeline using CSVScraperGraph from CSV documents -""" -import os -from dotenv import load_dotenv -import pandas as pd -from scrapegraphai.graphs import CSVScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the csv file -# ************************************************ - -text = pd.read_csv("inputs/username.csv") - -# ************************************************ -# Define the configuration for the graph -# ************************************************ -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_genai/gemini-pro", - }, -} - -# ************************************************ -# Create the CSVScraperGraph instance and run it -# ************************************************ - -csv_scraper_graph = CSVScraperGraph( - prompt="List me all the last names", - source=str(text), # Pass the content of the file, not the file object - config=graph_config -) - -result = csv_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = csv_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/google_genai/csv_scraper_graph_multi_gemini.py b/examples/google_genai/csv_scraper_graph_multi_gemini.py deleted file mode 100644 index a7b252ee..00000000 --- a/examples/google_genai/csv_scraper_graph_multi_gemini.py +++ /dev/null @@ -1,56 +0,0 @@ -""" -Basic example of scraping pipeline using CSVScraperMultiGraph from CSV documents -""" -import os -from dotenv import load_dotenv -import pandas as pd -from scrapegraphai.graphs import CSVScraperMultiGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() -# ************************************************ -# Read the CSV file -# ************************************************ - -FILE_NAME = "inputs/username.csv" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -text = pd.read_csv(file_path) - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_genai/gemini-pro", - }, -} - -# ************************************************ -# Create the CSVScraperMultiGraph instance and run it -# ************************************************ - -csv_scraper_graph = CSVScraperMultiGraph( - prompt="List me all the last names", - source=[str(text), str(text)], - config=graph_config -) - -result = csv_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = csv_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/google_genai/depth_search_graph_gemini.py b/examples/google_genai/depth_search_graph_gemini.py deleted file mode 100644 index 956341f4..00000000 --- a/examples/google_genai/depth_search_graph_gemini.py +++ /dev/null @@ -1,30 +0,0 @@ -""" -depth_search_graph_opeani example -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import DepthSearchGraph - -load_dotenv() - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_genai/gemini-pro", - }, - "verbose": True, - "headless": False, - "depth": 2, - "only_inside_links": False, -} - -search_graph = DepthSearchGraph( - prompt="List me all the projects with their description", - source="https://perinim.github.io", - config=graph_config -) - -result = search_graph.run() -print(result) diff --git a/examples/google_genai/document_scraper_gemini.py b/examples/google_genai/document_scraper_gemini.py deleted file mode 100644 index efb22d68..00000000 --- a/examples/google_genai/document_scraper_gemini.py +++ /dev/null @@ -1,41 +0,0 @@ -""" -document_scraper example -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import DocumentScraperGraph - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_genai/gemini-pro", - }, -} - -source = """ - The Divine Comedy, Italian La Divina Commedia, original name La commedia, long narrative poem written in Italian - circa 1308/21 by Dante. It is usually held to be one of the world s great works of literature. - Divided into three major sections—Inferno, Purgatorio, and Paradiso—the narrative traces the journey of Dante - from darkness and error to the revelation of the divine light, culminating in the Beatific Vision of God. - Dante is guided by the Roman poet Virgil, who represents the epitome of human knowledge, from the dark wood - through the descending circles of the pit of Hell (Inferno). He then climbs the mountain of Purgatory, guided - by the Roman poet Statius, who represents the fulfilment of human knowledge, and is finally led by his lifelong love, - the Beatrice of his earlier poetry, through the celestial spheres of Paradise. -""" - -pdf_scraper_graph = DocumentScraperGraph( - prompt="Summarize the text and find the main topics", - source=source, - config=graph_config, -) -result = pdf_scraper_graph.run() - -print(json.dumps(result, indent=4)) \ No newline at end of file diff --git a/examples/google_genai/inputs/books.xml b/examples/google_genai/inputs/books.xml deleted file mode 100644 index e3d1fe87..00000000 --- a/examples/google_genai/inputs/books.xml +++ /dev/null @@ -1,120 +0,0 @@ - - - - Gambardella, Matthew - XML Developer's Guide - Computer - 44.95 - 2000-10-01 - An in-depth look at creating applications - with XML. - - - Ralls, Kim - Midnight Rain - Fantasy - 5.95 - 2000-12-16 - A former architect battles corporate zombies, - an evil sorceress, and her own childhood to become queen - of the world. - - - Corets, Eva - Maeve Ascendant - Fantasy - 5.95 - 2000-11-17 - After the collapse of a nanotechnology - society in England, the young survivors lay the - foundation for a new society. - - - Corets, Eva - Oberon's Legacy - Fantasy - 5.95 - 2001-03-10 - In post-apocalypse England, the mysterious - agent known only as Oberon helps to create a new life - for the inhabitants of London. Sequel to Maeve - Ascendant. - - - Corets, Eva - The Sundered Grail - Fantasy - 5.95 - 2001-09-10 - The two daughters of Maeve, half-sisters, - battle one another for control of England. Sequel to - Oberon's Legacy. - - - Randall, Cynthia - Lover Birds - Romance - 4.95 - 2000-09-02 - When Carla meets Paul at an ornithology - conference, tempers fly as feathers get ruffled. - - - Thurman, Paula - Splish Splash - Romance - 4.95 - 2000-11-02 - A deep sea diver finds true love twenty - thousand leagues beneath the sea. - - - Knorr, Stefan - Creepy Crawlies - Horror - 4.95 - 2000-12-06 - An anthology of horror stories about roaches, - centipedes, scorpions and other insects. - - - Kress, Peter - Paradox Lost - Science Fiction - 6.95 - 2000-11-02 - After an inadvertant trip through a Heisenberg - Uncertainty Device, James Salway discovers the problems - of being quantum. - - - O'Brien, Tim - Microsoft .NET: The Programming Bible - Computer - 36.95 - 2000-12-09 - Microsoft's .NET initiative is explored in - detail in this deep programmer's reference. - - - O'Brien, Tim - MSXML3: A Comprehensive Guide - Computer - 36.95 - 2000-12-01 - The Microsoft MSXML3 parser is covered in - detail, with attention to XML DOM interfaces, XSLT processing, - SAX and more. - - - Galos, Mike - Visual Studio 7: A Comprehensive Guide - Computer - 49.95 - 2001-04-16 - Microsoft Visual Studio 7 is explored in depth, - looking at how Visual Basic, Visual C++, C#, and ASP+ are - integrated into a comprehensive development - environment. - - \ No newline at end of file diff --git a/examples/google_genai/inputs/example.json b/examples/google_genai/inputs/example.json deleted file mode 100644 index 2263184c..00000000 --- a/examples/google_genai/inputs/example.json +++ /dev/null @@ -1,182 +0,0 @@ -{ - "kind":"youtube#searchListResponse", - "etag":"q4ibjmYp1KA3RqMF4jFLl6PBwOg", - "nextPageToken":"CAUQAA", - "regionCode":"NL", - "pageInfo":{ - "totalResults":1000000, - "resultsPerPage":5 - }, - "items":[ - { - "kind":"youtube#searchResult", - "etag":"QCsHBifbaernVCbLv8Cu6rAeaDQ", - "id":{ - "kind":"youtube#video", - "videoId":"TvWDY4Mm5GM" - }, - "snippet":{ - "publishedAt":"2023-07-24T14:15:01Z", - "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", - "title":"3 Football Clubs Kylian Mbappe Should Avoid Signing ✍️❌⚽️ #football #mbappe #shorts", - "description":"", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"FC Motivate", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T14:15:01Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"0NG5QHdtIQM_V-DBJDEf-jK_Y9k", - "id":{ - "kind":"youtube#video", - "videoId":"aZM_42CcNZ4" - }, - "snippet":{ - "publishedAt":"2023-07-24T16:09:27Z", - "channelId":"UCM5gMM_HqfKHYIEJ3lstMUA", - "title":"Which Football Club Could Cristiano Ronaldo Afford To Buy? 💰", - "description":"Sign up to Sorare and get a FREE card: https://sorare.pxf.io/NellisShorts Give Soraredata a go for FREE: ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"John Nellis", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T16:09:27Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"WbBz4oh9I5VaYj91LjeJvffrBVY", - "id":{ - "kind":"youtube#video", - "videoId":"wkP3XS3aNAY" - }, - "snippet":{ - "publishedAt":"2023-07-24T16:00:50Z", - "channelId":"UC4EP1dxFDPup_aFLt0ElsDw", - "title":"PAULO DYBALA vs THE WORLD'S LONGEST FREEKICK WALL", - "description":"Can Paulo Dybala curl a football around the World's longest free kick wall? We met up with the World Cup winner and put him to ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"Shoot for Love", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T16:00:50Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"juxv_FhT_l4qrR05S1QTrb4CGh8", - "id":{ - "kind":"youtube#video", - "videoId":"rJkDZ0WvfT8" - }, - "snippet":{ - "publishedAt":"2023-07-24T10:00:39Z", - "channelId":"UCO8qj5u80Ga7N_tP3BZWWhQ", - "title":"TOP 10 DEFENDERS 2023", - "description":"SoccerKingz https://soccerkingz.nl Use code: 'ILOVEHOF' to get 10% off. TOP 10 DEFENDERS 2023 Follow us! • Instagram ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"Home of Football", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T10:00:39Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"wtuknXTmI1txoULeH3aWaOuXOow", - "id":{ - "kind":"youtube#video", - "videoId":"XH0rtu4U6SE" - }, - "snippet":{ - "publishedAt":"2023-07-21T16:30:05Z", - "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", - "title":"3 Things You Didn't Know About Erling Haaland ⚽️🇳🇴 #football #haaland #shorts", - "description":"", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"FC Motivate", - "liveBroadcastContent":"none", - "publishTime":"2023-07-21T16:30:05Z" - } - } - ] -} \ No newline at end of file diff --git a/examples/google_genai/inputs/plain_html_example.txt b/examples/google_genai/inputs/plain_html_example.txt deleted file mode 100644 index 78f814ae..00000000 --- a/examples/google_genai/inputs/plain_html_example.txt +++ /dev/null @@ -1,105 +0,0 @@ - -
- - -
-
-
- - -
- \ No newline at end of file diff --git a/examples/google_genai/inputs/username.csv b/examples/google_genai/inputs/username.csv deleted file mode 100644 index 006ac8e6..00000000 --- a/examples/google_genai/inputs/username.csv +++ /dev/null @@ -1,7 +0,0 @@ -Username; Identifier;First name;Last name -booker12;9012;Rachel;Booker -grey07;2070;Laura;Grey -johnson81;4081;Craig;Johnson -jenkins46;9346;Mary;Jenkins -smith79;5079;Jamie;Smith - diff --git a/examples/google_genai/json_scraper_gemini.py b/examples/google_genai/json_scraper_gemini.py deleted file mode 100644 index 343f1d42..00000000 --- a/examples/google_genai/json_scraper_gemini.py +++ /dev/null @@ -1,46 +0,0 @@ -""" -Basic example of scraping pipeline using JSONScraperGraph from JSON documents -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import JSONScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the JSON file -# ************************************************ - -FILE_NAME = "inputs/example.json" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_genai/gemini-pro", - }, -} - -# ************************************************ -# Create the JSONScraperGraph instance and run it -# ************************************************ - -json_scraper_graph = JSONScraperGraph( - prompt="List me all the authors, title and genres of the books", - source=text, # Pass the content of the file, not the file object - config=graph_config -) - -result = json_scraper_graph.run() -print(result) diff --git a/examples/google_genai/json_scraper_multi_gemini.py b/examples/google_genai/json_scraper_multi_gemini.py deleted file mode 100644 index 573faa97..00000000 --- a/examples/google_genai/json_scraper_multi_gemini.py +++ /dev/null @@ -1,38 +0,0 @@ -""" -Module for showing how JSONScraperMultiGraph multi works -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import JSONScraperMultiGraph - -load_dotenv() - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_genai/gemini-pro", - }, - "library": "beautifulsoup" -} - -FILE_NAME = "inputs/example.json" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -sources = [text, text] - -multiple_search_graph = JSONScraperMultiGraph( - prompt= "List me all the authors, title and genres of the books", - source= sources, - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/google_genai/rate_limit_gemini.py b/examples/google_genai/rate_limit_gemini.py deleted file mode 100644 index f3e2c555..00000000 --- a/examples/google_genai/rate_limit_gemini.py +++ /dev/null @@ -1,46 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper with a custom rate limit -""" -import os -from dotenv import load_dotenv -from scrapegraphai.utils import prettify_exec_info -from scrapegraphai.graphs import SmartScraperGraph - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_genai/gemini-pro", - "rate_limit": { - "requests_per_second": 1 - } - }, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the news with their description.", - # also accepts a string with the already downloaded HTML code - source="https://www.wired.com", - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/google_genai/readme.md b/examples/google_genai/readme.md deleted file mode 100644 index 7e06773d..00000000 --- a/examples/google_genai/readme.md +++ /dev/null @@ -1 +0,0 @@ -This folder contains an example of how to use ScrapeGraph-AI with Gemini, a large language model (LLM) from Google AI. The example shows how to extract information from a website using a natural language prompt. \ No newline at end of file diff --git a/examples/google_genai/scrape_plain_text_gemini.py b/examples/google_genai/scrape_plain_text_gemini.py deleted file mode 100644 index f554cede..00000000 --- a/examples/google_genai/scrape_plain_text_gemini.py +++ /dev/null @@ -1,56 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper from text -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the text file -# ************************************************ - -FILE_NAME = "inputs/plain_html_example.txt" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -# It could be also a http request using the request model -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_genai/gemini-pro", - "temperature": 0, - "streaming": True - }, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the news with their description.", - source=text, - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/google_genai/scrape_xml_gemini.py b/examples/google_genai/scrape_xml_gemini.py deleted file mode 100644 index af8868ea..00000000 --- a/examples/google_genai/scrape_xml_gemini.py +++ /dev/null @@ -1,56 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper from XML documents -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the XML file -# ************************************************ - -FILE_NAME = "inputs/books.xml" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_genai/gemini-pro", - "temperature": 0, - "streaming": True - }, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the authors, title and genres of the books", - source=text, # Pass the content of the file, not the file object - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/google_genai/script_generator_gemini.py b/examples/google_genai/script_generator_gemini.py deleted file mode 100644 index fdf61f87..00000000 --- a/examples/google_genai/script_generator_gemini.py +++ /dev/null @@ -1,44 +0,0 @@ -""" -Basic example of scraping pipeline using ScriptCreatorGraph -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import ScriptCreatorGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_genai/gemini-pro", - }, - "library": "beautifoulsoup" -} - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -smart_scraper_graph = ScriptCreatorGraph( - prompt="List me all the news with their description.", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects", - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/google_genai/script_multi_generator_gemini.py b/examples/google_genai/script_multi_generator_gemini.py deleted file mode 100644 index 3ef0e108..00000000 --- a/examples/google_genai/script_multi_generator_gemini.py +++ /dev/null @@ -1,53 +0,0 @@ -""" -Basic example of scraping pipeline using ScriptCreatorGraph -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import ScriptCreatorMultiGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_genai/gemini-pro", - }, - "library": "beautifoulsoup" -} - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -urls=[ - "https://schultzbergagency.com/emil-raste-karlsen/", - "https://schultzbergagency.com/johanna-hedberg/", -] - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -script_creator_graph = ScriptCreatorMultiGraph( - prompt="Find information about actors", - # also accepts a string with the already downloaded HTML code - source=urls, - config=graph_config -) - -result = script_creator_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = script_creator_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/google_genai/search_graph_gemini.py b/examples/google_genai/search_graph_gemini.py deleted file mode 100644 index d001b34d..00000000 --- a/examples/google_genai/search_graph_gemini.py +++ /dev/null @@ -1,41 +0,0 @@ -""" -Example of Search Graph -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SearchGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_genai/gemini-pro", - "temperature": 0, - "streaming": True - }, - "max_results": 5, - "verbose": True, -} - -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me all the regions of Italy.", - config=graph_config -) - -result = search_graph.run() -print(result) - -# Save to json and csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/google_genai/search_graph_schema_gemini.py b/examples/google_genai/search_graph_schema_gemini.py deleted file mode 100644 index c55854c5..00000000 --- a/examples/google_genai/search_graph_schema_gemini.py +++ /dev/null @@ -1,58 +0,0 @@ -""" -Example of Search Graph -""" -import os -from typing import List -from dotenv import load_dotenv -from pydantic import BaseModel, Field -from scrapegraphai.graphs import SearchGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Dish(BaseModel): - name: str = Field(description="The name of the dish") - description: str = Field(description="The description of the dish") - -class Dishes(BaseModel): - dishes: List[Dish] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_genai/gemini-pro", - }, -} - -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me Chioggia's famous dishes", - config=graph_config, - schema=Dishes -) - -result = search_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = search_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json and csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/google_genai/search_link_graph_gemini.py b/examples/google_genai/search_link_graph_gemini.py deleted file mode 100644 index 084cea41..00000000 --- a/examples/google_genai/search_link_graph_gemini.py +++ /dev/null @@ -1,44 +0,0 @@ -""" -Example of Search Graph -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SearchGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -load_dotenv() - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_genai/gemini-pro", - }, -} -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me the best escursions near Trento", - config=graph_config -) - -result = search_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = search_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json and csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/google_genai/smart_scraper_gemini.py b/examples/google_genai/smart_scraper_gemini.py deleted file mode 100644 index cb59e34f..00000000 --- a/examples/google_genai/smart_scraper_gemini.py +++ /dev/null @@ -1,44 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.utils import prettify_exec_info -from scrapegraphai.graphs import SmartScraperGraph -load_dotenv() - - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_genai/gemini-pro", - }, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the news with their description.", - # also accepts a string with the already downloaded HTML code - source="https://www.wired.com", - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/google_genai/smart_scraper_lite_google_genai.py b/examples/google_genai/smart_scraper_lite_google_genai.py deleted file mode 100644 index 9b776735..00000000 --- a/examples/google_genai/smart_scraper_lite_google_genai.py +++ /dev/null @@ -1,31 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperLiteGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -graph_config = { - "llm": { - "api_key": os.getenv("GOOGLE_API_KEY"), - "model": "gemini-pro", - }, - "verbose": True, - "headless": False, -} - -smart_scraper_lite_graph = SmartScraperLiteGraph( - prompt="Who is Marco Perini?", - source="https://perinim.github.io/", - config=graph_config -) - -result = smart_scraper_lite_graph.run() -print(json.dumps(result, indent=4)) - -graph_exec_info = smart_scraper_lite_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/google_genai/smart_scraper_multi_concat_gemini.py b/examples/google_genai/smart_scraper_multi_concat_gemini.py deleted file mode 100644 index bf6ee544..00000000 --- a/examples/google_genai/smart_scraper_multi_concat_gemini.py +++ /dev/null @@ -1,39 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiConcatGraph - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_genai/gemini-pro", - }, -} - -# ******************************************************* -# Create the SmartScraperMultiGraph instance and run it -# ******************************************************* - -multiple_search_graph = SmartScraperMultiConcatGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/google_genai/smart_scraper_multi_gemini.py b/examples/google_genai/smart_scraper_multi_gemini.py deleted file mode 100644 index db721db9..00000000 --- a/examples/google_genai/smart_scraper_multi_gemini.py +++ /dev/null @@ -1,39 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiGraph - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_genai/gemini-pro", - }, -} - -# ******************************************************* -# Create the SmartScraperMultiGraph instance and run it -# ******************************************************* - -multiple_search_graph = SmartScraperMultiGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/google_genai/smart_scraper_multi_lite_gemini.py b/examples/google_genai/smart_scraper_multi_lite_gemini.py deleted file mode 100644 index e69de29b..00000000 diff --git a/examples/google_genai/smart_scraper_multi_lite_google_genai.py b/examples/google_genai/smart_scraper_multi_lite_google_genai.py deleted file mode 100644 index e14e2ceb..00000000 --- a/examples/google_genai/smart_scraper_multi_lite_google_genai.py +++ /dev/null @@ -1,34 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiLiteGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -graph_config = { - "llm": { - "api_key": os.getenv("GOOGLE_API_KEY"), - "model": "gemini-pro", - }, - "verbose": True, - "headless": False, -} - -smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - config=graph_config -) - -result = smart_scraper_multi_lite_graph.run() -print(json.dumps(result, indent=4)) - -graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/google_genai/smart_scraper_schema_gemini.py b/examples/google_genai/smart_scraper_schema_gemini.py deleted file mode 100644 index 7037dc08..00000000 --- a/examples/google_genai/smart_scraper_schema_gemini.py +++ /dev/null @@ -1,56 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper with schema -""" -import os -from typing import List -from pydantic import BaseModel, Field -from dotenv import load_dotenv -from scrapegraphai.utils import prettify_exec_info -from scrapegraphai.graphs import SmartScraperGraph - -load_dotenv() - -# ************************************************ -# Define the output schema for the graph -# ************************************************ -class Project(BaseModel): - title: str = Field(description="The title of the project") - description: str = Field(description="The description of the project") - -class Projects(BaseModel): - projects: List[Project] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_genai/gemini-pro", - }, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the news with their description.", - # also accepts a string with the already downloaded HTML code - source="https://www.wired.com", - schema=Projects, - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/google_genai/xml_scraper_gemini.py b/examples/google_genai/xml_scraper_gemini.py deleted file mode 100644 index 3c3dc342..00000000 --- a/examples/google_genai/xml_scraper_gemini.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -Basic example of scraping pipeline using XMLScraperGraph from XML documents -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import XMLScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the XML file -# ************************************************ - -FILE_NAME = "inputs/books.xml" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_genai/gemini-pro", - }, -} -# ************************************************ -# Create the XMLScraperGraph instance and run it -# ************************************************ - -xml_scraper_graph = XMLScraperGraph( - prompt="List me all the authors, title and genres of the books", - source=text, # Pass the content of the file, not the file object - config=graph_config -) - -result = xml_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = xml_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") - diff --git a/examples/google_genai/xml_scraper_graph_multi_gemini.py b/examples/google_genai/xml_scraper_graph_multi_gemini.py deleted file mode 100644 index 15bc2485..00000000 --- a/examples/google_genai/xml_scraper_graph_multi_gemini.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -Basic example of scraping pipeline using XMLScraperMultiGraph from XML documents -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import XMLScraperMultiGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the XML file -# ************************************************ - -FILE_NAME = "inputs/books.xml" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_genai/gemini-pro", - }, -} - -# ************************************************ -# Create the XMLScraperMultiGraph instance and run it -# ************************************************ - -xml_scraper_graph = XMLScraperMultiGraph( - prompt="List me all the authors, title and genres of the books", - source=[text, text], # Pass the content of the file, not the file object - config=graph_config -) - -result = xml_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = xml_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/google_vertexai/.env.example b/examples/google_vertexai/.env.example deleted file mode 100644 index fc0dacb0..00000000 --- a/examples/google_vertexai/.env.example +++ /dev/null @@ -1 +0,0 @@ -GOOGLE_APIKEY="your google api key" diff --git a/examples/google_vertexai/code_generator_graph_vertex.py b/examples/google_vertexai/code_generator_graph_vertex.py deleted file mode 100644 index 28f40174..00000000 --- a/examples/google_vertexai/code_generator_graph_vertex.py +++ /dev/null @@ -1,60 +0,0 @@ -""" -Basic example of scraping pipeline using Code Generator with schema -""" - -import os, json -from typing import List -from dotenv import load_dotenv -from pydantic import BaseModel, Field -from scrapegraphai.graphs import CodeGeneratorGraph - -load_dotenv() - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Project(BaseModel): - title: str = Field(description="The title of the project") - description: str = Field(description="The description of the project") - -class Projects(BaseModel): - projects: List[Project] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_vertexai/gemini-1.5-pro", - }, - "verbose": True, - "headless": False, - "reduction": 2, - "max_iterations": { - "overall": 10, - "syntax": 3, - "execution": 3, - "validation": 3, - "semantic": 3 - }, - "output_file_name": "extracted_data.py" -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -code_generator_graph = CodeGeneratorGraph( - prompt="List me all the projects with their description", - source="https://perinim.github.io/projects/", - schema=Projects, - config=graph_config -) - -result = code_generator_graph.run() -print(result) \ No newline at end of file diff --git a/examples/google_vertexai/csv_scraper_gemini.py b/examples/google_vertexai/csv_scraper_gemini.py deleted file mode 100644 index e5de1f17..00000000 --- a/examples/google_vertexai/csv_scraper_gemini.py +++ /dev/null @@ -1,53 +0,0 @@ -""" -Basic example of scraping pipeline using CSVScraperGraph from CSV documents -""" - -import os -from dotenv import load_dotenv -import pandas as pd -from scrapegraphai.graphs import CSVScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the csv file -# ************************************************ - -text = pd.read_csv("inputs/username.csv") - -# ************************************************ -# Define the configuration for the graph -# ************************************************ -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_vertexai/gemini-1.5-pro", - }, -} - -# ************************************************ -# Create the CSVScraperGraph instance and run it -# ************************************************ - -csv_scraper_graph = CSVScraperGraph( - prompt="List me all the last names", - source=str(text), # Pass the content of the file, not the file object - config=graph_config -) - -result = csv_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = csv_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/google_vertexai/csv_scraper_graph_multi_gemini.py b/examples/google_vertexai/csv_scraper_graph_multi_gemini.py deleted file mode 100644 index 1318acfb..00000000 --- a/examples/google_vertexai/csv_scraper_graph_multi_gemini.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -Basic example of scraping pipeline using CSVScraperMultiGraph from CSV documents -""" - -import os -from dotenv import load_dotenv -import pandas as pd -from scrapegraphai.graphs import CSVScraperMultiGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() -# ************************************************ -# Read the CSV file -# ************************************************ - -FILE_NAME = "inputs/username.csv" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -text = pd.read_csv(file_path) - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_vertexai/gemini-1.5-pro", - }, -} - -# ************************************************ -# Create the CSVScraperMultiGraph instance and run it -# ************************************************ - -csv_scraper_graph = CSVScraperMultiGraph( - prompt="List me all the last names", - source=[str(text), str(text)], - config=graph_config -) - -result = csv_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = csv_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/google_vertexai/custom_graph_gemini.py b/examples/google_vertexai/custom_graph_gemini.py deleted file mode 100644 index 7feff114..00000000 --- a/examples/google_vertexai/custom_graph_gemini.py +++ /dev/null @@ -1,84 +0,0 @@ -""" -Example of custom graph using Gemini Google model -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.models import Gemini -from scrapegraphai.graphs import BaseGraph -from scrapegraphai.nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_vertexai/gemini-1.5-pro", - "temperature": 0, - "streaming": True - }, -} - -# ************************************************ -# Define the graph nodes -# ************************************************ - -llm_model = Gemini(graph_config["llm"]) - -# define the nodes for the graph -fetch_node = FetchNode( - input="url | local_dir", - output=["doc"], -) -parse_node = ParseNode( - input="doc", - output=["parsed_doc"], - node_config={"chunk_size": 4096} -) -rag_node = RAGNode( - input="user_prompt & (parsed_doc | doc)", - output=["relevant_chunks"], - node_config={"llm": llm_model}, -) -generate_answer_node = GenerateAnswerNode( - input="user_prompt & (relevant_chunks | parsed_doc | doc)", - output=["answer"], - node_config={"llm": llm_model}, -) - -# ************************************************ -# Create the graph by defining the connections -# ************************************************ - -graph = BaseGraph( - nodes={ - fetch_node, - parse_node, - rag_node, - generate_answer_node, - }, - edges={ - (fetch_node, parse_node), - (parse_node, rag_node), - (rag_node, generate_answer_node) - }, - entry_point=fetch_node -) - -# ************************************************ -# Execute the graph -# ************************************************ - -result, execution_info = graph.execute({ - "user_prompt": "List me the projects with their description", - "url": "https://perinim.github.io/projects/" -}) - -# get the answer from the result -result = result.get("answer", "No answer found.") -print(result) diff --git a/examples/google_vertexai/depth_search_graph_gemini.py b/examples/google_vertexai/depth_search_graph_gemini.py deleted file mode 100644 index 13bba630..00000000 --- a/examples/google_vertexai/depth_search_graph_gemini.py +++ /dev/null @@ -1,30 +0,0 @@ -""" -depth_search_graph_opeani example -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import DepthSearchGraph - -load_dotenv() - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_vertexai/gemini-1.5-pro", - }, - "verbose": True, - "headless": False, - "depth": 2, - "only_inside_links": False, -} - -search_graph = DepthSearchGraph( - prompt="List me all the projects with their description", - source="https://perinim.github.io", - config=graph_config -) - -result = search_graph.run() -print(result) diff --git a/examples/google_vertexai/document_scraper_vertex.py b/examples/google_vertexai/document_scraper_vertex.py deleted file mode 100644 index 58f79a91..00000000 --- a/examples/google_vertexai/document_scraper_vertex.py +++ /dev/null @@ -1,41 +0,0 @@ -""" -document_scraper example -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import DocumentScraperGraph - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_vertexai/gemini-1.5-pro", - }, -} - -source = """ - The Divine Comedy, Italian La Divina Commedia, original name La commedia, long narrative poem written in Italian - circa 1308/21 by Dante. It is usually held to be one of the world s great works of literature. - Divided into three major sections—Inferno, Purgatorio, and Paradiso—the narrative traces the journey of Dante - from darkness and error to the revelation of the divine light, culminating in the Beatific Vision of God. - Dante is guided by the Roman poet Virgil, who represents the epitome of human knowledge, from the dark wood - through the descending circles of the pit of Hell (Inferno). He then climbs the mountain of Purgatory, guided - by the Roman poet Statius, who represents the fulfilment of human knowledge, and is finally led by his lifelong love, - the Beatrice of his earlier poetry, through the celestial spheres of Paradise. -""" - -pdf_scraper_graph = DocumentScraperGraph( - prompt="Summarize the text and find the main topics", - source=source, - config=graph_config, -) -result = pdf_scraper_graph.run() - -print(json.dumps(result, indent=4)) \ No newline at end of file diff --git a/examples/google_vertexai/inputs/books.xml b/examples/google_vertexai/inputs/books.xml deleted file mode 100644 index e3d1fe87..00000000 --- a/examples/google_vertexai/inputs/books.xml +++ /dev/null @@ -1,120 +0,0 @@ - - - - Gambardella, Matthew - XML Developer's Guide - Computer - 44.95 - 2000-10-01 - An in-depth look at creating applications - with XML. - - - Ralls, Kim - Midnight Rain - Fantasy - 5.95 - 2000-12-16 - A former architect battles corporate zombies, - an evil sorceress, and her own childhood to become queen - of the world. - - - Corets, Eva - Maeve Ascendant - Fantasy - 5.95 - 2000-11-17 - After the collapse of a nanotechnology - society in England, the young survivors lay the - foundation for a new society. - - - Corets, Eva - Oberon's Legacy - Fantasy - 5.95 - 2001-03-10 - In post-apocalypse England, the mysterious - agent known only as Oberon helps to create a new life - for the inhabitants of London. Sequel to Maeve - Ascendant. - - - Corets, Eva - The Sundered Grail - Fantasy - 5.95 - 2001-09-10 - The two daughters of Maeve, half-sisters, - battle one another for control of England. Sequel to - Oberon's Legacy. - - - Randall, Cynthia - Lover Birds - Romance - 4.95 - 2000-09-02 - When Carla meets Paul at an ornithology - conference, tempers fly as feathers get ruffled. - - - Thurman, Paula - Splish Splash - Romance - 4.95 - 2000-11-02 - A deep sea diver finds true love twenty - thousand leagues beneath the sea. - - - Knorr, Stefan - Creepy Crawlies - Horror - 4.95 - 2000-12-06 - An anthology of horror stories about roaches, - centipedes, scorpions and other insects. - - - Kress, Peter - Paradox Lost - Science Fiction - 6.95 - 2000-11-02 - After an inadvertant trip through a Heisenberg - Uncertainty Device, James Salway discovers the problems - of being quantum. - - - O'Brien, Tim - Microsoft .NET: The Programming Bible - Computer - 36.95 - 2000-12-09 - Microsoft's .NET initiative is explored in - detail in this deep programmer's reference. - - - O'Brien, Tim - MSXML3: A Comprehensive Guide - Computer - 36.95 - 2000-12-01 - The Microsoft MSXML3 parser is covered in - detail, with attention to XML DOM interfaces, XSLT processing, - SAX and more. - - - Galos, Mike - Visual Studio 7: A Comprehensive Guide - Computer - 49.95 - 2001-04-16 - Microsoft Visual Studio 7 is explored in depth, - looking at how Visual Basic, Visual C++, C#, and ASP+ are - integrated into a comprehensive development - environment. - - \ No newline at end of file diff --git a/examples/google_vertexai/inputs/example.json b/examples/google_vertexai/inputs/example.json deleted file mode 100644 index 2263184c..00000000 --- a/examples/google_vertexai/inputs/example.json +++ /dev/null @@ -1,182 +0,0 @@ -{ - "kind":"youtube#searchListResponse", - "etag":"q4ibjmYp1KA3RqMF4jFLl6PBwOg", - "nextPageToken":"CAUQAA", - "regionCode":"NL", - "pageInfo":{ - "totalResults":1000000, - "resultsPerPage":5 - }, - "items":[ - { - "kind":"youtube#searchResult", - "etag":"QCsHBifbaernVCbLv8Cu6rAeaDQ", - "id":{ - "kind":"youtube#video", - "videoId":"TvWDY4Mm5GM" - }, - "snippet":{ - "publishedAt":"2023-07-24T14:15:01Z", - "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", - "title":"3 Football Clubs Kylian Mbappe Should Avoid Signing ✍️❌⚽️ #football #mbappe #shorts", - "description":"", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"FC Motivate", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T14:15:01Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"0NG5QHdtIQM_V-DBJDEf-jK_Y9k", - "id":{ - "kind":"youtube#video", - "videoId":"aZM_42CcNZ4" - }, - "snippet":{ - "publishedAt":"2023-07-24T16:09:27Z", - "channelId":"UCM5gMM_HqfKHYIEJ3lstMUA", - "title":"Which Football Club Could Cristiano Ronaldo Afford To Buy? 💰", - "description":"Sign up to Sorare and get a FREE card: https://sorare.pxf.io/NellisShorts Give Soraredata a go for FREE: ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"John Nellis", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T16:09:27Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"WbBz4oh9I5VaYj91LjeJvffrBVY", - "id":{ - "kind":"youtube#video", - "videoId":"wkP3XS3aNAY" - }, - "snippet":{ - "publishedAt":"2023-07-24T16:00:50Z", - "channelId":"UC4EP1dxFDPup_aFLt0ElsDw", - "title":"PAULO DYBALA vs THE WORLD'S LONGEST FREEKICK WALL", - "description":"Can Paulo Dybala curl a football around the World's longest free kick wall? We met up with the World Cup winner and put him to ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"Shoot for Love", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T16:00:50Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"juxv_FhT_l4qrR05S1QTrb4CGh8", - "id":{ - "kind":"youtube#video", - "videoId":"rJkDZ0WvfT8" - }, - "snippet":{ - "publishedAt":"2023-07-24T10:00:39Z", - "channelId":"UCO8qj5u80Ga7N_tP3BZWWhQ", - "title":"TOP 10 DEFENDERS 2023", - "description":"SoccerKingz https://soccerkingz.nl Use code: 'ILOVEHOF' to get 10% off. TOP 10 DEFENDERS 2023 Follow us! • Instagram ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"Home of Football", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T10:00:39Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"wtuknXTmI1txoULeH3aWaOuXOow", - "id":{ - "kind":"youtube#video", - "videoId":"XH0rtu4U6SE" - }, - "snippet":{ - "publishedAt":"2023-07-21T16:30:05Z", - "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", - "title":"3 Things You Didn't Know About Erling Haaland ⚽️🇳🇴 #football #haaland #shorts", - "description":"", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"FC Motivate", - "liveBroadcastContent":"none", - "publishTime":"2023-07-21T16:30:05Z" - } - } - ] -} \ No newline at end of file diff --git a/examples/google_vertexai/inputs/plain_html_example.txt b/examples/google_vertexai/inputs/plain_html_example.txt deleted file mode 100644 index 78f814ae..00000000 --- a/examples/google_vertexai/inputs/plain_html_example.txt +++ /dev/null @@ -1,105 +0,0 @@ - -
- - -
-
-
- - -
- \ No newline at end of file diff --git a/examples/google_vertexai/inputs/username.csv b/examples/google_vertexai/inputs/username.csv deleted file mode 100644 index 006ac8e6..00000000 --- a/examples/google_vertexai/inputs/username.csv +++ /dev/null @@ -1,7 +0,0 @@ -Username; Identifier;First name;Last name -booker12;9012;Rachel;Booker -grey07;2070;Laura;Grey -johnson81;4081;Craig;Johnson -jenkins46;9346;Mary;Jenkins -smith79;5079;Jamie;Smith - diff --git a/examples/google_vertexai/json_scraper_gemini.py b/examples/google_vertexai/json_scraper_gemini.py deleted file mode 100644 index 8e9f5a9f..00000000 --- a/examples/google_vertexai/json_scraper_gemini.py +++ /dev/null @@ -1,46 +0,0 @@ -""" -Basic example of scraping pipeline using JSONScraperGraph from JSON documents -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import JSONScraperGraph -from scrapegraphai.utils import prettify_exec_info -load_dotenv() - -# ************************************************ -# Read the JSON file -# ************************************************ - -FILE_NAME = "inputs/example.json" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_vertexai/gemini-1.5-pro", - }, -} - -# ************************************************ -# Create the JSONScraperGraph instance and run it -# ************************************************ - -json_scraper_graph = JSONScraperGraph( - prompt="List me all the authors, title and genres of the books", - source=text, # Pass the content of the file, not the file object - config=graph_config -) - -result = json_scraper_graph.run() -print(result) diff --git a/examples/google_vertexai/json_scraper_multi_gemini.py b/examples/google_vertexai/json_scraper_multi_gemini.py deleted file mode 100644 index b9dc2e93..00000000 --- a/examples/google_vertexai/json_scraper_multi_gemini.py +++ /dev/null @@ -1,38 +0,0 @@ -""" -Module for showing how JSONScraperMultiGraph multi works -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import JSONScraperMultiGraph - -load_dotenv() - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_vertexai/gemini-1.5-pro", - }, - "library": "beautifulsoup" -} - -FILE_NAME = "inputs/example.json" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -sources = [text, text] - -multiple_search_graph = JSONScraperMultiGraph( - prompt= "List me all the authors, title and genres of the books", - source= sources, - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/google_vertexai/rate_limit_gemini.py b/examples/google_vertexai/rate_limit_gemini.py deleted file mode 100644 index c5f15a35..00000000 --- a/examples/google_vertexai/rate_limit_gemini.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper with a custom rate limit -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.utils import prettify_exec_info -from scrapegraphai.graphs import SmartScraperGraph -load_dotenv() - - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_vertexai/gemini-1.5-pro", - "rate_limit": { - "requests_per_second": 1 - } - }, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the news with their description.", - # also accepts a string with the already downloaded HTML code - source="https://www.wired.com", - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/google_vertexai/readme.md b/examples/google_vertexai/readme.md deleted file mode 100644 index 7e06773d..00000000 --- a/examples/google_vertexai/readme.md +++ /dev/null @@ -1 +0,0 @@ -This folder contains an example of how to use ScrapeGraph-AI with Gemini, a large language model (LLM) from Google AI. The example shows how to extract information from a website using a natural language prompt. \ No newline at end of file diff --git a/examples/google_vertexai/scrape_plain_text_gemini.py b/examples/google_vertexai/scrape_plain_text_gemini.py deleted file mode 100644 index b910330a..00000000 --- a/examples/google_vertexai/scrape_plain_text_gemini.py +++ /dev/null @@ -1,56 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper from text -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info -load_dotenv() - -# ************************************************ -# Read the text file -# ************************************************ - -FILE_NAME = "inputs/plain_html_example.txt" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -# It could be also a http request using the request model -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_vertexai/gemini-1.5-pro", - "temperature": 0, - "streaming": True - }, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the news with their description.", - source=text, - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/google_vertexai/scrape_xml_gemini.py b/examples/google_vertexai/scrape_xml_gemini.py deleted file mode 100644 index 0b6563a4..00000000 --- a/examples/google_vertexai/scrape_xml_gemini.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper from XML documents -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the XML file -# ************************************************ - -FILE_NAME = "inputs/books.xml" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_vertexai/gemini-1.5-pro", - "temperature": 0, - "streaming": True - }, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the authors, title and genres of the books", - source=text, # Pass the content of the file, not the file object - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/google_vertexai/script_generator_gemini.py b/examples/google_vertexai/script_generator_gemini.py deleted file mode 100644 index 83bcb978..00000000 --- a/examples/google_vertexai/script_generator_gemini.py +++ /dev/null @@ -1,46 +0,0 @@ -""" -Basic example of scraping pipeline using ScriptCreatorGraph -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import ScriptCreatorGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_vertexai/gemini-1.5-pro", - }, - "library": "beautifoulsoup" -} - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -smart_scraper_graph = ScriptCreatorGraph( - prompt="List me all the news with their description.", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects", - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/google_vertexai/script_multi_generator_gemini.py b/examples/google_vertexai/script_multi_generator_gemini.py deleted file mode 100644 index 8ab3564e..00000000 --- a/examples/google_vertexai/script_multi_generator_gemini.py +++ /dev/null @@ -1,54 +0,0 @@ -""" -Basic example of scraping pipeline using ScriptCreatorGraph -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import ScriptCreatorMultiGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_vertexai/gemini-1.5-pro", - }, - "library": "beautifoulsoup" -} - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -urls=[ - "https://schultzbergagency.com/emil-raste-karlsen/", - "https://schultzbergagency.com/johanna-hedberg/", -] - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -script_creator_graph = ScriptCreatorMultiGraph( - prompt="Find information about actors", - # also accepts a string with the already downloaded HTML code - source=urls, - config=graph_config -) - -result = script_creator_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = script_creator_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/google_vertexai/search_graph_gemini.py b/examples/google_vertexai/search_graph_gemini.py deleted file mode 100644 index 1c86f322..00000000 --- a/examples/google_vertexai/search_graph_gemini.py +++ /dev/null @@ -1,42 +0,0 @@ -""" -Example of Search Graph -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SearchGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_vertexai/gemini-1.5-pro", - "temperature": 0, - "streaming": True - }, - "max_results": 5, - "verbose": True, -} - -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me all the regions of Italy.", - config=graph_config -) - -result = search_graph.run() -print(result) - -# Save to json and csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/google_vertexai/search_graph_schema_gemini.py b/examples/google_vertexai/search_graph_schema_gemini.py deleted file mode 100644 index 54586c7e..00000000 --- a/examples/google_vertexai/search_graph_schema_gemini.py +++ /dev/null @@ -1,61 +0,0 @@ -""" -Example of Search Graph -""" - -import os -from dotenv import load_dotenv -load_dotenv() - -from scrapegraphai.graphs import SearchGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -from pydantic import BaseModel, Field -from typing import List - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Dish(BaseModel): - name: str = Field(description="The name of the dish") - description: str = Field(description="The description of the dish") - -class Dishes(BaseModel): - dishes: List[Dish] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_vertexai/gemini-1.5-pro", - }, -} - -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me Chioggia's famous dishes", - config=graph_config, - schema=Dishes -) - -result = search_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = search_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json and csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/google_vertexai/search_link_graph_gemini.py b/examples/google_vertexai/search_link_graph_gemini.py deleted file mode 100644 index d351b843..00000000 --- a/examples/google_vertexai/search_link_graph_gemini.py +++ /dev/null @@ -1,44 +0,0 @@ -""" -Example of Search Graph -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SearchGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -load_dotenv() - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_vertexai/gemini-1.5-pro", - }, -} -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me the best escursions near Trento", - config=graph_config -) - -result = search_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = search_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json and csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/google_vertexai/smart_scraper_gemini.py b/examples/google_vertexai/smart_scraper_gemini.py deleted file mode 100644 index 4ed7c352..00000000 --- a/examples/google_vertexai/smart_scraper_gemini.py +++ /dev/null @@ -1,44 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.utils import prettify_exec_info -from scrapegraphai.graphs import SmartScraperGraph - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_vertexai/gemini-1.5-pro", - }, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the news with their description.", - # also accepts a string with the already downloaded HTML code - source="https://www.wired.com", - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/google_vertexai/smart_scraper_lite_google_vertexai.py b/examples/google_vertexai/smart_scraper_lite_google_vertexai.py deleted file mode 100644 index eca61bbb..00000000 --- a/examples/google_vertexai/smart_scraper_lite_google_vertexai.py +++ /dev/null @@ -1,33 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperLiteGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -graph_config = { - "llm": { - "project": os.getenv("GOOGLE_CLOUD_PROJECT"), - "location": "us-central1", - "model": "text-bison@001", - }, - "verbose": True, - "headless": False, -} - -smart_scraper_lite_graph = SmartScraperLiteGraph( - prompt="Who is Marco Perini?", - source="https://perinim.github.io/", - config=graph_config -) - -result = smart_scraper_lite_graph.run() -print(json.dumps(result, indent=4)) - -graph_exec_info = smart_scraper_lite_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - diff --git a/examples/google_vertexai/smart_scraper_multi_concat_gemini.py b/examples/google_vertexai/smart_scraper_multi_concat_gemini.py deleted file mode 100644 index c6874ff6..00000000 --- a/examples/google_vertexai/smart_scraper_multi_concat_gemini.py +++ /dev/null @@ -1,36 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" - -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiConcatGraph - -load_dotenv() - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_vertexai/gemini-1.5-pro", - }, -} - -# ******************************************************* -# Create the SmartScraperMultiGraph instance and run it -# ******************************************************* - -multiple_search_graph = SmartScraperMultiConcatGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/google_vertexai/smart_scraper_multi_gemini.py b/examples/google_vertexai/smart_scraper_multi_gemini.py deleted file mode 100644 index ffbd6f47..00000000 --- a/examples/google_vertexai/smart_scraper_multi_gemini.py +++ /dev/null @@ -1,39 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" - -import os, json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiGraph - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_vertexai/gemini-1.5-pro", - }, -} - -# ******************************************************* -# Create the SmartScraperMultiGraph instance and run it -# ******************************************************* - -multiple_search_graph = SmartScraperMultiGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/google_vertexai/smart_scraper_multi_lite_google_vertexai.py b/examples/google_vertexai/smart_scraper_multi_lite_google_vertexai.py deleted file mode 100644 index 5c293416..00000000 --- a/examples/google_vertexai/smart_scraper_multi_lite_google_vertexai.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiLiteGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -graph_config = { - "llm": { - "project": os.getenv("GOOGLE_CLOUD_PROJECT"), - "location": "us-central1", - "model": "text-bison@001", - }, - "verbose": True, - "headless": False, -} - -smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - config=graph_config -) - -result = smart_scraper_multi_lite_graph.run() -print(json.dumps(result, indent=4)) - -graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/google_vertexai/smart_scraper_multi_lite_vertex.py b/examples/google_vertexai/smart_scraper_multi_lite_vertex.py deleted file mode 100644 index 60ff3638..00000000 --- a/examples/google_vertexai/smart_scraper_multi_lite_vertex.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiLiteGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "project": os.getenv("GOOGLE_CLOUD_PROJECT"), - "location": "us-central1", - "model": "text-bison@001", - }, - "verbose": True, - "headless": False, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - config=graph_config -) - -result = smart_scraper_multi_lite_graph.run() -print(json.dumps(result, indent=4)) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/google_vertexai/smart_scraper_schema_gemini.py b/examples/google_vertexai/smart_scraper_schema_gemini.py deleted file mode 100644 index 541ce9aa..00000000 --- a/examples/google_vertexai/smart_scraper_schema_gemini.py +++ /dev/null @@ -1,56 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper with schema -""" - -import os -from typing import List -from pydantic import BaseModel, Field -from dotenv import load_dotenv -from scrapegraphai.utils import prettify_exec_info -from scrapegraphai.graphs import SmartScraperGraph -load_dotenv() - -# ************************************************ -# Define the output schema for the graph -# ************************************************ -class Project(BaseModel): - title: str = Field(description="The title of the project") - description: str = Field(description="The description of the project") - -class Projects(BaseModel): - projects: List[Project] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_vertexai/gemini-1.5-pro", - }, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the news with their description.", - # also accepts a string with the already downloaded HTML code - source="https://www.wired.com", - schema=Projects, - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/google_vertexai/xml_scraper_gemini.py b/examples/google_vertexai/xml_scraper_gemini.py deleted file mode 100644 index de0e084f..00000000 --- a/examples/google_vertexai/xml_scraper_gemini.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -Basic example of scraping pipeline using XMLScraperGraph from XML documents -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import XMLScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info -load_dotenv() - -# ************************************************ -# Read the XML file -# ************************************************ - -FILE_NAME = "inputs/books.xml" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_vertexai/gemini-1.5-pro", - }, -} -# ************************************************ -# Create the XMLScraperGraph instance and run it -# ************************************************ - -xml_scraper_graph = XMLScraperGraph( - prompt="List me all the authors, title and genres of the books", - source=text, # Pass the content of the file, not the file object - config=graph_config -) - -result = xml_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = xml_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") - diff --git a/examples/google_vertexai/xml_scraper_graph_multi_gemini.py b/examples/google_vertexai/xml_scraper_graph_multi_gemini.py deleted file mode 100644 index 3b7562d3..00000000 --- a/examples/google_vertexai/xml_scraper_graph_multi_gemini.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -Basic example of scraping pipeline using XMLScraperMultiGraph from XML documents -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import XMLScraperMultiGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info -load_dotenv() - -# ************************************************ -# Read the XML file -# ************************************************ - -FILE_NAME = "inputs/books.xml" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -gemini_key = os.getenv("GOOGLE_APIKEY") - -graph_config = { - "llm": { - "api_key": gemini_key, - "model": "google_vertexai/gemini-1.5-pro", - }, -} - -# ************************************************ -# Create the XMLScraperMultiGraph instance and run it -# ************************************************ - -xml_scraper_graph = XMLScraperMultiGraph( - prompt="List me all the authors, title and genres of the books", - source=[text, text], # Pass the content of the file, not the file object - config=graph_config -) - -result = xml_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = xml_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/groq/.env.example b/examples/groq/.env.example deleted file mode 100644 index c934d4fa..00000000 --- a/examples/groq/.env.example +++ /dev/null @@ -1,2 +0,0 @@ -GROQ_APIKEY= "your groq key" -OPENAI_APIKEY="your openai api key" \ No newline at end of file diff --git a/examples/groq/code_generator_graph_groq.py b/examples/groq/code_generator_graph_groq.py deleted file mode 100644 index cf03d96c..00000000 --- a/examples/groq/code_generator_graph_groq.py +++ /dev/null @@ -1,60 +0,0 @@ -""" -Basic example of scraping pipeline using Code Generator with schema -""" -import os -from typing import List -from dotenv import load_dotenv -from pydantic import BaseModel, Field -from scrapegraphai.graphs import CodeGeneratorGraph - -load_dotenv() - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Project(BaseModel): - title: str = Field(description="The title of the project") - description: str = Field(description="The description of the project") - -class Projects(BaseModel): - projects: List[Project] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -groq_key = os.getenv("GROQ_APIKEY") - -graph_config = { - "llm": { - "model": "groq/gemma-7b-it", - "api_key": groq_key, - "temperature": 0 - }, - "verbose": True, - "headless": False, - "reduction": 2, - "max_iterations": { - "overall": 10, - "syntax": 3, - "execution": 3, - "validation": 3, - "semantic": 3 - }, - "output_file_name": "extracted_data.py" -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -code_generator_graph = CodeGeneratorGraph( - prompt="List me all the projects with their description", - source="https://perinim.github.io/projects/", - schema=Projects, - config=graph_config -) - -result = code_generator_graph.run() -print(result) diff --git a/examples/groq/csv_scraper_graph_multi_groq.py b/examples/groq/csv_scraper_graph_multi_groq.py deleted file mode 100644 index e0343f31..00000000 --- a/examples/groq/csv_scraper_graph_multi_groq.py +++ /dev/null @@ -1,58 +0,0 @@ -""" -Basic example of scraping pipeline using CSVScraperMultiGraph from CSV documents -""" -import os -from dotenv import load_dotenv -import pandas as pd -from scrapegraphai.graphs import CSVScraperMultiGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() -# ************************************************ -# Read the CSV file -# ************************************************ - -FILE_NAME = "inputs/username.csv" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -text = pd.read_csv(file_path) - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -groq_key = os.getenv("GROQ_APIKEY") - -graph_config = { - "llm": { - "model": "groq/gemma-7b-it", - "api_key": groq_key, - "temperature": 0 - }, - "headless": False -} - -# ************************************************ -# Create the CSVScraperMultiGraph instance and run it -# ************************************************ - -csv_scraper_graph = CSVScraperMultiGraph( - prompt="List me all the last names", - source=[str(text), str(text)], - config=graph_config -) - -result = csv_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = csv_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/groq/csv_scraper_groq.py b/examples/groq/csv_scraper_groq.py deleted file mode 100644 index 6c36b4c4..00000000 --- a/examples/groq/csv_scraper_groq.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -Basic example of scraping pipeline using CSVScraperGraph from CSV documents -""" -import os -from dotenv import load_dotenv -import pandas as pd -from scrapegraphai.graphs import CSVScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the CSV file -# ************************************************ - -FILE_NAME = "inputs/username.csv" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -text = pd.read_csv(file_path) - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -groq_key = os.getenv("GROQ_APIKEY") - -graph_config = { - "llm": { - "model": "groq/gemma-7b-it", - "api_key": groq_key, - "temperature": 0 - }, -} -# ************************************************ -# Create the CSVScraperGraph instance and run it -# ************************************************ - -csv_scraper_graph = CSVScraperGraph( - prompt="List me all the last names", - source=str(text), # Pass the content of the file, not the file object - config=graph_config -) - -result = csv_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = csv_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/groq/custom_graph_groq.py b/examples/groq/custom_graph_groq.py deleted file mode 100644 index ea35137f..00000000 --- a/examples/groq/custom_graph_groq.py +++ /dev/null @@ -1,99 +0,0 @@ -""" -Example of custom graph using existing nodes -""" -import os -from dotenv import load_dotenv -from langchain_openai import ChatOpenAI -from scrapegraphai.graphs import BaseGraph -from scrapegraphai.nodes import FetchNode, ParseNode, GenerateAnswerNode, RobotsNode -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ -groq_key = os.getenv("GROQ_APIKEY") - -graph_config = { - "llm": { - "model": "groq/gemma-7b-it", - "api_key": groq_key, - "temperature": 0 - }, - "verbose": True, - "headless": False -} - -# ************************************************ -# Define the graph nodes -# ************************************************ - -llm_model = ChatOpenAI(graph_config["llm"]) - -# define the nodes for the graph -robot_node = RobotsNode( - input="url", - output=["is_scrapable"], - node_config={ - "llm_model": llm_model, - "force_scraping": True, - "verbose": True, - } -) - -fetch_node = FetchNode( - input="url | local_dir", - output=["doc"], - node_config={ - "verbose": True, - "headless": True, - } -) -parse_node = ParseNode( - input="doc", - output=["parsed_doc"], - node_config={ - "chunk_size": 4096, - "verbose": True, - } -) - -generate_answer_node = GenerateAnswerNode( - input="user_prompt & (relevant_chunks | parsed_doc | doc)", - output=["answer"], - node_config={ - "llm_model": llm_model, - "verbose": True, - } -) - -# ************************************************ -# Create the graph by defining the connections -# ************************************************ - -graph = BaseGraph( - nodes=[ - robot_node, - fetch_node, - parse_node, - generate_answer_node, - ], - edges=[ - (robot_node, fetch_node), - (fetch_node, parse_node), - (parse_node, generate_answer_node) - ], - entry_point=robot_node -) - -# ************************************************ -# Execute the graph -# ************************************************ - -result, execution_info = graph.execute({ - "user_prompt": "Describe the content", - "url": "https://example.com/" -}) - -# get the answer from the result -result = result.get("answer", "No answer found.") -print(result) diff --git a/examples/groq/depth_search_graph_groq.py b/examples/groq/depth_search_graph_groq.py deleted file mode 100644 index 2d1ed8b1..00000000 --- a/examples/groq/depth_search_graph_groq.py +++ /dev/null @@ -1,31 +0,0 @@ -""" -depth_search_graph_opeani example -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import DepthSearchGraph - -load_dotenv() - -groq_key = os.getenv("GROQ_APIKEY") - -graph_config = { - "llm": { - "model": "groq/gemma-7b-it", - "api_key": groq_key, - "temperature": 0 - }, - "verbose": True, - "headless": False, - "depth": 2, - "only_inside_links": False, -} - -search_graph = DepthSearchGraph( - prompt="List me all the projects with their description", - source="https://perinim.github.io", - config=graph_config -) - -result = search_graph.run() -print(result) diff --git a/examples/groq/document_scraper_groq.py b/examples/groq/document_scraper_groq.py deleted file mode 100644 index 53c64f73..00000000 --- a/examples/groq/document_scraper_groq.py +++ /dev/null @@ -1,45 +0,0 @@ -""" -document_scraper example -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import DocumentScraperGraph - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ -groq_key = os.getenv("GROQ_APIKEY") - -graph_config = { - "llm": { - "model": "groq/gemma-7b-it", - "api_key": groq_key, - "temperature": 0 - }, - "verbose": True, - "headless": False -} - - -source = """ - The Divine Comedy, Italian La Divina Commedia, original name La commedia, long narrative poem written in Italian - circa 1308/21 by Dante. It is usually held to be one of the world s great works of literature. - Divided into three major sections—Inferno, Purgatorio, and Paradiso—the narrative traces the journey of Dante - from darkness and error to the revelation of the divine light, culminating in the Beatific Vision of God. - Dante is guided by the Roman poet Virgil, who represents the epitome of human knowledge, from the dark wood - through the descending circles of the pit of Hell (Inferno). He then climbs the mountain of Purgatory, guided - by the Roman poet Statius, who represents the fulfilment of human knowledge, and is finally led by his lifelong love, - the Beatrice of his earlier poetry, through the celestial spheres of Paradise. -""" - -pdf_scraper_graph = DocumentScraperGraph( - prompt="Summarize the text and find the main topics", - source=source, - config=graph_config, -) -result = pdf_scraper_graph.run() - -print(json.dumps(result, indent=4)) \ No newline at end of file diff --git a/examples/groq/inputs/books.xml b/examples/groq/inputs/books.xml deleted file mode 100644 index e3d1fe87..00000000 --- a/examples/groq/inputs/books.xml +++ /dev/null @@ -1,120 +0,0 @@ - - - - Gambardella, Matthew - XML Developer's Guide - Computer - 44.95 - 2000-10-01 - An in-depth look at creating applications - with XML. - - - Ralls, Kim - Midnight Rain - Fantasy - 5.95 - 2000-12-16 - A former architect battles corporate zombies, - an evil sorceress, and her own childhood to become queen - of the world. - - - Corets, Eva - Maeve Ascendant - Fantasy - 5.95 - 2000-11-17 - After the collapse of a nanotechnology - society in England, the young survivors lay the - foundation for a new society. - - - Corets, Eva - Oberon's Legacy - Fantasy - 5.95 - 2001-03-10 - In post-apocalypse England, the mysterious - agent known only as Oberon helps to create a new life - for the inhabitants of London. Sequel to Maeve - Ascendant. - - - Corets, Eva - The Sundered Grail - Fantasy - 5.95 - 2001-09-10 - The two daughters of Maeve, half-sisters, - battle one another for control of England. Sequel to - Oberon's Legacy. - - - Randall, Cynthia - Lover Birds - Romance - 4.95 - 2000-09-02 - When Carla meets Paul at an ornithology - conference, tempers fly as feathers get ruffled. - - - Thurman, Paula - Splish Splash - Romance - 4.95 - 2000-11-02 - A deep sea diver finds true love twenty - thousand leagues beneath the sea. - - - Knorr, Stefan - Creepy Crawlies - Horror - 4.95 - 2000-12-06 - An anthology of horror stories about roaches, - centipedes, scorpions and other insects. - - - Kress, Peter - Paradox Lost - Science Fiction - 6.95 - 2000-11-02 - After an inadvertant trip through a Heisenberg - Uncertainty Device, James Salway discovers the problems - of being quantum. - - - O'Brien, Tim - Microsoft .NET: The Programming Bible - Computer - 36.95 - 2000-12-09 - Microsoft's .NET initiative is explored in - detail in this deep programmer's reference. - - - O'Brien, Tim - MSXML3: A Comprehensive Guide - Computer - 36.95 - 2000-12-01 - The Microsoft MSXML3 parser is covered in - detail, with attention to XML DOM interfaces, XSLT processing, - SAX and more. - - - Galos, Mike - Visual Studio 7: A Comprehensive Guide - Computer - 49.95 - 2001-04-16 - Microsoft Visual Studio 7 is explored in depth, - looking at how Visual Basic, Visual C++, C#, and ASP+ are - integrated into a comprehensive development - environment. - - \ No newline at end of file diff --git a/examples/groq/inputs/example.json b/examples/groq/inputs/example.json deleted file mode 100644 index 2263184c..00000000 --- a/examples/groq/inputs/example.json +++ /dev/null @@ -1,182 +0,0 @@ -{ - "kind":"youtube#searchListResponse", - "etag":"q4ibjmYp1KA3RqMF4jFLl6PBwOg", - "nextPageToken":"CAUQAA", - "regionCode":"NL", - "pageInfo":{ - "totalResults":1000000, - "resultsPerPage":5 - }, - "items":[ - { - "kind":"youtube#searchResult", - "etag":"QCsHBifbaernVCbLv8Cu6rAeaDQ", - "id":{ - "kind":"youtube#video", - "videoId":"TvWDY4Mm5GM" - }, - "snippet":{ - "publishedAt":"2023-07-24T14:15:01Z", - "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", - "title":"3 Football Clubs Kylian Mbappe Should Avoid Signing ✍️❌⚽️ #football #mbappe #shorts", - "description":"", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"FC Motivate", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T14:15:01Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"0NG5QHdtIQM_V-DBJDEf-jK_Y9k", - "id":{ - "kind":"youtube#video", - "videoId":"aZM_42CcNZ4" - }, - "snippet":{ - "publishedAt":"2023-07-24T16:09:27Z", - "channelId":"UCM5gMM_HqfKHYIEJ3lstMUA", - "title":"Which Football Club Could Cristiano Ronaldo Afford To Buy? 💰", - "description":"Sign up to Sorare and get a FREE card: https://sorare.pxf.io/NellisShorts Give Soraredata a go for FREE: ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"John Nellis", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T16:09:27Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"WbBz4oh9I5VaYj91LjeJvffrBVY", - "id":{ - "kind":"youtube#video", - "videoId":"wkP3XS3aNAY" - }, - "snippet":{ - "publishedAt":"2023-07-24T16:00:50Z", - "channelId":"UC4EP1dxFDPup_aFLt0ElsDw", - "title":"PAULO DYBALA vs THE WORLD'S LONGEST FREEKICK WALL", - "description":"Can Paulo Dybala curl a football around the World's longest free kick wall? We met up with the World Cup winner and put him to ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"Shoot for Love", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T16:00:50Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"juxv_FhT_l4qrR05S1QTrb4CGh8", - "id":{ - "kind":"youtube#video", - "videoId":"rJkDZ0WvfT8" - }, - "snippet":{ - "publishedAt":"2023-07-24T10:00:39Z", - "channelId":"UCO8qj5u80Ga7N_tP3BZWWhQ", - "title":"TOP 10 DEFENDERS 2023", - "description":"SoccerKingz https://soccerkingz.nl Use code: 'ILOVEHOF' to get 10% off. TOP 10 DEFENDERS 2023 Follow us! • Instagram ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"Home of Football", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T10:00:39Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"wtuknXTmI1txoULeH3aWaOuXOow", - "id":{ - "kind":"youtube#video", - "videoId":"XH0rtu4U6SE" - }, - "snippet":{ - "publishedAt":"2023-07-21T16:30:05Z", - "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", - "title":"3 Things You Didn't Know About Erling Haaland ⚽️🇳🇴 #football #haaland #shorts", - "description":"", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"FC Motivate", - "liveBroadcastContent":"none", - "publishTime":"2023-07-21T16:30:05Z" - } - } - ] -} \ No newline at end of file diff --git a/examples/groq/inputs/plain_html_example.txt b/examples/groq/inputs/plain_html_example.txt deleted file mode 100644 index 78f814ae..00000000 --- a/examples/groq/inputs/plain_html_example.txt +++ /dev/null @@ -1,105 +0,0 @@ - -
- - -
-
-
- - -
- \ No newline at end of file diff --git a/examples/groq/inputs/username.csv b/examples/groq/inputs/username.csv deleted file mode 100644 index 006ac8e6..00000000 --- a/examples/groq/inputs/username.csv +++ /dev/null @@ -1,7 +0,0 @@ -Username; Identifier;First name;Last name -booker12;9012;Rachel;Booker -grey07;2070;Laura;Grey -johnson81;4081;Craig;Johnson -jenkins46;9346;Mary;Jenkins -smith79;5079;Jamie;Smith - diff --git a/examples/groq/json_scraper_groq.py b/examples/groq/json_scraper_groq.py deleted file mode 100644 index cac0f10d..00000000 --- a/examples/groq/json_scraper_groq.py +++ /dev/null @@ -1,49 +0,0 @@ -""" -Basic example of scraping pipeline using JSONScraperGraph from JSON documents -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import JSONScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the JSON file -# ************************************************ - -FILE_NAME = "inputs/example.json" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -groq_key = os.getenv("GROQ_APIKEY") - -graph_config = { - "llm": { - "model": "groq/gemma-7b-it", - "api_key": groq_key, - "temperature": 0 - }, - "verbose": True, - "headless": False -} - -# ************************************************ -# Create the JSONScraperGraph instance and run it -# ************************************************ - -json_scraper_graph = JSONScraperGraph( - prompt="List me all the authors, title and genres of the books", - source=text, # Pass the content of the file, not the file object - config=graph_config -) - -result = json_scraper_graph.run() -print(result) diff --git a/examples/groq/json_scraper_multi_groq.py b/examples/groq/json_scraper_multi_groq.py deleted file mode 100644 index df3b9276..00000000 --- a/examples/groq/json_scraper_multi_groq.py +++ /dev/null @@ -1,38 +0,0 @@ -""" -Module for showing how JSONScraperMultiGraph multi works -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import JSONScraperMultiGraph - -load_dotenv() - -groq_key = os.getenv("GROQ_APIKEY") - -graph_config = { - "llm": { - "model": "groq/gemma-7b-it", - "api_key": groq_key, - "temperature": 0 - }, - "library": "beautifulsoup" -} -FILE_NAME = "inputs/example.json" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -sources = [text, text] - -multiple_search_graph = JSONScraperMultiGraph( - prompt= "List me all the authors, title and genres of the books", - source= sources, - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/groq/rate_limit_groq.py b/examples/groq/rate_limit_groq.py deleted file mode 100644 index 8e59115f..00000000 --- a/examples/groq/rate_limit_groq.py +++ /dev/null @@ -1,48 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -groq_key = os.getenv("GROQ_APIKEY") - -graph_config = { - "llm": { - "model": "groq/gemma-7b-it", - "api_key": groq_key, - "temperature": 0, - "rate_limit": { - "requests_per_second": 1 - } - }, - "headless": False -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description.", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects/", - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/groq/scrape_plain_text_groq.py b/examples/groq/scrape_plain_text_groq.py deleted file mode 100644 index c4e4065d..00000000 --- a/examples/groq/scrape_plain_text_groq.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper from text -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the text file -# ************************************************ - -FILE_NAME = "inputs/plain_html_example.txt" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -# It could be also a http request using the request model -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -groq_key = os.getenv("GROQ_APIKEY") - -graph_config = { - "llm": { - "model": "groq/gemma-7b-it", - "api_key": groq_key, - "temperature": 0 - }, - "verbose": True, - "headless": False -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description.", - source=text, - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/groq/script_generator_groq.py b/examples/groq/script_generator_groq.py deleted file mode 100644 index 08550044..00000000 --- a/examples/groq/script_generator_groq.py +++ /dev/null @@ -1,44 +0,0 @@ -""" -Basic example of scraping pipeline using ScriptCreatorGraph -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import ScriptCreatorGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ -groq_key = os.getenv("GROQ_APIKEY") - -graph_config = { - "llm": { - "model": "groq/gemma-7b-it", - "api_key": groq_key, - "temperature": 0 - }, - "library": "beautifulsoup" -} -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -script_creator_graph = ScriptCreatorGraph( - prompt="List me all the projects with their description.", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects", - config=graph_config -) - -result = script_creator_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = script_creator_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - diff --git a/examples/groq/script_multi_generator_groq.py b/examples/groq/script_multi_generator_groq.py deleted file mode 100644 index 31f4041e..00000000 --- a/examples/groq/script_multi_generator_groq.py +++ /dev/null @@ -1,55 +0,0 @@ -""" -Basic example of scraping pipeline using ScriptCreatorGraph -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import ScriptCreatorMultiGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -groq_key = os.getenv("GROQ_APIKEY") - -graph_config = { - "llm": { - "model": "groq/gemma-7b-it", - "api_key": groq_key, - "temperature": 0 - }, - "library": "beautifulsoup" -} - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -urls=[ - "https://schultzbergagency.com/emil-raste-karlsen/", - "https://schultzbergagency.com/johanna-hedberg/", -] - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -script_creator_graph = ScriptCreatorMultiGraph( - prompt="Find information about actors", - # also accepts a string with the already downloaded HTML code - source=urls, - config=graph_config -) - -result = script_creator_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = script_creator_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/groq/search_graph_groq.py b/examples/groq/search_graph_groq.py deleted file mode 100644 index ec971e37..00000000 --- a/examples/groq/search_graph_groq.py +++ /dev/null @@ -1,40 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SearchGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -groq_key = os.getenv("GROQ_APIKEY") - -graph_config = { - "llm": { - "model": "groq/gemma-7b-it", - "api_key": groq_key, - "temperature": 0 - }, - "headless": False -} - -search_graph = SearchGraph( - prompt="List me the best escursions near Trento", - config=graph_config -) - -result = search_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = search_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/groq/search_graph_schema_groq.py b/examples/groq/search_graph_schema_groq.py deleted file mode 100644 index ae0de3ee..00000000 --- a/examples/groq/search_graph_schema_groq.py +++ /dev/null @@ -1,62 +0,0 @@ -""" -Example of Search Graph -""" -import os -from typing import List -from dotenv import load_dotenv -from pydantic import BaseModel, Field -from scrapegraphai.graphs import SearchGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Dish(BaseModel): - name: str = Field(description="The name of the dish") - description: str = Field(description="The description of the dish") - -class Dishes(BaseModel): - dishes: List[Dish] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -groq_key = os.getenv("GROQ_APIKEY") - -graph_config = { - "llm": { - "model": "groq/gemma-7b-it", - "api_key": groq_key, - "temperature": 0 - }, - "headless": False -} - - -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me Chioggia's famous dishes", - config=graph_config, - schema=Dishes -) - -result = search_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = search_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json and csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/groq/search_link_graph_groq.py b/examples/groq/search_link_graph_groq.py deleted file mode 100644 index 5d82f37f..00000000 --- a/examples/groq/search_link_graph_groq.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -Example of Search Graph -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SearchGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -load_dotenv() - -groq_key = os.getenv("GROQ_APIKEY") - -graph_config = { - "llm": { - "model": "groq/gemma-7b-it", - "api_key": groq_key, - "temperature": 0 - }, - "headless": False -} - -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me the best escursions near Trento", - config=graph_config -) - -result = search_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = search_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json and csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/groq/smart_scraper_groq.py b/examples/groq/smart_scraper_groq.py deleted file mode 100644 index 4ac32678..00000000 --- a/examples/groq/smart_scraper_groq.py +++ /dev/null @@ -1,45 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -groq_key = os.getenv("GROQ_APIKEY") - -graph_config = { - "llm": { - "model": "groq/gemma-7b-it", - "api_key": groq_key, - "temperature": 0 - }, - "headless": False, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description.", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects/", - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/groq/smart_scraper_lite_groq.py b/examples/groq/smart_scraper_lite_groq.py deleted file mode 100644 index 5fe6022f..00000000 --- a/examples/groq/smart_scraper_lite_groq.py +++ /dev/null @@ -1,31 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperLiteGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -graph_config = { - "llm": { - "api_key": os.getenv("GROQ_API_KEY"), - "model": "mixtral-8x7b-32768", - }, - "verbose": True, - "headless": False, -} - -smart_scraper_lite_graph = SmartScraperLiteGraph( - prompt="Who is Marco Perini?", - source="https://perinim.github.io/", - config=graph_config -) - -result = smart_scraper_lite_graph.run() -print(json.dumps(result, indent=4)) - -graph_exec_info = smart_scraper_lite_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/groq/smart_scraper_multi_concat_groq.py b/examples/groq/smart_scraper_multi_concat_groq.py deleted file mode 100644 index 79c262a1..00000000 --- a/examples/groq/smart_scraper_multi_concat_groq.py +++ /dev/null @@ -1,41 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiConcatGraph - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -groq_key = os.getenv("GROQ_APIKEY") - -graph_config = { - "llm": { - "model": "groq/gemma-7b-it", - "api_key": groq_key, - "temperature": 0 - }, - "headless": False -} - -# ******************************************************* -# Create the SmartScraperMultiGraph instance and run it -# ******************************************************* - -multiple_search_graph = SmartScraperMultiConcatGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/groq/smart_scraper_multi_groq.py b/examples/groq/smart_scraper_multi_groq.py deleted file mode 100644 index fec8fbb5..00000000 --- a/examples/groq/smart_scraper_multi_groq.py +++ /dev/null @@ -1,41 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiGraph - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -groq_key = os.getenv("GROQ_APIKEY") - -graph_config = { - "llm": { - "model": "groq/gemma-7b-it", - "api_key": groq_key, - "temperature": 0 - }, - "verbose": True, - "headless": False -} -# ******************************************************* -# Create the SmartScraperMultiGraph instance and run it -# ******************************************************* - -multiple_search_graph = SmartScraperMultiGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/groq/smart_scraper_multi_lite_groq.py b/examples/groq/smart_scraper_multi_lite_groq.py deleted file mode 100644 index 9c8e4d1d..00000000 --- a/examples/groq/smart_scraper_multi_lite_groq.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiLiteGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -graph_config = { - "llm": { - "api_key": os.getenv("GROQ_API_KEY"), - "model": "mixtral-8x7b-32768", - }, - "verbose": True, - "headless": False, -} - -smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - config=graph_config -) - -result = smart_scraper_multi_lite_graph.run() -print(json.dumps(result, indent=4)) - -graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - diff --git a/examples/groq/smart_scraper_schema_groq.py b/examples/groq/smart_scraper_schema_groq.py deleted file mode 100644 index bfa7ed3b..00000000 --- a/examples/groq/smart_scraper_schema_groq.py +++ /dev/null @@ -1,59 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper with schema -""" -import os -from typing import List -from pydantic import BaseModel, Field -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Project(BaseModel): - title: str = Field(description="The title of the project") - description: str = Field(description="The description of the project") - -class Projects(BaseModel): - projects: List[Project] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -groq_key = os.getenv("GROQ_APIKEY") - -graph_config = { - "llm": { - "model": "groq/gemma-7b-it", - "api_key": groq_key, - "temperature": 0 - }, - "headless": False -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description.", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects/", - schema=Projects, - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/groq/xml_scraper_graph_multi_groq.py b/examples/groq/xml_scraper_graph_multi_groq.py deleted file mode 100644 index 09c7483f..00000000 --- a/examples/groq/xml_scraper_graph_multi_groq.py +++ /dev/null @@ -1,59 +0,0 @@ -""" -Basic example of scraping pipeline using XMLScraperMultiGraph from XML documents -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import XMLScraperMultiGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the XML file -# ************************************************ - -FILE_NAME = "inputs/books.xml" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -groq_key = os.getenv("GROQ_APIKEY") - -graph_config = { - "llm": { - "model": "groq/gemma-7b-it", - "api_key": groq_key, - "temperature": 0 - }, - "headless": False -} - -# ************************************************ -# Create the XMLScraperMultiGraph instance and run it -# ************************************************ - -xml_scraper_graph = XMLScraperMultiGraph( - prompt="List me all the authors, title and genres of the books", - source=[text, text], # Pass the content of the file, not the file object - config=graph_config -) - -result = xml_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = xml_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/groq/xml_scraper_groq.py b/examples/groq/xml_scraper_groq.py deleted file mode 100644 index cb1ca8d7..00000000 --- a/examples/groq/xml_scraper_groq.py +++ /dev/null @@ -1,60 +0,0 @@ -""" -Basic example of scraping pipeline using XMLScraperGraph from XML documents -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import XMLScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the XML file -# ************************************************ - -FILE_NAME = "inputs/books.xml" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -groq_key = os.getenv("GROQ_APIKEY") - -graph_config = { - "llm": { - "model": "groq/gemma-7b-it", - "api_key": groq_key, - "temperature": 0 - }, - "verbose": True, - "headless": False -} -# ************************************************ -# Create the XMLScraperGraph instance and run it -# ************************************************ - -xml_scraper_graph = XMLScraperGraph( - prompt="List me all the authors, title and genres of the books", - source=text, # Pass the content of the file, not the file object - config=graph_config -) - -result = xml_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = xml_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") - diff --git a/examples/huggingfacehub/code_generator_graph_huggingfacehub.py b/examples/huggingfacehub/code_generator_graph_huggingfacehub.py deleted file mode 100644 index 4ff0d67e..00000000 --- a/examples/huggingfacehub/code_generator_graph_huggingfacehub.py +++ /dev/null @@ -1,71 +0,0 @@ -""" -Basic example of scraping pipeline using Code Generator with schema -""" - -import os, json -from typing import List -from dotenv import load_dotenv -from pydantic import BaseModel, Field -from scrapegraphai.graphs import CodeGeneratorGraph -from langchain_community.llms import HuggingFaceEndpoint -from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings - -load_dotenv() - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Project(BaseModel): - title: str = Field(description="The title of the project") - description: str = Field(description="The description of the project") - -class Projects(BaseModel): - projects: List[Project] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') - -repo_id = "mistralai/Mistral-7B-Instruct-v0.2" - -llm_model_instance = HuggingFaceEndpoint( - repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN -) - -embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( - api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2" -) - -graph_config = { - "llm": { - "model_instance": llm_model_instance - }, - "verbose": True, - "headless": False, - "reduction": 2, - "max_iterations": { - "overall": 10, - "syntax": 3, - "execution": 3, - "validation": 3, - "semantic": 3 - }, - "output_file_name": "extracted_data.py" -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -code_generator_graph = CodeGeneratorGraph( - prompt="List me all the projects with their description", - source="https://perinim.github.io/projects/", - schema=Projects, - config=graph_config -) - -result = code_generator_graph.run() -print(result) \ No newline at end of file diff --git a/examples/huggingfacehub/csv_scraper_graph_multi_huggingfacehub.py b/examples/huggingfacehub/csv_scraper_graph_multi_huggingfacehub.py deleted file mode 100644 index 48b04dab..00000000 --- a/examples/huggingfacehub/csv_scraper_graph_multi_huggingfacehub.py +++ /dev/null @@ -1,68 +0,0 @@ -""" -Basic example of scraping pipeline using CSVScraperMultiGraph from CSV documents -""" - -import os -import pandas as pd -from scrapegraphai.graphs import CSVScraperMultiGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info -from langchain_community.llms import HuggingFaceEndpoint -from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings - -# ************************************************ -# Read the CSV file -# ************************************************ - -FILE_NAME = "inputs/username.csv" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -text = pd.read_csv(file_path) - -# ************************************************ -# Define the configuration for the graph -# ************************************************ -HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') - -repo_id = "mistralai/Mistral-7B-Instruct-v0.2" - -llm_model_instance = HuggingFaceEndpoint( - repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN -) - -embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( - api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2" -) - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -graph_config = { - "llm": {"model_instance": llm_model_instance}, -} - - -# ************************************************ -# Create the CSVScraperMultiGraph instance and run it -# ************************************************ - -csv_scraper_graph = CSVScraperMultiGraph( - prompt="List me all the last names", - source=[str(text), str(text)], - config=graph_config -) - -result = csv_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = csv_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/huggingfacehub/csv_scraper_huggingfacehub.py b/examples/huggingfacehub/csv_scraper_huggingfacehub.py deleted file mode 100644 index 18ce1194..00000000 --- a/examples/huggingfacehub/csv_scraper_huggingfacehub.py +++ /dev/null @@ -1,70 +0,0 @@ -""" -Basic example of scraping pipeline using CSVScraperGraph from CSV documents -""" - -import os -from dotenv import load_dotenv -import pandas as pd -from scrapegraphai.graphs import CSVScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info -from langchain_community.llms import HuggingFaceEndpoint -from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings - -load_dotenv() - -# ************************************************ -# Read the CSV file -# ************************************************ - -FILE_NAME = "inputs/username.csv" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -text = pd.read_csv(file_path) - -# ************************************************ -# Define the configuration for the graph -# ************************************************ -HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') - -repo_id = "mistralai/Mistral-7B-Instruct-v0.2" - -llm_model_instance = HuggingFaceEndpoint( - repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN -) - -embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( - api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2" -) - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -graph_config = { - "llm": {"model_instance": llm_model_instance}, -} - -# ************************************************ -# Create the CSVScraperGraph instance and run it -# ************************************************ - -csv_scraper_graph = CSVScraperGraph( - prompt="List me all the last names", - source=str(text), # Pass the content of the file, not the file object - config=graph_config -) - -result = csv_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = csv_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/huggingfacehub/custom_graph_huggingfacehub.py b/examples/huggingfacehub/custom_graph_huggingfacehub.py deleted file mode 100644 index 06b2f089..00000000 --- a/examples/huggingfacehub/custom_graph_huggingfacehub.py +++ /dev/null @@ -1,121 +0,0 @@ -""" -Example of custom graph using existing nodes -""" - -import os -from dotenv import load_dotenv -from langchain_openai import OpenAIEmbeddings -from langchain_openai import ChatOpenAI -from scrapegraphai.graphs import BaseGraph -from scrapegraphai.nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode, RobotsNode -from langchain_community.llms import HuggingFaceEndpoint -from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - - -HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') - -repo_id = "mistralai/Mistral-7B-Instruct-v0.2" - -llm_model_instance = HuggingFaceEndpoint( - repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN -) - -embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( - api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2" -) - -graph_config = { - "llm": {"model_instance": llm_model_instance}, -} - -# ************************************************ -# Define the graph nodes -# ************************************************ - -llm_model = OpenAI(graph_config["llm"]) -embedder = OpenAIEmbeddings(api_key=llm_model.openai_api_key) - -# define the nodes for the graph -robot_node = RobotsNode( - input="url", - output=["is_scrapable"], - node_config={ - "llm_model": llm_model, - "force_scraping": True, - "verbose": True, - } -) - -fetch_node = FetchNode( - input="url | local_dir", - output=["doc"], - node_config={ - "verbose": True, - "headless": True, - } -) -parse_node = ParseNode( - input="doc", - output=["parsed_doc"], - node_config={ - "chunk_size": 4096, - "verbose": True, - } -) -rag_node = RAGNode( - input="user_prompt & (parsed_doc | doc)", - output=["relevant_chunks"], - node_config={ - "llm_model": llm_model, - "embedder_model": embedder, - "verbose": True, - } -) -generate_answer_node = GenerateAnswerNode( - input="user_prompt & (relevant_chunks | parsed_doc | doc)", - output=["answer"], - node_config={ - "llm_model": llm_model, - "verbose": True, - } -) - -# ************************************************ -# Create the graph by defining the connections -# ************************************************ - -graph = BaseGraph( - nodes=[ - robot_node, - fetch_node, - parse_node, - rag_node, - generate_answer_node, - ], - edges=[ - (robot_node, fetch_node), - (fetch_node, parse_node), - (parse_node, rag_node), - (rag_node, generate_answer_node) - ], - entry_point=robot_node -) - -# ************************************************ -# Execute the graph -# ************************************************ - -result, execution_info = graph.execute({ - "user_prompt": "Describe the content", - "url": "https://example.com/" -}) - -# get the answer from the result -result = result.get("answer", "No answer found.") -print(result) diff --git a/examples/huggingfacehub/depth_search_graph_huggingfacehub.py b/examples/huggingfacehub/depth_search_graph_huggingfacehub.py deleted file mode 100644 index 48df3e37..00000000 --- a/examples/huggingfacehub/depth_search_graph_huggingfacehub.py +++ /dev/null @@ -1,38 +0,0 @@ -""" -depth_search_graph_opeani example -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import DepthSearchGraph -from langchain_community.llms import HuggingFaceEndpoint -from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings - -load_dotenv() -HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') - -repo_id = "mistralai/Mistral-7B-Instruct-v0.2" - -llm_model_instance = HuggingFaceEndpoint( - repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN -) - -embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( - api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2" -) - -graph_config = { - "llm": {"model_instance": llm_model_instance}, - "verbose": True, - "headless": False, - "depth": 2, - "only_inside_links": False, -} - -search_graph = DepthSearchGraph( - prompt="List me all the projects with their description", - source="https://perinim.github.io", - config=graph_config -) - -result = search_graph.run() -print(result) diff --git a/examples/huggingfacehub/document_scraper_huggingfacehub.py b/examples/huggingfacehub/document_scraper_huggingfacehub.py deleted file mode 100644 index 5992f077..00000000 --- a/examples/huggingfacehub/document_scraper_huggingfacehub.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -document_scraper example -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import DocumentScraperGraph -from langchain_community.llms import HuggingFaceEndpoint -from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ -# ************************************************ -# Define the configuration for the graph -# ************************************************ -HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') - -repo_id = "mistralai/Mistral-7B-Instruct-v0.2" - -llm_model_instance = HuggingFaceEndpoint( - repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN -) - -embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( - api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2" -) - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -graph_config = { - "llm": {"model_instance": llm_model_instance}, -} - -source = """ - The Divine Comedy, Italian La Divina Commedia, original name La commedia, long narrative poem written in Italian - circa 1308/21 by Dante. It is usually held to be one of the world s great works of literature. - Divided into three major sections—Inferno, Purgatorio, and Paradiso—the narrative traces the journey of Dante - from darkness and error to the revelation of the divine light, culminating in the Beatific Vision of God. - Dante is guided by the Roman poet Virgil, who represents the epitome of human knowledge, from the dark wood - through the descending circles of the pit of Hell (Inferno). He then climbs the mountain of Purgatory, guided - by the Roman poet Statius, who represents the fulfilment of human knowledge, and is finally led by his lifelong love, - the Beatrice of his earlier poetry, through the celestial spheres of Paradise. -""" - -pdf_scraper_graph = DocumentScraperGraph( - prompt="Summarize the text and find the main topics", - source=source, - config=graph_config, -) -result = pdf_scraper_graph.run() - -print(json.dumps(result, indent=4)) \ No newline at end of file diff --git a/examples/huggingfacehub/inputs/books.xml b/examples/huggingfacehub/inputs/books.xml deleted file mode 100644 index e3d1fe87..00000000 --- a/examples/huggingfacehub/inputs/books.xml +++ /dev/null @@ -1,120 +0,0 @@ - - - - Gambardella, Matthew - XML Developer's Guide - Computer - 44.95 - 2000-10-01 - An in-depth look at creating applications - with XML. - - - Ralls, Kim - Midnight Rain - Fantasy - 5.95 - 2000-12-16 - A former architect battles corporate zombies, - an evil sorceress, and her own childhood to become queen - of the world. - - - Corets, Eva - Maeve Ascendant - Fantasy - 5.95 - 2000-11-17 - After the collapse of a nanotechnology - society in England, the young survivors lay the - foundation for a new society. - - - Corets, Eva - Oberon's Legacy - Fantasy - 5.95 - 2001-03-10 - In post-apocalypse England, the mysterious - agent known only as Oberon helps to create a new life - for the inhabitants of London. Sequel to Maeve - Ascendant. - - - Corets, Eva - The Sundered Grail - Fantasy - 5.95 - 2001-09-10 - The two daughters of Maeve, half-sisters, - battle one another for control of England. Sequel to - Oberon's Legacy. - - - Randall, Cynthia - Lover Birds - Romance - 4.95 - 2000-09-02 - When Carla meets Paul at an ornithology - conference, tempers fly as feathers get ruffled. - - - Thurman, Paula - Splish Splash - Romance - 4.95 - 2000-11-02 - A deep sea diver finds true love twenty - thousand leagues beneath the sea. - - - Knorr, Stefan - Creepy Crawlies - Horror - 4.95 - 2000-12-06 - An anthology of horror stories about roaches, - centipedes, scorpions and other insects. - - - Kress, Peter - Paradox Lost - Science Fiction - 6.95 - 2000-11-02 - After an inadvertant trip through a Heisenberg - Uncertainty Device, James Salway discovers the problems - of being quantum. - - - O'Brien, Tim - Microsoft .NET: The Programming Bible - Computer - 36.95 - 2000-12-09 - Microsoft's .NET initiative is explored in - detail in this deep programmer's reference. - - - O'Brien, Tim - MSXML3: A Comprehensive Guide - Computer - 36.95 - 2000-12-01 - The Microsoft MSXML3 parser is covered in - detail, with attention to XML DOM interfaces, XSLT processing, - SAX and more. - - - Galos, Mike - Visual Studio 7: A Comprehensive Guide - Computer - 49.95 - 2001-04-16 - Microsoft Visual Studio 7 is explored in depth, - looking at how Visual Basic, Visual C++, C#, and ASP+ are - integrated into a comprehensive development - environment. - - \ No newline at end of file diff --git a/examples/huggingfacehub/inputs/example.json b/examples/huggingfacehub/inputs/example.json deleted file mode 100644 index 2263184c..00000000 --- a/examples/huggingfacehub/inputs/example.json +++ /dev/null @@ -1,182 +0,0 @@ -{ - "kind":"youtube#searchListResponse", - "etag":"q4ibjmYp1KA3RqMF4jFLl6PBwOg", - "nextPageToken":"CAUQAA", - "regionCode":"NL", - "pageInfo":{ - "totalResults":1000000, - "resultsPerPage":5 - }, - "items":[ - { - "kind":"youtube#searchResult", - "etag":"QCsHBifbaernVCbLv8Cu6rAeaDQ", - "id":{ - "kind":"youtube#video", - "videoId":"TvWDY4Mm5GM" - }, - "snippet":{ - "publishedAt":"2023-07-24T14:15:01Z", - "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", - "title":"3 Football Clubs Kylian Mbappe Should Avoid Signing ✍️❌⚽️ #football #mbappe #shorts", - "description":"", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"FC Motivate", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T14:15:01Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"0NG5QHdtIQM_V-DBJDEf-jK_Y9k", - "id":{ - "kind":"youtube#video", - "videoId":"aZM_42CcNZ4" - }, - "snippet":{ - "publishedAt":"2023-07-24T16:09:27Z", - "channelId":"UCM5gMM_HqfKHYIEJ3lstMUA", - "title":"Which Football Club Could Cristiano Ronaldo Afford To Buy? 💰", - "description":"Sign up to Sorare and get a FREE card: https://sorare.pxf.io/NellisShorts Give Soraredata a go for FREE: ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"John Nellis", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T16:09:27Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"WbBz4oh9I5VaYj91LjeJvffrBVY", - "id":{ - "kind":"youtube#video", - "videoId":"wkP3XS3aNAY" - }, - "snippet":{ - "publishedAt":"2023-07-24T16:00:50Z", - "channelId":"UC4EP1dxFDPup_aFLt0ElsDw", - "title":"PAULO DYBALA vs THE WORLD'S LONGEST FREEKICK WALL", - "description":"Can Paulo Dybala curl a football around the World's longest free kick wall? We met up with the World Cup winner and put him to ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"Shoot for Love", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T16:00:50Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"juxv_FhT_l4qrR05S1QTrb4CGh8", - "id":{ - "kind":"youtube#video", - "videoId":"rJkDZ0WvfT8" - }, - "snippet":{ - "publishedAt":"2023-07-24T10:00:39Z", - "channelId":"UCO8qj5u80Ga7N_tP3BZWWhQ", - "title":"TOP 10 DEFENDERS 2023", - "description":"SoccerKingz https://soccerkingz.nl Use code: 'ILOVEHOF' to get 10% off. TOP 10 DEFENDERS 2023 Follow us! • Instagram ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"Home of Football", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T10:00:39Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"wtuknXTmI1txoULeH3aWaOuXOow", - "id":{ - "kind":"youtube#video", - "videoId":"XH0rtu4U6SE" - }, - "snippet":{ - "publishedAt":"2023-07-21T16:30:05Z", - "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", - "title":"3 Things You Didn't Know About Erling Haaland ⚽️🇳🇴 #football #haaland #shorts", - "description":"", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"FC Motivate", - "liveBroadcastContent":"none", - "publishTime":"2023-07-21T16:30:05Z" - } - } - ] -} \ No newline at end of file diff --git a/examples/huggingfacehub/inputs/plain_html_example.txt b/examples/huggingfacehub/inputs/plain_html_example.txt deleted file mode 100644 index 78f814ae..00000000 --- a/examples/huggingfacehub/inputs/plain_html_example.txt +++ /dev/null @@ -1,105 +0,0 @@ - -
- - -
-
-
- - -
- \ No newline at end of file diff --git a/examples/huggingfacehub/inputs/username.csv b/examples/huggingfacehub/inputs/username.csv deleted file mode 100644 index 006ac8e6..00000000 --- a/examples/huggingfacehub/inputs/username.csv +++ /dev/null @@ -1,7 +0,0 @@ -Username; Identifier;First name;Last name -booker12;9012;Rachel;Booker -grey07;2070;Laura;Grey -johnson81;4081;Craig;Johnson -jenkins46;9346;Mary;Jenkins -smith79;5079;Jamie;Smith - diff --git a/examples/huggingfacehub/json_scraper_huggingfacehub.py b/examples/huggingfacehub/json_scraper_huggingfacehub.py deleted file mode 100644 index f8223711..00000000 --- a/examples/huggingfacehub/json_scraper_huggingfacehub.py +++ /dev/null @@ -1,59 +0,0 @@ -""" -Basic example of scraping pipeline using JSONScraperGraph from JSON documents -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import JSONScraperGraph -from scrapegraphai.utils import prettify_exec_info -from langchain_community.llms import HuggingFaceEndpoint -from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings - -load_dotenv() - -# ************************************************ -# Read the JSON file -# ************************************************ - -FILE_NAME = "inputs/example.json" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ -HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') - -repo_id = "mistralai/Mistral-7B-Instruct-v0.2" - -llm_model_instance = HuggingFaceEndpoint( - repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN -) - -embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( - api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2" -) - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -graph_config = { - "llm": {"model_instance": llm_model_instance}, -} - -# ************************************************ -# Create the JSONScraperGraph instance and run it -# ************************************************ - -json_scraper_graph = JSONScraperGraph( - prompt="List me all the authors, title and genres of the books", - source=text, # Pass the content of the file, not the file object - config=graph_config -) - -result = json_scraper_graph.run() -print(result) diff --git a/examples/huggingfacehub/json_scraper_multi_huggingfacehub.py b/examples/huggingfacehub/json_scraper_multi_huggingfacehub.py deleted file mode 100644 index c029431e..00000000 --- a/examples/huggingfacehub/json_scraper_multi_huggingfacehub.py +++ /dev/null @@ -1,45 +0,0 @@ -""" -Module for showing how PDFScraper multi works -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import JSONScraperMultiGraph -from langchain_community.llms import HuggingFaceEndpoint -from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings - -load_dotenv() - -HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') - -repo_id = "mistralai/Mistral-7B-Instruct-v0.2" - -llm_model_instance = HuggingFaceEndpoint( - repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN -) - -embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( - api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2" -) - -graph_config = { - "llm": {"model_instance": llm_model_instance}, -} -FILE_NAME = "inputs/example.json" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -sources = [text, text] - -multiple_search_graph = JSONScraperMultiGraph( - prompt= "List me all the authors, title and genres of the books", - source= sources, - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/huggingfacehub/scrape_plain_text_huggingfacehub.py b/examples/huggingfacehub/scrape_plain_text_huggingfacehub.py deleted file mode 100644 index 76d32cda..00000000 --- a/examples/huggingfacehub/scrape_plain_text_huggingfacehub.py +++ /dev/null @@ -1,68 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper from text -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info -from langchain_community.llms import HuggingFaceEndpoint -from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings - -load_dotenv() - -# ************************************************ -# Read the text file -# ************************************************ - -FILE_NAME = "inputs/plain_html_example.txt" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -# It could be also a http request using the request model -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') - -repo_id = "mistralai/Mistral-7B-Instruct-v0.2" - -llm_model_instance = HuggingFaceEndpoint( - repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN -) - -embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( - api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2" -) - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -graph_config = { - "llm": {"model_instance": llm_model_instance}, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description.", - source=text, - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/huggingfacehub/script_generator_huggingfacehub.py b/examples/huggingfacehub/script_generator_huggingfacehub.py deleted file mode 100644 index a3fcaaf4..00000000 --- a/examples/huggingfacehub/script_generator_huggingfacehub.py +++ /dev/null @@ -1,60 +0,0 @@ -""" -Basic example of scraping pipeline using ScriptCreatorGraph -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import ScriptCreatorGraph -from scrapegraphai.utils import prettify_exec_info -from langchain_community.llms import HuggingFaceEndpoint -from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') -# ************************************************ -# Initialize the model instances -# ************************************************ - -repo_id = "mistralai/Mistral-7B-Instruct-v0.2" - -llm_model_instance = HuggingFaceEndpoint( - repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN -) - -embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( - api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2" -) - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -graph_config = { - "llm": {"model_instance": llm_model_instance}, -} -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -script_creator_graph = ScriptCreatorGraph( - prompt="List me all the projects with their description.", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects", - config=graph_config -) - -result = script_creator_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = script_creator_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - diff --git a/examples/huggingfacehub/script_multi_generator_huggingfacehub.py b/examples/huggingfacehub/script_multi_generator_huggingfacehub.py deleted file mode 100644 index 0ee89189..00000000 --- a/examples/huggingfacehub/script_multi_generator_huggingfacehub.py +++ /dev/null @@ -1,66 +0,0 @@ -""" -Basic example of scraping pipeline using ScriptCreatorGraph -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import ScriptCreatorMultiGraph -from scrapegraphai.utils import prettify_exec_info -from langchain_community.llms import HuggingFaceEndpoint -from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') - -repo_id = "mistralai/Mistral-7B-Instruct-v0.2" - -llm_model_instance = HuggingFaceEndpoint( - repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN -) - -embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( - api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2" -) - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -graph_config = { - "llm": {"model_instance": llm_model_instance}, -} - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -urls=[ - "https://schultzbergagency.com/emil-raste-karlsen/", - "https://schultzbergagency.com/johanna-hedberg/", -] - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -script_creator_graph = ScriptCreatorMultiGraph( - prompt="Find information about actors", - # also accepts a string with the already downloaded HTML code - source=urls, - config=graph_config -) - -result = script_creator_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = script_creator_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/huggingfacehub/search_graph_huggingfacehub.py b/examples/huggingfacehub/search_graph_huggingfacehub.py deleted file mode 100644 index 7c4a0c43..00000000 --- a/examples/huggingfacehub/search_graph_huggingfacehub.py +++ /dev/null @@ -1,55 +0,0 @@ -""" -Example of Search Graph -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SearchGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info -from langchain_community.llms import HuggingFaceEndpoint -from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') - -repo_id = "mistralai/Mistral-7B-Instruct-v0.2" - -llm_model_instance = HuggingFaceEndpoint( - repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN -) - -embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( - api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2" -) - -graph_config = { - "llm": {"model_instance": llm_model_instance}, -} - -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me Chioggia's famous dishes", - config=graph_config -) - -result = search_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = search_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json and csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/huggingfacehub/search_link_graph_huggingfacehub.py b/examples/huggingfacehub/search_link_graph_huggingfacehub.py deleted file mode 100644 index 75b41282..00000000 --- a/examples/huggingfacehub/search_link_graph_huggingfacehub.py +++ /dev/null @@ -1,53 +0,0 @@ -""" -Example of Search Graph -""" -import os -from scrapegraphai.graphs import SearchGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info -from langchain_community.llms import HuggingFaceEndpoint -from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings - -# ************************************************ -# Define the configuration for the graph -# ************************************************ -# ************************************************ - -HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') - -repo_id = "mistralai/Mistral-7B-Instruct-v0.2" - -llm_model_instance = HuggingFaceEndpoint( - repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN -) - -embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( - api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2" -) - -graph_config = { - "llm": {"model_instance": llm_model_instance}, -} - - -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me the best escursions near Trento", - config=graph_config -) - -result = search_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = search_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json and csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/huggingfacehub/smart_scraper_huggingfacehub.py b/examples/huggingfacehub/smart_scraper_huggingfacehub.py deleted file mode 100644 index a50b574e..00000000 --- a/examples/huggingfacehub/smart_scraper_huggingfacehub.py +++ /dev/null @@ -1,58 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper using Azure OpenAI Key -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info -from langchain_community.llms import HuggingFaceEndpoint -from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings - - -## required environment variable in .env -#HUGGINGFACEHUB_API_TOKEN -load_dotenv() - -HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') -# ************************************************ -# Initialize the model instances -# ************************************************ - -repo_id = "mistralai/Mistral-7B-Instruct-v0.2" - -llm_model_instance = HuggingFaceEndpoint( - repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN -) - - -embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( - api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2" -) - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -graph_config = { - "llm": {"model_instance": llm_model_instance}, -} - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the events, with the following fields: company_name, event_name, event_start_date, event_start_time, event_end_date, event_end_time, location, event_mode, event_category, third_party_redirect, no_of_days, time_in_hours, hosted_or_attending, refreshments_type, registration_available, registration_link", - # also accepts a string with the already downloaded HTML code - source="https://www.hmhco.com/event", - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - - diff --git a/examples/huggingfacehub/smart_scraper_lite_huggingfacehub.py b/examples/huggingfacehub/smart_scraper_lite_huggingfacehub.py deleted file mode 100644 index 4faa8a47..00000000 --- a/examples/huggingfacehub/smart_scraper_lite_huggingfacehub.py +++ /dev/null @@ -1,31 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperLiteGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -graph_config = { - "llm": { - "api_key": os.getenv("HUGGINGFACEHUB_API_TOKEN"), - "model": "huggingfacehub/meta-llama/Llama-2-70b-chat-hf", - }, - "verbose": True, - "headless": False, -} - -smart_scraper_lite_graph = SmartScraperLiteGraph( - prompt="Who is Marco Perini?", - source="https://perinim.github.io/", - config=graph_config -) - -result = smart_scraper_lite_graph.run() -print(json.dumps(result, indent=4)) - -graph_exec_info = smart_scraper_lite_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/huggingfacehub/smart_scraper_multi_concat_huggingfacehub.py b/examples/huggingfacehub/smart_scraper_multi_concat_huggingfacehub.py deleted file mode 100644 index 3f2d7135..00000000 --- a/examples/huggingfacehub/smart_scraper_multi_concat_huggingfacehub.py +++ /dev/null @@ -1,53 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" - -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiConcatGraph -from langchain_community.llms import HuggingFaceEndpoint -from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings - -load_dotenv() - -HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') -# ************************************************ -# Initialize the model instances -# ************************************************ - -repo_id = "mistralai/Mistral-7B-Instruct-v0.2" - -llm_model_instance = HuggingFaceEndpoint( - repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN -) - - -embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( - api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2" -) - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -graph_config = { - "llm": {"model_instance": llm_model_instance}, -} - -# ******************************************************* -# Create the SmartScraperMultiGraph instance and run it -# ******************************************************* - -multiple_search_graph = SmartScraperMultiConcatGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/huggingfacehub/smart_scraper_multi_huggingfacehub.py b/examples/huggingfacehub/smart_scraper_multi_huggingfacehub.py deleted file mode 100644 index 046883a2..00000000 --- a/examples/huggingfacehub/smart_scraper_multi_huggingfacehub.py +++ /dev/null @@ -1,48 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" - -import os, json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiGraph -from langchain_community.llms import HuggingFaceEndpoint -from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') - -repo_id = "mistralai/Mistral-7B-Instruct-v0.2" - -llm_model_instance = HuggingFaceEndpoint( - repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN -) - -embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( - api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2" -) - -graph_config = { - "llm": {"model_instance": llm_model_instance}, -} - -# ******************************************************* -# Create the SmartScraperMultiGraph instance and run it -# ******************************************************* - -multiple_search_graph = SmartScraperMultiGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/huggingfacehub/smart_scraper_multi_lite_huggingfacehub.py b/examples/huggingfacehub/smart_scraper_multi_lite_huggingfacehub.py deleted file mode 100644 index 2d7a3a45..00000000 --- a/examples/huggingfacehub/smart_scraper_multi_lite_huggingfacehub.py +++ /dev/null @@ -1,34 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiLiteGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -graph_config = { - "llm": { - "api_key": os.getenv("HUGGINGFACEHUB_API_TOKEN"), - "model": "huggingfacehub/meta-llama/Llama-2-70b-chat-hf", - }, - "verbose": True, - "headless": False, -} - -smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - config=graph_config -) - -result = smart_scraper_multi_lite_graph.run() -print(json.dumps(result, indent=4)) - -graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/huggingfacehub/smart_scraper_multi_lite_uhggingfacehub.py b/examples/huggingfacehub/smart_scraper_multi_lite_uhggingfacehub.py deleted file mode 100644 index e69de29b..00000000 diff --git a/examples/huggingfacehub/smart_scraper_schema_huggingfacehub.py b/examples/huggingfacehub/smart_scraper_schema_huggingfacehub.py deleted file mode 100644 index 31719697..00000000 --- a/examples/huggingfacehub/smart_scraper_schema_huggingfacehub.py +++ /dev/null @@ -1,67 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper using Azure OpenAI Key -""" - -import os -from dotenv import load_dotenv -from typing import Dict - -from pydantic import BaseModel -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info -from langchain_community.llms import HuggingFaceEndpoint -from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Project(BaseModel): - title: str - description: str - -class Projects(BaseModel): - Projects: Dict[str, Project] - -## required environment variable in .env -#HUGGINGFACEHUB_API_TOKEN -load_dotenv() - -HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') -# ************************************************ -# Initialize the model instances -# ************************************************ - -repo_id = "mistralai/Mistral-7B-Instruct-v0.2" - -llm_model_instance = HuggingFaceEndpoint( - repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN -) - -embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( - api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2" -) - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -graph_config = { - "llm": {"model_instance": llm_model_instance}, -} - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description", - source="https://perinim.github.io/projects/", - schema=Projects, - config=graph_config -) -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/huggingfacehub/xml_scraper_graph_multi_huggingfacehub.py b/examples/huggingfacehub/xml_scraper_graph_multi_huggingfacehub.py deleted file mode 100644 index 1a244b86..00000000 --- a/examples/huggingfacehub/xml_scraper_graph_multi_huggingfacehub.py +++ /dev/null @@ -1,67 +0,0 @@ -""" -Basic example of scraping pipeline using XMLScraperMultiGraph from XML documents -""" - -import os -from scrapegraphai.graphs import XMLScraperMultiGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info -from langchain_community.llms import HuggingFaceEndpoint -from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings - -# ************************************************ -# Read the XML file -# ************************************************ - -FILE_NAME = "inputs/books.xml" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ -HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') - -repo_id = "mistralai/Mistral-7B-Instruct-v0.2" - -llm_model_instance = HuggingFaceEndpoint( - repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN -) - -embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( - api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2" -) - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -graph_config = { - "llm": {"model_instance": llm_model_instance}, -} - -# ************************************************ -# Create the XMLScraperMultiGraph instance and run it -# ************************************************ - -xml_scraper_graph = XMLScraperMultiGraph( - prompt="List me all the authors, title and genres of the books", - source=[text, text], # Pass the content of the file, not the file object - config=graph_config -) - -result = xml_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = xml_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/huggingfacehub/xml_scraper_huggingfacehub.py b/examples/huggingfacehub/xml_scraper_huggingfacehub.py deleted file mode 100644 index ddd73b5f..00000000 --- a/examples/huggingfacehub/xml_scraper_huggingfacehub.py +++ /dev/null @@ -1,68 +0,0 @@ -""" -Basic example of scraping pipeline using XMLScraperGraph from XML documents -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import XMLScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info -from langchain_community.llms import HuggingFaceEndpoint -from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings - -load_dotenv() - -# ************************************************ -# Read the XML file -# ************************************************ - -FILE_NAME = "inputs/books.xml" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') - -repo_id = "mistralai/Mistral-7B-Instruct-v0.2" - -llm_model_instance = HuggingFaceEndpoint( - repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN -) - -embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( - api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2" -) - -graph_config = { - "llm": {"model_instance": llm_model_instance}, -} - -# ************************************************ -# Create the XMLScraperGraph instance and run it -# ************************************************ - -xml_scraper_graph = XMLScraperGraph( - prompt="List me all the authors, title and genres of the books", - source=text, # Pass the content of the file, not the file object - config=graph_config -) - -result = xml_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = xml_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") - diff --git a/examples/integrations/indexify_node_example.py b/examples/integrations/indexify_node_example.py deleted file mode 100644 index 61db52d2..00000000 --- a/examples/integrations/indexify_node_example.py +++ /dev/null @@ -1,72 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper with schema -""" - -import os, json -from typing import List - -from dotenv import load_dotenv -load_dotenv() - -from pydantic import BaseModel, Field -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.integrations import IndexifyNode - - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Image(BaseModel): - url: str = Field(description="The url of the image") - -class Images(BaseModel): - images: List[Image] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -openai_key = os.getenv("OPENAI_APIKEY") - -graph_config = { - "llm": { - "api_key":openai_key, - "model": "openai/gpt-3.5-turbo", - }, - "verbose": True, - "headless": False, -} - -# ************************************************ -# Define the custom nodes for the graph -# ************************************************ - -indexify_node = IndexifyNode( - input="answer & img_urls", - output=["is_indexed"], - node_config={ - "verbose": True - } -) - -# ************************************************ -# Create the SmartScraperGraph instance -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the images with their url", - source="https://giphy.com/", - schema=Images, - config=graph_config -) - -# Add the custom node to the graph -smart_scraper_graph.append_node(indexify_node) - -# ************************************************ -# Run the SmartScraperGraph -# ************************************************ - -result = smart_scraper_graph.run() -print(json.dumps(result, indent=2)) diff --git a/examples/json_scraper_graph/.env.example b/examples/json_scraper_graph/.env.example new file mode 100644 index 00000000..f1862149 --- /dev/null +++ b/examples/json_scraper_graph/.env.example @@ -0,0 +1,11 @@ +# OpenAI API Configuration +OPENAI_API_KEY=your-openai-api-key-here + +# Optional Configurations +MAX_TOKENS=4000 +MODEL_NAME=gpt-4-1106-preview +TEMPERATURE=0.7 + +# JSON Scraper Settings +MAX_DEPTH=3 +TIMEOUT=30 \ No newline at end of file diff --git a/examples/json_scraper_graph/README.md b/examples/json_scraper_graph/README.md new file mode 100644 index 00000000..217875ff --- /dev/null +++ b/examples/json_scraper_graph/README.md @@ -0,0 +1,30 @@ +# JSON Scraper Graph Example + +This example demonstrates how to use Scrapegraph-ai to extract and process JSON data from web sources. + +## Features + +- JSON data extraction +- Schema validation +- Data transformation +- Structured output + +## Setup + +1. Install required dependencies +2. Copy `.env.example` to `.env` +3. Configure your API keys in the `.env` file + +## Usage + +```python +from scrapegraphai.graphs import JsonScraperGraph + +graph = JsonScraperGraph() +json_data = graph.scrape("https://api.example.com/data") +``` + +## Environment Variables + +Required environment variables: +- `OPENAI_API_KEY`: Your OpenAI API key \ No newline at end of file diff --git a/examples/anthropic/inputs/example.json b/examples/json_scraper_graph/ollama/inputs/example.json similarity index 100% rename from examples/anthropic/inputs/example.json rename to examples/json_scraper_graph/ollama/inputs/example.json diff --git a/examples/local_models/json_scraper_multi_ollama.py b/examples/json_scraper_graph/ollama/json_scraper_multi_ollama.py similarity index 100% rename from examples/local_models/json_scraper_multi_ollama.py rename to examples/json_scraper_graph/ollama/json_scraper_multi_ollama.py diff --git a/examples/local_models/json_scraper_ollama.py b/examples/json_scraper_graph/ollama/json_scraper_ollama.py similarity index 100% rename from examples/local_models/json_scraper_ollama.py rename to examples/json_scraper_graph/ollama/json_scraper_ollama.py diff --git a/examples/azure/inputs/example.json b/examples/json_scraper_graph/openai/inputs/example.json similarity index 100% rename from examples/azure/inputs/example.json rename to examples/json_scraper_graph/openai/inputs/example.json diff --git a/examples/openai/json_scraper_multi_openai.py b/examples/json_scraper_graph/openai/json_scraper_multi_openai.py similarity index 100% rename from examples/openai/json_scraper_multi_openai.py rename to examples/json_scraper_graph/openai/json_scraper_multi_openai.py diff --git a/examples/openai/json_scraper_openai.py b/examples/json_scraper_graph/openai/json_scraper_openai.py similarity index 100% rename from examples/openai/json_scraper_openai.py rename to examples/json_scraper_graph/openai/json_scraper_openai.py diff --git a/examples/openai/md_scraper_openai.py b/examples/json_scraper_graph/openai/md_scraper_openai.py similarity index 100% rename from examples/openai/md_scraper_openai.py rename to examples/json_scraper_graph/openai/md_scraper_openai.py diff --git a/examples/openai/omni_scraper_openai.py b/examples/json_scraper_graph/openai/omni_scraper_openai.py similarity index 100% rename from examples/openai/omni_scraper_openai.py rename to examples/json_scraper_graph/openai/omni_scraper_openai.py diff --git a/examples/local_models/inputs/books.xml b/examples/local_models/inputs/books.xml deleted file mode 100644 index e3d1fe87..00000000 --- a/examples/local_models/inputs/books.xml +++ /dev/null @@ -1,120 +0,0 @@ - - - - Gambardella, Matthew - XML Developer's Guide - Computer - 44.95 - 2000-10-01 - An in-depth look at creating applications - with XML. - - - Ralls, Kim - Midnight Rain - Fantasy - 5.95 - 2000-12-16 - A former architect battles corporate zombies, - an evil sorceress, and her own childhood to become queen - of the world. - - - Corets, Eva - Maeve Ascendant - Fantasy - 5.95 - 2000-11-17 - After the collapse of a nanotechnology - society in England, the young survivors lay the - foundation for a new society. - - - Corets, Eva - Oberon's Legacy - Fantasy - 5.95 - 2001-03-10 - In post-apocalypse England, the mysterious - agent known only as Oberon helps to create a new life - for the inhabitants of London. Sequel to Maeve - Ascendant. - - - Corets, Eva - The Sundered Grail - Fantasy - 5.95 - 2001-09-10 - The two daughters of Maeve, half-sisters, - battle one another for control of England. Sequel to - Oberon's Legacy. - - - Randall, Cynthia - Lover Birds - Romance - 4.95 - 2000-09-02 - When Carla meets Paul at an ornithology - conference, tempers fly as feathers get ruffled. - - - Thurman, Paula - Splish Splash - Romance - 4.95 - 2000-11-02 - A deep sea diver finds true love twenty - thousand leagues beneath the sea. - - - Knorr, Stefan - Creepy Crawlies - Horror - 4.95 - 2000-12-06 - An anthology of horror stories about roaches, - centipedes, scorpions and other insects. - - - Kress, Peter - Paradox Lost - Science Fiction - 6.95 - 2000-11-02 - After an inadvertant trip through a Heisenberg - Uncertainty Device, James Salway discovers the problems - of being quantum. - - - O'Brien, Tim - Microsoft .NET: The Programming Bible - Computer - 36.95 - 2000-12-09 - Microsoft's .NET initiative is explored in - detail in this deep programmer's reference. - - - O'Brien, Tim - MSXML3: A Comprehensive Guide - Computer - 36.95 - 2000-12-01 - The Microsoft MSXML3 parser is covered in - detail, with attention to XML DOM interfaces, XSLT processing, - SAX and more. - - - Galos, Mike - Visual Studio 7: A Comprehensive Guide - Computer - 49.95 - 2001-04-16 - Microsoft Visual Studio 7 is explored in depth, - looking at how Visual Basic, Visual C++, C#, and ASP+ are - integrated into a comprehensive development - environment. - - \ No newline at end of file diff --git a/examples/local_models/inputs/example.json b/examples/local_models/inputs/example.json deleted file mode 100644 index 2263184c..00000000 --- a/examples/local_models/inputs/example.json +++ /dev/null @@ -1,182 +0,0 @@ -{ - "kind":"youtube#searchListResponse", - "etag":"q4ibjmYp1KA3RqMF4jFLl6PBwOg", - "nextPageToken":"CAUQAA", - "regionCode":"NL", - "pageInfo":{ - "totalResults":1000000, - "resultsPerPage":5 - }, - "items":[ - { - "kind":"youtube#searchResult", - "etag":"QCsHBifbaernVCbLv8Cu6rAeaDQ", - "id":{ - "kind":"youtube#video", - "videoId":"TvWDY4Mm5GM" - }, - "snippet":{ - "publishedAt":"2023-07-24T14:15:01Z", - "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", - "title":"3 Football Clubs Kylian Mbappe Should Avoid Signing ✍️❌⚽️ #football #mbappe #shorts", - "description":"", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"FC Motivate", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T14:15:01Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"0NG5QHdtIQM_V-DBJDEf-jK_Y9k", - "id":{ - "kind":"youtube#video", - "videoId":"aZM_42CcNZ4" - }, - "snippet":{ - "publishedAt":"2023-07-24T16:09:27Z", - "channelId":"UCM5gMM_HqfKHYIEJ3lstMUA", - "title":"Which Football Club Could Cristiano Ronaldo Afford To Buy? 💰", - "description":"Sign up to Sorare and get a FREE card: https://sorare.pxf.io/NellisShorts Give Soraredata a go for FREE: ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"John Nellis", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T16:09:27Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"WbBz4oh9I5VaYj91LjeJvffrBVY", - "id":{ - "kind":"youtube#video", - "videoId":"wkP3XS3aNAY" - }, - "snippet":{ - "publishedAt":"2023-07-24T16:00:50Z", - "channelId":"UC4EP1dxFDPup_aFLt0ElsDw", - "title":"PAULO DYBALA vs THE WORLD'S LONGEST FREEKICK WALL", - "description":"Can Paulo Dybala curl a football around the World's longest free kick wall? We met up with the World Cup winner and put him to ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"Shoot for Love", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T16:00:50Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"juxv_FhT_l4qrR05S1QTrb4CGh8", - "id":{ - "kind":"youtube#video", - "videoId":"rJkDZ0WvfT8" - }, - "snippet":{ - "publishedAt":"2023-07-24T10:00:39Z", - "channelId":"UCO8qj5u80Ga7N_tP3BZWWhQ", - "title":"TOP 10 DEFENDERS 2023", - "description":"SoccerKingz https://soccerkingz.nl Use code: 'ILOVEHOF' to get 10% off. TOP 10 DEFENDERS 2023 Follow us! • Instagram ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"Home of Football", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T10:00:39Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"wtuknXTmI1txoULeH3aWaOuXOow", - "id":{ - "kind":"youtube#video", - "videoId":"XH0rtu4U6SE" - }, - "snippet":{ - "publishedAt":"2023-07-21T16:30:05Z", - "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", - "title":"3 Things You Didn't Know About Erling Haaland ⚽️🇳🇴 #football #haaland #shorts", - "description":"", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"FC Motivate", - "liveBroadcastContent":"none", - "publishTime":"2023-07-21T16:30:05Z" - } - } - ] -} \ No newline at end of file diff --git a/examples/local_models/inputs/plain_html_example.txt b/examples/local_models/inputs/plain_html_example.txt deleted file mode 100644 index 78f814ae..00000000 --- a/examples/local_models/inputs/plain_html_example.txt +++ /dev/null @@ -1,105 +0,0 @@ - -
- - -
-
-
- - -
- \ No newline at end of file diff --git a/examples/local_models/inputs/username.csv b/examples/local_models/inputs/username.csv deleted file mode 100644 index 006ac8e6..00000000 --- a/examples/local_models/inputs/username.csv +++ /dev/null @@ -1,7 +0,0 @@ -Username; Identifier;First name;Last name -booker12;9012;Rachel;Booker -grey07;2070;Laura;Grey -johnson81;4081;Craig;Johnson -jenkins46;9346;Mary;Jenkins -smith79;5079;Jamie;Smith - diff --git a/examples/local_models/package-lock.json b/examples/local_models/package-lock.json deleted file mode 100644 index 4159e5cf..00000000 --- a/examples/local_models/package-lock.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "name": "local_models", - "lockfileVersion": 3, - "requires": true, - "packages": {} -} diff --git a/examples/local_models/package.json b/examples/local_models/package.json deleted file mode 100644 index 0967ef42..00000000 --- a/examples/local_models/package.json +++ /dev/null @@ -1 +0,0 @@ -{} diff --git a/examples/local_models/scrape_plain_text_ollama.py b/examples/local_models/scrape_plain_text_ollama.py deleted file mode 100644 index fe24c2a9..00000000 --- a/examples/local_models/scrape_plain_text_ollama.py +++ /dev/null @@ -1,54 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper from text -""" - -import os -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -# ************************************************ -# Read the text file -# ************************************************ - -FILE_NAME = "inputs/plain_html_example.txt" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -# It could be also a http request using the request model -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "model": "ollama/mistral", - "temperature": 0, - "format": "json", # Ollama needs the format to be specified explicitly - # "model_tokens": 2000, # set context length arbitrarily - "base_url": "http://localhost:11434", - }, - "verbose": True, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects", - source=text, - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/local_models/search_link_graph_ollama.py b/examples/local_models/search_link_graph_ollama.py deleted file mode 100644 index 885b65e9..00000000 --- a/examples/local_models/search_link_graph_ollama.py +++ /dev/null @@ -1,51 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -from scrapegraphai.graphs import SearchLinkGraph -from scrapegraphai.utils import prettify_exec_info -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "model": "ollama/llama3.1:8b", - "temperature": 0, - "format": "json", # Ollama needs the format to be specified explicitly - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily - }, - - "verbose": True, - "headless": False, - "filter_config": { - "diff_domain_filter": True, - # "img_exts": ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp', '.ico'], - # "lang_indicators": ['lang=', '/fr', '/pt', '/es', '/de', '/jp', '/it'], - # "irrelevant_keywords": [ - # '/login', '/signup', '/register', '/contact', 'facebook.com', 'twitter.com', - # 'linkedin.com', 'instagram.com', '.js', '.css', '/wp-content/', '/wp-admin/', - # '/wp-includes/', '/wp-json/', '/wp-comments-post.php', ';amp', '/about', - # '/careers', '/jobs', '/privacy', '/terms', '/legal', '/faq', '/help', - # '.pdf', '.zip', '/news', '/files', '/downloads' - # ] - }, -} - -# ************************************************ -# Create the SearchLinkGraph instance and run it -# ************************************************ - -smart_scraper_graph = SearchLinkGraph( - source="https://sport.sky.it/nba?gr=www", - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/local_models/smart_scraper_multi_concat_ollama.py b/examples/local_models/smart_scraper_multi_concat_ollama.py deleted file mode 100644 index 665b5db4..00000000 --- a/examples/local_models/smart_scraper_multi_concat_ollama.py +++ /dev/null @@ -1,42 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" - -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiConcatGraph - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "model": "ollama/llama3.1", - "temperature": 0, - "format": "json", # Ollama needs the format to be specified explicitly - "base_url": "http://localhost:11434", # set ollama URL arbitrarily - }, - "verbose": True, - "headless": False -} - -# ******************************************************* -# Create the SmartScraperMultiGraph instance and run it -# ******************************************************* - -multiple_search_graph = SmartScraperMultiConcatGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/mistral/.env.example b/examples/mistral/.env.example deleted file mode 100644 index cca63d1d..00000000 --- a/examples/mistral/.env.example +++ /dev/null @@ -1 +0,0 @@ -MISTRAL_API_KEY="YOUR MISTRAL API KEY" diff --git a/examples/mistral/code_generator_graph_mistral.py b/examples/mistral/code_generator_graph_mistral.py deleted file mode 100644 index 19af9aef..00000000 --- a/examples/mistral/code_generator_graph_mistral.py +++ /dev/null @@ -1,59 +0,0 @@ -""" -Basic example of scraping pipeline using Code Generator with schema -""" -import os -from typing import List -from dotenv import load_dotenv -from pydantic import BaseModel, Field -from scrapegraphai.graphs import CodeGeneratorGraph - -load_dotenv() - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Project(BaseModel): - title: str = Field(description="The title of the project") - description: str = Field(description="The description of the project") - -class Projects(BaseModel): - projects: List[Project] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -mistral_key = os.getenv("MISTRAL_API_KEY") - -graph_config = { - "llm": { - "api_key": mistral_key, - "model": "mistralai/open-mistral-nemo", - }, - "verbose": True, - "headless": False, - "reduction": 2, - "max_iterations": { - "overall": 10, - "syntax": 3, - "execution": 3, - "validation": 3, - "semantic": 3 - }, - "output_file_name": "extracted_data.py" -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -code_generator_graph = CodeGeneratorGraph( - prompt="List me all the projects with their description", - source="https://perinim.github.io/projects/", - schema=Projects, - config=graph_config -) - -result = code_generator_graph.run() -print(result) diff --git a/examples/mistral/csv_scraper_graph_multi_mistral.py b/examples/mistral/csv_scraper_graph_multi_mistral.py deleted file mode 100644 index 608a8851..00000000 --- a/examples/mistral/csv_scraper_graph_multi_mistral.py +++ /dev/null @@ -1,56 +0,0 @@ -""" -Basic example of scraping pipeline using CSVScraperMultiGraph from CSV documents -""" -import os -from dotenv import load_dotenv -import pandas as pd -from scrapegraphai.graphs import CSVScraperMultiGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the CSV file -# ************************************************ - -FILE_NAME = "inputs/username.csv" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -text = pd.read_csv(file_path) - -# ************************************************ -# Define the configuration for the graph -# ************************************************ -mistral_key = os.getenv("MISTRAL_API_KEY") - -graph_config = { - "llm": { - "api_key": mistral_key, - "model": "mistralai/open-mistral-nemo", - }, -} - -# ************************************************ -# Create the CSVScraperMultiGraph instance and run it -# ************************************************ - -csv_scraper_graph = CSVScraperMultiGraph( - prompt="List me all the last names", - source=[str(text), str(text)], - config=graph_config -) - -result = csv_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = csv_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/mistral/csv_scraper_mistral.py b/examples/mistral/csv_scraper_mistral.py deleted file mode 100644 index 6daa216c..00000000 --- a/examples/mistral/csv_scraper_mistral.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -Basic example of scraping pipeline using CSVScraperGraph from CSV documents -""" -import os -from dotenv import load_dotenv -import pandas as pd -from scrapegraphai.graphs import CSVScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the CSV file -# ************************************************ - -FILE_NAME = "inputs/username.csv" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -text = pd.read_csv(file_path) - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -mistral_key = os.getenv("MISTRAL_API_KEY") - -graph_config = { - "llm": { - "api_key": mistral_key, - "model": "mistralai/open-mistral-nemo", - }, -} - -# ************************************************ -# Create the CSVScraperGraph instance and run it -# ************************************************ - -csv_scraper_graph = CSVScraperGraph( - prompt="List me all the last names", - source=str(text), # Pass the content of the file, not the file object - config=graph_config -) - -result = csv_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = csv_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/mistral/custom_graph_mistral.py b/examples/mistral/custom_graph_mistral.py deleted file mode 100644 index bac1cd30..00000000 --- a/examples/mistral/custom_graph_mistral.py +++ /dev/null @@ -1,108 +0,0 @@ -""" -Example of custom graph using existing nodes -""" -import os -from dotenv import load_dotenv -from langchain_mistralai import ChatMistralAI, MistralAIEmbeddings -from scrapegraphai.graphs import BaseGraph -from scrapegraphai.nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode, RobotsNode - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -mistral_key = os.getenv("MISTRAL_API_KEY") -graph_config = { - "llm": { - "api_key": mistral_key, - "model": "mistralai/open-mistral-nemo", - }, -} - -# ************************************************ -# Define the graph nodes -# ************************************************ - -llm_model = ChatMistralAI(**graph_config["llm"]) -embedder = MistralAIEmbeddings(api_key=llm_model.mistral_api_key) - -# define the nodes for the graph -robot_node = RobotsNode( - input="url", - output=["is_scrapable"], - node_config={ - "llm_model": llm_model, - "force_scraping": True, - "verbose": True, - } -) - -fetch_node = FetchNode( - input="url | local_dir", - output=["doc"], - node_config={ - "verbose": True, - "headless": True, - } -) -parse_node = ParseNode( - input="doc", - output=["parsed_doc"], - node_config={ - "chunk_size": 4096, - "verbose": True, - } -) -rag_node = RAGNode( - input="user_prompt & (parsed_doc | doc)", - output=["relevant_chunks"], - node_config={ - "llm_model": llm_model, - "embedder_model": embedder, - "verbose": True, - } -) -generate_answer_node = GenerateAnswerNode( - input="user_prompt & (relevant_chunks | parsed_doc | doc)", - output=["answer"], - node_config={ - "llm_model": llm_model, - "verbose": True, - } -) - -# ************************************************ -# Create the graph by defining the connections -# ************************************************ - -graph = BaseGraph( - nodes=[ - robot_node, - fetch_node, - parse_node, - rag_node, - generate_answer_node, - ], - edges=[ - (robot_node, fetch_node), - (fetch_node, parse_node), - (parse_node, rag_node), - (rag_node, generate_answer_node) - ], - entry_point=robot_node -) - -# ************************************************ -# Execute the graph -# ************************************************ - -result, execution_info = graph.execute({ - "user_prompt": "Describe the content", - "url": "https://example.com/" -}) - -# get the answer from the result -result = result.get("answer", "No answer found.") -print(result) diff --git a/examples/mistral/depth_search_graph_mistral.py b/examples/mistral/depth_search_graph_mistral.py deleted file mode 100644 index ae18ffba..00000000 --- a/examples/mistral/depth_search_graph_mistral.py +++ /dev/null @@ -1,30 +0,0 @@ -""" -depth_search_graph_opeani example -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import DepthSearchGraph - -load_dotenv() - -mistral_key = os.getenv("MISTRAL_API_KEY") - -graph_config = { - "llm": { - "api_key": mistral_key, - "model": "mistralai/open-mistral-nemo", - }, - "verbose": True, - "headless": False, - "depth": 2, - "only_inside_links": False, -} - -search_graph = DepthSearchGraph( - prompt="List me all the projects with their description", - source="https://perinim.github.io", - config=graph_config -) - -result = search_graph.run() -print(result) diff --git a/examples/mistral/document_scraper_mistral.py b/examples/mistral/document_scraper_mistral.py deleted file mode 100644 index aa75e9c4..00000000 --- a/examples/mistral/document_scraper_mistral.py +++ /dev/null @@ -1,43 +0,0 @@ -""" -document_scraper example -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import DocumentScraperGraph - -load_dotenv() - - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -mistral_key = os.getenv("MISTRAL_API_KEY") - -graph_config = { - "llm": { - "api_key": mistral_key, - "model": "mistralai/open-mistral-nemo", - }, -} - -source = """ - The Divine Comedy, Italian La Divina Commedia, original name La commedia, long narrative poem written in Italian - circa 1308/21 by Dante. It is usually held to be one of the world s great works of literature. - Divided into three major sections—Inferno, Purgatorio, and Paradiso—the narrative traces the journey of Dante - from darkness and error to the revelation of the divine light, culminating in the Beatific Vision of God. - Dante is guided by the Roman poet Virgil, who represents the epitome of human knowledge, from the dark wood - through the descending circles of the pit of Hell (Inferno). He then climbs the mountain of Purgatory, guided - by the Roman poet Statius, who represents the fulfilment of human knowledge, and is finally led by his lifelong love, - the Beatrice of his earlier poetry, through the celestial spheres of Paradise. -""" - -pdf_scraper_graph = DocumentScraperGraph( - prompt="Summarize the text and find the main topics", - source=source, - config=graph_config, -) -result = pdf_scraper_graph.run() - -print(json.dumps(result, indent=4)) diff --git a/examples/mistral/inputs/books.xml b/examples/mistral/inputs/books.xml deleted file mode 100644 index e3d1fe87..00000000 --- a/examples/mistral/inputs/books.xml +++ /dev/null @@ -1,120 +0,0 @@ - - - - Gambardella, Matthew - XML Developer's Guide - Computer - 44.95 - 2000-10-01 - An in-depth look at creating applications - with XML. - - - Ralls, Kim - Midnight Rain - Fantasy - 5.95 - 2000-12-16 - A former architect battles corporate zombies, - an evil sorceress, and her own childhood to become queen - of the world. - - - Corets, Eva - Maeve Ascendant - Fantasy - 5.95 - 2000-11-17 - After the collapse of a nanotechnology - society in England, the young survivors lay the - foundation for a new society. - - - Corets, Eva - Oberon's Legacy - Fantasy - 5.95 - 2001-03-10 - In post-apocalypse England, the mysterious - agent known only as Oberon helps to create a new life - for the inhabitants of London. Sequel to Maeve - Ascendant. - - - Corets, Eva - The Sundered Grail - Fantasy - 5.95 - 2001-09-10 - The two daughters of Maeve, half-sisters, - battle one another for control of England. Sequel to - Oberon's Legacy. - - - Randall, Cynthia - Lover Birds - Romance - 4.95 - 2000-09-02 - When Carla meets Paul at an ornithology - conference, tempers fly as feathers get ruffled. - - - Thurman, Paula - Splish Splash - Romance - 4.95 - 2000-11-02 - A deep sea diver finds true love twenty - thousand leagues beneath the sea. - - - Knorr, Stefan - Creepy Crawlies - Horror - 4.95 - 2000-12-06 - An anthology of horror stories about roaches, - centipedes, scorpions and other insects. - - - Kress, Peter - Paradox Lost - Science Fiction - 6.95 - 2000-11-02 - After an inadvertant trip through a Heisenberg - Uncertainty Device, James Salway discovers the problems - of being quantum. - - - O'Brien, Tim - Microsoft .NET: The Programming Bible - Computer - 36.95 - 2000-12-09 - Microsoft's .NET initiative is explored in - detail in this deep programmer's reference. - - - O'Brien, Tim - MSXML3: A Comprehensive Guide - Computer - 36.95 - 2000-12-01 - The Microsoft MSXML3 parser is covered in - detail, with attention to XML DOM interfaces, XSLT processing, - SAX and more. - - - Galos, Mike - Visual Studio 7: A Comprehensive Guide - Computer - 49.95 - 2001-04-16 - Microsoft Visual Studio 7 is explored in depth, - looking at how Visual Basic, Visual C++, C#, and ASP+ are - integrated into a comprehensive development - environment. - - \ No newline at end of file diff --git a/examples/mistral/inputs/example.json b/examples/mistral/inputs/example.json deleted file mode 100644 index 2263184c..00000000 --- a/examples/mistral/inputs/example.json +++ /dev/null @@ -1,182 +0,0 @@ -{ - "kind":"youtube#searchListResponse", - "etag":"q4ibjmYp1KA3RqMF4jFLl6PBwOg", - "nextPageToken":"CAUQAA", - "regionCode":"NL", - "pageInfo":{ - "totalResults":1000000, - "resultsPerPage":5 - }, - "items":[ - { - "kind":"youtube#searchResult", - "etag":"QCsHBifbaernVCbLv8Cu6rAeaDQ", - "id":{ - "kind":"youtube#video", - "videoId":"TvWDY4Mm5GM" - }, - "snippet":{ - "publishedAt":"2023-07-24T14:15:01Z", - "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", - "title":"3 Football Clubs Kylian Mbappe Should Avoid Signing ✍️❌⚽️ #football #mbappe #shorts", - "description":"", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"FC Motivate", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T14:15:01Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"0NG5QHdtIQM_V-DBJDEf-jK_Y9k", - "id":{ - "kind":"youtube#video", - "videoId":"aZM_42CcNZ4" - }, - "snippet":{ - "publishedAt":"2023-07-24T16:09:27Z", - "channelId":"UCM5gMM_HqfKHYIEJ3lstMUA", - "title":"Which Football Club Could Cristiano Ronaldo Afford To Buy? 💰", - "description":"Sign up to Sorare and get a FREE card: https://sorare.pxf.io/NellisShorts Give Soraredata a go for FREE: ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"John Nellis", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T16:09:27Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"WbBz4oh9I5VaYj91LjeJvffrBVY", - "id":{ - "kind":"youtube#video", - "videoId":"wkP3XS3aNAY" - }, - "snippet":{ - "publishedAt":"2023-07-24T16:00:50Z", - "channelId":"UC4EP1dxFDPup_aFLt0ElsDw", - "title":"PAULO DYBALA vs THE WORLD'S LONGEST FREEKICK WALL", - "description":"Can Paulo Dybala curl a football around the World's longest free kick wall? We met up with the World Cup winner and put him to ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"Shoot for Love", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T16:00:50Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"juxv_FhT_l4qrR05S1QTrb4CGh8", - "id":{ - "kind":"youtube#video", - "videoId":"rJkDZ0WvfT8" - }, - "snippet":{ - "publishedAt":"2023-07-24T10:00:39Z", - "channelId":"UCO8qj5u80Ga7N_tP3BZWWhQ", - "title":"TOP 10 DEFENDERS 2023", - "description":"SoccerKingz https://soccerkingz.nl Use code: 'ILOVEHOF' to get 10% off. TOP 10 DEFENDERS 2023 Follow us! • Instagram ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"Home of Football", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T10:00:39Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"wtuknXTmI1txoULeH3aWaOuXOow", - "id":{ - "kind":"youtube#video", - "videoId":"XH0rtu4U6SE" - }, - "snippet":{ - "publishedAt":"2023-07-21T16:30:05Z", - "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", - "title":"3 Things You Didn't Know About Erling Haaland ⚽️🇳🇴 #football #haaland #shorts", - "description":"", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"FC Motivate", - "liveBroadcastContent":"none", - "publishTime":"2023-07-21T16:30:05Z" - } - } - ] -} \ No newline at end of file diff --git a/examples/mistral/inputs/plain_html_example.txt b/examples/mistral/inputs/plain_html_example.txt deleted file mode 100644 index 78f814ae..00000000 --- a/examples/mistral/inputs/plain_html_example.txt +++ /dev/null @@ -1,105 +0,0 @@ - -
- - -
-
-
- - -
- \ No newline at end of file diff --git a/examples/mistral/inputs/username.csv b/examples/mistral/inputs/username.csv deleted file mode 100644 index 006ac8e6..00000000 --- a/examples/mistral/inputs/username.csv +++ /dev/null @@ -1,7 +0,0 @@ -Username; Identifier;First name;Last name -booker12;9012;Rachel;Booker -grey07;2070;Laura;Grey -johnson81;4081;Craig;Johnson -jenkins46;9346;Mary;Jenkins -smith79;5079;Jamie;Smith - diff --git a/examples/mistral/json_scraper_mistral.py b/examples/mistral/json_scraper_mistral.py deleted file mode 100644 index 0b9be3ec..00000000 --- a/examples/mistral/json_scraper_mistral.py +++ /dev/null @@ -1,46 +0,0 @@ -""" -Basic example of scraping pipeline using JSONScraperGraph from JSON documents -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import JSONScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the JSON file -# ************************************************ - -FILE_NAME = "inputs/example.json" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -mistral_key = os.getenv("MISTRAL_API_KEY") - -graph_config = { - "llm": { - "api_key": mistral_key, - "model": "mistralai/open-mistral-nemo", - }, -} - -# ************************************************ -# Create the JSONScraperGraph instance and run it -# ************************************************ - -json_scraper_graph = JSONScraperGraph( - prompt="List me all the authors, title and genres of the books", - source=text, # Pass the content of the file, not the file object - config=graph_config -) - -result = json_scraper_graph.run() -print(result) diff --git a/examples/mistral/json_scraper_multi_mistral.py b/examples/mistral/json_scraper_multi_mistral.py deleted file mode 100644 index 1369eda7..00000000 --- a/examples/mistral/json_scraper_multi_mistral.py +++ /dev/null @@ -1,37 +0,0 @@ -""" -Module for showing how PDFScraper multi works -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import JSONScraperMultiGraph - -load_dotenv() - -mistral_key = os.getenv("MISTRAL_API_KEY") - -graph_config = { - "llm": { - "api_key": mistral_key, - "model": "mistralai/open-mistral-nemo", - } -} - -FILE_NAME = "inputs/example.json" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -sources = [text, text] - -multiple_search_graph = JSONScraperMultiGraph( - prompt= "List me all the authors, title and genres of the books", - source= sources, - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/mistral/md_scraper_mistral.py b/examples/mistral/md_scraper_mistral.py deleted file mode 100644 index 135f08ba..00000000 --- a/examples/mistral/md_scraper_mistral.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -Basic example of scraping pipeline using DocumentScraperGraph from MD documents -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import DocumentScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the MD file -# ************************************************ - -FILE_NAME = "inputs/markdown_example.md" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -mistral_key = os.getenv("MISTRAL_API_KEY") - -graph_config = { - "llm": { - "api_key": mistral_key, - "model": "mistralai/open-mistral-nemo", - }, -} - -# ************************************************ -# Create the DocumentScraperGraph instance and run it -# ************************************************ - -md_scraper_graph = DocumentScraperGraph( - prompt="List me all the authors, title and genres of the books", - source=text, # Pass the content of the file, not the file object - config=graph_config -) - -result = md_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = md_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/mistral/rate_limit_mistral.py b/examples/mistral/rate_limit_mistral.py deleted file mode 100644 index 4bc0f6fb..00000000 --- a/examples/mistral/rate_limit_mistral.py +++ /dev/null @@ -1,39 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper with a custom rate limit -""" -import os -import json -from scrapegraphai.graphs import SmartScraperGraph -from dotenv import load_dotenv - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - - -graph_config = { - "llm": { - "api_key": os.getenv("MISTRAL_API_KEY"), - "model": "mistralai/open-mistral-nemo", - "rate_limit": { - "requests_per_second": 1 - } - }, - "verbose": True, - "headless": False, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me what does the company do, the name and a contact email.", - source="https://scrapegraphai.com/", - config=graph_config -) - -result = smart_scraper_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/mistral/readme.md b/examples/mistral/readme.md deleted file mode 100644 index 6e13a97b..00000000 --- a/examples/mistral/readme.md +++ /dev/null @@ -1 +0,0 @@ -This folder contains examples of how to use ScrapeGraph-AI with Mistral, an LLM provider. The examples show how to extract information from a website using a natural language prompt. \ No newline at end of file diff --git a/examples/mistral/scrape_plain_text_mistral.py b/examples/mistral/scrape_plain_text_mistral.py deleted file mode 100644 index 131747c6..00000000 --- a/examples/mistral/scrape_plain_text_mistral.py +++ /dev/null @@ -1,54 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper from text -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the text file -# ************************************************ - -FILE_NAME = "inputs/plain_html_example.txt" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -# It could be also a http request using the request model -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -mistral_key = os.getenv("MISTRAL_API_KEY") - -graph_config = { - "llm": { - "api_key": mistral_key, - "model": "mistralai/open-mistral-nemo", - }, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description.", - source=text, - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/mistral/script_generator_mistral.py b/examples/mistral/script_generator_mistral.py deleted file mode 100644 index 74a81b46..00000000 --- a/examples/mistral/script_generator_mistral.py +++ /dev/null @@ -1,45 +0,0 @@ -""" -Basic example of scraping pipeline using ScriptCreatorGraph -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import ScriptCreatorGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -mistral_key = os.getenv("MISTRAL_API_KEY") - -graph_config = { - "llm": { - "api_key": mistral_key, - "model": "mistralai/open-mistral-nemo", - }, - "library": "beautifulsoup" -} - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -script_creator_graph = ScriptCreatorGraph( - prompt="List me all the projects with their description.", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects", - config=graph_config -) - -result = script_creator_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = script_creator_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - diff --git a/examples/mistral/script_generator_schema_mistral.py b/examples/mistral/script_generator_schema_mistral.py deleted file mode 100644 index 3ad46685..00000000 --- a/examples/mistral/script_generator_schema_mistral.py +++ /dev/null @@ -1,61 +0,0 @@ -""" -Basic example of scraping pipeline using ScriptCreatorGraph -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import ScriptCreatorGraph -from scrapegraphai.utils import prettify_exec_info -from pydantic import BaseModel, Field -from typing import List - -load_dotenv() - -# ************************************************ -# Define the schema for the graph -# ************************************************ - -class Project(BaseModel): - title: str = Field(description="The title of the project") - description: str = Field(description="The description of the project") - -class Projects(BaseModel): - projects: List[Project] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -mistral_key = os.getenv("MISTRAL_API_KEY") - -graph_config = { - "llm": { - "api_key": mistral_key, - "model": "mistralai/open-mistral-nemo", - }, - "library": "beautifulsoup", - "verbose": True, -} - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -script_creator_graph = ScriptCreatorGraph( - prompt="List me all the projects with their description.", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects", - config=graph_config, - schema=Projects -) - -result = script_creator_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = script_creator_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - diff --git a/examples/mistral/script_multi_generator_mistral.py b/examples/mistral/script_multi_generator_mistral.py deleted file mode 100644 index d5869c53..00000000 --- a/examples/mistral/script_multi_generator_mistral.py +++ /dev/null @@ -1,54 +0,0 @@ -""" -Basic example of scraping pipeline using ScriptCreatorGraph -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import ScriptCreatorMultiGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -mistral_key = os.getenv("MISTRAL_API_KEY") - -graph_config = { - "llm": { - "api_key": mistral_key, - "model": "mistralai/open-mistral-nemo", - }, - "library": "beautifulsoup", - "verbose": True, -} - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -urls=[ - "https://schultzbergagency.com/emil-raste-karlsen/", - "https://schultzbergagency.com/johanna-hedberg/", -] - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -script_creator_graph = ScriptCreatorMultiGraph( - prompt="Find information about actors", - # also accepts a string with the already downloaded HTML code - source=urls, - config=graph_config -) - -result = script_creator_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = script_creator_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/mistral/search_graph_mistral.py b/examples/mistral/search_graph_mistral.py deleted file mode 100644 index 983733e0..00000000 --- a/examples/mistral/search_graph_mistral.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -Example of Search Graph -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SearchGraph - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -mistral_key = os.getenv("MISTRAL_API_KEY") - -graph_config = { - "llm": { - "api_key": mistral_key, - "model": "mistralai/open-mistral-nemo", - }, - "max_results": 2, - "verbose": True, -} - -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me Chioggia's famous dishes", - config=graph_config -) - -result = search_graph.run() -print(result) diff --git a/examples/mistral/search_graph_schema_mistral.py b/examples/mistral/search_graph_schema_mistral.py deleted file mode 100644 index 06a88ff7..00000000 --- a/examples/mistral/search_graph_schema_mistral.py +++ /dev/null @@ -1,61 +0,0 @@ -""" -Example of Search Graph -""" -import os -from typing import List -from dotenv import load_dotenv -from pydantic import BaseModel, Field -from scrapegraphai.graphs import SearchGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Dish(BaseModel): - name: str = Field(description="The name of the dish") - description: str = Field(description="The description of the dish") - -class Dishes(BaseModel): - dishes: List[Dish] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -mistral_key = os.getenv("MISTRAL_API_KEY") - -graph_config = { - "llm": { - "api_key": mistral_key, - "model": "mistralai/open-mistral-nemo", - }, - "max_results": 2, - "verbose": True, -} - -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me Chioggia's famous dishes", - config=graph_config, - schema=Dishes -) - -result = search_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = search_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json and csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/mistral/search_link_graph_mistral.py b/examples/mistral/search_link_graph_mistral.py deleted file mode 100644 index 45d0c5f0..00000000 --- a/examples/mistral/search_link_graph_mistral.py +++ /dev/null @@ -1,42 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SearchLinkGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -mistral_key = os.getenv("MISTRAL_API_KEY") - -graph_config = { - "llm": { - "api_key": mistral_key, - "model": "mistralai/open-mistral-nemo", - }, - "verbose": True, - "headless": False, -} - -# ************************************************ -# Create the SearchLinkGraph instance and run it -# ************************************************ - -smart_scraper_graph = SearchLinkGraph( - source="https://sport.sky.it/nba?gr=www", - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/mistral/smart_scraper_lite_mistral.py b/examples/mistral/smart_scraper_lite_mistral.py deleted file mode 100644 index 390371f9..00000000 --- a/examples/mistral/smart_scraper_lite_mistral.py +++ /dev/null @@ -1,31 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperLiteGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -graph_config = { - "llm": { - "api_key": os.getenv("MISTRAL_API_KEY"), - "model": "mistral/mistral-medium", - }, - "verbose": True, - "headless": False, -} - -smart_scraper_lite_graph = SmartScraperLiteGraph( - prompt="Who is Marco Perini?", - source="https://perinim.github.io/", - config=graph_config -) - -result = smart_scraper_lite_graph.run() -print(json.dumps(result, indent=4)) - -graph_exec_info = smart_scraper_lite_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/mistral/smart_scraper_mistral.py b/examples/mistral/smart_scraper_mistral.py deleted file mode 100644 index a2f82504..00000000 --- a/examples/mistral/smart_scraper_mistral.py +++ /dev/null @@ -1,36 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - - -graph_config = { - "llm": { - "api_key": os.getenv("MISTRAL_API_KEY"), - "model": "mistralai/open-mistral-nemo", - }, - "verbose": True, - "headless": False, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me what does the company do, the name and a contact email.", - source="https://scrapegraphai.com/", - config=graph_config -) - -result = smart_scraper_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/mistral/smart_scraper_multi_lite_mistral.py b/examples/mistral/smart_scraper_multi_lite_mistral.py deleted file mode 100644 index ce2d19bf..00000000 --- a/examples/mistral/smart_scraper_multi_lite_mistral.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiLiteGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -graph_config = { - "llm": { - "api_key": os.getenv("MISTRAL_API_KEY"), - "model": "mistral/mistral-medium", - }, - "verbose": True, - "headless": False, -} - -smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - config=graph_config -) - -result = smart_scraper_multi_lite_graph.run() -print(json.dumps(result, indent=4)) - -graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - diff --git a/examples/mistral/smart_scraper_multi_mistral.py b/examples/mistral/smart_scraper_multi_mistral.py deleted file mode 100644 index 7929f9cc..00000000 --- a/examples/mistral/smart_scraper_multi_mistral.py +++ /dev/null @@ -1,41 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiGraph - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -mistral_key = os.getenv("MISTRAL_API_KEY") - -graph_config = { - "llm": { - "api_key": mistral_key, - "model": "mistralai/open-mistral-nemo", - }, - "verbose": True, - "headless": False, -} - -# ******************************************************* -# Create the SmartScraperMultiGraph instance and run it -# ******************************************************* - -multiple_search_graph = SmartScraperMultiGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/mistral/smart_scraper_schema_mistral.py b/examples/mistral/smart_scraper_schema_mistral.py deleted file mode 100644 index 3b129a89..00000000 --- a/examples/mistral/smart_scraper_schema_mistral.py +++ /dev/null @@ -1,50 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper with schema -""" -import os -from typing import List -from dotenv import load_dotenv -from pydantic import BaseModel, Field -from scrapegraphai.graphs import SmartScraperGraph - -load_dotenv() - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Project(BaseModel): - title: str = Field(description="The title of the project") - description: str = Field(description="The description of the project") - -class Projects(BaseModel): - projects: List[Project] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -mistral_key = os.getenv("MISTRAL_API_KEY") - -graph_config = { - "llm": { - "api_key":mistral_key, - "model": "mistralai/open-mistral-nemo", - }, - "verbose": True, - "headless": False, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description", - source="https://perinim.github.io/projects/", - schema=Projects, - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) diff --git a/examples/mistral/xml_scraper_graph_multi_mistral.py b/examples/mistral/xml_scraper_graph_multi_mistral.py deleted file mode 100644 index 6db20ebf..00000000 --- a/examples/mistral/xml_scraper_graph_multi_mistral.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -Basic example of scraping pipeline using XMLScraperMultiGraph from XML documents -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import XMLScraperMultiGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info -load_dotenv() - -# ************************************************ -# Read the XML file -# ************************************************ - -FILE_NAME = "inputs/books.xml" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -mistral_key = os.getenv("MISTRAL_API_KEY") - -graph_config = { - "llm": { - "api_key":mistral_key, - "model": "mistralai/open-mistral-nemo", - }, - "verbose": True, - "headless": False, -} -# ************************************************ -# Create the XMLScraperMultiGraph instance and run it -# ************************************************ - -xml_scraper_graph = XMLScraperMultiGraph( - prompt="List me all the authors, title and genres of the books", - source=[text, text], # Pass the content of the file, not the file object - config=graph_config -) - -result = xml_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = xml_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/mistral/xml_scraper_mistral.py b/examples/mistral/xml_scraper_mistral.py deleted file mode 100644 index 6d551c22..00000000 --- a/examples/mistral/xml_scraper_mistral.py +++ /dev/null @@ -1,58 +0,0 @@ -""" -Basic example of scraping pipeline using XMLScraperGraph from XML documents -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import XMLScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the XML file -# ************************************************ - -FILE_NAME = "inputs/books.xml" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -mistral_key = os.getenv("MISTRAL_API_KEY") - -graph_config = { - "llm": { - "api_key": mistral_key, - "model": "mistralai/open-mistral-nemo", - }, - "verbose":False, -} - -# ************************************************ -# Create the XMLScraperGraph instance and run it -# ************************************************ - -xml_scraper_graph = XMLScraperGraph( - prompt="List me all the authors, title and genres of the books", - source=text, # Pass the content of the file, not the file object - config=graph_config -) - -result = xml_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = xml_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/model_instance/.env.example b/examples/model_instance/.env.example deleted file mode 100644 index c5a7ed85..00000000 --- a/examples/model_instance/.env.example +++ /dev/null @@ -1 +0,0 @@ -MOONLIGHT_API_KEY="YOUR MOONLIGHT API KEY" \ No newline at end of file diff --git a/examples/model_instance/smart_scraper_with_model_instace.py b/examples/model_instance/smart_scraper_with_model_instace.py deleted file mode 100644 index b362414f..00000000 --- a/examples/model_instance/smart_scraper_with_model_instace.py +++ /dev/null @@ -1,53 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper and model_instace -""" - -import os, json -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info -from langchain_community.chat_models.moonshot import MoonshotChat -from dotenv import load_dotenv -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - - -llm_instance_config = { - "model": "moonshot-v1-8k", - "base_url": "https://api.moonshot.cn/v1", - "moonshot_api_key": os.getenv("MOONLIGHT_API_KEY"), -} - - -llm_model_instance = MoonshotChat(**llm_instance_config) - -graph_config = { - "llm": { - "model_instance": llm_model_instance, - "model_tokens": 10000 - }, - "verbose": True, - "headless": True, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me what does the company do, the name and a contact email.", - source="https://scrapegraphai.com/", - config=graph_config -) - -result = smart_scraper_graph.run() -print(json.dumps(result, indent=4)) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/moonshot/.env.example b/examples/moonshot/.env.example deleted file mode 100644 index c5a7ed85..00000000 --- a/examples/moonshot/.env.example +++ /dev/null @@ -1 +0,0 @@ -MOONLIGHT_API_KEY="YOUR MOONLIGHT API KEY" \ No newline at end of file diff --git a/examples/moonshot/code_generator_graph_moonshot.py b/examples/moonshot/code_generator_graph_moonshot.py deleted file mode 100644 index 58e6182b..00000000 --- a/examples/moonshot/code_generator_graph_moonshot.py +++ /dev/null @@ -1,67 +0,0 @@ -""" -Basic example of scraping pipeline using Code Generator with schema -""" - -import os, json -from typing import List -from dotenv import load_dotenv -from pydantic import BaseModel, Field -from langchain_community.chat_models.moonshot import MoonshotChat -from scrapegraphai.graphs import CodeGeneratorGraph - -load_dotenv() - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Project(BaseModel): - title: str = Field(description="The title of the project") - description: str = Field(description="The description of the project") - -class Projects(BaseModel): - projects: List[Project] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -llm_instance_config = { - "model": "moonshot-v1-8k", - "base_url": "https://api.moonshot.cn/v1", - "moonshot_api_key": os.getenv("MOONLIGHT_API_KEY"), -} - -llm_model_instance = MoonshotChat(**llm_instance_config) - -graph_config = { - "llm": { - "model_instance": llm_model_instance, - "model_tokens": 10000 - }, - "verbose": True, - "headless": False, - "reduction": 2, - "max_iterations": { - "overall": 10, - "syntax": 3, - "execution": 3, - "validation": 3, - "semantic": 3 - }, - "output_file_name": "extracted_data.py" -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -code_generator_graph = CodeGeneratorGraph( - prompt="List me all the projects with their description", - source="https://perinim.github.io/projects/", - schema=Projects, - config=graph_config -) - -result = code_generator_graph.run() -print(result) \ No newline at end of file diff --git a/examples/moonshot/document_scraper_moonshot.py b/examples/moonshot/document_scraper_moonshot.py deleted file mode 100644 index aa75e9c4..00000000 --- a/examples/moonshot/document_scraper_moonshot.py +++ /dev/null @@ -1,43 +0,0 @@ -""" -document_scraper example -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import DocumentScraperGraph - -load_dotenv() - - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -mistral_key = os.getenv("MISTRAL_API_KEY") - -graph_config = { - "llm": { - "api_key": mistral_key, - "model": "mistralai/open-mistral-nemo", - }, -} - -source = """ - The Divine Comedy, Italian La Divina Commedia, original name La commedia, long narrative poem written in Italian - circa 1308/21 by Dante. It is usually held to be one of the world s great works of literature. - Divided into three major sections—Inferno, Purgatorio, and Paradiso—the narrative traces the journey of Dante - from darkness and error to the revelation of the divine light, culminating in the Beatific Vision of God. - Dante is guided by the Roman poet Virgil, who represents the epitome of human knowledge, from the dark wood - through the descending circles of the pit of Hell (Inferno). He then climbs the mountain of Purgatory, guided - by the Roman poet Statius, who represents the fulfilment of human knowledge, and is finally led by his lifelong love, - the Beatrice of his earlier poetry, through the celestial spheres of Paradise. -""" - -pdf_scraper_graph = DocumentScraperGraph( - prompt="Summarize the text and find the main topics", - source=source, - config=graph_config, -) -result = pdf_scraper_graph.run() - -print(json.dumps(result, indent=4)) diff --git a/examples/moonshot/readme.md b/examples/moonshot/readme.md deleted file mode 100644 index 6b9b2f21..00000000 --- a/examples/moonshot/readme.md +++ /dev/null @@ -1 +0,0 @@ -This folder offer an example of how to use ScrapeGraph-AI with Moonshot and SmartScraperGraph. More usage examples can refer to openai exapmles. \ No newline at end of file diff --git a/examples/moonshot/smart_scraper_lite_moonshot.py b/examples/moonshot/smart_scraper_lite_moonshot.py deleted file mode 100644 index 509027fb..00000000 --- a/examples/moonshot/smart_scraper_lite_moonshot.py +++ /dev/null @@ -1,31 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperLiteGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -graph_config = { - "llm": { - "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "anthropic/claude-3-haiku-20240307", - }, - "verbose": True, - "headless": False, -} - -smart_scraper_lite_graph = SmartScraperLiteGraph( - prompt="Who is Marco Perini?", - source="https://perinim.github.io/", - config=graph_config -) - -result = smart_scraper_lite_graph.run() -print(json.dumps(result, indent=4)) - -graph_exec_info = smart_scraper_lite_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/moonshot/smart_scraper_multi_concat_moonshot.py b/examples/moonshot/smart_scraper_multi_concat_moonshot.py deleted file mode 100644 index 1e652db4..00000000 --- a/examples/moonshot/smart_scraper_multi_concat_moonshot.py +++ /dev/null @@ -1,52 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from langchain_community.chat_models.moonshot import MoonshotChat -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiConcatGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - - -llm_instance_config = { - "model": "moonshot-v1-8k", - "base_url": "https://api.moonshot.cn/v1", - "moonshot_api_key": os.getenv("MOONLIGHT_API_KEY"), -} - - -llm_model_instance = MoonshotChat(**llm_instance_config) - -graph_config = { - "llm": { - "model_instance": llm_model_instance, - "model_tokens": 10000 - }, - "verbose": True, - "headless": True, -} - - -# ******************************************************* -# Create the SmartScraperMultiGraph instance and run it -# ******************************************************* - -multiple_search_graph = SmartScraperMultiConcatGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/moonshot/smart_scraper_multi_lite_moonshot.py b/examples/moonshot/smart_scraper_multi_lite_moonshot.py deleted file mode 100644 index b3e2b7be..00000000 --- a/examples/moonshot/smart_scraper_multi_lite_moonshot.py +++ /dev/null @@ -1,34 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiLiteGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -graph_config = { - "llm": { - "api_key": os.getenv("MOONSHOT_API_KEY"), - "model": "moonshot/moonshot-v1-8b", - }, - "verbose": True, - "headless": False, -} - -smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - config=graph_config -) - -result = smart_scraper_multi_lite_graph.run() -print(json.dumps(result, indent=4)) - -graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/moonshot/smart_scraper_with_moonshot.py b/examples/moonshot/smart_scraper_with_moonshot.py deleted file mode 100644 index 28635ba3..00000000 --- a/examples/moonshot/smart_scraper_with_moonshot.py +++ /dev/null @@ -1,54 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper and model_instace -""" -import os -import json -from langchain_community.chat_models.moonshot import MoonshotChat -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - - -llm_instance_config = { - "model": "moonshot-v1-8k", - "base_url": "https://api.moonshot.cn/v1", - "moonshot_api_key": os.getenv("MOONLIGHT_API_KEY"), -} - - -llm_model_instance = MoonshotChat(**llm_instance_config) - -graph_config = { - "llm": { - "model_instance": llm_model_instance, - "model_tokens": 10000 - }, - "verbose": True, - "headless": True, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me what does the company do, the name and a contact email.", - source="https://scrapegraphai.com/", - config=graph_config -) - -result = smart_scraper_graph.run() -print(json.dumps(result, indent=4)) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/nemotron/code_generator_graph_nemotron.py b/examples/nemotron/code_generator_graph_nemotron.py deleted file mode 100644 index 5ccd9d9f..00000000 --- a/examples/nemotron/code_generator_graph_nemotron.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -Basic example of scraping pipeline using Code Generator with schema -""" -import os -from typing import List -from dotenv import load_dotenv -from pydantic import BaseModel, Field -from scrapegraphai.graphs import CodeGeneratorGraph - -load_dotenv() - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Project(BaseModel): - title: str = Field(description="The title of the project") - description: str = Field(description="The description of the project") - -class Projects(BaseModel): - projects: List[Project] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "api_key": os.getenv("NEMOTRON_APIKEY"), - "model": "nvidia/meta/llama3-70b-instruct", - }, - "verbose": True, - "headless": False, - "reduction": 2, - "max_iterations": { - "overall": 10, - "syntax": 3, - "execution": 3, - "validation": 3, - "semantic": 3 - }, - "output_file_name": "extracted_data.py" -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -code_generator_graph = CodeGeneratorGraph( - prompt="List me all the projects with their description", - source="https://perinim.github.io/projects/", - schema=Projects, - config=graph_config -) - -result = code_generator_graph.run() -print(result) diff --git a/examples/nemotron/csv_scraper_graph_multi_nemotron.py b/examples/nemotron/csv_scraper_graph_multi_nemotron.py deleted file mode 100644 index d5de6039..00000000 --- a/examples/nemotron/csv_scraper_graph_multi_nemotron.py +++ /dev/null @@ -1,54 +0,0 @@ -""" -Basic example of scraping pipeline using CSVScraperMultiGraph from CSV documents -""" -import os -import pandas as pd -from dotenv import load_dotenv -from scrapegraphai.graphs import CSVScraperMultiGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() -# ************************************************ -# Read the CSV file -# ************************************************ - -FILE_NAME = "inputs/username.csv" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -text = pd.read_csv(file_path) - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "api_key": os.getenv("NEMOTRON_APIKEY"), - "model": "nvidia/meta/llama3-70b-instruct", - } -} - -# ************************************************ -# Create the CSVScraperMultiGraph instance and run it -# ************************************************ - -csv_scraper_graph = CSVScraperMultiGraph( - prompt="List me all the last names", - source=[str(text), str(text)], - config=graph_config -) - -result = csv_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = csv_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/nemotron/csv_scraper_nemotron.py b/examples/nemotron/csv_scraper_nemotron.py deleted file mode 100644 index 2d527450..00000000 --- a/examples/nemotron/csv_scraper_nemotron.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -Basic example of scraping pipeline using CSVScraperGraph from CSV documents -""" -import os -from dotenv import load_dotenv -import pandas as pd -from scrapegraphai.graphs import CSVScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the CSV file -# ************************************************ - -FILE_NAME = "inputs/username.csv" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -text = pd.read_csv(file_path) - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -nemotron_key = os.getenv("NEMOTRON_APIKEY") - -graph_config = { - "llm": { - "api_key": nemotron_key, - "model": "nvidia/meta/llama3-70b-instruct", - }, -} - -# ************************************************ -# Create the CSVScraperGraph instance and run it -# ************************************************ - -csv_scraper_graph = CSVScraperGraph( - prompt="List me all the last names", - source=str(text), # Pass the content of the file, not the file object - config=graph_config -) - -result = csv_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = csv_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/nemotron/depth_search_graph_nemotron.py b/examples/nemotron/depth_search_graph_nemotron.py deleted file mode 100644 index edd80463..00000000 --- a/examples/nemotron/depth_search_graph_nemotron.py +++ /dev/null @@ -1,30 +0,0 @@ -""" -depth_search_graph_opeani example -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import DepthSearchGraph - -load_dotenv() - -openai_key = os.getenv("OPENAI_APIKEY") - -graph_config = { - "llm": { - "api_key": os.getenv("NEMOTRON_KEY"), - "model": "claude-3-haiku-20240307", - }, - "verbose": True, - "headless": False, - "depth": 2, - "only_inside_links": False, -} - -search_graph = DepthSearchGraph( - prompt="List me all the projects with their description", - source="https://perinim.github.io", - config=graph_config -) - -result = search_graph.run() -print(result) diff --git a/examples/nemotron/document_scraper_nemotron.py b/examples/nemotron/document_scraper_nemotron.py deleted file mode 100644 index 618047ee..00000000 --- a/examples/nemotron/document_scraper_nemotron.py +++ /dev/null @@ -1,44 +0,0 @@ -""" -document_scraper example -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import DocumentScraperGraph - -load_dotenv() - - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - - -nemotron_key = os.getenv("NEMOTRON_APIKEY") - -graph_config = { - "llm": { - "api_key": nemotron_key, - "model": "nvidia/meta/llama3-70b-instruct", - }, -} - -source = """ - The Divine Comedy, Italian La Divina Commedia, original name La commedia, long narrative poem written in Italian - circa 1308/21 by Dante. It is usually held to be one of the world s great works of literature. - Divided into three major sections—Inferno, Purgatorio, and Paradiso—the narrative traces the journey of Dante - from darkness and error to the revelation of the divine light, culminating in the Beatific Vision of God. - Dante is guided by the Roman poet Virgil, who represents the epitome of human knowledge, from the dark wood - through the descending circles of the pit of Hell (Inferno). He then climbs the mountain of Purgatory, guided - by the Roman poet Statius, who represents the fulfilment of human knowledge, and is finally led by his lifelong love, - the Beatrice of his earlier poetry, through the celestial spheres of Paradise. -""" - -pdf_scraper_graph = DocumentScraperGraph( - prompt="Summarize the text and find the main topics", - source=source, - config=graph_config, -) -result = pdf_scraper_graph.run() - -print(json.dumps(result, indent=4)) \ No newline at end of file diff --git a/examples/nemotron/inputs/books.xml b/examples/nemotron/inputs/books.xml deleted file mode 100644 index e3d1fe87..00000000 --- a/examples/nemotron/inputs/books.xml +++ /dev/null @@ -1,120 +0,0 @@ - - - - Gambardella, Matthew - XML Developer's Guide - Computer - 44.95 - 2000-10-01 - An in-depth look at creating applications - with XML. - - - Ralls, Kim - Midnight Rain - Fantasy - 5.95 - 2000-12-16 - A former architect battles corporate zombies, - an evil sorceress, and her own childhood to become queen - of the world. - - - Corets, Eva - Maeve Ascendant - Fantasy - 5.95 - 2000-11-17 - After the collapse of a nanotechnology - society in England, the young survivors lay the - foundation for a new society. - - - Corets, Eva - Oberon's Legacy - Fantasy - 5.95 - 2001-03-10 - In post-apocalypse England, the mysterious - agent known only as Oberon helps to create a new life - for the inhabitants of London. Sequel to Maeve - Ascendant. - - - Corets, Eva - The Sundered Grail - Fantasy - 5.95 - 2001-09-10 - The two daughters of Maeve, half-sisters, - battle one another for control of England. Sequel to - Oberon's Legacy. - - - Randall, Cynthia - Lover Birds - Romance - 4.95 - 2000-09-02 - When Carla meets Paul at an ornithology - conference, tempers fly as feathers get ruffled. - - - Thurman, Paula - Splish Splash - Romance - 4.95 - 2000-11-02 - A deep sea diver finds true love twenty - thousand leagues beneath the sea. - - - Knorr, Stefan - Creepy Crawlies - Horror - 4.95 - 2000-12-06 - An anthology of horror stories about roaches, - centipedes, scorpions and other insects. - - - Kress, Peter - Paradox Lost - Science Fiction - 6.95 - 2000-11-02 - After an inadvertant trip through a Heisenberg - Uncertainty Device, James Salway discovers the problems - of being quantum. - - - O'Brien, Tim - Microsoft .NET: The Programming Bible - Computer - 36.95 - 2000-12-09 - Microsoft's .NET initiative is explored in - detail in this deep programmer's reference. - - - O'Brien, Tim - MSXML3: A Comprehensive Guide - Computer - 36.95 - 2000-12-01 - The Microsoft MSXML3 parser is covered in - detail, with attention to XML DOM interfaces, XSLT processing, - SAX and more. - - - Galos, Mike - Visual Studio 7: A Comprehensive Guide - Computer - 49.95 - 2001-04-16 - Microsoft Visual Studio 7 is explored in depth, - looking at how Visual Basic, Visual C++, C#, and ASP+ are - integrated into a comprehensive development - environment. - - \ No newline at end of file diff --git a/examples/nemotron/inputs/example.json b/examples/nemotron/inputs/example.json deleted file mode 100644 index 2263184c..00000000 --- a/examples/nemotron/inputs/example.json +++ /dev/null @@ -1,182 +0,0 @@ -{ - "kind":"youtube#searchListResponse", - "etag":"q4ibjmYp1KA3RqMF4jFLl6PBwOg", - "nextPageToken":"CAUQAA", - "regionCode":"NL", - "pageInfo":{ - "totalResults":1000000, - "resultsPerPage":5 - }, - "items":[ - { - "kind":"youtube#searchResult", - "etag":"QCsHBifbaernVCbLv8Cu6rAeaDQ", - "id":{ - "kind":"youtube#video", - "videoId":"TvWDY4Mm5GM" - }, - "snippet":{ - "publishedAt":"2023-07-24T14:15:01Z", - "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", - "title":"3 Football Clubs Kylian Mbappe Should Avoid Signing ✍️❌⚽️ #football #mbappe #shorts", - "description":"", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"FC Motivate", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T14:15:01Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"0NG5QHdtIQM_V-DBJDEf-jK_Y9k", - "id":{ - "kind":"youtube#video", - "videoId":"aZM_42CcNZ4" - }, - "snippet":{ - "publishedAt":"2023-07-24T16:09:27Z", - "channelId":"UCM5gMM_HqfKHYIEJ3lstMUA", - "title":"Which Football Club Could Cristiano Ronaldo Afford To Buy? 💰", - "description":"Sign up to Sorare and get a FREE card: https://sorare.pxf.io/NellisShorts Give Soraredata a go for FREE: ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"John Nellis", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T16:09:27Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"WbBz4oh9I5VaYj91LjeJvffrBVY", - "id":{ - "kind":"youtube#video", - "videoId":"wkP3XS3aNAY" - }, - "snippet":{ - "publishedAt":"2023-07-24T16:00:50Z", - "channelId":"UC4EP1dxFDPup_aFLt0ElsDw", - "title":"PAULO DYBALA vs THE WORLD'S LONGEST FREEKICK WALL", - "description":"Can Paulo Dybala curl a football around the World's longest free kick wall? We met up with the World Cup winner and put him to ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"Shoot for Love", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T16:00:50Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"juxv_FhT_l4qrR05S1QTrb4CGh8", - "id":{ - "kind":"youtube#video", - "videoId":"rJkDZ0WvfT8" - }, - "snippet":{ - "publishedAt":"2023-07-24T10:00:39Z", - "channelId":"UCO8qj5u80Ga7N_tP3BZWWhQ", - "title":"TOP 10 DEFENDERS 2023", - "description":"SoccerKingz https://soccerkingz.nl Use code: 'ILOVEHOF' to get 10% off. TOP 10 DEFENDERS 2023 Follow us! • Instagram ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"Home of Football", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T10:00:39Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"wtuknXTmI1txoULeH3aWaOuXOow", - "id":{ - "kind":"youtube#video", - "videoId":"XH0rtu4U6SE" - }, - "snippet":{ - "publishedAt":"2023-07-21T16:30:05Z", - "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", - "title":"3 Things You Didn't Know About Erling Haaland ⚽️🇳🇴 #football #haaland #shorts", - "description":"", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"FC Motivate", - "liveBroadcastContent":"none", - "publishTime":"2023-07-21T16:30:05Z" - } - } - ] -} \ No newline at end of file diff --git a/examples/nemotron/inputs/markdown_example.md b/examples/nemotron/inputs/markdown_example.md deleted file mode 100644 index 85088f29..00000000 --- a/examples/nemotron/inputs/markdown_example.md +++ /dev/null @@ -1,35 +0,0 @@ -Marco Perini Toggle navigation - - * About - * Projects(current) - -Projects - -Competitions - - * CV - * ____ - -# Projects - - ![project thumbnail Rotary Pendulum RL -Open Source project aimed at controlling a real life rotary pendulum using RL -algorithms ](/projects/rotary-pendulum-rl/) - - ![project thumbnail DQN -Implementation from scratch Developed a Deep Q-Network algorithm to train a -simple and double pendulum ](https://github.com/PeriniM/DQN-SwingUp) - - ![project thumbnail Multi Agents HAED -University project which focuses on simulating a multi-agent system to perform -environment mapping. Agents, equipped with sensors, explore and record their -surroundings, considering uncertainties in their readings. -](https://github.com/PeriniM/Multi-Agents-HAED) - - ![project thumbnail Wireless ESC for Modular -Drones Modular drone architecture proposal and proof of concept. The project -received maximum grade. ](/projects/wireless-esc-drone/) - -© Copyright 2023 Marco Perini. Powered by Jekyll with -al-folio theme. Hosted by [GitHub -Pages](https://pages.github.com/). \ No newline at end of file diff --git a/examples/nemotron/inputs/plain_html_example.txt b/examples/nemotron/inputs/plain_html_example.txt deleted file mode 100644 index 78f814ae..00000000 --- a/examples/nemotron/inputs/plain_html_example.txt +++ /dev/null @@ -1,105 +0,0 @@ - -
- - -
-
-
- - -
- \ No newline at end of file diff --git a/examples/nemotron/inputs/username.csv b/examples/nemotron/inputs/username.csv deleted file mode 100644 index 006ac8e6..00000000 --- a/examples/nemotron/inputs/username.csv +++ /dev/null @@ -1,7 +0,0 @@ -Username; Identifier;First name;Last name -booker12;9012;Rachel;Booker -grey07;2070;Laura;Grey -johnson81;4081;Craig;Johnson -jenkins46;9346;Mary;Jenkins -smith79;5079;Jamie;Smith - diff --git a/examples/nemotron/json_scraper_multi_nemotron.py b/examples/nemotron/json_scraper_multi_nemotron.py deleted file mode 100644 index 846c7f48..00000000 --- a/examples/nemotron/json_scraper_multi_nemotron.py +++ /dev/null @@ -1,37 +0,0 @@ -""" -Module for showing how PDFScraper multi works -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import JSONScraperMultiGraph - -load_dotenv() - -nemotron_key = os.getenv("NEMOTRON_APIKEY") - -graph_config = { - "llm": { - "api_key": nemotron_key, - "model": "nvidia/meta/llama3-70b-instruct", - }, -} - -FILE_NAME = "inputs/example.json" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -sources = [text, text] - -multiple_search_graph = JSONScraperMultiGraph( - prompt= "List me all the authors, title and genres of the books", - source= sources, - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/nemotron/json_scraper_nemotron.py b/examples/nemotron/json_scraper_nemotron.py deleted file mode 100644 index a5479ca7..00000000 --- a/examples/nemotron/json_scraper_nemotron.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -Basic example of scraping pipeline using JSONScraperGraph from JSON documents -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import JSONScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the JSON file -# ************************************************ - -FILE_NAME = "inputs/example.json" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -nemotron_key = os.getenv("NEMOTRON_APIKEY") - -graph_config = { - "llm": { - "api_key": nemotron_key, - "model": "nvidia/meta/llama3-70b-instruct", - }, -} - -# ************************************************ -# Create the JSONScraperGraph instance and run it -# ************************************************ - -json_scraper_graph = JSONScraperGraph( - prompt="List me all the authors, title and genres of the books", - source=text, # Pass the content of the file, not the file object - config=graph_config -) - -result = json_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = json_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/nemotron/md_scraper_nemotron.py b/examples/nemotron/md_scraper_nemotron.py deleted file mode 100644 index 8e925c03..00000000 --- a/examples/nemotron/md_scraper_nemotron.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -Basic example of scraping pipeline using DocumentScraperGraph from XML documents -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import DocumentScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the XML file -# ************************************************ - -FILE_NAME = "inputs/markdown_example.md" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -nemotron_key = os.getenv("NEMOTRON_APIKEY") - -graph_config = { - "llm": { - "api_key": nemotron_key, - "model": "nvidia/meta/llama3-70b-instruct", - }, -} - -# ************************************************ -# Create the DocumentScraperGraph instance and run it -# ************************************************ - -md_scraper_graph = DocumentScraperGraph( - prompt="List me all the authors, title and genres of the books", - source=text, # Pass the content of the file, not the file object - config=graph_config -) - -result = md_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = md_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/nemotron/rate_limit_nemotron.py b/examples/nemotron/rate_limit_nemotron.py deleted file mode 100644 index 934c2036..00000000 --- a/examples/nemotron/rate_limit_nemotron.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper with a custom rate limit -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - - -graph_config = { - "llm": { - "api_key": os.getenv("NEMOTRON_KEY"), - "model": "nvidia/meta/llama3-70b-instruct", - "rate_limit": { - "requests_per_second": 1 - } - }, - "verbose": True, - "headless": False, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="Extract me the python code inside the page", - source="https://www.exploit-db.com/exploits/51447", - config=graph_config -) - -result = smart_scraper_graph.run() -print(json.dumps(result, indent=4)) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/nemotron/scrape_plain_text_nemotron.py b/examples/nemotron/scrape_plain_text_nemotron.py deleted file mode 100644 index 315bae8e..00000000 --- a/examples/nemotron/scrape_plain_text_nemotron.py +++ /dev/null @@ -1,54 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper from text -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the text file -# ************************************************ - -FILE_NAME = "inputs/plain_html_example.txt" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -# It could be also a http request using the request model -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -nemotron_key = os.getenv("NEMOTRON_APIKEY") - -graph_config = { - "llm": { - "api_key": nemotron_key, - "model": "nvidia/meta/llama3-70b-instruct", - }, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description.", - source=text, - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/nemotron/script_generator_nemotron.py b/examples/nemotron/script_generator_nemotron.py deleted file mode 100644 index 2ff8176a..00000000 --- a/examples/nemotron/script_generator_nemotron.py +++ /dev/null @@ -1,44 +0,0 @@ -""" -Basic example of scraping pipeline using ScriptCreatorGraph -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import ScriptCreatorGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -nemotron_key = os.getenv("NEMOTRON_APIKEY") - -graph_config = { - "llm": { - "api_key": nemotron_key, - "model": "nvidia/meta/llama3-70b-instruct", - }, - "library": "beautifulsoup" -} - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -script_creator_graph = ScriptCreatorGraph( - prompt="List me all the projects with their description.", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects", - config=graph_config -) - -result = script_creator_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = script_creator_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/nemotron/script_generator_schema_nemotron.py b/examples/nemotron/script_generator_schema_nemotron.py deleted file mode 100644 index 9516521a..00000000 --- a/examples/nemotron/script_generator_schema_nemotron.py +++ /dev/null @@ -1,59 +0,0 @@ -""" -Basic example of scraping pipeline using ScriptCreatorGraph -""" -import os -from dotenv import load_dotenv -from pydantic import BaseModel, Field -from typing import List -from scrapegraphai.graphs import ScriptCreatorGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the schema for the graph -# ************************************************ - -class Project(BaseModel): - title: str = Field(description="The title of the project") - description: str = Field(description="The description of the project") - -class Projects(BaseModel): - projects: List[Project] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -nemotron_key = os.getenv("NEMOTRON_APIKEY") - -graph_config = { - "llm": { - "api_key": nemotron_key, - "model": "nvidia/meta/llama3-70b-instruct", - }, - "library": "beautifulsoup", - "verbose": True, -} - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -script_creator_graph = ScriptCreatorGraph( - prompt="List me all the projects with their description.", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects", - config=graph_config, - schema=Projects -) - -result = script_creator_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = script_creator_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/nemotron/script_multi_generator_nemotron.py b/examples/nemotron/script_multi_generator_nemotron.py deleted file mode 100644 index 730fab8d..00000000 --- a/examples/nemotron/script_multi_generator_nemotron.py +++ /dev/null @@ -1,54 +0,0 @@ -""" -Basic example of scraping pipeline using ScriptCreatorGraph -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import ScriptCreatorMultiGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -nemotron_key = os.getenv("NEMOTRON_APIKEY") - -graph_config = { - "llm": { - "api_key": nemotron_key, - "model": "nvidia/meta/llama3-70b-instruct", - }, - "library": "beautifulsoup", - "verbose": True, -} - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -urls=[ - "https://schultzbergagency.com/emil-raste-karlsen/", - "https://schultzbergagency.com/johanna-hedberg/", -] - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -script_creator_graph = ScriptCreatorMultiGraph( - prompt="Find information about actors", - # also accepts a string with the already downloaded HTML code - source=urls, - config=graph_config -) - -result = script_creator_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = script_creator_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/nemotron/search_graph_nemotron.py b/examples/nemotron/search_graph_nemotron.py deleted file mode 100644 index e57e9642..00000000 --- a/examples/nemotron/search_graph_nemotron.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -Example of Search Graph -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SearchGraph - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -nemotron_key = os.getenv("NEMOTRON_APIKEY") - -graph_config = { - "llm": { - "api_key": nemotron_key, - "model": "nvidia/meta/llama3-70b-instruct", - }, - "max_results": 2, - "verbose": True, -} - -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me Chioggia's famous dishes", - config=graph_config -) - -result = search_graph.run() -print(result) diff --git a/examples/nemotron/search_graph_schema_nemotron.py b/examples/nemotron/search_graph_schema_nemotron.py deleted file mode 100644 index 64fbf047..00000000 --- a/examples/nemotron/search_graph_schema_nemotron.py +++ /dev/null @@ -1,62 +0,0 @@ -""" -Example of Search Graph -""" - -import os -from typing import List -from dotenv import load_dotenv -from pydantic import BaseModel, Field -from scrapegraphai.graphs import SearchGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Dish(BaseModel): - name: str = Field(description="The name of the dish") - description: str = Field(description="The description of the dish") - -class Dishes(BaseModel): - dishes: List[Dish] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -nemotron_key = os.getenv("NEMOTRON_APIKEY") - -graph_config = { - "llm": { - "api_key": nemotron_key, - "model": "nvidia/meta/llama3-70b-instruct", - }, - "max_results": 2, - "verbose": True, -} - -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me Chioggia's famous dishes", - config=graph_config, - schema=Dishes -) - -result = search_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = search_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json and csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/nemotron/search_link_graph_nemotron.py b/examples/nemotron/search_link_graph_nemotron.py deleted file mode 100644 index 50dce11b..00000000 --- a/examples/nemotron/search_link_graph_nemotron.py +++ /dev/null @@ -1,41 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SearchLinkGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "api_key": os.getenv("NEMOTRON_APIKEY"), - "model": "nvidia/meta/llama3-70b-instruct", - }, - "verbose": True, - "headless": False, -} - -# ************************************************ -# Create the SearchLinkGraph instance and run it -# ************************************************ - -smart_scraper_graph = SearchLinkGraph( - source="https://sport.sky.it/nba?gr=www", - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/nemotron/smart_scraper_lite_nemotron.py b/examples/nemotron/smart_scraper_lite_nemotron.py deleted file mode 100644 index 6c1d8528..00000000 --- a/examples/nemotron/smart_scraper_lite_nemotron.py +++ /dev/null @@ -1,32 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperLiteGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -graph_config = { - "llm": { - "api_key": os.getenv("NEMOTRON_API_KEY"), - "model": "nemotron/nemotron-3.5-turbo", - "base_url": "http://127.0.0.1:3000/v1", - }, - "verbose": True, - "headless": False, -} - -smart_scraper_lite_graph = SmartScraperLiteGraph( - prompt="Who is Marco Perini?", - source="https://perinim.github.io/", - config=graph_config -) - -result = smart_scraper_lite_graph.run() -print(json.dumps(result, indent=4)) - -graph_exec_info = smart_scraper_lite_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/nemotron/smart_scraper_multi_concat_nemotron.py b/examples/nemotron/smart_scraper_multi_concat_nemotron.py deleted file mode 100644 index 3297fcbf..00000000 --- a/examples/nemotron/smart_scraper_multi_concat_nemotron.py +++ /dev/null @@ -1,38 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiConcatGraph - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "api_key": os.getenv("NEMOTRON_APIKEY"), - "model": "nvidia/meta/llama3-70b-instruct", - }, - "verbose": True, - "headless": False, -} -# ******************************************************* -# Create the SmartScraperMultiGraph instance and run it -# ******************************************************* - -multiple_search_graph = SmartScraperMultiConcatGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/nemotron/smart_scraper_multi_lite_nemotron.py b/examples/nemotron/smart_scraper_multi_lite_nemotron.py deleted file mode 100644 index 7639d820..00000000 --- a/examples/nemotron/smart_scraper_multi_lite_nemotron.py +++ /dev/null @@ -1,46 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiLiteGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "api_key": os.getenv("NEMOTRON_API_KEY"), - "model": "nemotron/nemotron-3-8b-chat", - }, - "verbose": True, - "headless": False, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - config=graph_config -) - -result = smart_scraper_multi_lite_graph.run() -print(json.dumps(result, indent=4)) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/nemotron/smart_scraper_multi_nemotron.py b/examples/nemotron/smart_scraper_multi_nemotron.py deleted file mode 100644 index 00306a96..00000000 --- a/examples/nemotron/smart_scraper_multi_nemotron.py +++ /dev/null @@ -1,41 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiGraph - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -nemotron_key = os.getenv("NEMOTRON_APIKEY") - -graph_config = { - "llm": { - "api_key": nemotron_key, - "model": "nvidia/meta/llama3-70b-instruct", - }, - "verbose": True, - "headless": False, -} - -# ******************************************************* -# Create the SmartScraperMultiGraph instance and run it -# ******************************************************* - -multiple_search_graph = SmartScraperMultiGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/nemotron/smart_scraper_nemotron.py b/examples/nemotron/smart_scraper_nemotron.py deleted file mode 100644 index 10ad42b7..00000000 --- a/examples/nemotron/smart_scraper_nemotron.py +++ /dev/null @@ -1,45 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" - -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - - -graph_config = { - "llm": { - "api_key": os.getenv("NEMOTRON_KEY"), - "model": "nvidia/meta/llama3-70b-instruct", - }, - "verbose": True, - "headless": False, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="Extract me the python code inside the page", - source="https://www.exploit-db.com/exploits/51447", - config=graph_config -) - -result = smart_scraper_graph.run() -print(json.dumps(result, indent=4)) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/nemotron/smart_scraper_schema_nemotron.py b/examples/nemotron/smart_scraper_schema_nemotron.py deleted file mode 100644 index 54dbce1f..00000000 --- a/examples/nemotron/smart_scraper_schema_nemotron.py +++ /dev/null @@ -1,50 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper with schema -""" -import os -from typing import List -from dotenv import load_dotenv -from pydantic import BaseModel, Field -from scrapegraphai.graphs import SmartScraperGraph - -load_dotenv() - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Project(BaseModel): - title: str = Field(description="The title of the project") - description: str = Field(description="The description of the project") - -class Projects(BaseModel): - projects: List[Project] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -nemotron_key = os.getenv("NEMOTRON_APIKEY") - -graph_config = { - "llm": { - "api_key":nemotron_key, - "model": "nvidia/meta/llama3-70b-instruct", - }, - "verbose": True, - "headless": False, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description", - source="https://perinim.github.io/projects/", - schema=Projects, - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) diff --git a/examples/nemotron/speech_graph_nemotron.py b/examples/nemotron/speech_graph_nemotron.py deleted file mode 100644 index 21f0d2b1..00000000 --- a/examples/nemotron/speech_graph_nemotron.py +++ /dev/null @@ -1,50 +0,0 @@ -""" -Basic example of scraping pipeline using SpeechSummaryGraph -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SpeechGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define audio output path -# ************************************************ - -FILE_NAME = "website_summary.mp3" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -output_path = os.path.join(curr_dir, FILE_NAME) - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -nemotron_key = os.getenv("NEMOTRON_APIKEY") - -graph_config = { - "llm": { - "api_key": nemotron_key, - "model": "nvidia/meta/llama3-70b-instruct", - }, -} - -# ************************************************ -# Create the SpeechGraph instance and run it -# ************************************************ - -speech_graph = SpeechGraph( - prompt="Make a detailed audio summary of the projects.", - source="https://perinim.github.io/projects/", - config=graph_config, -) - -result = speech_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = speech_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/nemotron/xml_scraper_graph_nemotron.py b/examples/nemotron/xml_scraper_graph_nemotron.py deleted file mode 100644 index 753b0be5..00000000 --- a/examples/nemotron/xml_scraper_graph_nemotron.py +++ /dev/null @@ -1,60 +0,0 @@ -""" - -Basic example of scraping pipeline using XMLScraperMultiGraph from XML documents -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import XMLScraperMultiGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the XML file -# ************************************************ - -FILE_NAME = "inputs/books.xml" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - - -nemotron_key = os.getenv("NEMOTRON_APIKEY") - -graph_config = { - "llm": { - "api_key":nemotron_key, - "model": "nvidia/meta/llama3-70b-instruct", - }, - "verbose": True, - "headless": False, -} -# ************************************************ -# Create the XMLScraperMultiGraph instance and run it -# ************************************************ - -xml_scraper_graph = XMLScraperMultiGraph( - prompt="List me all the authors, title and genres of the books", - source=[text, text], # Pass the content of the file, not the file object - config=graph_config -) - -result = xml_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = xml_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/nemotron/xml_scraper_nemotron.py b/examples/nemotron/xml_scraper_nemotron.py deleted file mode 100644 index 5f7cb7d6..00000000 --- a/examples/nemotron/xml_scraper_nemotron.py +++ /dev/null @@ -1,59 +0,0 @@ -""" -Basic example of scraping pipeline using XMLScraperGraph from XML documents -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import XMLScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the XML file -# ************************************************ - -FILE_NAME = "inputs/books.xml" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -nemotron_key = os.getenv("NEMOTRON_APIKEY") - -graph_config = { - "llm": { - "api_key": nemotron_key, - "model": "nvidia/meta/llama3-70b-instruct", - }, - "verbose":False, -} - -# ************************************************ -# Create the XMLScraperGraph instance and run it -# ************************************************ - -xml_scraper_graph = XMLScraperGraph( - prompt="List me all the authors, title and genres of the books", - source=text, # Pass the content of the file, not the file object - config=graph_config -) - -result = xml_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = xml_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") - diff --git a/examples/omni_scraper_graph/.env.example b/examples/omni_scraper_graph/.env.example new file mode 100644 index 00000000..d5bb1b4d --- /dev/null +++ b/examples/omni_scraper_graph/.env.example @@ -0,0 +1,13 @@ +# OpenAI API Configuration +OPENAI_API_KEY=your-openai-api-key-here + +# Optional Configurations +MAX_TOKENS=4000 +MODEL_NAME=gpt-4-1106-preview +TEMPERATURE=0.7 + +# Omni Scraper Settings +DEFAULT_FORMAT=auto +TIMEOUT=60 +MAX_RETRIES=3 +USER_AGENT=Mozilla/5.0 \ No newline at end of file diff --git a/examples/omni_scraper_graph/README.md b/examples/omni_scraper_graph/README.md new file mode 100644 index 00000000..da5ab652 --- /dev/null +++ b/examples/omni_scraper_graph/README.md @@ -0,0 +1,30 @@ +# Omni Scraper Graph Example + +This example demonstrates how to use Scrapegraph-ai for universal web scraping across multiple data formats. + +## Features + +- Multi-format data extraction (JSON, XML, HTML, CSV) +- Automatic format detection +- Unified data output +- Content transformation + +## Setup + +1. Install required dependencies +2. Copy `.env.example` to `.env` +3. Configure your API keys in the `.env` file + +## Usage + +```python +from scrapegraphai.graphs import OmniScraperGraph + +graph = OmniScraperGraph() +data = graph.scrape("https://example.com/data") +``` + +## Environment Variables + +Required environment variables: +- `OPENAI_API_KEY`: Your OpenAI API key \ No newline at end of file diff --git a/examples/openai/omni_search_openai.py b/examples/omni_scraper_graph/omni_search_openai.py similarity index 100% rename from examples/openai/omni_search_openai.py rename to examples/omni_scraper_graph/omni_search_openai.py diff --git a/examples/oneapi/code_generator_graph_oneapi.py b/examples/oneapi/code_generator_graph_oneapi.py deleted file mode 100644 index 5f9808a3..00000000 --- a/examples/oneapi/code_generator_graph_oneapi.py +++ /dev/null @@ -1,60 +0,0 @@ -""" -Basic example of scraping pipeline using Code Generator with schema -""" -import os -from typing import List -from dotenv import load_dotenv -from pydantic import BaseModel, Field -from scrapegraphai.graphs import CodeGeneratorGraph - -load_dotenv() - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Project(BaseModel): - title: str = Field(description="The title of the project") - description: str = Field(description="The description of the project") - -class Projects(BaseModel): - projects: List[Project] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -openai_key = os.getenv("OPENAI_APIKEY") - -graph_config = { - "llm": { - "api_key": "***************************", - "model": "oneapi/qwen-turbo", - "base_url": "http://127.0.0.1:3000/v1", # 设置 OneAPI URL - }, - "verbose": True, - "headless": False, - "reduction": 2, - "max_iterations": { - "overall": 10, - "syntax": 3, - "execution": 3, - "validation": 3, - "semantic": 3 - }, - "output_file_name": "extracted_data.py" -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -code_generator_graph = CodeGeneratorGraph( - prompt="List me all the projects with their description", - source="https://perinim.github.io/projects/", - schema=Projects, - config=graph_config -) - -result = code_generator_graph.run() -print(result) \ No newline at end of file diff --git a/examples/oneapi/csv_scraper_graph_multi_oneapi.py b/examples/oneapi/csv_scraper_graph_multi_oneapi.py deleted file mode 100644 index 7b5d8abd..00000000 --- a/examples/oneapi/csv_scraper_graph_multi_oneapi.py +++ /dev/null @@ -1,56 +0,0 @@ -""" -Basic example of scraping pipeline using CSVScraperMultiGraph from CSV documents -""" -import os -from dotenv import load_dotenv -import pandas as pd -from scrapegraphai.graphs import CSVScraperMultiGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the CSV file -# ************************************************ - -FILE_NAME = "inputs/username.csv" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -text = pd.read_csv(file_path) - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "api_key": "***************************", - "model": "oneapi/qwen-turbo", - "base_url": "http://127.0.0.1:3000/v1", # 设置 OneAPI URL - } -} - -# ************************************************ -# Create the CSVScraperMultiGraph instance and run it -# ************************************************ - -csv_scraper_graph = CSVScraperMultiGraph( - prompt="List me all the last names", - source=[str(text), str(text)], - config=graph_config -) - -result = csv_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = csv_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/oneapi/csv_scraper_oneapi.py b/examples/oneapi/csv_scraper_oneapi.py deleted file mode 100644 index a9fda090..00000000 --- a/examples/oneapi/csv_scraper_oneapi.py +++ /dev/null @@ -1,55 +0,0 @@ -""" -Basic example of scraping pipeline using CSVScraperGraph from CSV documents -""" -import os -from dotenv import load_dotenv -import pandas as pd -from scrapegraphai.graphs import CSVScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info -load_dotenv() - -# ************************************************ -# Read the CSV file -# ************************************************ - -FILE_NAME = "inputs/username.csv" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -text = pd.read_csv(file_path) - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "api_key": "***************************", - "model": "oneapi/qwen-turbo", - "base_url": "http://127.0.0.1:3000/v1", # 设置 OneAPI URL - } -} - -# ************************************************ -# Create the CSVScraperGraph instance and run it -# ************************************************ - -csv_scraper_graph = CSVScraperGraph( - prompt="List me all the last names", - source=str(text), # Pass the content of the file, not the file object - config=graph_config -) - -result = csv_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = csv_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/oneapi/custom_graph_oneapi.py b/examples/oneapi/custom_graph_oneapi.py deleted file mode 100644 index 1e27dcf9..00000000 --- a/examples/oneapi/custom_graph_oneapi.py +++ /dev/null @@ -1,105 +0,0 @@ -""" -Example of custom graph using existing nodes -""" -from langchain_openai import OpenAIEmbeddings -from langchain_openai import ChatOpenAI -from scrapegraphai.graphs import BaseGraph -from scrapegraphai.nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode, RobotsNode - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "api_key": "***************************", - "model": "oneapi/qwen-turbo", - "base_url": "http://127.0.0.1:3000/v1", # 设置 OneAPI URL - } -} - -# ************************************************ -# Define the graph nodes -# ************************************************ - -llm_model = ChatOpenAI(graph_config["llm"]) -embedder = OpenAIEmbeddings(api_key=llm_model.openai_api_key) - -# define the nodes for the graph -robot_node = RobotsNode( - input="url", - output=["is_scrapable"], - node_config={ - "llm_model": llm_model, - "force_scraping": True, - "verbose": True, - } -) - -fetch_node = FetchNode( - input="url | local_dir", - output=["doc"], - node_config={ - "verbose": True, - "headless": True, - } -) -parse_node = ParseNode( - input="doc", - output=["parsed_doc"], - node_config={ - "chunk_size": 4096, - "verbose": True, - } -) -rag_node = RAGNode( - input="user_prompt & (parsed_doc | doc)", - output=["relevant_chunks"], - node_config={ - "llm_model": llm_model, - "embedder_model": embedder, - "verbose": True, - } -) -generate_answer_node = GenerateAnswerNode( - input="user_prompt & (relevant_chunks | parsed_doc | doc)", - output=["answer"], - node_config={ - "llm_model": llm_model, - "verbose": True, - } -) - -# ************************************************ -# Create the graph by defining the connections -# ************************************************ - -graph = BaseGraph( - nodes=[ - robot_node, - fetch_node, - parse_node, - rag_node, - generate_answer_node, - ], - edges=[ - (robot_node, fetch_node), - (fetch_node, parse_node), - (parse_node, rag_node), - (rag_node, generate_answer_node) - ], - entry_point=robot_node -) - -# ************************************************ -# Execute the graph -# ************************************************ - -result, execution_info = graph.execute({ - "user_prompt": "Describe the content", - "url": "https://example.com/" -}) - -# get the answer from the result -result = result.get("answer", "No answer found.") -print(result) diff --git a/examples/oneapi/depth_search_graph_onenapi.py b/examples/oneapi/depth_search_graph_onenapi.py deleted file mode 100644 index 7a2e7f3e..00000000 --- a/examples/oneapi/depth_search_graph_onenapi.py +++ /dev/null @@ -1,31 +0,0 @@ -""" -depth_search_graph_opeani example -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import DepthSearchGraph - -load_dotenv() - -openai_key = os.getenv("OPENAI_APIKEY") - -graph_config = { - "llm": { - "api_key": "***************************", - "model": "oneapi/qwen-turbo", - "base_url": "http://127.0.0.1:3000/v1", # 设置 OneAPI URL - }, - "verbose": True, - "headless": False, - "depth": 2, - "only_inside_links": False, -} - -search_graph = DepthSearchGraph( - prompt="List me all the projects with their description", - source="https://perinim.github.io", - config=graph_config -) - -result = search_graph.run() -print(result) diff --git a/examples/oneapi/document_scraper_oneapi.py b/examples/oneapi/document_scraper_oneapi.py deleted file mode 100644 index 99ffe295..00000000 --- a/examples/oneapi/document_scraper_oneapi.py +++ /dev/null @@ -1,42 +0,0 @@ -""" -document_scraper example -""" -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import DocumentScraperGraph - -load_dotenv() - - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "api_key": "***************************", - "model": "oneapi/qwen-turbo", - "base_url": "http://127.0.0.1:3000/v1", # 设置 OneAPI URL - } -} - - -source = """ - The Divine Comedy, Italian La Divina Commedia, original name La commedia, long narrative poem written in Italian - circa 1308/21 by Dante. It is usually held to be one of the world s great works of literature. - Divided into three major sections—Inferno, Purgatorio, and Paradiso—the narrative traces the journey of Dante - from darkness and error to the revelation of the divine light, culminating in the Beatific Vision of God. - Dante is guided by the Roman poet Virgil, who represents the epitome of human knowledge, from the dark wood - through the descending circles of the pit of Hell (Inferno). He then climbs the mountain of Purgatory, guided - by the Roman poet Statius, who represents the fulfilment of human knowledge, and is finally led by his lifelong love, - the Beatrice of his earlier poetry, through the celestial spheres of Paradise. -""" - -pdf_scraper_graph = DocumentScraperGraph( - prompt="Summarize the text and find the main topics", - source=source, - config=graph_config, -) -result = pdf_scraper_graph.run() - -print(json.dumps(result, indent=4)) \ No newline at end of file diff --git a/examples/oneapi/inputs/books.xml b/examples/oneapi/inputs/books.xml deleted file mode 100644 index e3d1fe87..00000000 --- a/examples/oneapi/inputs/books.xml +++ /dev/null @@ -1,120 +0,0 @@ - - - - Gambardella, Matthew - XML Developer's Guide - Computer - 44.95 - 2000-10-01 - An in-depth look at creating applications - with XML. - - - Ralls, Kim - Midnight Rain - Fantasy - 5.95 - 2000-12-16 - A former architect battles corporate zombies, - an evil sorceress, and her own childhood to become queen - of the world. - - - Corets, Eva - Maeve Ascendant - Fantasy - 5.95 - 2000-11-17 - After the collapse of a nanotechnology - society in England, the young survivors lay the - foundation for a new society. - - - Corets, Eva - Oberon's Legacy - Fantasy - 5.95 - 2001-03-10 - In post-apocalypse England, the mysterious - agent known only as Oberon helps to create a new life - for the inhabitants of London. Sequel to Maeve - Ascendant. - - - Corets, Eva - The Sundered Grail - Fantasy - 5.95 - 2001-09-10 - The two daughters of Maeve, half-sisters, - battle one another for control of England. Sequel to - Oberon's Legacy. - - - Randall, Cynthia - Lover Birds - Romance - 4.95 - 2000-09-02 - When Carla meets Paul at an ornithology - conference, tempers fly as feathers get ruffled. - - - Thurman, Paula - Splish Splash - Romance - 4.95 - 2000-11-02 - A deep sea diver finds true love twenty - thousand leagues beneath the sea. - - - Knorr, Stefan - Creepy Crawlies - Horror - 4.95 - 2000-12-06 - An anthology of horror stories about roaches, - centipedes, scorpions and other insects. - - - Kress, Peter - Paradox Lost - Science Fiction - 6.95 - 2000-11-02 - After an inadvertant trip through a Heisenberg - Uncertainty Device, James Salway discovers the problems - of being quantum. - - - O'Brien, Tim - Microsoft .NET: The Programming Bible - Computer - 36.95 - 2000-12-09 - Microsoft's .NET initiative is explored in - detail in this deep programmer's reference. - - - O'Brien, Tim - MSXML3: A Comprehensive Guide - Computer - 36.95 - 2000-12-01 - The Microsoft MSXML3 parser is covered in - detail, with attention to XML DOM interfaces, XSLT processing, - SAX and more. - - - Galos, Mike - Visual Studio 7: A Comprehensive Guide - Computer - 49.95 - 2001-04-16 - Microsoft Visual Studio 7 is explored in depth, - looking at how Visual Basic, Visual C++, C#, and ASP+ are - integrated into a comprehensive development - environment. - - \ No newline at end of file diff --git a/examples/oneapi/inputs/example.json b/examples/oneapi/inputs/example.json deleted file mode 100644 index 2263184c..00000000 --- a/examples/oneapi/inputs/example.json +++ /dev/null @@ -1,182 +0,0 @@ -{ - "kind":"youtube#searchListResponse", - "etag":"q4ibjmYp1KA3RqMF4jFLl6PBwOg", - "nextPageToken":"CAUQAA", - "regionCode":"NL", - "pageInfo":{ - "totalResults":1000000, - "resultsPerPage":5 - }, - "items":[ - { - "kind":"youtube#searchResult", - "etag":"QCsHBifbaernVCbLv8Cu6rAeaDQ", - "id":{ - "kind":"youtube#video", - "videoId":"TvWDY4Mm5GM" - }, - "snippet":{ - "publishedAt":"2023-07-24T14:15:01Z", - "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", - "title":"3 Football Clubs Kylian Mbappe Should Avoid Signing ✍️❌⚽️ #football #mbappe #shorts", - "description":"", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"FC Motivate", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T14:15:01Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"0NG5QHdtIQM_V-DBJDEf-jK_Y9k", - "id":{ - "kind":"youtube#video", - "videoId":"aZM_42CcNZ4" - }, - "snippet":{ - "publishedAt":"2023-07-24T16:09:27Z", - "channelId":"UCM5gMM_HqfKHYIEJ3lstMUA", - "title":"Which Football Club Could Cristiano Ronaldo Afford To Buy? 💰", - "description":"Sign up to Sorare and get a FREE card: https://sorare.pxf.io/NellisShorts Give Soraredata a go for FREE: ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"John Nellis", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T16:09:27Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"WbBz4oh9I5VaYj91LjeJvffrBVY", - "id":{ - "kind":"youtube#video", - "videoId":"wkP3XS3aNAY" - }, - "snippet":{ - "publishedAt":"2023-07-24T16:00:50Z", - "channelId":"UC4EP1dxFDPup_aFLt0ElsDw", - "title":"PAULO DYBALA vs THE WORLD'S LONGEST FREEKICK WALL", - "description":"Can Paulo Dybala curl a football around the World's longest free kick wall? We met up with the World Cup winner and put him to ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"Shoot for Love", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T16:00:50Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"juxv_FhT_l4qrR05S1QTrb4CGh8", - "id":{ - "kind":"youtube#video", - "videoId":"rJkDZ0WvfT8" - }, - "snippet":{ - "publishedAt":"2023-07-24T10:00:39Z", - "channelId":"UCO8qj5u80Ga7N_tP3BZWWhQ", - "title":"TOP 10 DEFENDERS 2023", - "description":"SoccerKingz https://soccerkingz.nl Use code: 'ILOVEHOF' to get 10% off. TOP 10 DEFENDERS 2023 Follow us! • Instagram ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"Home of Football", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T10:00:39Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"wtuknXTmI1txoULeH3aWaOuXOow", - "id":{ - "kind":"youtube#video", - "videoId":"XH0rtu4U6SE" - }, - "snippet":{ - "publishedAt":"2023-07-21T16:30:05Z", - "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", - "title":"3 Things You Didn't Know About Erling Haaland ⚽️🇳🇴 #football #haaland #shorts", - "description":"", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"FC Motivate", - "liveBroadcastContent":"none", - "publishTime":"2023-07-21T16:30:05Z" - } - } - ] -} \ No newline at end of file diff --git a/examples/oneapi/inputs/plain_html_example copy.txt b/examples/oneapi/inputs/plain_html_example copy.txt deleted file mode 100644 index 78f814ae..00000000 --- a/examples/oneapi/inputs/plain_html_example copy.txt +++ /dev/null @@ -1,105 +0,0 @@ - -
- - -
-
-
- - -
- \ No newline at end of file diff --git a/examples/oneapi/inputs/plain_html_example.txt b/examples/oneapi/inputs/plain_html_example.txt deleted file mode 100644 index 78f814ae..00000000 --- a/examples/oneapi/inputs/plain_html_example.txt +++ /dev/null @@ -1,105 +0,0 @@ - -
- - -
-
-
- - -
- \ No newline at end of file diff --git a/examples/oneapi/inputs/username.csv b/examples/oneapi/inputs/username.csv deleted file mode 100644 index 006ac8e6..00000000 --- a/examples/oneapi/inputs/username.csv +++ /dev/null @@ -1,7 +0,0 @@ -Username; Identifier;First name;Last name -booker12;9012;Rachel;Booker -grey07;2070;Laura;Grey -johnson81;4081;Craig;Johnson -jenkins46;9346;Mary;Jenkins -smith79;5079;Jamie;Smith - diff --git a/examples/oneapi/json_scraper_multi_oneapi.py b/examples/oneapi/json_scraper_multi_oneapi.py deleted file mode 100644 index fc1c4555..00000000 --- a/examples/oneapi/json_scraper_multi_oneapi.py +++ /dev/null @@ -1,33 +0,0 @@ -""" -Module for showing how PDFScraper multi works -""" -import os -import json -from scrapegraphai.graphs import JSONScraperMultiGraph - -graph_config = { - "llm": { - "api_key": "***************************", - "model": "oneapi/qwen-turbo", - "base_url": "http://127.0.0.1:3000/v1", # 设置 OneAPI URL - } -} - -FILE_NAME = "inputs/example.json" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -sources = [text, text] - -multiple_search_graph = JSONScraperMultiGraph( - prompt= "List me all the authors, title and genres of the books", - source= sources, - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/oneapi/json_scraper_oneapi.py b/examples/oneapi/json_scraper_oneapi.py deleted file mode 100644 index 2f89fc50..00000000 --- a/examples/oneapi/json_scraper_oneapi.py +++ /dev/null @@ -1,53 +0,0 @@ -""" -Basic example of scraping pipeline using JSONScraperGraph from JSON documents -""" -import os -from scrapegraphai.graphs import JSONScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -# ************************************************ -# Read the JSON file -# ************************************************ - -FILE_NAME = "inputs/example.json" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "api_key": "***************************", - "model": "oneapi/qwen-turbo", - "base_url": "http://127.0.0.1:3000/v1", # 设置 OneAPI URL - } -} - -# ************************************************ -# Create the JSONScraperGraph instance and run it -# ************************************************ - -json_scraper_graph = JSONScraperGraph( - prompt="List me all the authors, title and genres of the books", - source=text, # Pass the content of the file, not the file object - config=graph_config -) - -result = json_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = json_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/oneapi/rate_limit_oneapi.py b/examples/oneapi/rate_limit_oneapi.py deleted file mode 100644 index abd2f9c7..00000000 --- a/examples/oneapi/rate_limit_oneapi.py +++ /dev/null @@ -1,41 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper with a custom rate limit -""" -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "api_key": "***************************", - "model": "oneapi/qwen-turbo", - "base_url": "http://127.0.0.1:3000/v1", # 设置 OneAPI URL - "rate_limit": { - "requests_per_second": 1 - } - } -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the titles", - # also accepts a string with the already downloaded HTML code - source="https://www.wired.com/", - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/oneapi/scrape_plain_text_oneapi.py b/examples/oneapi/scrape_plain_text_oneapi.py deleted file mode 100644 index 268d2b0d..00000000 --- a/examples/oneapi/scrape_plain_text_oneapi.py +++ /dev/null @@ -1,53 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper from text -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the text file -# ************************************************ - -FILE_NAME = "inputs/plain_html_example.txt" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -# It could be also a http request using the request model -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "api_key": "***************************", - "model": "oneapi/qwen-turbo", - "base_url": "http://127.0.0.1:3000/v1", # 设置 OneAPI URL - } -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description.", - source=text, - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/oneapi/script_generator_oneapi.py b/examples/oneapi/script_generator_oneapi.py deleted file mode 100644 index 3876eb34..00000000 --- a/examples/oneapi/script_generator_oneapi.py +++ /dev/null @@ -1,43 +0,0 @@ -""" -Basic example of scraping pipeline using ScriptCreatorGraph -""" -from dotenv import load_dotenv -from scrapegraphai.graphs import ScriptCreatorGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "api_key": "***************************", - "model": "oneapi/qwen-turbo", - "base_url": "http://127.0.0.1:3000/v1", # 设置 OneAPI URL - }, - "library": "beautifulsoup" -} - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -script_creator_graph = ScriptCreatorGraph( - prompt="List me all the projects with their description.", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects", - config=graph_config -) - -result = script_creator_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = script_creator_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - diff --git a/examples/oneapi/script_multi_generator_oneapi.py b/examples/oneapi/script_multi_generator_oneapi.py deleted file mode 100644 index 42328744..00000000 --- a/examples/oneapi/script_multi_generator_oneapi.py +++ /dev/null @@ -1,48 +0,0 @@ -""" -Basic example of scraping pipeline using ScriptCreatorGraph -""" -from scrapegraphai.graphs import ScriptCreatorMultiGraph -from scrapegraphai.utils import prettify_exec_info - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "api_key": "***************************", - "model": "oneapi/qwen-turbo", - "base_url": "http://127.0.0.1:3000/v1", # 设置 OneAPI URL - }, - "library": "beautifulsoup" -} - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -urls=[ - "https://schultzbergagency.com/emil-raste-karlsen/", - "https://schultzbergagency.com/johanna-hedberg/", -] - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -script_creator_graph = ScriptCreatorMultiGraph( - prompt="Find information about actors", - # also accepts a string with the already downloaded HTML code - source=urls, - config=graph_config -) - -result = script_creator_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = script_creator_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/oneapi/search_graph_oneapi.py b/examples/oneapi/search_graph_oneapi.py deleted file mode 100644 index b25cbfa6..00000000 --- a/examples/oneapi/search_graph_oneapi.py +++ /dev/null @@ -1,29 +0,0 @@ -""" -Example of Search Graph -""" -from scrapegraphai.graphs import SearchGraph - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "api_key": "***************************", - "model": "oneapi/qwen-turbo", - "base_url": "http://127.0.0.1:3000/v1", # 设置 OneAPI URL - } -} - - -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me Chioggia's famous dishes", - config=graph_config -) - -result = search_graph.run() -print(result) diff --git a/examples/oneapi/search_graph_schema_oneapi.py b/examples/oneapi/search_graph_schema_oneapi.py deleted file mode 100644 index 7fc44539..00000000 --- a/examples/oneapi/search_graph_schema_oneapi.py +++ /dev/null @@ -1,55 +0,0 @@ -""" -Example of Search Graph -""" -from scrapegraphai.graphs import SearchGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -from pydantic import BaseModel, Field -from typing import List - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Dish(BaseModel): - name: str = Field(description="The name of the dish") - description: str = Field(description="The description of the dish") - -class Dishes(BaseModel): - dishes: List[Dish] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "api_key": "***************************", - "model": "oneapi/qwen-turbo", - "base_url": "http://127.0.0.1:3000/v1", # 设置 OneAPI URL - } -} - -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me Chioggia's famous dishes", - config=graph_config, - schema=Dishes -) - -result = search_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = search_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json and csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/oneapi/smart_scraper_lite_oneapi.py b/examples/oneapi/smart_scraper_lite_oneapi.py deleted file mode 100644 index b271acb3..00000000 --- a/examples/oneapi/smart_scraper_lite_oneapi.py +++ /dev/null @@ -1,32 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperLiteGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -graph_config = { - "llm": { - "api_key": os.getenv("ONEAPI_API_KEY"), - "model": "oneapi/gpt-3.5-turbo", - "base_url": "http://127.0.0.1:3000/v1", - }, - "verbose": True, - "headless": False, -} - -smart_scraper_lite_graph = SmartScraperLiteGraph( - prompt="Who is Marco Perini?", - source="https://perinim.github.io/", - config=graph_config -) - -result = smart_scraper_lite_graph.run() -print(json.dumps(result, indent=4)) - -graph_exec_info = smart_scraper_lite_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/oneapi/smart_scraper_multi_concat_oneapi.py b/examples/oneapi/smart_scraper_multi_concat_oneapi.py deleted file mode 100644 index bbadbcfd..00000000 --- a/examples/oneapi/smart_scraper_multi_concat_oneapi.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" - -import json -from scrapegraphai.graphs import SmartScraperMultiConcatGraph - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "api_key": "***************************", - "model": "oneapi/qwen-turbo", - "base_url": "http://127.0.0.1:3000/v1", # 设置 OneAPI URL - } -} - -# ******************************************************* -# Create the SmartScraperMultiGraph instance and run it -# ******************************************************* - -multiple_search_graph = SmartScraperMultiConcatGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/oneapi/smart_scraper_multi_lite_oneapi.py b/examples/oneapi/smart_scraper_multi_lite_oneapi.py deleted file mode 100644 index 8cf66dea..00000000 --- a/examples/oneapi/smart_scraper_multi_lite_oneapi.py +++ /dev/null @@ -1,43 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiLiteGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -together_key = os.getenv("TOGETHER_APIKEY") - -graph_config = { - "llm": { - "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", - "api_key": together_key, - }, - "verbose": True, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - config=graph_config -) - -result = smart_scraper_multi_lite_graph.run() -print(json.dumps(result, indent=4)) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/oneapi/smart_scraper_multi_oneapi.py b/examples/oneapi/smart_scraper_multi_oneapi.py deleted file mode 100644 index 37b7b6e8..00000000 --- a/examples/oneapi/smart_scraper_multi_oneapi.py +++ /dev/null @@ -1,38 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" - -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiGraph - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "api_key": "***************************", - "model": "oneapi/qwen-turbo", - "base_url": "http://127.0.0.1:3000/v1", # 设置 OneAPI URL - } -} - -# ******************************************************* -# Create the SmartScraperMultiGraph instance and run it -# ******************************************************* - -multiple_search_graph = SmartScraperMultiGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/oneapi/smart_scraper_oneapi.py b/examples/oneapi/smart_scraper_oneapi.py deleted file mode 100644 index 30b12aa3..00000000 --- a/examples/oneapi/smart_scraper_oneapi.py +++ /dev/null @@ -1,38 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "api_key": "***************************", - "model": "oneapi/qwen-turbo", - "base_url": "http://127.0.0.1:3000/v1", # 设置 OneAPI URL - } -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the titles", - # also accepts a string with the already downloaded HTML code - source="https://www.wired.com/", - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/oneapi/smart_scraper_schema_oneapi.py b/examples/oneapi/smart_scraper_schema_oneapi.py deleted file mode 100644 index 0c011bb6..00000000 --- a/examples/oneapi/smart_scraper_schema_oneapi.py +++ /dev/null @@ -1,48 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper and OneAPI -""" -from typing import List -from pydantic import BaseModel, Field -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -# ************************************************ -# Define the configuration for the graph -# ************************************************ -class Project(BaseModel): - title: str = Field(description="The title of the project") - description: str = Field(description="The description of the project") - -class Projects(BaseModel): - projects: List[Project] - -# ************************************************ -# Define the configuration for the graph -# ********************************************* - -graph_config = { - "llm": { - "api_key": "***************************", - "model": "oneapi/qwen-turbo", - "base_url": "http://127.0.0.1:3000/v1", # 设置 OneAPI URL - } -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects/", - config=graph_config, - schema=Projects -) - -# ************************************************ -# Get graph execution info -# ************************************************ -result = smart_scraper_graph.run() -print(result) -print(prettify_exec_info(result)) diff --git a/examples/oneapi/smartscraper_oneapi.py b/examples/oneapi/smartscraper_oneapi.py deleted file mode 100644 index f0783782..00000000 --- a/examples/oneapi/smartscraper_oneapi.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -# ************************************************ -# Define the configuration for the graph -# ********************************************* - -graph_config = { - "llm": { - "api_key": "***************************", - "model": "oneapi/qwen-turbo", - "base_url": "http://127.0.0.1:3000/v1", # 设置 OneAPI URL - } -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="该网站为XXXXX,请提取出标题、发布时间、发布来源以及内容摘要,并以中文回答。", - # 也可以使用已下载的 HTML 代码的字符串 - source="http://XXXX", - config=graph_config -) - -# ************************************************ -# Get graph execution info -# ************************************************ -result = smart_scraper_graph.run() -print(result) -print(prettify_exec_info(result)) diff --git a/examples/oneapi/xml_scraper_graph_multi_oneapi.py b/examples/oneapi/xml_scraper_graph_multi_oneapi.py deleted file mode 100644 index b459fdd3..00000000 --- a/examples/oneapi/xml_scraper_graph_multi_oneapi.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -Basic example of scraping pipeline using XMLScraperMultiGraph from XML documents -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import XMLScraperMultiGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info -load_dotenv() - -# ************************************************ -# Read the XML file -# ************************************************ - -FILE_NAME = "inputs/books.xml" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -openai_key = os.getenv("OPENAI_APIKEY") - -graph_config = { - "llm": { - "api_key": openai_key, - "model": "openai/gpt-3.5-turbo", - }, -} - -# ************************************************ -# Create the XMLScraperMultiGraph instance and run it -# ************************************************ - -xml_scraper_graph = XMLScraperMultiGraph( - prompt="List me all the authors, title and genres of the books", - source=[text, text], # Pass the content of the file, not the file object - config=graph_config -) - -result = xml_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = xml_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/oneapi/xml_scraper_oneapi.py b/examples/oneapi/xml_scraper_oneapi.py deleted file mode 100644 index 7ea7fad5..00000000 --- a/examples/oneapi/xml_scraper_oneapi.py +++ /dev/null @@ -1,58 +0,0 @@ -""" -Basic example of scraping pipeline using XMLScraperGraph from XML documents -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import XMLScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the XML file -# ************************************************ - -FILE_NAME = "inputs/books.xml" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -openai_key = os.getenv("ONEAPI_KEY") - -graph_config = { - "llm": { - "api_key": openai_key, - "model": "openai/gpt-3.5-turbo", - }, - "verbose":False, -} - -# ************************************************ -# Create the XMLScraperGraph instance and run it -# ************************************************ - -xml_scraper_graph = XMLScraperGraph( - prompt="List me all the authors, title and genres of the books", - source=text, # Pass the content of the file, not the file object - config=graph_config -) - -result = xml_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = xml_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/openai/.env.example b/examples/openai/.env.example deleted file mode 100644 index afa13602..00000000 --- a/examples/openai/.env.example +++ /dev/null @@ -1 +0,0 @@ -OPENAI_API_KEY="YOUR OPENAI API KEY" \ No newline at end of file diff --git a/examples/openai/inputs/books.xml b/examples/openai/inputs/books.xml deleted file mode 100644 index e3d1fe87..00000000 --- a/examples/openai/inputs/books.xml +++ /dev/null @@ -1,120 +0,0 @@ - - - - Gambardella, Matthew - XML Developer's Guide - Computer - 44.95 - 2000-10-01 - An in-depth look at creating applications - with XML. - - - Ralls, Kim - Midnight Rain - Fantasy - 5.95 - 2000-12-16 - A former architect battles corporate zombies, - an evil sorceress, and her own childhood to become queen - of the world. - - - Corets, Eva - Maeve Ascendant - Fantasy - 5.95 - 2000-11-17 - After the collapse of a nanotechnology - society in England, the young survivors lay the - foundation for a new society. - - - Corets, Eva - Oberon's Legacy - Fantasy - 5.95 - 2001-03-10 - In post-apocalypse England, the mysterious - agent known only as Oberon helps to create a new life - for the inhabitants of London. Sequel to Maeve - Ascendant. - - - Corets, Eva - The Sundered Grail - Fantasy - 5.95 - 2001-09-10 - The two daughters of Maeve, half-sisters, - battle one another for control of England. Sequel to - Oberon's Legacy. - - - Randall, Cynthia - Lover Birds - Romance - 4.95 - 2000-09-02 - When Carla meets Paul at an ornithology - conference, tempers fly as feathers get ruffled. - - - Thurman, Paula - Splish Splash - Romance - 4.95 - 2000-11-02 - A deep sea diver finds true love twenty - thousand leagues beneath the sea. - - - Knorr, Stefan - Creepy Crawlies - Horror - 4.95 - 2000-12-06 - An anthology of horror stories about roaches, - centipedes, scorpions and other insects. - - - Kress, Peter - Paradox Lost - Science Fiction - 6.95 - 2000-11-02 - After an inadvertant trip through a Heisenberg - Uncertainty Device, James Salway discovers the problems - of being quantum. - - - O'Brien, Tim - Microsoft .NET: The Programming Bible - Computer - 36.95 - 2000-12-09 - Microsoft's .NET initiative is explored in - detail in this deep programmer's reference. - - - O'Brien, Tim - MSXML3: A Comprehensive Guide - Computer - 36.95 - 2000-12-01 - The Microsoft MSXML3 parser is covered in - detail, with attention to XML DOM interfaces, XSLT processing, - SAX and more. - - - Galos, Mike - Visual Studio 7: A Comprehensive Guide - Computer - 49.95 - 2001-04-16 - Microsoft Visual Studio 7 is explored in depth, - looking at how Visual Basic, Visual C++, C#, and ASP+ are - integrated into a comprehensive development - environment. - - \ No newline at end of file diff --git a/examples/openai/inputs/example.json b/examples/openai/inputs/example.json deleted file mode 100644 index 2263184c..00000000 --- a/examples/openai/inputs/example.json +++ /dev/null @@ -1,182 +0,0 @@ -{ - "kind":"youtube#searchListResponse", - "etag":"q4ibjmYp1KA3RqMF4jFLl6PBwOg", - "nextPageToken":"CAUQAA", - "regionCode":"NL", - "pageInfo":{ - "totalResults":1000000, - "resultsPerPage":5 - }, - "items":[ - { - "kind":"youtube#searchResult", - "etag":"QCsHBifbaernVCbLv8Cu6rAeaDQ", - "id":{ - "kind":"youtube#video", - "videoId":"TvWDY4Mm5GM" - }, - "snippet":{ - "publishedAt":"2023-07-24T14:15:01Z", - "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", - "title":"3 Football Clubs Kylian Mbappe Should Avoid Signing ✍️❌⚽️ #football #mbappe #shorts", - "description":"", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"FC Motivate", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T14:15:01Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"0NG5QHdtIQM_V-DBJDEf-jK_Y9k", - "id":{ - "kind":"youtube#video", - "videoId":"aZM_42CcNZ4" - }, - "snippet":{ - "publishedAt":"2023-07-24T16:09:27Z", - "channelId":"UCM5gMM_HqfKHYIEJ3lstMUA", - "title":"Which Football Club Could Cristiano Ronaldo Afford To Buy? 💰", - "description":"Sign up to Sorare and get a FREE card: https://sorare.pxf.io/NellisShorts Give Soraredata a go for FREE: ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"John Nellis", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T16:09:27Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"WbBz4oh9I5VaYj91LjeJvffrBVY", - "id":{ - "kind":"youtube#video", - "videoId":"wkP3XS3aNAY" - }, - "snippet":{ - "publishedAt":"2023-07-24T16:00:50Z", - "channelId":"UC4EP1dxFDPup_aFLt0ElsDw", - "title":"PAULO DYBALA vs THE WORLD'S LONGEST FREEKICK WALL", - "description":"Can Paulo Dybala curl a football around the World's longest free kick wall? We met up with the World Cup winner and put him to ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"Shoot for Love", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T16:00:50Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"juxv_FhT_l4qrR05S1QTrb4CGh8", - "id":{ - "kind":"youtube#video", - "videoId":"rJkDZ0WvfT8" - }, - "snippet":{ - "publishedAt":"2023-07-24T10:00:39Z", - "channelId":"UCO8qj5u80Ga7N_tP3BZWWhQ", - "title":"TOP 10 DEFENDERS 2023", - "description":"SoccerKingz https://soccerkingz.nl Use code: 'ILOVEHOF' to get 10% off. TOP 10 DEFENDERS 2023 Follow us! • Instagram ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"Home of Football", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T10:00:39Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"wtuknXTmI1txoULeH3aWaOuXOow", - "id":{ - "kind":"youtube#video", - "videoId":"XH0rtu4U6SE" - }, - "snippet":{ - "publishedAt":"2023-07-21T16:30:05Z", - "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", - "title":"3 Things You Didn't Know About Erling Haaland ⚽️🇳🇴 #football #haaland #shorts", - "description":"", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"FC Motivate", - "liveBroadcastContent":"none", - "publishTime":"2023-07-21T16:30:05Z" - } - } - ] -} \ No newline at end of file diff --git a/examples/openai/inputs/markdown_example.md b/examples/openai/inputs/markdown_example.md deleted file mode 100644 index 85088f29..00000000 --- a/examples/openai/inputs/markdown_example.md +++ /dev/null @@ -1,35 +0,0 @@ -Marco Perini Toggle navigation - - * About - * Projects(current) - -Projects - -Competitions - - * CV - * ____ - -# Projects - - ![project thumbnail Rotary Pendulum RL -Open Source project aimed at controlling a real life rotary pendulum using RL -algorithms ](/projects/rotary-pendulum-rl/) - - ![project thumbnail DQN -Implementation from scratch Developed a Deep Q-Network algorithm to train a -simple and double pendulum ](https://github.com/PeriniM/DQN-SwingUp) - - ![project thumbnail Multi Agents HAED -University project which focuses on simulating a multi-agent system to perform -environment mapping. Agents, equipped with sensors, explore and record their -surroundings, considering uncertainties in their readings. -](https://github.com/PeriniM/Multi-Agents-HAED) - - ![project thumbnail Wireless ESC for Modular -Drones Modular drone architecture proposal and proof of concept. The project -received maximum grade. ](/projects/wireless-esc-drone/) - -© Copyright 2023 Marco Perini. Powered by Jekyll with -al-folio theme. Hosted by [GitHub -Pages](https://pages.github.com/). \ No newline at end of file diff --git a/examples/openai/inputs/plain_html_example.txt b/examples/openai/inputs/plain_html_example.txt deleted file mode 100644 index 78f814ae..00000000 --- a/examples/openai/inputs/plain_html_example.txt +++ /dev/null @@ -1,105 +0,0 @@ - -
- - -
-
-
- - -
- \ No newline at end of file diff --git a/examples/openai/inputs/username.csv b/examples/openai/inputs/username.csv deleted file mode 100644 index 006ac8e6..00000000 --- a/examples/openai/inputs/username.csv +++ /dev/null @@ -1,7 +0,0 @@ -Username; Identifier;First name;Last name -booker12;9012;Rachel;Booker -grey07;2070;Laura;Grey -johnson81;4081;Craig;Johnson -jenkins46;9346;Mary;Jenkins -smith79;5079;Jamie;Smith - diff --git a/examples/openai/rate_limit_openai.py b/examples/openai/rate_limit_openai.py deleted file mode 100644 index 9455e798..00000000 --- a/examples/openai/rate_limit_openai.py +++ /dev/null @@ -1,48 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper with a custom rate limit -""" - -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - - -graph_config = { - "llm": { - "api_key": os.getenv("OPENAI_API_KEY"), - "model": "openai/gpt-4o", - "rate_limit": { - "requests_per_second": 1 - } - }, - "verbose": True, - "headless": False, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me what does the company do, the name and a contact email.", - source="https://scrapegraphai.com/", - config=graph_config -) - -result = smart_scraper_graph.run() -print(json.dumps(result, indent=4)) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/openai/readme.md b/examples/openai/readme.md deleted file mode 100644 index 9a517ac6..00000000 --- a/examples/openai/readme.md +++ /dev/null @@ -1 +0,0 @@ -This folder contains an example of how to use ScrapeGraph-AI with OpenAI, an artificial intelligence platform. The examples show how to extract information from a website using a natural language prompt. \ No newline at end of file diff --git a/examples/openai/scrape_plain_text_openai.py b/examples/openai/scrape_plain_text_openai.py deleted file mode 100644 index 27a65663..00000000 --- a/examples/openai/scrape_plain_text_openai.py +++ /dev/null @@ -1,54 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper from text -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the text file -# ************************************************ - -FILE_NAME = "inputs/plain_html_example.txt" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -# It could be also a http request using the request model -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -openai_key = os.getenv("OPENAI_APIKEY") - -graph_config = { - "llm": { - "api_key": openai_key, - "model": "openai/gpt-4o", - }, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description.", - source=text, - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/openai/screenshot_scraper.py b/examples/openai/screenshot_scraper.py deleted file mode 100644 index f5576b64..00000000 --- a/examples/openai/screenshot_scraper.py +++ /dev/null @@ -1,37 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import ScreenshotScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - - -graph_config = { - "llm": { - "api_key": os.getenv("OPENAI_API_KEY"), - "model": "openai/gpt-4o", - }, - "verbose": True, - "headless": False, -} - -# ************************************************ -# Create the ScreenshotScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = ScreenshotScraperGraph( - prompt="List me all the projects", - source="https://perinim.github.io/projects/", - config=graph_config -) - -result = smart_scraper_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/readme.md b/examples/readme.md index 3d3501fb..b750ccf8 100644 --- a/examples/readme.md +++ b/examples/readme.md @@ -1,41 +1,66 @@ -# Benchmark analysis -# Local models -The two websites benchmark are: -- Example 1: https://perinim.github.io/projects -- Example 2: https://www.wired.com (at 17/4/2024) +# 🕷️ Scrapegraph-ai Examples -Both are strored locally as txt file in .txt format because in this way we do not have to think about the internet connection +This directory contains various example implementations of Scrapegraph-ai for different use cases. Each example demonstrates how to leverage the power of Scrapegraph-ai for specific scenarios. -The time is measured in seconds +> **Note:** While these examples showcase implementations using OpenAI and Ollama, Scrapegraph-ai supports many other LLM providers! Check out our [documentation](https://docs-oss.scrapegraphai.com/examples) for the full list of supported providers. -The model runned for this benchmark is Mistral on Ollama with nomic-embed-text +## 📚 Available Examples -| Hardware | Example 1 | Example 2 | -| ----------------------- | --------- | --------- | -| Macbook pro 14' m1 | 11.60s | 26.61s | -| Macbook pro 16' m2 max | 8.05s | 12.17s | +- 🧠 `smart_scraper/` - Advanced web scraping with intelligent content extraction +- 🔎 `search_graph/` - Web search and data retrieval +- ⚙️ `script_generator_graph/` - Automated script generation +- 🌐 `depth_search_graph/` - Deep web crawling and content exploration +- 📊 `csv_scraper_graph/` - Scraping and processing data into CSV format +- 📑 `xml_scraper_graph/` - XML data extraction and processing +- 🎤 `speech_graph/` - Speech processing and analysis +- 🔄 `omni_scraper_graph/` - Universal web scraping for multiple data types +- 🔍 `omni_search_graph/` - Comprehensive search across multiple sources +- 📄 `document_scraper_graph/` - Document parsing and data extraction +- 🛠️ `custom_graph/` - Custom graph implementation examples +- 💻 `code_generator_graph/` - Code generation utilities +- 📋 `json_scraper_graph/` - JSON data extraction and processing +- 📋 `colab example`: + + Open In Colab + -**Note**: the examples on Docker are not runned on other devices than the Macbook because the performance are to slow (10 times slower than Ollama). Indeed the results are the following: +## 🚀 Getting Started -| Hardware | Example 1 | Example 2 | -| ------------------ | --------- | --------- | -| Macbook 14' m1 pro | 139.89 | Too long | -# Performance on APIs services -### Example 1: personal portfolio -**URL**: https://perinim.github.io/projects -**Task**: List me all the projects with their description. +1. Choose the example that best fits your use case +2. Navigate to the corresponding directory +3. Follow the README instructions in each directory +4. Configure any required environment variables using the provided `.env.example` files -| Name | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD | -| ------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- | -| gpt-3.5-turbo | 25.22 | 445 | 272 | 173 | 1 | 0.000754 | -| gpt-4-turbo-preview | 9.53 | 449 | 272 | 177 | 1 | 0.00803 | +## ⚡ Quick Setup -### Example 2: Wired -**URL**: https://www.wired.com -**Task**: List me all the articles with their description. +```bash +pip install scrapegraphai -| Name | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD | -| ------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- | -| gpt-3.5-turbo | 25.89 | 445 | 272 | 173 | 1 | 0.000754 | -| gpt-4-turbo-preview | 64.70 | 3573 | 2199 | 1374 | 1 | 0.06321 | +playwright install +# choose an example +cd examples/smart_scraper_graph/openai + +# run the example +python smart_scraper_openai.py +``` + +## 📋 Requirements + +Each example may have its own specific requirements. Please refer to the individual README files in each directory for detailed setup instructions. + +## 📚 Additional Resources + +- 📖 [Full Documentation](https://docs-oss.scrapegraphai.com/examples) +- 💡 [Examples Repository](https://github.com/ScrapeGraphAI/ScrapegraphLib-Examples) +- 🤝 [Community Support](https://github.com/ScrapeGraphAI/scrapegraph-ai/discussions) + +## 🤔 Need Help? + +- Check out our [documentation](https://docs-oss.scrapegraphai.com) +- Join our [Discord community](https://discord.gg/scrapegraphai) +- Open an [issue](https://github.com/ScrapeGraphAI/scrapegraph-ai/issues) + +--- + +⭐ Don't forget to star our repository if you find these examples helpful! diff --git a/examples/scrapegraph-api/smart_scraper_api.py b/examples/scrapegraph-api/smart_scraper_api.py deleted file mode 100644 index 8a292ee9..00000000 --- a/examples/scrapegraph-api/smart_scraper_api.py +++ /dev/null @@ -1,44 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - - -graph_config = { - "llm": { - "model": "scrapegraphai/smart-scraper", - "api_key": os.getenv("SCRAPEGRAPH_API_KEY") - }, - "verbose": True, - "headless": False, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="Extract me all the articles", - source="https://www.wired.com", - config=graph_config -) - -result = smart_scraper_graph.run() -print(json.dumps(result, indent=4)) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/script_generator_graph/.env.example b/examples/script_generator_graph/.env.example new file mode 100644 index 00000000..216ab8a7 --- /dev/null +++ b/examples/script_generator_graph/.env.example @@ -0,0 +1,13 @@ +# OpenAI API Configuration +OPENAI_API_KEY=your-openai-api-key-here + +# Optional Configurations +MAX_TOKENS=4000 +MODEL_NAME=gpt-4-1106-preview +TEMPERATURE=0.7 + +# Script Generator Settings +DEFAULT_LANGUAGE=python +INCLUDE_COMMENTS=true +ADD_TYPE_HINTS=true +CODE_STYLE=pep8 \ No newline at end of file diff --git a/examples/script_generator_graph/README.md b/examples/script_generator_graph/README.md new file mode 100644 index 00000000..7d1495c6 --- /dev/null +++ b/examples/script_generator_graph/README.md @@ -0,0 +1,30 @@ +# Script Generator Graph Example + +This example demonstrates how to use Scrapegraph-ai to generate automation scripts based on data analysis. + +## Features + +- Automated script generation +- Task automation +- Code optimization +- Multiple language support + +## Setup + +1. Install required dependencies +2. Copy `.env.example` to `.env` +3. Configure your API keys in the `.env` file + +## Usage + +```python +from scrapegraphai.graphs import ScriptGeneratorGraph + +graph = ScriptGeneratorGraph() +script = graph.generate("task description") +``` + +## Environment Variables + +Required environment variables: +- `OPENAI_API_KEY`: Your OpenAI API key \ No newline at end of file diff --git a/examples/local_models/script_generator_ollama.py b/examples/script_generator_graph/ollama/script_generator_ollama.py similarity index 100% rename from examples/local_models/script_generator_ollama.py rename to examples/script_generator_graph/ollama/script_generator_ollama.py diff --git a/examples/local_models/script_multi_generator_ollama.py b/examples/script_generator_graph/ollama/script_multi_generator_ollama.py similarity index 100% rename from examples/local_models/script_multi_generator_ollama.py rename to examples/script_generator_graph/ollama/script_multi_generator_ollama.py diff --git a/examples/openai/script_multi_generator_openai.py b/examples/script_generator_graph/openai/script_generator_multi_openai.py similarity index 100% rename from examples/openai/script_multi_generator_openai.py rename to examples/script_generator_graph/openai/script_generator_multi_openai.py diff --git a/examples/openai/script_generator_openai.py b/examples/script_generator_graph/openai/script_generator_openai.py similarity index 100% rename from examples/openai/script_generator_openai.py rename to examples/script_generator_graph/openai/script_generator_openai.py diff --git a/examples/openai/script_generator_schema_openai.py b/examples/script_generator_graph/openai/script_generator_schema_openai.py similarity index 100% rename from examples/openai/script_generator_schema_openai.py rename to examples/script_generator_graph/openai/script_generator_schema_openai.py diff --git a/examples/search_graph/.env.example b/examples/search_graph/.env.example new file mode 100644 index 00000000..a4b25c88 --- /dev/null +++ b/examples/search_graph/.env.example @@ -0,0 +1,11 @@ +# OpenAI API Configuration +OPENAI_API_KEY=your-openai-api-key-here + +# Search API Configuration +SERP_API_KEY=your-serp-api-key-here + +# Optional Configurations +MAX_SEARCH_RESULTS=10 +MAX_TOKENS=4000 +MODEL_NAME=gpt-4-1106-preview +TEMPERATURE=0.7 \ No newline at end of file diff --git a/examples/search_graph/README.md b/examples/search_graph/README.md new file mode 100644 index 00000000..084c6ea2 --- /dev/null +++ b/examples/search_graph/README.md @@ -0,0 +1,31 @@ +# Search Graph Example + +This example shows how to implement a search graph for web content retrieval and analysis using Scrapegraph-ai. + +## Features + +- Web search integration +- Content relevance scoring +- Result filtering +- Data aggregation + +## Setup + +1. Install required dependencies +2. Copy `.env.example` to `.env` +3. Configure your API keys in the `.env` file + +## Usage + +```python +from scrapegraphai.graphs import SearchGraph + +graph = SearchGraph() +results = graph.search("your search query") +``` + +## Environment Variables + +Required environment variables: +- `OPENAI_API_KEY`: Your OpenAI API key +- `SERP_API_KEY`: Your SERP API key (optional) \ No newline at end of file diff --git a/examples/local_models/search_graph_ollama.py b/examples/search_graph/ollama/search_graph_ollama.py similarity index 100% rename from examples/local_models/search_graph_ollama.py rename to examples/search_graph/ollama/search_graph_ollama.py diff --git a/examples/local_models/search_graph_schema_ollama.py b/examples/search_graph/ollama/search_graph_schema_ollama.py similarity index 100% rename from examples/local_models/search_graph_schema_ollama.py rename to examples/search_graph/ollama/search_graph_schema_ollama.py diff --git a/examples/openai/search_graph_openai.py b/examples/search_graph/openai/search_graph_openai.py similarity index 100% rename from examples/openai/search_graph_openai.py rename to examples/search_graph/openai/search_graph_openai.py diff --git a/examples/openai/search_graph_schema_openai.py b/examples/search_graph/openai/search_graph_schema_openai.py similarity index 100% rename from examples/openai/search_graph_schema_openai.py rename to examples/search_graph/openai/search_graph_schema_openai.py diff --git a/examples/openai/search_link_graph_openai.py b/examples/search_graph/openai/search_link_graph_openai.py similarity index 100% rename from examples/openai/search_link_graph_openai.py rename to examples/search_graph/openai/search_link_graph_openai.py diff --git a/examples/single_node/fetch_node.py b/examples/single_node/fetch_node.py deleted file mode 100644 index ed2de2e0..00000000 --- a/examples/single_node/fetch_node.py +++ /dev/null @@ -1,30 +0,0 @@ -""" -Example of custom graph using existing nodes -""" - -from scrapegraphai.nodes import FetchNode - -# ************************************************ -# Define the node -# ************************************************ - - -fetch_node = FetchNode( - input="url | local_dir", - output=["doc"], - node_config={ - "headless": False - } -) - -# ************************************************ -# Test the node -# ************************************************ - -state = { - "url": "https://twitter.com/home" -} - -result = fetch_node.execute(state) - -print(result) diff --git a/examples/single_node/image2text_node.py b/examples/single_node/image2text_node.py deleted file mode 100644 index 0f691e8a..00000000 --- a/examples/single_node/image2text_node.py +++ /dev/null @@ -1,54 +0,0 @@ -""" -Example of ImageToTextNode -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.nodes import ImageToTextNode -from scrapegraphai.models import OpenAIImageToText - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -openai_key = os.getenv("OPENAI_APIKEY") - -graph_config = { - "llm": { - "api_key": openai_key, - "model": "gpt-4o", - "temperature": 0, - }, -} - -# ************************************************ -# Define the node -# ************************************************ - -llm_model = OpenAIImageToText(graph_config["llm"]) - -image_to_text_node = ImageToTextNode( - input="img_url", - output=["img_desc"], - node_config={ - "llm_model": llm_model, - "headless": False - } -) - -# ************************************************ -# Test the node -# ************************************************ - -state = { - "img_url": [ - "https://perinim.github.io/assets/img/rotary_pybullet.jpg", - "https://perinim.github.io/assets/img/value-policy-heatmaps.jpg", - ], -} - -result = image_to_text_node.execute(state) - -print(result) diff --git a/examples/single_node/kg_node.py b/examples/single_node/kg_node.py deleted file mode 100644 index 37d1d9a4..00000000 --- a/examples/single_node/kg_node.py +++ /dev/null @@ -1,79 +0,0 @@ -""" -Example of knowledge graph node -""" - -import os -from langchain_openai import ChatOpenAI -from scrapegraphai.nodes import KnowledgeGraphNode - -job_postings = { - "Job Postings": { - "Company A": [ - { - "title": "Software Engineer", - "description": "Develop and maintain software applications.", - "location": "New York, NY", - "date_posted": "2024-05-01", - "requirements": ["Python", "Django", "REST APIs"] - }, - { - "title": "Data Scientist", - "description": "Analyze and interpret complex data.", - "location": "San Francisco, CA", - "date_posted": "2024-05-05", - "requirements": ["Python", "Machine Learning", "SQL"] - } - ], - "Company B": [ - { - "title": "Project Manager", - "description": "Manage software development projects.", - "location": "Boston, MA", - "date_posted": "2024-04-20", - "requirements": ["Project Management", "Agile", "Scrum"] - } - ] - } -} - - - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -openai_key = os.getenv("OPENAI_APIKEY") - -graph_config = { - "llm": { - "api_key": openai_key, - "model": "gpt-4o", - "temperature": 0, - }, - "verbose": True, -} - -# ************************************************ -# Define the node -# ************************************************ - -llm_model = ChatOpenAI(graph_config["llm"]) - -robots_node = KnowledgeGraphNode( - input="user_prompt & answer_dict", - output=["is_scrapable"], - node_config={"llm_model": llm_model} -) - -# ************************************************ -# Test the node -# ************************************************ - -state = { - "user_prompt": "What are the job postings?", - "answer_dict": job_postings -} - -result = robots_node.execute(state) - -print(result) diff --git a/examples/single_node/robot_node.py b/examples/single_node/robot_node.py deleted file mode 100644 index dcb70e3d..00000000 --- a/examples/single_node/robot_node.py +++ /dev/null @@ -1,49 +0,0 @@ -""" -Example of custom graph using existing nodes -""" - -from langchain_community.chat_models import ChatOllama -from scrapegraphai.nodes import RobotsNode - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "model": "llama3", - "temperature": 0, - "streaming": True - }, - "embeddings": { - "model": "nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily - } -} - -# ************************************************ -# Define the node -# ************************************************ - -llm_model = ChatOllama(graph_config["llm"]) - -robots_node = RobotsNode( - input="url", - output=["is_scrapable"], - node_config={"llm_model": llm_model, - "headless": False - } -) - -# ************************************************ -# Test the node -# ************************************************ - -state = { - "url": "https://twitter.com/home" -} - -result = robots_node.execute(state) - -print(result) diff --git a/examples/single_node/search_internet_node.py b/examples/single_node/search_internet_node.py deleted file mode 100644 index c998cdd1..00000000 --- a/examples/single_node/search_internet_node.py +++ /dev/null @@ -1,50 +0,0 @@ -""" -Example of custom graph using existing nodes -""" - -from langchain_community.chat_models import ChatOllama -from scrapegraphai.nodes import SearchInternetNode - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "model": "llama3", - "temperature": 0, - "streaming": True - }, - "search_engine": "google", - "max_results": 3, - "verbose": True -} - -# ************************************************ -# Define the node -# ************************************************ - -llm_model = ChatOllama(graph_config["llm"]) - -search_node = SearchInternetNode( - input="user_input", - output=["search_results"], - node_config={ - "llm_model": llm_model, - "search_engine": graph_config["search_engine"], - "max_results": graph_config["max_results"], - "verbose": graph_config["verbose"] - } -) - -# ************************************************ -# Test the node -# ************************************************ - -state = { - "user_input": "What is the capital of France?" -} - -result = search_node.execute(state) - -print(result) diff --git a/examples/smart_scraper_graph/.env.example b/examples/smart_scraper_graph/.env.example new file mode 100644 index 00000000..0c8d0b86 --- /dev/null +++ b/examples/smart_scraper_graph/.env.example @@ -0,0 +1,7 @@ +# OpenAI API Configuration +OPENAI_API_KEY=your-openai-api-key-here + +# Optional Configurations +MAX_TOKENS=4000 +MODEL_NAME=gpt-4-1106-preview +TEMPERATURE=0.7 diff --git a/examples/smart_scraper_graph/README.md b/examples/smart_scraper_graph/README.md new file mode 100644 index 00000000..d5f0d564 --- /dev/null +++ b/examples/smart_scraper_graph/README.md @@ -0,0 +1,30 @@ +# Smart Scraper Example + +This example demonstrates how to use Scrapegraph-ai for intelligent web scraping with automatic content detection and extraction. + +## Features + +- Intelligent content detection +- Automatic data extraction +- Content classification +- Clean data output + +## Setup + +1. Install required dependencies +2. Copy `.env.example` to `.env` +3. Configure your OpenAI API key in the `.env` file + +## Usage + +```python +from scrapegraphai.graphs import SmartScraperGraph + +graph = SmartScraperGraph() +results = graph.scrape("https://example.com") +``` + +## Environment Variables + +Required environment variables: +- `OPENAI_API_KEY`: Your OpenAI API key diff --git a/examples/local_models/smart_scraper_lite_ollama.py b/examples/smart_scraper_graph/ollama/smart_scraper_lite_ollama.py similarity index 89% rename from examples/local_models/smart_scraper_lite_ollama.py rename to examples/smart_scraper_graph/ollama/smart_scraper_lite_ollama.py index 2cf6c402..9c4b9a69 100644 --- a/examples/local_models/smart_scraper_lite_ollama.py +++ b/examples/smart_scraper_graph/ollama/smart_scraper_lite_ollama.py @@ -1,8 +1,10 @@ -""" +""" Basic example of scraping pipeline using SmartScraper """ + import json + from scrapegraphai.graphs import SmartScraperLiteGraph from scrapegraphai.utils import prettify_exec_info @@ -10,17 +12,16 @@ "llm": { "model": "ollama/llama3.1", "temperature": 0, - "format": "json", "base_url": "http://localhost:11434", }, "verbose": True, - "headless": False + "headless": False, } smart_scraper_lite_graph = SmartScraperLiteGraph( prompt="Who is Marco Perini?", source="https://perinim.github.io/", - config=graph_config + config=graph_config, ) result = smart_scraper_lite_graph.run() diff --git a/examples/mistral/smart_scraper_multi_concat_mistral.py b/examples/smart_scraper_graph/ollama/smart_scraper_multi_concat_ollama.py similarity index 75% rename from examples/mistral/smart_scraper_multi_concat_mistral.py rename to examples/smart_scraper_graph/ollama/smart_scraper_multi_concat_ollama.py index 9cef8a16..a29ac3fc 100644 --- a/examples/mistral/smart_scraper_multi_concat_mistral.py +++ b/examples/smart_scraper_graph/ollama/smart_scraper_multi_concat_ollama.py @@ -1,9 +1,11 @@ -""" +""" Basic example of scraping pipeline using SmartScraper """ -import os + import json + from dotenv import load_dotenv + from scrapegraphai.graphs import SmartScraperMultiConcatGraph load_dotenv() @@ -11,10 +13,12 @@ # ************************************************ # Define the configuration for the graph # ************************************************ + graph_config = { "llm": { - "api_key": os.getenv("MISTRAL_API_KEY"), - "model": "mistralai/open-mistral-nemo", + "model": "ollama/llama3.1", + "temperature": 0, + "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, "headless": False, @@ -26,12 +30,9 @@ multiple_search_graph = SmartScraperMultiConcatGraph( prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], + source=["https://perinim.github.io/", "https://perinim.github.io/cv/"], schema=None, - config=graph_config + config=graph_config, ) result = multiple_search_graph.run() diff --git a/examples/local_models/smart_scraper_multi_lite_ollama.py b/examples/smart_scraper_graph/ollama/smart_scraper_multi_lite_ollama.py similarity index 82% rename from examples/local_models/smart_scraper_multi_lite_ollama.py rename to examples/smart_scraper_graph/ollama/smart_scraper_multi_lite_ollama.py index f09c4cb4..15055f96 100644 --- a/examples/local_models/smart_scraper_multi_lite_ollama.py +++ b/examples/smart_scraper_graph/ollama/smart_scraper_multi_lite_ollama.py @@ -1,7 +1,9 @@ -""" +""" Basic example of scraping pipeline using SmartScraper """ + import json + from scrapegraphai.graphs import SmartScraperMultiLiteGraph from scrapegraphai.utils import prettify_exec_info @@ -13,11 +15,10 @@ "llm": { "model": "ollama/llama3.1", "temperature": 0, - "format": "json", # Ollama needs the format to be specified explicitly "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, - "headless": False + "headless": False, } # ************************************************ @@ -26,11 +27,8 @@ smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - config=graph_config + source=["https://perinim.github.io/", "https://perinim.github.io/cv/"], + config=graph_config, ) result = smart_scraper_multi_lite_graph.run() @@ -42,4 +40,3 @@ graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() print(prettify_exec_info(graph_exec_info)) - diff --git a/examples/local_models/smart_scraper_multi_ollama.py b/examples/smart_scraper_graph/ollama/smart_scraper_multi_ollama.py similarity index 77% rename from examples/local_models/smart_scraper_multi_ollama.py rename to examples/smart_scraper_graph/ollama/smart_scraper_multi_ollama.py index c9d49793..04eb0e67 100644 --- a/examples/local_models/smart_scraper_multi_ollama.py +++ b/examples/smart_scraper_graph/ollama/smart_scraper_multi_ollama.py @@ -1,8 +1,9 @@ -""" +""" Basic example of scraping pipeline using SmartScraper """ import json + from scrapegraphai.graphs import SmartScraperMultiGraph # ************************************************ @@ -12,12 +13,10 @@ "llm": { "model": "ollama/llama3.1", "temperature": 0, - "format": "json", # Ollama needs the format to be specified explicitly # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, - "verbose": True, - "headless": False + "headless": False, } @@ -27,12 +26,9 @@ multiple_search_graph = SmartScraperMultiGraph( prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], + source=["https://perinim.github.io/", "https://perinim.github.io/cv/"], schema=None, - config=graph_config + config=graph_config, ) result = multiple_search_graph.run() diff --git a/examples/local_models/smart_scraper_ollama.py b/examples/smart_scraper_graph/ollama/smart_scraper_ollama.py similarity index 93% rename from examples/local_models/smart_scraper_ollama.py rename to examples/smart_scraper_graph/ollama/smart_scraper_ollama.py index b08dceb9..9642b3f8 100644 --- a/examples/local_models/smart_scraper_ollama.py +++ b/examples/smart_scraper_graph/ollama/smart_scraper_ollama.py @@ -13,7 +13,6 @@ "llm": { "model": "ollama/llama3.2:3b", "temperature": 0, - "format": "json", # Ollama needs the format to be specified explicitly # "base_url": "http://localhost:11434", # set ollama URL arbitrarily "model_tokens": 4096, }, diff --git a/examples/local_models/smart_scraper_schema_ollama.py b/examples/smart_scraper_graph/ollama/smart_scraper_schema_ollama.py similarity index 100% rename from examples/local_models/smart_scraper_schema_ollama.py rename to examples/smart_scraper_graph/ollama/smart_scraper_schema_ollama.py diff --git a/examples/openai/smart_scraper_lite_openai.py b/examples/smart_scraper_graph/openai/smart_scraper_lite_openai.py similarity index 95% rename from examples/openai/smart_scraper_lite_openai.py rename to examples/smart_scraper_graph/openai/smart_scraper_lite_openai.py index 5de725bb..3d768548 100644 --- a/examples/openai/smart_scraper_lite_openai.py +++ b/examples/smart_scraper_graph/openai/smart_scraper_lite_openai.py @@ -1,9 +1,12 @@ -""" +""" Basic example of scraping pipeline using SmartScraper """ -import os + import json +import os + from dotenv import load_dotenv + from scrapegraphai.graphs import SmartScraperLiteGraph from scrapegraphai.utils import prettify_exec_info @@ -21,7 +24,7 @@ smart_scraper_lite_graph = SmartScraperLiteGraph( prompt="Who is Marco Perini?", source="https://perinim.github.io/", - config=graph_config + config=graph_config, ) result = smart_scraper_lite_graph.run() @@ -29,4 +32,3 @@ graph_exec_info = smart_scraper_lite_graph.get_execution_info() print(prettify_exec_info(graph_exec_info)) - diff --git a/examples/openai/smart_scraper_multi_concat_openai.py b/examples/smart_scraper_graph/openai/smart_scraper_multi_concat_openai.py similarity index 86% rename from examples/openai/smart_scraper_multi_concat_openai.py rename to examples/smart_scraper_graph/openai/smart_scraper_multi_concat_openai.py index 650971f1..4774e620 100644 --- a/examples/openai/smart_scraper_multi_concat_openai.py +++ b/examples/smart_scraper_graph/openai/smart_scraper_multi_concat_openai.py @@ -1,9 +1,12 @@ -""" +""" Basic example of scraping pipeline using SmartScraper """ -import os + import json +import os + from dotenv import load_dotenv + from scrapegraphai.graphs import SmartScraperMultiConcatGraph load_dotenv() @@ -28,12 +31,9 @@ multiple_search_graph = SmartScraperMultiConcatGraph( prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], + source=["https://perinim.github.io/", "https://perinim.github.io/cv/"], schema=None, - config=graph_config + config=graph_config, ) result = multiple_search_graph.run() diff --git a/examples/openai/smart_scraper_multi_lite_openai.py b/examples/smart_scraper_graph/openai/smart_scraper_multi_lite_openai.py similarity index 89% rename from examples/openai/smart_scraper_multi_lite_openai.py rename to examples/smart_scraper_graph/openai/smart_scraper_multi_lite_openai.py index 69eeafc7..acc970be 100644 --- a/examples/openai/smart_scraper_multi_lite_openai.py +++ b/examples/smart_scraper_graph/openai/smart_scraper_multi_lite_openai.py @@ -1,9 +1,12 @@ -""" +""" Basic example of scraping pipeline using SmartScraper """ -import os + import json +import os + from dotenv import load_dotenv + from scrapegraphai.graphs import SmartScraperMultiLiteGraph from scrapegraphai.utils import prettify_exec_info @@ -29,11 +32,8 @@ smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - config=graph_config + source=["https://perinim.github.io/", "https://perinim.github.io/cv/"], + config=graph_config, ) result = smart_scraper_multi_lite_graph.run() diff --git a/examples/openai/smart_scraper_multi_openai.py b/examples/smart_scraper_graph/openai/smart_scraper_multi_openai.py similarity index 86% rename from examples/openai/smart_scraper_multi_openai.py rename to examples/smart_scraper_graph/openai/smart_scraper_multi_openai.py index ba889c96..ec510fc2 100644 --- a/examples/openai/smart_scraper_multi_openai.py +++ b/examples/smart_scraper_graph/openai/smart_scraper_multi_openai.py @@ -1,9 +1,12 @@ -""" +""" Basic example of scraping pipeline using SmartScraper """ -import os + import json +import os + from dotenv import load_dotenv + from scrapegraphai.graphs import SmartScraperMultiGraph load_dotenv() @@ -29,12 +32,9 @@ multiple_search_graph = SmartScraperMultiGraph( prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], + source=["https://perinim.github.io/", "https://perinim.github.io/cv/"], schema=None, - config=graph_config + config=graph_config, ) result = multiple_search_graph.run() diff --git a/examples/openai/smart_scraper_openai.py b/examples/smart_scraper_graph/openai/smart_scraper_openai.py similarity index 100% rename from examples/openai/smart_scraper_openai.py rename to examples/smart_scraper_graph/openai/smart_scraper_openai.py diff --git a/examples/openai/smart_scraper_schema_openai.py b/examples/smart_scraper_graph/openai/smart_scraper_schema_openai.py similarity index 97% rename from examples/openai/smart_scraper_schema_openai.py rename to examples/smart_scraper_graph/openai/smart_scraper_schema_openai.py index 32e8891a..3a75bd5a 100644 --- a/examples/openai/smart_scraper_schema_openai.py +++ b/examples/smart_scraper_graph/openai/smart_scraper_schema_openai.py @@ -1,10 +1,13 @@ -""" +""" Basic example of scraping pipeline using SmartScraper with schema """ + import os from typing import List + from dotenv import load_dotenv from pydantic import BaseModel, Field + from scrapegraphai.graphs import SmartScraperGraph load_dotenv() @@ -13,13 +16,16 @@ # Define the output schema for the graph # ************************************************ + class Project(BaseModel): title: str = Field(description="The title of the project") description: str = Field(description="The description of the project") + class Projects(BaseModel): projects: List[Project] + # ************************************************ # Define the configuration for the graph # ************************************************ @@ -43,7 +49,7 @@ class Projects(BaseModel): prompt="List me all the projects with their description", source="https://perinim.github.io/projects/", schema=Projects, - config=graph_config + config=graph_config, ) result = smart_scraper_graph.run() diff --git a/examples/speech_graph/.env.example b/examples/speech_graph/.env.example new file mode 100644 index 00000000..93aa505d --- /dev/null +++ b/examples/speech_graph/.env.example @@ -0,0 +1,14 @@ +# OpenAI API Configuration +OPENAI_API_KEY=your-openai-api-key-here + +# Whisper API Configuration (Optional) +WHISPER_API_KEY=your-whisper-api-key-here + +# Optional Configurations +MAX_TOKENS=4000 +MODEL_NAME=gpt-4-1106-preview +TEMPERATURE=0.7 + +# Speech Settings +AUDIO_FORMAT=mp3 +SAMPLE_RATE=16000 \ No newline at end of file diff --git a/examples/speech_graph/README.md b/examples/speech_graph/README.md new file mode 100644 index 00000000..35be155f --- /dev/null +++ b/examples/speech_graph/README.md @@ -0,0 +1,31 @@ +# Speech Graph Example + +This example demonstrates how to use Scrapegraph-ai for speech processing and analysis. + +## Features + +- Speech-to-text conversion +- Audio processing +- Text analysis +- Sentiment analysis + +## Setup + +1. Install required dependencies +2. Copy `.env.example` to `.env` +3. Configure your API keys in the `.env` file + +## Usage + +```python +from scrapegraphai.graphs import SpeechGraph + +graph = SpeechGraph() +text = graph.process("audio_file.mp3") +``` + +## Environment Variables + +Required environment variables: +- `OPENAI_API_KEY`: Your OpenAI API key +- `WHISPER_API_KEY`: Your Whisper API key (optional) \ No newline at end of file diff --git a/examples/openai/speech_graph_openai.py b/examples/speech_graph/speech_graph_openai.py similarity index 100% rename from examples/openai/speech_graph_openai.py rename to examples/speech_graph/speech_graph_openai.py diff --git a/examples/together/.env.example b/examples/together/.env.example deleted file mode 100644 index 7004713a..00000000 --- a/examples/together/.env.example +++ /dev/null @@ -1 +0,0 @@ -TOGETHER_APIKEY="your api key" \ No newline at end of file diff --git a/examples/together/code_generator_graph_togehter.py b/examples/together/code_generator_graph_togehter.py deleted file mode 100644 index aefbeba4..00000000 --- a/examples/together/code_generator_graph_togehter.py +++ /dev/null @@ -1,60 +0,0 @@ -""" -Basic example of scraping pipeline using Code Generator with schema -""" - -import os, json -from typing import List -from dotenv import load_dotenv -from pydantic import BaseModel, Field -from scrapegraphai.graphs import CodeGeneratorGraph - -load_dotenv() - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Project(BaseModel): - title: str = Field(description="The title of the project") - description: str = Field(description="The description of the project") - -class Projects(BaseModel): - projects: List[Project] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -together_key = os.getenv("TOGETHER_KEY") - -graph_config = { - "llm": { - "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", - "api_key": together_key, - }, - "verbose": True, - "headless": False, - "reduction": 2, - "max_iterations": { - "overall": 10, - "syntax": 3, - "execution": 3, - "validation": 3, - "semantic": 3 - }, - "output_file_name": "extracted_data.py" -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -code_generator_graph = CodeGeneratorGraph( - prompt="List me all the projects with their description", - source="https://perinim.github.io/projects/", - schema=Projects, - config=graph_config -) - -result = code_generator_graph.run() -print(result) diff --git a/examples/together/csv_scraper_graph_multi_together.py b/examples/together/csv_scraper_graph_multi_together.py deleted file mode 100644 index beee56c1..00000000 --- a/examples/together/csv_scraper_graph_multi_together.py +++ /dev/null @@ -1,56 +0,0 @@ -""" -Basic example of scraping pipeline using CSVScraperMultiGraph from CSV documents -""" -import os -from dotenv import load_dotenv -import pandas as pd -from scrapegraphai.graphs import CSVScraperMultiGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() -# ************************************************ -# Read the CSV file -# ************************************************ - -FILE_NAME = "inputs/username.csv" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -text = pd.read_csv(file_path) - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -together_key = os.getenv("TOGETHER_APIKEY") - -graph_config = { - "llm": { - "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", - "api_key": together_key, - }, - "verbose": True, -} -# ************************************************ -# Create the CSVScraperMultiGraph instance and run it -# ************************************************ - -csv_scraper_graph = CSVScraperMultiGraph( - prompt="List me all the last names", - source=[str(text), str(text)], - config=graph_config -) - -result = csv_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = csv_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/together/csv_scraper_together.py b/examples/together/csv_scraper_together.py deleted file mode 100644 index 5d1a3474..00000000 --- a/examples/together/csv_scraper_together.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -Basic example of scraping pipeline using CSVScraperGraph from CSV documents -""" -import os -from dotenv import load_dotenv -import pandas as pd -from scrapegraphai.graphs import CSVScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the CSV file -# ************************************************ - -FILE_NAME = "inputs/username.csv" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -text = pd.read_csv(file_path) - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -together_key = os.getenv("TOGETHER_APIKEY") - -graph_config = { - "llm": { - "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", - "api_key": together_key, - }, - "verbose": True, -} -# ************************************************ -# Create the CSVScraperGraph instance and run it -# ************************************************ - -csv_scraper_graph = CSVScraperGraph( - prompt="List me all the last names", - source=str(text), # Pass the content of the file, not the file object - config=graph_config -) - -result = csv_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = csv_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/together/depth_search_graph_together.py b/examples/together/depth_search_graph_together.py deleted file mode 100644 index fb7b4d9e..00000000 --- a/examples/together/depth_search_graph_together.py +++ /dev/null @@ -1,30 +0,0 @@ -""" -depth_search_graph_opeani example -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import DepthSearchGraph - -load_dotenv() - -together_key = os.getenv("TOGETHER_APIKEY") - -graph_config = { - "llm": { - "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", - "api_key": together_key, - }, - "verbose": True, - "headless": False, - "depth": 2, - "only_inside_links": False, -} - -search_graph = DepthSearchGraph( - prompt="List me all the projects with their description", - source="https://perinim.github.io", - config=graph_config -) - -result = search_graph.run() -print(result) diff --git a/examples/together/document_scraper_together.py b/examples/together/document_scraper_together.py deleted file mode 100644 index c3324330..00000000 --- a/examples/together/document_scraper_together.py +++ /dev/null @@ -1,39 +0,0 @@ -""" -document_scraper example -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import DocumentScraperGraph - -load_dotenv() - -together_key = os.getenv("TOGETHER_APIKEY") - -graph_config = { - "llm": { - "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", - "api_key": together_key, - }, - "verbose": True, -} - -source = """ - The Divine Comedy, Italian La Divina Commedia, original name La commedia, long narrative poem written in Italian - circa 1308/21 by Dante. It is usually held to be one of the world s great works of literature. - Divided into three major sections—Inferno, Purgatorio, and Paradiso—the narrative traces the journey of Dante - from darkness and error to the revelation of the divine light, culminating in the Beatific Vision of God. - Dante is guided by the Roman poet Virgil, who represents the epitome of human knowledge, from the dark wood - through the descending circles of the pit of Hell (Inferno). He then climbs the mountain of Purgatory, guided - by the Roman poet Statius, who represents the fulfilment of human knowledge, and is finally led by his lifelong love, - the Beatrice of his earlier poetry, through the celestial spheres of Paradise. -""" - -pdf_scraper_graph = DocumentScraperGraph( - prompt="Summarize the text and find the main topics", - source=source, - config=graph_config, -) -result = pdf_scraper_graph.run() - -print(json.dumps(result, indent=4)) \ No newline at end of file diff --git a/examples/together/inputs/books.xml b/examples/together/inputs/books.xml deleted file mode 100644 index e3d1fe87..00000000 --- a/examples/together/inputs/books.xml +++ /dev/null @@ -1,120 +0,0 @@ - - - - Gambardella, Matthew - XML Developer's Guide - Computer - 44.95 - 2000-10-01 - An in-depth look at creating applications - with XML. - - - Ralls, Kim - Midnight Rain - Fantasy - 5.95 - 2000-12-16 - A former architect battles corporate zombies, - an evil sorceress, and her own childhood to become queen - of the world. - - - Corets, Eva - Maeve Ascendant - Fantasy - 5.95 - 2000-11-17 - After the collapse of a nanotechnology - society in England, the young survivors lay the - foundation for a new society. - - - Corets, Eva - Oberon's Legacy - Fantasy - 5.95 - 2001-03-10 - In post-apocalypse England, the mysterious - agent known only as Oberon helps to create a new life - for the inhabitants of London. Sequel to Maeve - Ascendant. - - - Corets, Eva - The Sundered Grail - Fantasy - 5.95 - 2001-09-10 - The two daughters of Maeve, half-sisters, - battle one another for control of England. Sequel to - Oberon's Legacy. - - - Randall, Cynthia - Lover Birds - Romance - 4.95 - 2000-09-02 - When Carla meets Paul at an ornithology - conference, tempers fly as feathers get ruffled. - - - Thurman, Paula - Splish Splash - Romance - 4.95 - 2000-11-02 - A deep sea diver finds true love twenty - thousand leagues beneath the sea. - - - Knorr, Stefan - Creepy Crawlies - Horror - 4.95 - 2000-12-06 - An anthology of horror stories about roaches, - centipedes, scorpions and other insects. - - - Kress, Peter - Paradox Lost - Science Fiction - 6.95 - 2000-11-02 - After an inadvertant trip through a Heisenberg - Uncertainty Device, James Salway discovers the problems - of being quantum. - - - O'Brien, Tim - Microsoft .NET: The Programming Bible - Computer - 36.95 - 2000-12-09 - Microsoft's .NET initiative is explored in - detail in this deep programmer's reference. - - - O'Brien, Tim - MSXML3: A Comprehensive Guide - Computer - 36.95 - 2000-12-01 - The Microsoft MSXML3 parser is covered in - detail, with attention to XML DOM interfaces, XSLT processing, - SAX and more. - - - Galos, Mike - Visual Studio 7: A Comprehensive Guide - Computer - 49.95 - 2001-04-16 - Microsoft Visual Studio 7 is explored in depth, - looking at how Visual Basic, Visual C++, C#, and ASP+ are - integrated into a comprehensive development - environment. - - \ No newline at end of file diff --git a/examples/together/inputs/example.json b/examples/together/inputs/example.json deleted file mode 100644 index 2263184c..00000000 --- a/examples/together/inputs/example.json +++ /dev/null @@ -1,182 +0,0 @@ -{ - "kind":"youtube#searchListResponse", - "etag":"q4ibjmYp1KA3RqMF4jFLl6PBwOg", - "nextPageToken":"CAUQAA", - "regionCode":"NL", - "pageInfo":{ - "totalResults":1000000, - "resultsPerPage":5 - }, - "items":[ - { - "kind":"youtube#searchResult", - "etag":"QCsHBifbaernVCbLv8Cu6rAeaDQ", - "id":{ - "kind":"youtube#video", - "videoId":"TvWDY4Mm5GM" - }, - "snippet":{ - "publishedAt":"2023-07-24T14:15:01Z", - "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", - "title":"3 Football Clubs Kylian Mbappe Should Avoid Signing ✍️❌⚽️ #football #mbappe #shorts", - "description":"", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"FC Motivate", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T14:15:01Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"0NG5QHdtIQM_V-DBJDEf-jK_Y9k", - "id":{ - "kind":"youtube#video", - "videoId":"aZM_42CcNZ4" - }, - "snippet":{ - "publishedAt":"2023-07-24T16:09:27Z", - "channelId":"UCM5gMM_HqfKHYIEJ3lstMUA", - "title":"Which Football Club Could Cristiano Ronaldo Afford To Buy? 💰", - "description":"Sign up to Sorare and get a FREE card: https://sorare.pxf.io/NellisShorts Give Soraredata a go for FREE: ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"John Nellis", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T16:09:27Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"WbBz4oh9I5VaYj91LjeJvffrBVY", - "id":{ - "kind":"youtube#video", - "videoId":"wkP3XS3aNAY" - }, - "snippet":{ - "publishedAt":"2023-07-24T16:00:50Z", - "channelId":"UC4EP1dxFDPup_aFLt0ElsDw", - "title":"PAULO DYBALA vs THE WORLD'S LONGEST FREEKICK WALL", - "description":"Can Paulo Dybala curl a football around the World's longest free kick wall? We met up with the World Cup winner and put him to ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"Shoot for Love", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T16:00:50Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"juxv_FhT_l4qrR05S1QTrb4CGh8", - "id":{ - "kind":"youtube#video", - "videoId":"rJkDZ0WvfT8" - }, - "snippet":{ - "publishedAt":"2023-07-24T10:00:39Z", - "channelId":"UCO8qj5u80Ga7N_tP3BZWWhQ", - "title":"TOP 10 DEFENDERS 2023", - "description":"SoccerKingz https://soccerkingz.nl Use code: 'ILOVEHOF' to get 10% off. TOP 10 DEFENDERS 2023 Follow us! • Instagram ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"Home of Football", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T10:00:39Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"wtuknXTmI1txoULeH3aWaOuXOow", - "id":{ - "kind":"youtube#video", - "videoId":"XH0rtu4U6SE" - }, - "snippet":{ - "publishedAt":"2023-07-21T16:30:05Z", - "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", - "title":"3 Things You Didn't Know About Erling Haaland ⚽️🇳🇴 #football #haaland #shorts", - "description":"", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"FC Motivate", - "liveBroadcastContent":"none", - "publishTime":"2023-07-21T16:30:05Z" - } - } - ] -} \ No newline at end of file diff --git a/examples/together/inputs/username.csv b/examples/together/inputs/username.csv deleted file mode 100644 index 006ac8e6..00000000 --- a/examples/together/inputs/username.csv +++ /dev/null @@ -1,7 +0,0 @@ -Username; Identifier;First name;Last name -booker12;9012;Rachel;Booker -grey07;2070;Laura;Grey -johnson81;4081;Craig;Johnson -jenkins46;9346;Mary;Jenkins -smith79;5079;Jamie;Smith - diff --git a/examples/together/json_scraper_multi_together.py b/examples/together/json_scraper_multi_together.py deleted file mode 100644 index 0d9ac293..00000000 --- a/examples/together/json_scraper_multi_together.py +++ /dev/null @@ -1,38 +0,0 @@ -""" -Module for showing how JSONScraperMultiGraph multi works -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import JSONScraperMultiGraph - -load_dotenv() - -together_key = os.getenv("TOGETHER_APIKEY") - -graph_config = { - "llm": { - "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", - "api_key": together_key, - }, - "verbose": True, -} - -FILE_NAME = "inputs/example.json" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -sources = [text, text] - -multiple_search_graph = JSONScraperMultiGraph( - prompt= "List me all the authors, title and genres of the books", - source= sources, - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/together/json_scraper_together.py b/examples/together/json_scraper_together.py deleted file mode 100644 index a39c6ce4..00000000 --- a/examples/together/json_scraper_together.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -Basic example of scraping pipeline using JSONScraperGraph from JSON documents -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import JSONScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the JSON file -# ************************************************ -together_key = os.getenv("TOGETHER_APIKEY") - -FILE_NAME = "inputs/example.json" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", - "api_key": together_key, - }, - "verbose": True, -} - -# ************************************************ -# Create the JSONScraperGraph instance and run it -# ************************************************ - -json_scraper_graph = JSONScraperGraph( - prompt="List me all the authors, title and genres of the books", - source=text, # Pass the content of the file, not the file object - config=graph_config -) - -result = json_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = json_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/together/rate_limit_together.py b/examples/together/rate_limit_together.py deleted file mode 100644 index 89e3f89f..00000000 --- a/examples/together/rate_limit_together.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper with a custom rate limit -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -together_key = os.getenv("TOGETHER_APIKEY") - -graph_config = { - "llm": { - "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", - "api_key": together_key, - "rate_limit": { - "requests_per_second": 1 - } - }, - "verbose": True, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description.", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects/", - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/together/scrape_plain_text_together.py b/examples/together/scrape_plain_text_together.py deleted file mode 100644 index feff1e3a..00000000 --- a/examples/together/scrape_plain_text_together.py +++ /dev/null @@ -1,55 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper from text -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the text file -# ************************************************ - -FILE_NAME = "inputs/plain_html_example.txt" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -# It could be also a http request using the request model -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -together_key = os.getenv("TOGETHER_APIKEY") - -graph_config = { - "llm": { - "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", - "api_key": together_key, - }, - "verbose": True, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the news with their description.", - source=text, - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/together/script_generator_together.py b/examples/together/script_generator_together.py deleted file mode 100644 index cfe46c83..00000000 --- a/examples/together/script_generator_together.py +++ /dev/null @@ -1,44 +0,0 @@ -""" -Basic example of scraping pipeline using ScriptCreatorGraph -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import ScriptCreatorGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -together_key = os.getenv("TOGETHER_APIKEY") - -graph_config = { - "llm": { - "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", - "api_key": together_key, - }, - "verbose": True, -} - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -script_creator_graph = ScriptCreatorGraph( - prompt="List me all the projects with their description.", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects", - config=graph_config -) - -result = script_creator_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = script_creator_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/together/script_multi_generator_together.py b/examples/together/script_multi_generator_together.py deleted file mode 100644 index 0596f1e2..00000000 --- a/examples/together/script_multi_generator_together.py +++ /dev/null @@ -1,53 +0,0 @@ -""" -Basic example of scraping pipeline using ScriptCreatorGraph -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import ScriptCreatorMultiGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -together_key = os.getenv("TOGETHER_APIKEY") - -graph_config = { - "llm": { - "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", - "api_key": together_key, - }, - "library": "beautifulsoup" -} - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -urls=[ - "https://schultzbergagency.com/emil-raste-karlsen/", - "https://schultzbergagency.com/johanna-hedberg/", -] - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -script_creator_graph = ScriptCreatorMultiGraph( - prompt="Find information about actors", - # also accepts a string with the already downloaded HTML code - source=urls, - config=graph_config -) - -result = script_creator_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = script_creator_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/together/search_graph_schema_together.py b/examples/together/search_graph_schema_together.py deleted file mode 100644 index c5954294..00000000 --- a/examples/together/search_graph_schema_together.py +++ /dev/null @@ -1,61 +0,0 @@ -""" -Example of Search Graph -""" - -import os -from typing import List -from pydantic import BaseModel, Field -from dotenv import load_dotenv -from scrapegraphai.graphs import SearchGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Dish(BaseModel): - name: str = Field(description="The name of the dish") - description: str = Field(description="The description of the dish") - -class Dishes(BaseModel): - dishes: List[Dish] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -together_key = os.getenv("TOGETHER_APIKEY") - -graph_config = { - "llm": { - "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", - "api_key": together_key, - }, - "verbose": True, -} - -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me Chioggia's famous dishes", - config=graph_config, - schema=Dishes -) - -result = search_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = search_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json and csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/together/search_graph_together.py b/examples/together/search_graph_together.py deleted file mode 100644 index e4c442c4..00000000 --- a/examples/together/search_graph_together.py +++ /dev/null @@ -1,34 +0,0 @@ -""" -Example of Search Graph -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SearchGraph -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -together_key = os.getenv("TOGETHER_APIKEY") - -graph_config = { - "llm": { - "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", - "api_key": together_key, - }, - "max_results": 2, - "verbose": True, -} - -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me the best escursions near Trento", - config=graph_config -) - -result = search_graph.run() -print(result) diff --git a/examples/together/search_link_graph_together.py b/examples/together/search_link_graph_together.py deleted file mode 100644 index 46c86d5c..00000000 --- a/examples/together/search_link_graph_together.py +++ /dev/null @@ -1,46 +0,0 @@ -""" -Example of Search Graph -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SearchGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -load_dotenv() - -together_key = os.getenv("TOGETHER_APIKEY") - -graph_config = { - "llm": { - "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", - "api_key": together_key, - }, - "verbose": True, -} - -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me the best escursions near Trento", - config=graph_config -) - -result = search_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = search_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json and csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/together/smart_scraper_lite_together.py b/examples/together/smart_scraper_lite_together.py deleted file mode 100644 index 0519ecba..00000000 --- a/examples/together/smart_scraper_lite_together.py +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/examples/together/smart_scraper_multi_lite_together.py b/examples/together/smart_scraper_multi_lite_together.py deleted file mode 100644 index 8cf66dea..00000000 --- a/examples/together/smart_scraper_multi_lite_together.py +++ /dev/null @@ -1,43 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiLiteGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -together_key = os.getenv("TOGETHER_APIKEY") - -graph_config = { - "llm": { - "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", - "api_key": together_key, - }, - "verbose": True, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - config=graph_config -) - -result = smart_scraper_multi_lite_graph.run() -print(json.dumps(result, indent=4)) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/together/smart_scraper_multi_together.py b/examples/together/smart_scraper_multi_together.py deleted file mode 100644 index a2da7b8f..00000000 --- a/examples/together/smart_scraper_multi_together.py +++ /dev/null @@ -1,40 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -import json -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiGraph - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -together_key = os.getenv("TOGETHER_APIKEY") - -graph_config = { - "llm": { - "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", - "api_key": together_key, - }, - "verbose": True, -} - -# ******************************************************* -# Create the SmartScraperMultiGraph instance and run it -# ******************************************************* - -multiple_search_graph = SmartScraperMultiGraph( - prompt="Who is Marco Perini?", - source= [ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" - ], - schema=None, - config=graph_config -) - -result = multiple_search_graph.run() -print(json.dumps(result, indent=4)) diff --git a/examples/together/smart_scraper_schema_together.py b/examples/together/smart_scraper_schema_together.py deleted file mode 100644 index 45883ff0..00000000 --- a/examples/together/smart_scraper_schema_together.py +++ /dev/null @@ -1,58 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -from typing import List -from pydantic import BaseModel, Field -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the output schema for the graph -# ************************************************ - -class Project(BaseModel): - title: str = Field(description="The title of the project") - description: str = Field(description="The description of the project") - -class Projects(BaseModel): - projects: List[Project] - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -together_key = os.getenv("TOGETHER_APIKEY") - -graph_config = { - "llm": { - "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", - "api_key": together_key, - }, - "verbose": True, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description.", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects/", - schema=Projects, - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/together/smart_scraper_together.py b/examples/together/smart_scraper_together.py deleted file mode 100644 index c60656f2..00000000 --- a/examples/together/smart_scraper_together.py +++ /dev/null @@ -1,44 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -together_key = os.getenv("TOGETHER_APIKEY") - -graph_config = { - "llm": { - "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", - "api_key": together_key, - }, - "verbose": True, -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description.", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects/", - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/together/xml_scraper_graph_multi_together.py b/examples/together/xml_scraper_graph_multi_together.py deleted file mode 100644 index d6d98a0d..00000000 --- a/examples/together/xml_scraper_graph_multi_together.py +++ /dev/null @@ -1,58 +0,0 @@ -""" -Basic example of scraping pipeline using XMLScraperMultiGraph from XML documents -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import XMLScraperMultiGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the XML file -# ************************************************ - -FILE_NAME = "inputs/books.xml" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -together_key = os.getenv("TOGETHER_APIKEY") - -graph_config = { - "llm": { - "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", - "api_key": together_key, - }, - "verbose": True, -} - -# ************************************************ -# Create the XMLScraperMultiGraph instance and run it -# ************************************************ - -xml_scraper_graph = XMLScraperMultiGraph( - prompt="List me all the authors, title and genres of the books", - source=[text, text], # Pass the content of the file, not the file object - config=graph_config -) - -result = xml_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = xml_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/together/xml_scraper_together.py b/examples/together/xml_scraper_together.py deleted file mode 100644 index b1d39e2e..00000000 --- a/examples/together/xml_scraper_together.py +++ /dev/null @@ -1,59 +0,0 @@ -""" -Basic example of scraping pipeline using XMLScraperGraph from XML documents -""" -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import XMLScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -load_dotenv() - -# ************************************************ -# Read the XML file -# ************************************************ - -FILE_NAME = "inputs/books.xml" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - - -together_key = os.getenv("TOGETHER_APIKEY") - -graph_config = { - "llm": { - "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", - "api_key": together_key, - }, - "verbose": True, -} - -# ************************************************ -# Create the XMLScraperGraph instance and run it -# ************************************************ - -xml_scraper_graph = XMLScraperGraph( - prompt="List me all the authors, title and genres of the books", - source=text, # Pass the content of the file, not the file object - config=graph_config -) - -result = xml_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = xml_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/xml_scraper_graph/.env.example b/examples/xml_scraper_graph/.env.example new file mode 100644 index 00000000..9f0d484b --- /dev/null +++ b/examples/xml_scraper_graph/.env.example @@ -0,0 +1,11 @@ +# OpenAI API Configuration +OPENAI_API_KEY=your-openai-api-key-here + +# Optional Configurations +MAX_TOKENS=4000 +MODEL_NAME=gpt-4-1106-preview +TEMPERATURE=0.7 + +# XML Scraper Settings +XPATH_TIMEOUT=30 +VALIDATE_XML=true \ No newline at end of file diff --git a/examples/xml_scraper_graph/README.md b/examples/xml_scraper_graph/README.md new file mode 100644 index 00000000..ed7eaf30 --- /dev/null +++ b/examples/xml_scraper_graph/README.md @@ -0,0 +1,30 @@ +# XML Scraper Graph Example + +This example demonstrates how to use Scrapegraph-ai to extract and process XML data from web sources. + +## Features + +- XML data extraction +- XPath querying +- Data transformation +- Schema validation + +## Setup + +1. Install required dependencies +2. Copy `.env.example` to `.env` +3. Configure your API keys in the `.env` file + +## Usage + +```python +from scrapegraphai.graphs import XmlScraperGraph + +graph = XmlScraperGraph() +xml_data = graph.scrape("https://example.com/feed.xml") +``` + +## Environment Variables + +Required environment variables: +- `OPENAI_API_KEY`: Your OpenAI API key \ No newline at end of file diff --git a/examples/anthropic/inputs/books.xml b/examples/xml_scraper_graph/ollama/inputs/books.xml similarity index 100% rename from examples/anthropic/inputs/books.xml rename to examples/xml_scraper_graph/ollama/inputs/books.xml diff --git a/examples/local_models/xml_scraper_graph_multi_ollama.py b/examples/xml_scraper_graph/ollama/xml_scraper_graph_multi_ollama.py similarity index 100% rename from examples/local_models/xml_scraper_graph_multi_ollama.py rename to examples/xml_scraper_graph/ollama/xml_scraper_graph_multi_ollama.py diff --git a/examples/local_models/xml_scraper_ollama.py b/examples/xml_scraper_graph/ollama/xml_scraper_ollama.py similarity index 100% rename from examples/local_models/xml_scraper_ollama.py rename to examples/xml_scraper_graph/ollama/xml_scraper_ollama.py diff --git a/examples/azure/inputs/books.xml b/examples/xml_scraper_graph/openai/inputs/books.xml similarity index 100% rename from examples/azure/inputs/books.xml rename to examples/xml_scraper_graph/openai/inputs/books.xml diff --git a/examples/openai/xml_scraper_graph_multi_openai.py b/examples/xml_scraper_graph/openai/xml_scraper_graph_multi_openai.py similarity index 100% rename from examples/openai/xml_scraper_graph_multi_openai.py rename to examples/xml_scraper_graph/openai/xml_scraper_graph_multi_openai.py diff --git a/examples/openai/xml_scraper_openai.py b/examples/xml_scraper_graph/openai/xml_scraper_openai.py similarity index 100% rename from examples/openai/xml_scraper_openai.py rename to examples/xml_scraper_graph/openai/xml_scraper_openai.py diff --git a/scrapegraphai/nodes/merge_answers_node.py b/scrapegraphai/nodes/merge_answers_node.py index 77eb1587..b867b3e0 100644 --- a/scrapegraphai/nodes/merge_answers_node.py +++ b/scrapegraphai/nodes/merge_answers_node.py @@ -5,6 +5,7 @@ from typing import List, Optional from langchain.prompts import PromptTemplate +from langchain_community.chat_models import ChatOllama from langchain_core.output_parsers import JsonOutputParser from langchain_mistralai import ChatMistralAI from langchain_openai import ChatOpenAI @@ -42,6 +43,13 @@ def __init__( super().__init__(node_name, "node", input, output, 2, node_config) self.llm_model = node_config["llm_model"] + + if isinstance(self.llm_model, ChatOllama): + if self.node_config.get("schema", None) is None: + self.llm_model.format = "json" + else: + self.llm_model.format = self.node_config["schema"].model_json_schema() + self.verbose = ( False if node_config is None else node_config.get("verbose", False) )