From 224ff07032d006d75160a7094366fac17023aca1 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sun, 12 Jan 2025 16:29:21 +0100 Subject: [PATCH 01/11] feat: add integration for search on web --- scrapegraphai/utils/research_web.py | 44 ++++++++++++++++++++++++++--- uv.lock | 2 +- 2 files changed, 41 insertions(+), 5 deletions(-) diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py index 9db6a5fe..4a3bacea 100644 --- a/scrapegraphai/utils/research_web.py +++ b/scrapegraphai/utils/research_web.py @@ -19,8 +19,22 @@ def search_on_web( timeout: int = 10, proxy: str | dict = None, serper_api_key: str = None, + region: str = None, + language: str = None, ) -> List[str]: - """Search web function with improved error handling and validation""" + """Search web function with improved error handling and validation + + Args: + query (str): Search query + search_engine (str): Search engine to use + max_results (int): Maximum number of results to return + port (int): Port for SearXNG + timeout (int): Request timeout in seconds + proxy (str | dict): Proxy configuration + serper_api_key (str): API key for Serper + region (str): Country/region code (e.g., 'mx' for Mexico) + language (str): Language code (e.g., 'es' for Spanish) + """ # Input validation if not query or not isinstance(query, str): @@ -39,9 +53,31 @@ def search_on_web( try: results = [] if search_engine == "google": - results = list( - google_search(query, num_results=max_results, proxy=formatted_proxy) - ) + + if region is not None and language is not None: + results = list( + google_search( + query, num_results=max_results, proxy=formatted_proxy, + lang= language, region=region) + ) + elif region is not None: + results = list( + google_search( + query, num_results=max_results, proxy=formatted_proxy, + region=region) + ) + elif language is not None: + results = list( + google_search( + query, num_results=max_results, proxy=formatted_proxy, + lang=language) + ) + else: + results = list( + google_search( + query, num_results=max_results, proxy=formatted_proxy) + ) + print(results) elif search_engine == "duckduckgo": research = DuckDuckGoSearchResults(max_results=max_results) diff --git a/uv.lock b/uv.lock index ef0623ed..b5d953f1 100644 --- a/uv.lock +++ b/uv.lock @@ -3429,7 +3429,7 @@ wheels = [ [[package]] name = "scrapegraphai" -version = "1.35.0" +version = "1.36.0" source = { editable = "." } dependencies = [ { name = "async-timeout", version = "4.0.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" }, From c3cdff77ae98594c27f9b976e76dd994a9c94cc2 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sun, 12 Jan 2025 16:30:28 +0100 Subject: [PATCH 02/11] Update research_web.py --- scrapegraphai/utils/research_web.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py index 4a3bacea..b711eb37 100644 --- a/scrapegraphai/utils/research_web.py +++ b/scrapegraphai/utils/research_web.py @@ -72,7 +72,7 @@ def search_on_web( query, num_results=max_results, proxy=formatted_proxy, lang=language) ) - else: + else: results = list( google_search( query, num_results=max_results, proxy=formatted_proxy) From 8762a01f8b6e5ca9458b5eca95d788d2ad5014cf Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sun, 12 Jan 2025 16:32:28 +0100 Subject: [PATCH 03/11] Update research_web.py --- scrapegraphai/utils/research_web.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py index b711eb37..673319b2 100644 --- a/scrapegraphai/utils/research_web.py +++ b/scrapegraphai/utils/research_web.py @@ -54,19 +54,19 @@ def search_on_web( results = [] if search_engine == "google": - if region is not None and language is not None: + if region and language: results = list( google_search( query, num_results=max_results, proxy=formatted_proxy, lang= language, region=region) ) - elif region is not None: + elif region: results = list( google_search( query, num_results=max_results, proxy=formatted_proxy, region=region) ) - elif language is not None: + elif language: results = list( google_search( query, num_results=max_results, proxy=formatted_proxy, @@ -77,7 +77,6 @@ def search_on_web( google_search( query, num_results=max_results, proxy=formatted_proxy) ) - print(results) elif search_engine == "duckduckgo": research = DuckDuckGoSearchResults(max_results=max_results) From dc0a138a7e5b2b2eae8fe5542627a298733f3064 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sun, 12 Jan 2025 16:35:31 +0100 Subject: [PATCH 04/11] run pre commit --- .github/FUNDING.yml | 2 +- .github/ISSUE_TEMPLATE/custom.md | 2 - .github/workflows/release.yml | 26 +-- .releaserc.yml | 1 - Dockerfile | 2 +- LICENSE | 2 +- README.md | 2 +- SECURITY.md | 1 - docs/README.md | 2 +- docs/russian.md | 2 +- docs/source/conf.py | 19 +-- docs/source/getting_started/examples.rst | 2 +- docs/source/getting_started/installation.rst | 6 +- docs/source/index.rst | 2 +- docs/source/introduction/overview.rst | 10 +- docs/source/modules/modules.rst | 1 - .../scrapegraphai.helpers.models_tokens.rst | 2 +- docs/source/scrapers/llm.rst | 11 +- examples/ScrapegraphAI_cookbook.ipynb | 2 +- examples/code_generator_graph/.env.example | 2 +- examples/code_generator_graph/README.md | 2 +- .../ollama/code_generator_graph_ollama.py | 15 +- .../openai/code_generator_graph_openai.py | 16 +- examples/csv_scraper_graph/.env.example | 2 +- examples/csv_scraper_graph/README.md | 2 +- .../ollama/inputs/username.csv | 1 - .../openai/inputs/username.csv | 1 - examples/custom_graph/.env.example | 2 +- examples/custom_graph/README.md | 2 +- .../ollama/custom_graph_ollama.py | 33 ++-- .../openai/custom_graph_openai.py | 37 +++-- examples/depth_search_graph/.env.example | 2 +- examples/depth_search_graph/README.md | 2 +- .../ollama/depth_search_graph_ollama.py | 5 +- .../openai/depth_search_graph_openai.py | 5 +- examples/document_scraper_graph/.env.example | 2 +- examples/document_scraper_graph/README.md | 2 +- .../ollama/document_scraper_ollama.py | 17 +- .../ollama/inputs/plain_html_example.txt | 12 +- .../openai/document_scraper_openai.py | 21 ++- .../openai/inputs/markdown_example.md | 70 ++++---- .../openai/inputs/plain_html_example.txt | 12 +- examples/extras/.env.example | 2 +- examples/extras/authenticated_playwright.py | 1 + examples/extras/browser_base_integration.py | 8 +- examples/extras/chromium_selenium.py | 58 +++++-- examples/extras/cond_smartscraper_usage.py | 8 +- examples/extras/conditional_usage.py | 12 +- examples/extras/custom_prompt.py | 7 +- examples/extras/example.yml | 6 +- examples/extras/force_mode.py | 10 +- examples/extras/html_mode.py | 10 +- examples/extras/load_yml.py | 8 +- examples/extras/no_cut.py | 9 +- examples/extras/proxy_rotation.py | 9 +- examples/extras/rag_caching.py | 10 +- examples/extras/reasoning.py | 8 +- examples/extras/scrape_do.py | 8 +- examples/extras/screenshot_scaping.py | 31 ++-- examples/extras/serch_graph_scehma.py | 14 +- examples/extras/slow_mo.py | 14 +- examples/extras/undected_playwright.py | 14 +- examples/json_scraper_graph/.env.example | 2 +- examples/json_scraper_graph/README.md | 2 +- .../ollama/inputs/example.json | 2 +- .../ollama/json_scraper_multi_ollama.py | 12 +- .../ollama/json_scraper_ollama.py | 7 +- .../openai/inputs/example.json | 2 +- .../openai/json_scraper_multi_openai.py | 13 +- .../openai/json_scraper_openai.py | 7 +- .../openai/md_scraper_openai.py | 7 +- .../openai/omni_scraper_openai.py | 11 +- examples/omni_scraper_graph/.env.example | 2 +- examples/omni_scraper_graph/README.md | 2 +- .../omni_scraper_graph/omni_search_openai.py | 7 +- examples/readme.md | 2 +- examples/script_generator_graph/.env.example | 2 +- examples/script_generator_graph/README.md | 2 +- .../ollama/script_generator_ollama.py | 4 +- .../ollama/script_multi_generator_ollama.py | 8 +- .../openai/script_generator_multi_openai.py | 9 +- .../openai/script_generator_openai.py | 9 +- .../openai/script_generator_schema_openai.py | 16 +- examples/search_graph/.env.example | 2 +- examples/search_graph/README.md | 2 +- .../ollama/search_graph_ollama.py | 4 +- .../ollama/search_graph_schema_ollama.py | 17 +- .../openai/search_graph_openai.py | 6 +- .../openai/search_graph_schema_openai.py | 15 +- .../openai/search_link_graph_openai.py | 8 +- examples/speech_graph/.env.example | 2 +- examples/speech_graph/README.md | 2 +- examples/speech_graph/speech_graph_openai.py | 11 +- examples/xml_scraper_graph/.env.example | 2 +- examples/xml_scraper_graph/README.md | 2 +- .../xml_scraper_graph/ollama/inputs/books.xml | 38 ++--- .../ollama/xml_scraper_graph_multi_ollama.py | 6 +- .../ollama/xml_scraper_ollama.py | 7 +- .../xml_scraper_graph/openai/inputs/books.xml | 38 ++--- .../openai/xml_scraper_graph_multi_openai.py | 9 +- .../openai/xml_scraper_openai.py | 10 +- scrapegraphai/utils/research_web.py | 27 ++- tests/Readme.md | 4 +- tests/graphs/.env.example | 2 +- tests/graphs/abstract_graph_test.py | 155 ++++++++++++------ .../code_generator_graph_openai_test.py | 15 +- .../graphs/depth_search_graph_openai_test.py | 9 +- tests/graphs/inputs/books.xml | 38 ++--- tests/graphs/inputs/example.json | 2 +- tests/graphs/inputs/plain_html_example.txt | 12 +- tests/graphs/inputs/username.csv | 1 - tests/graphs/scrape_graph_test.py | 5 + .../graphs/scrape_plain_text_mistral_test.py | 10 +- tests/graphs/scrape_xml_ollama_test.py | 9 +- tests/graphs/screenshot_scraper_test.py | 10 +- tests/graphs/script_generator_test.py | 52 +++--- tests/graphs/search_graph_openai_test.py | 12 +- tests/graphs/search_link_ollama.py | 6 +- tests/graphs/smart_scraper_ernie_test.py | 10 +- tests/graphs/smart_scraper_fireworks_test.py | 11 +- ...rt_scraper_multi_lite_graph_openai_test.py | 17 +- tests/graphs/smart_scraper_ollama_test.py | 9 +- tests/graphs/smart_scraper_openai_test.py | 7 +- tests/graphs/xml_scraper_openai_test.py | 17 +- tests/inputs/books.xml | 38 ++--- tests/inputs/example.json | 2 +- tests/inputs/plain_html_example.txt | 12 +- tests/inputs/username.csv | 1 - tests/nodes/fetch_node_test.py | 9 +- tests/nodes/inputs/books.xml | 38 ++--- tests/nodes/inputs/example.json | 2 +- tests/nodes/inputs/plain_html_example.txt | 12 +- tests/nodes/inputs/username.csv | 1 - tests/nodes/robot_node_test.py | 40 +++-- tests/nodes/search_internet_node_test.py | 24 ++- tests/nodes/search_link_node_test.py | 25 +-- tests/utils/convert_to_md_test.py | 14 +- tests/utils/copy_utils_test.py | 27 +-- tests/utils/parse_state_keys_test.py | 4 +- tests/utils/research_web_test.py | 6 +- 140 files changed, 952 insertions(+), 661 deletions(-) diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml index 3dbf9273..8868e7e6 100644 --- a/.github/FUNDING.yml +++ b/.github/FUNDING.yml @@ -12,4 +12,4 @@ lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cl polar: # Replace with a single Polar username buy_me_a_coffee: # Replace with a single Buy Me a Coffee username thanks_dev: # Replace with a single thanks.dev username -custom: +custom: diff --git a/.github/ISSUE_TEMPLATE/custom.md b/.github/ISSUE_TEMPLATE/custom.md index 48d5f81f..b894315f 100644 --- a/.github/ISSUE_TEMPLATE/custom.md +++ b/.github/ISSUE_TEMPLATE/custom.md @@ -6,5 +6,3 @@ labels: '' assignees: '' --- - - diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 0ad43347..125c557c 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -19,21 +19,21 @@ jobs: uses: actions/setup-python@v5 with: python-version: '3.10' - + - name: Install uv uses: astral-sh/setup-uv@v3 - + - name: Install Node Env uses: actions/setup-node@v4 with: node-version: 20 - + - name: Checkout uses: actions/checkout@v4.1.1 with: fetch-depth: 0 persist-credentials: false - + - name: Build and validate package run: | uv venv @@ -44,10 +44,10 @@ jobs: uv build uv pip install --upgrade pkginfo==1.12.0 twine==6.0.1 # Upgrade pkginfo and install twine python -m twine check dist/* - + - name: Debug Dist Directory run: ls -al dist - + - name: Cache build uses: actions/cache@v3 with: @@ -59,7 +59,7 @@ jobs: runs-on: ubuntu-latest needs: build environment: development - if: > + if: > github.event_name == 'push' && (github.ref == 'refs/heads/main' || github.ref == 'refs/heads/pre/beta') || (github.event_name == 'pull_request' && github.event.action == 'closed' && github.event.pull_request.merged && (github.event.pull_request.base.ref == 'main' || github.event.pull_request.base.ref == 'pre/beta')) @@ -74,23 +74,23 @@ jobs: with: fetch-depth: 0 persist-credentials: false - + - name: Restore build artifacts uses: actions/cache@v3 with: path: ./dist key: ${{ runner.os }}-build-${{ github.sha }} - + - name: Semantic Release uses: cycjimmy/semantic-release-action@v4.1.0 with: semantic_version: 23 extra_plugins: | semantic-release-pypi@3 - @semantic-release/git - @semantic-release/commit-analyzer@12 - @semantic-release/release-notes-generator@13 - @semantic-release/github@10 + @semantic-release/git + @semantic-release/commit-analyzer@12 + @semantic-release/release-notes-generator@13 + @semantic-release/github@10 @semantic-release/changelog@6 conventional-changelog-conventionalcommits@7 env: diff --git a/.releaserc.yml b/.releaserc.yml index 65d589fa..574eb74e 100644 --- a/.releaserc.yml +++ b/.releaserc.yml @@ -53,4 +53,3 @@ branches: channel: "dev" prerelease: "beta" debug: true - diff --git a/Dockerfile b/Dockerfile index a5f71732..d3246bc7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,4 +6,4 @@ RUN pip install --no-cache-dir scrapegraphai RUN pip install --no-cache-dir scrapegraphai[burr] RUN python3 -m playwright install-deps -RUN python3 -m playwright install \ No newline at end of file +RUN python3 -m playwright install diff --git a/LICENSE b/LICENSE index 70e38342..404d6c0c 100644 --- a/LICENSE +++ b/LICENSE @@ -4,4 +4,4 @@ Permission is hereby granted, free of charge, to any person obtaining a copy of The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file +THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/README.md b/README.md index df75fbd9..00a169cb 100644 --- a/README.md +++ b/README.md @@ -182,7 +182,7 @@ The Official API Documentation can be found [here](https://docs.scrapegraphai.co -## 📈 Telemetry +## 📈 Telemetry We collect anonymous usage metrics to enhance our package's quality and user experience. The data helps us prioritize improvements and ensure compatibility. If you wish to opt-out, set the environment variable SCRAPEGRAPHAI_TELEMETRY_ENABLED=false. For more information, please refer to the documentation [here](https://scrapegraph-ai.readthedocs.io/en/latest/scrapers/telemetry.html). diff --git a/SECURITY.md b/SECURITY.md index e9de9357..36eb39b9 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -3,4 +3,3 @@ ## Reporting a Vulnerability For reporting a vulnerability contact directly mvincig11@gmail.com - diff --git a/docs/README.md b/docs/README.md index 037f0117..598580fc 100644 --- a/docs/README.md +++ b/docs/README.md @@ -55,7 +55,7 @@ markmap: - Use Selenium or Playwright to take screenshots - Use LLM to asses if it is a block-like page, paragraph-like page, etc. - [Issue #88](https://github.com/VinciGit00/Scrapegraph-ai/issues/88) - + ## **Long-Term Goals** - Automatic generation of scraping pipelines from a given prompt diff --git a/docs/russian.md b/docs/russian.md index 84da9796..ac16ef41 100644 --- a/docs/russian.md +++ b/docs/russian.md @@ -228,4 +228,4 @@ ScrapeGraphAI лицензирован под MIT License. Подробнее с ## Благодарности - Мы хотели бы поблагодарить всех участников проекта и сообщество с открытым исходным кодом за их поддержку. -- ScrapeGraphAI предназначен только для исследования данных и научных целей. Мы не несем ответственности за неправильное использование библиотеки. \ No newline at end of file +- ScrapeGraphAI предназначен только для исследования данных и научных целей. Мы не несем ответственности за неправильное использование библиотеки. diff --git a/docs/source/conf.py b/docs/source/conf.py index 9fc3aec7..f7d44113 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -12,31 +12,30 @@ import sys # import all the modules -sys.path.insert(0, os.path.abspath('../../')) +sys.path.insert(0, os.path.abspath("../../")) -project = 'ScrapeGraphAI' -copyright = '2024, ScrapeGraphAI' -author = 'Marco Vinciguerra, Marco Perini, Lorenzo Padoan' +project = "ScrapeGraphAI" +copyright = "2024, ScrapeGraphAI" +author = "Marco Vinciguerra, Marco Perini, Lorenzo Padoan" html_last_updated_fmt = "%b %d, %Y" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration -extensions = ['sphinx.ext.autodoc', 'sphinx.ext.napoleon'] +extensions = ["sphinx.ext.autodoc", "sphinx.ext.napoleon"] -templates_path = ['_templates'] +templates_path = ["_templates"] exclude_patterns = [] # -- Options for HTML output ------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output -html_theme = 'furo' +html_theme = "furo" html_theme_options = { "source_repository": "https://github.com/VinciGit00/Scrapegraph-ai/", "source_branch": "main", "source_directory": "docs/source/", - 'navigation_with_keys': True, - 'sidebar_hide_name': False, + "navigation_with_keys": True, + "sidebar_hide_name": False, } - diff --git a/docs/source/getting_started/examples.rst b/docs/source/getting_started/examples.rst index 1bed0a6e..5696e468 100644 --- a/docs/source/getting_started/examples.rst +++ b/docs/source/getting_started/examples.rst @@ -84,4 +84,4 @@ After that, you can run the following code, using only your machine resources br result = smart_scraper_graph.run() print(result) -To find out how you can customize the `graph_config` dictionary, by using different LLM and adding new parameters, check the `Scrapers` section! \ No newline at end of file +To find out how you can customize the `graph_config` dictionary, by using different LLM and adding new parameters, check the `Scrapers` section! diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst index 30acfb5a..a9fd7626 100644 --- a/docs/source/getting_started/installation.rst +++ b/docs/source/getting_started/installation.rst @@ -22,7 +22,7 @@ The library is available on PyPI, so it can be installed using the following com pip install scrapegraphai .. important:: - + It is higly recommended to install the library in a virtual environment (conda, venv, etc.) If your clone the repository, it is recommended to use a package manager like `uv `_. @@ -35,7 +35,7 @@ To install the library using uv, you can run the following command: uv build .. caution:: - + **Rye** must be installed first by following the instructions on the `official website `_. Additionally on Windows when using WSL @@ -46,5 +46,3 @@ If you are using Windows Subsystem for Linux (WSL) and you are facing issues wit .. code-block:: bash sudo apt-get -y install libnss3 libnspr4 libgbm1 libasound2 - - diff --git a/docs/source/index.rst b/docs/source/index.rst index acc0db73..9d0b5a15 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -43,4 +43,4 @@ Indices and tables * :ref:`genindex` * :ref:`modindex` -* :ref:`search` \ No newline at end of file +* :ref:`search` diff --git a/docs/source/introduction/overview.rst b/docs/source/introduction/overview.rst index 4dcaadbe..e2bdf961 100644 --- a/docs/source/introduction/overview.rst +++ b/docs/source/introduction/overview.rst @@ -3,11 +3,11 @@ :width: 50% :alt: ScrapegraphAI -Overview +Overview ======== ScrapeGraphAI is an **open-source** Python library designed to revolutionize **scraping** tools. -In today's data-intensive digital landscape, this library stands out by integrating **Large Language Models** (LLMs) +In today's data-intensive digital landscape, this library stands out by integrating **Large Language Models** (LLMs) and modular **graph-based** pipelines to automate the scraping of data from various sources (e.g., websites, local files etc.). Simply specify the information you need to extract, and ScrapeGraphAI handles the rest, providing a more **flexible** and **low-maintenance** solution compared to traditional scraping tools. @@ -16,7 +16,7 @@ Why ScrapegraphAI? ================== Traditional web scraping tools often rely on fixed patterns or manual configuration to extract data from web pages. -ScrapegraphAI, leveraging the power of LLMs, adapts to changes in website structures, reducing the need for constant developer intervention. +ScrapegraphAI, leveraging the power of LLMs, adapts to changes in website structures, reducing the need for constant developer intervention. This flexibility ensures that scrapers remain functional even when website layouts change. We support many LLMs including **GPT, Gemini, Groq, Azure, Hugging Face** etc. @@ -161,13 +161,13 @@ FAQ - Check your internet connection. Low speed or unstable connection can cause the HTML to not load properly. - Try using a proxy server to mask your IP address. Check out the :ref:`Proxy` section for more information on how to configure proxy settings. - + - Use a different LLM model. Some models might perform better on certain websites than others. - Set the `verbose` parameter to `True` in the graph_config to see more detailed logs. - Visualize the pipeline graphically using :ref:`Burr`. - + If the issue persists, please report it on the GitHub repository. 6. **How does ScrapeGraphAI handle the context window limit of LLMs?** diff --git a/docs/source/modules/modules.rst b/docs/source/modules/modules.rst index 7551ea96..d3237dcd 100644 --- a/docs/source/modules/modules.rst +++ b/docs/source/modules/modules.rst @@ -7,4 +7,3 @@ scrapegraphai scrapegraphai scrapegraphai.helpers.models_tokens - diff --git a/docs/source/modules/scrapegraphai.helpers.models_tokens.rst b/docs/source/modules/scrapegraphai.helpers.models_tokens.rst index 82615b3b..6df4f086 100644 --- a/docs/source/modules/scrapegraphai.helpers.models_tokens.rst +++ b/docs/source/modules/scrapegraphai.helpers.models_tokens.rst @@ -25,4 +25,4 @@ Example usage: else: print(f"{model_name} not found in the models list") -This information is crucial for users to understand the capabilities and limitations of different AI models when designing their scraping pipelines. \ No newline at end of file +This information is crucial for users to understand the capabilities and limitations of different AI models when designing their scraping pipelines. diff --git a/docs/source/scrapers/llm.rst b/docs/source/scrapers/llm.rst index 7b1df30e..bc0ed9bb 100644 --- a/docs/source/scrapers/llm.rst +++ b/docs/source/scrapers/llm.rst @@ -133,11 +133,11 @@ We can also pass a model instance for the chat model and the embedding model. Fo openai_api_version="AZURE_OPENAI_API_VERSION", ) # Supposing model_tokens are 100K - model_tokens_count = 100000 + model_tokens_count = 100000 graph_config = { "llm": { "model_instance": llm_model_instance, - "model_tokens": model_tokens_count, + "model_tokens": model_tokens_count, }, "embeddings": { "model_instance": embedder_model_instance @@ -198,7 +198,7 @@ We can also pass a model instance for the chat model and the embedding model. Fo Other LLM models ^^^^^^^^^^^^^^^^ -We can also pass a model instance for the chat model and the embedding model through the **model_instance** parameter. +We can also pass a model instance for the chat model and the embedding model through the **model_instance** parameter. This feature enables you to utilize a Langchain model instance. You will discover the model you require within the provided list: @@ -208,7 +208,7 @@ You will discover the model you require within the provided list: For instance, consider **chat model** Moonshot. We can integrate it in the following manner: .. code-block:: python - + from langchain_community.chat_models.moonshot import MoonshotChat # The configuration parameters are contingent upon the specific model you select @@ -221,8 +221,7 @@ For instance, consider **chat model** Moonshot. We can integrate it in the follo llm_model_instance = MoonshotChat(**llm_instance_config) graph_config = { "llm": { - "model_instance": llm_model_instance, + "model_instance": llm_model_instance, "model_tokens": 5000 }, } - \ No newline at end of file diff --git a/examples/ScrapegraphAI_cookbook.ipynb b/examples/ScrapegraphAI_cookbook.ipynb index b58bf0ea..5016c8c7 100644 --- a/examples/ScrapegraphAI_cookbook.ipynb +++ b/examples/ScrapegraphAI_cookbook.ipynb @@ -912,4 +912,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} \ No newline at end of file +} diff --git a/examples/code_generator_graph/.env.example b/examples/code_generator_graph/.env.example index a93912e4..93a1b024 100644 --- a/examples/code_generator_graph/.env.example +++ b/examples/code_generator_graph/.env.example @@ -11,4 +11,4 @@ DEFAULT_LANGUAGE=python GENERATE_TESTS=true ADD_DOCUMENTATION=true CODE_STYLE=pep8 -TYPE_CHECKING=true \ No newline at end of file +TYPE_CHECKING=true diff --git a/examples/code_generator_graph/README.md b/examples/code_generator_graph/README.md index bc4b5dec..13d90f5a 100644 --- a/examples/code_generator_graph/README.md +++ b/examples/code_generator_graph/README.md @@ -27,4 +27,4 @@ code = graph.generate("code specification") ## Environment Variables Required environment variables: -- `OPENAI_API_KEY`: Your OpenAI API key \ No newline at end of file +- `OPENAI_API_KEY`: Your OpenAI API key diff --git a/examples/code_generator_graph/ollama/code_generator_graph_ollama.py b/examples/code_generator_graph/ollama/code_generator_graph_ollama.py index 46ab8ab3..339bb03c 100644 --- a/examples/code_generator_graph/ollama/code_generator_graph_ollama.py +++ b/examples/code_generator_graph/ollama/code_generator_graph_ollama.py @@ -1,11 +1,13 @@ -""" +""" Basic example of scraping pipeline using Code Generator with schema """ import json from typing import List + from dotenv import load_dotenv from pydantic import BaseModel, Field + from scrapegraphai.graphs import CodeGeneratorGraph load_dotenv() @@ -14,13 +16,16 @@ # Define the output schema for the graph # ************************************************ + class Project(BaseModel): title: str = Field(description="The title of the project") description: str = Field(description="The description of the project") + class Projects(BaseModel): projects: List[Project] + # ************************************************ # Define the configuration for the graph # ************************************************ @@ -41,9 +46,9 @@ class Projects(BaseModel): "syntax": 3, "execution": 3, "validation": 3, - "semantic": 3 + "semantic": 3, }, - "output_file_name": "extracted_data.py" + "output_file_name": "extracted_data.py", } # ************************************************ @@ -54,8 +59,8 @@ class Projects(BaseModel): prompt="List me all the projects with their description", source="https://perinim.github.io/projects/", schema=Projects, - config=graph_config + config=graph_config, ) result = code_generator_graph.run() -print(result) \ No newline at end of file +print(result) diff --git a/examples/code_generator_graph/openai/code_generator_graph_openai.py b/examples/code_generator_graph/openai/code_generator_graph_openai.py index a9a2ea56..0fe01ebe 100644 --- a/examples/code_generator_graph/openai/code_generator_graph_openai.py +++ b/examples/code_generator_graph/openai/code_generator_graph_openai.py @@ -1,10 +1,13 @@ -""" +""" Basic example of scraping pipeline using Code Generator with schema """ + import os from typing import List + from dotenv import load_dotenv from pydantic import BaseModel, Field + from scrapegraphai.graphs import CodeGeneratorGraph load_dotenv() @@ -13,13 +16,16 @@ # Define the output schema for the graph # ************************************************ + class Project(BaseModel): title: str = Field(description="The title of the project") description: str = Field(description="The description of the project") + class Projects(BaseModel): projects: List[Project] + # ************************************************ # Define the configuration for the graph # ************************************************ @@ -28,7 +34,7 @@ class Projects(BaseModel): graph_config = { "llm": { - "api_key":openai_key, + "api_key": openai_key, "model": "openai/gpt-4o-mini", }, "verbose": True, @@ -39,9 +45,9 @@ class Projects(BaseModel): "syntax": 3, "execution": 3, "validation": 3, - "semantic": 3 + "semantic": 3, }, - "output_file_name": "extracted_data.py" + "output_file_name": "extracted_data.py", } # ************************************************ @@ -52,7 +58,7 @@ class Projects(BaseModel): prompt="List me all the projects with their description", source="https://perinim.github.io/projects/", schema=Projects, - config=graph_config + config=graph_config, ) result = code_generator_graph.run() diff --git a/examples/csv_scraper_graph/.env.example b/examples/csv_scraper_graph/.env.example index 1917f9aa..1a7559ea 100644 --- a/examples/csv_scraper_graph/.env.example +++ b/examples/csv_scraper_graph/.env.example @@ -8,4 +8,4 @@ TEMPERATURE=0.7 # CSV Scraper Settings CSV_DELIMITER=, -MAX_ROWS=1000 \ No newline at end of file +MAX_ROWS=1000 diff --git a/examples/csv_scraper_graph/README.md b/examples/csv_scraper_graph/README.md index d39858b0..17a49665 100644 --- a/examples/csv_scraper_graph/README.md +++ b/examples/csv_scraper_graph/README.md @@ -27,4 +27,4 @@ csv_data = graph.scrape("https://example.com/table") ## Environment Variables Required environment variables: -- `OPENAI_API_KEY`: Your OpenAI API key \ No newline at end of file +- `OPENAI_API_KEY`: Your OpenAI API key diff --git a/examples/csv_scraper_graph/ollama/inputs/username.csv b/examples/csv_scraper_graph/ollama/inputs/username.csv index 006ac8e6..0d32afd0 100644 --- a/examples/csv_scraper_graph/ollama/inputs/username.csv +++ b/examples/csv_scraper_graph/ollama/inputs/username.csv @@ -4,4 +4,3 @@ grey07;2070;Laura;Grey johnson81;4081;Craig;Johnson jenkins46;9346;Mary;Jenkins smith79;5079;Jamie;Smith - diff --git a/examples/csv_scraper_graph/openai/inputs/username.csv b/examples/csv_scraper_graph/openai/inputs/username.csv index 006ac8e6..0d32afd0 100644 --- a/examples/csv_scraper_graph/openai/inputs/username.csv +++ b/examples/csv_scraper_graph/openai/inputs/username.csv @@ -4,4 +4,3 @@ grey07;2070;Laura;Grey johnson81;4081;Craig;Johnson jenkins46;9346;Mary;Jenkins smith79;5079;Jamie;Smith - diff --git a/examples/custom_graph/.env.example b/examples/custom_graph/.env.example index 9eac4cb8..bb0a8eed 100644 --- a/examples/custom_graph/.env.example +++ b/examples/custom_graph/.env.example @@ -10,4 +10,4 @@ TEMPERATURE=0.7 CUSTOM_NODE_TIMEOUT=30 MAX_NODES=10 DEBUG_MODE=false -LOG_LEVEL=info \ No newline at end of file +LOG_LEVEL=info diff --git a/examples/custom_graph/README.md b/examples/custom_graph/README.md index e6d3b88a..2b4ffbbf 100644 --- a/examples/custom_graph/README.md +++ b/examples/custom_graph/README.md @@ -28,4 +28,4 @@ results = graph.process() ## Environment Variables Required environment variables: -- `OPENAI_API_KEY`: Your OpenAI API key \ No newline at end of file +- `OPENAI_API_KEY`: Your OpenAI API key diff --git a/examples/custom_graph/ollama/custom_graph_ollama.py b/examples/custom_graph/ollama/custom_graph_ollama.py index c505d068..f7aebd3d 100644 --- a/examples/custom_graph/ollama/custom_graph_ollama.py +++ b/examples/custom_graph/ollama/custom_graph_ollama.py @@ -3,10 +3,17 @@ """ import os -from langchain_openai import OpenAIEmbeddings -from langchain_openai import ChatOpenAI + +from langchain_openai import ChatOpenAI, OpenAIEmbeddings + from scrapegraphai.graphs import BaseGraph -from scrapegraphai.nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode, RobotsNode +from scrapegraphai.nodes import ( + FetchNode, + GenerateAnswerNode, + ParseNode, + RAGNode, + RobotsNode, +) # ************************************************ # Define the configuration for the graph @@ -20,7 +27,6 @@ # "model_tokens": 2000, # set context length arbitrarily "base_url": "http://localhost:11434", }, - "verbose": True, } @@ -39,7 +45,7 @@ "llm_model": llm_model, "force_scraping": True, "verbose": True, - } + }, ) fetch_node = FetchNode( @@ -48,7 +54,7 @@ node_config={ "verbose": True, "headless": True, - } + }, ) parse_node = ParseNode( input="doc", @@ -56,7 +62,7 @@ node_config={ "chunk_size": 4096, "verbose": True, - } + }, ) generate_answer_node = GenerateAnswerNode( @@ -65,7 +71,7 @@ node_config={ "llm_model": llm_model, "verbose": True, - } + }, ) # ************************************************ @@ -82,19 +88,18 @@ edges=[ (robot_node, fetch_node), (fetch_node, parse_node), - (parse_node, generate_answer_node) + (parse_node, generate_answer_node), ], - entry_point=robot_node + entry_point=robot_node, ) # ************************************************ # Execute the graph # ************************************************ -result, execution_info = graph.execute({ - "user_prompt": "Describe the content", - "url": "https://example.com/" -}) +result, execution_info = graph.execute( + {"user_prompt": "Describe the content", "url": "https://example.com/"} +) # get the answer from the result result = result.get("answer", "No answer found.") diff --git a/examples/custom_graph/openai/custom_graph_openai.py b/examples/custom_graph/openai/custom_graph_openai.py index 00fecfdd..ead1617c 100644 --- a/examples/custom_graph/openai/custom_graph_openai.py +++ b/examples/custom_graph/openai/custom_graph_openai.py @@ -1,12 +1,20 @@ """ Example of custom graph using existing nodes """ + import os + from dotenv import load_dotenv -from langchain_openai import OpenAIEmbeddings -from langchain_openai import ChatOpenAI +from langchain_openai import ChatOpenAI, OpenAIEmbeddings + from scrapegraphai.graphs import BaseGraph -from scrapegraphai.nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode, RobotsNode +from scrapegraphai.nodes import ( + FetchNode, + GenerateAnswerNode, + ParseNode, + RAGNode, + RobotsNode, +) load_dotenv() @@ -16,7 +24,7 @@ openai_key = os.getenv("OPENAI_APIKEY") graph_config = { - "llm": { + "llm": { "api_key": openai_key, "model": "gpt-4o", }, @@ -37,7 +45,7 @@ "llm_model": llm_model, "force_scraping": True, "verbose": True, - } + }, ) fetch_node = FetchNode( @@ -46,7 +54,7 @@ node_config={ "verbose": True, "headless": True, - } + }, ) parse_node = ParseNode( input="doc", @@ -54,7 +62,7 @@ node_config={ "chunk_size": 4096, "verbose": True, - } + }, ) rag_node = RAGNode( input="user_prompt & (parsed_doc | doc)", @@ -63,7 +71,7 @@ "llm_model": llm_model, "embedder_model": embedder, "verbose": True, - } + }, ) generate_answer_node = GenerateAnswerNode( input="user_prompt & (relevant_chunks | parsed_doc | doc)", @@ -71,7 +79,7 @@ node_config={ "llm_model": llm_model, "verbose": True, - } + }, ) # ************************************************ @@ -90,19 +98,18 @@ (robot_node, fetch_node), (fetch_node, parse_node), (parse_node, rag_node), - (rag_node, generate_answer_node) + (rag_node, generate_answer_node), ], - entry_point=robot_node + entry_point=robot_node, ) # ************************************************ # Execute the graph # ************************************************ -result, execution_info = graph.execute({ - "user_prompt": "Describe the content", - "url": "https://example.com/" -}) +result, execution_info = graph.execute( + {"user_prompt": "Describe the content", "url": "https://example.com/"} +) # get the answer from the result result = result.get("answer", "No answer found.") diff --git a/examples/depth_search_graph/.env.example b/examples/depth_search_graph/.env.example index 8c10cfbb..5498eb26 100644 --- a/examples/depth_search_graph/.env.example +++ b/examples/depth_search_graph/.env.example @@ -11,4 +11,4 @@ MAX_DEPTH=5 CRAWL_DELAY=1 RESPECT_ROBOTS_TXT=true MAX_PAGES_PER_DOMAIN=100 -USER_AGENT=Mozilla/5.0 \ No newline at end of file +USER_AGENT=Mozilla/5.0 diff --git a/examples/depth_search_graph/README.md b/examples/depth_search_graph/README.md index c4ce05df..453f21e1 100644 --- a/examples/depth_search_graph/README.md +++ b/examples/depth_search_graph/README.md @@ -27,4 +27,4 @@ results = graph.search("https://example.com", depth=3) ## Environment Variables Required environment variables: -- `OPENAI_API_KEY`: Your OpenAI API key \ No newline at end of file +- `OPENAI_API_KEY`: Your OpenAI API key diff --git a/examples/depth_search_graph/ollama/depth_search_graph_ollama.py b/examples/depth_search_graph/ollama/depth_search_graph_ollama.py index d0f960b5..c232e2f0 100644 --- a/examples/depth_search_graph/ollama/depth_search_graph_ollama.py +++ b/examples/depth_search_graph/ollama/depth_search_graph_ollama.py @@ -1,8 +1,11 @@ """ depth_search_graph_opeani example """ + import os + from dotenv import load_dotenv + from scrapegraphai.graphs import DepthSearchGraph load_dotenv() @@ -25,7 +28,7 @@ search_graph = DepthSearchGraph( prompt="List me all the projects with their description", source="https://perinim.github.io", - config=graph_config + config=graph_config, ) result = search_graph.run() diff --git a/examples/depth_search_graph/openai/depth_search_graph_openai.py b/examples/depth_search_graph/openai/depth_search_graph_openai.py index 799d733a..589fc7c7 100644 --- a/examples/depth_search_graph/openai/depth_search_graph_openai.py +++ b/examples/depth_search_graph/openai/depth_search_graph_openai.py @@ -1,8 +1,11 @@ """ depth_search_graph_opeani example """ + import os + from dotenv import load_dotenv + from scrapegraphai.graphs import DepthSearchGraph load_dotenv() @@ -23,7 +26,7 @@ search_graph = DepthSearchGraph( prompt="List me all the projects with their description", source="https://perinim.github.io", - config=graph_config + config=graph_config, ) result = search_graph.run() diff --git a/examples/document_scraper_graph/.env.example b/examples/document_scraper_graph/.env.example index 2e7bab46..2a1625bb 100644 --- a/examples/document_scraper_graph/.env.example +++ b/examples/document_scraper_graph/.env.example @@ -10,4 +10,4 @@ TEMPERATURE=0.7 OCR_ENABLED=true EXTRACT_METADATA=true MAX_FILE_SIZE=10485760 # 10MB -SUPPORTED_FORMATS=pdf,doc,docx,txt \ No newline at end of file +SUPPORTED_FORMATS=pdf,doc,docx,txt diff --git a/examples/document_scraper_graph/README.md b/examples/document_scraper_graph/README.md index f8561ee7..731ec7fa 100644 --- a/examples/document_scraper_graph/README.md +++ b/examples/document_scraper_graph/README.md @@ -27,4 +27,4 @@ content = graph.scrape("document.pdf") ## Environment Variables Required environment variables: -- `OPENAI_API_KEY`: Your OpenAI API key \ No newline at end of file +- `OPENAI_API_KEY`: Your OpenAI API key diff --git a/examples/document_scraper_graph/ollama/document_scraper_ollama.py b/examples/document_scraper_graph/ollama/document_scraper_ollama.py index 6853a549..12e4a8ea 100644 --- a/examples/document_scraper_graph/ollama/document_scraper_ollama.py +++ b/examples/document_scraper_graph/ollama/document_scraper_ollama.py @@ -1,8 +1,11 @@ """ document_scraper example """ + import json + from dotenv import load_dotenv + from scrapegraphai.graphs import DocumentScraperGraph load_dotenv() @@ -22,13 +25,13 @@ } source = """ - The Divine Comedy, Italian La Divina Commedia, original name La commedia, long narrative poem written in Italian - circa 1308/21 by Dante. It is usually held to be one of the world s great works of literature. - Divided into three major sections—Inferno, Purgatorio, and Paradiso—the narrative traces the journey of Dante - from darkness and error to the revelation of the divine light, culminating in the Beatific Vision of God. - Dante is guided by the Roman poet Virgil, who represents the epitome of human knowledge, from the dark wood - through the descending circles of the pit of Hell (Inferno). He then climbs the mountain of Purgatory, guided - by the Roman poet Statius, who represents the fulfilment of human knowledge, and is finally led by his lifelong love, + The Divine Comedy, Italian La Divina Commedia, original name La commedia, long narrative poem written in Italian + circa 1308/21 by Dante. It is usually held to be one of the world s great works of literature. + Divided into three major sections—Inferno, Purgatorio, and Paradiso—the narrative traces the journey of Dante + from darkness and error to the revelation of the divine light, culminating in the Beatific Vision of God. + Dante is guided by the Roman poet Virgil, who represents the epitome of human knowledge, from the dark wood + through the descending circles of the pit of Hell (Inferno). He then climbs the mountain of Purgatory, guided + by the Roman poet Statius, who represents the fulfilment of human knowledge, and is finally led by his lifelong love, the Beatrice of his earlier poetry, through the celestial spheres of Paradise. """ diff --git a/examples/document_scraper_graph/ollama/inputs/plain_html_example.txt b/examples/document_scraper_graph/ollama/inputs/plain_html_example.txt index 78f814ae..2476565e 100644 --- a/examples/document_scraper_graph/ollama/inputs/plain_html_example.txt +++ b/examples/document_scraper_graph/ollama/inputs/plain_html_example.txt @@ -2,16 +2,16 @@