diff --git a/CHANGELOG.md b/CHANGELOG.md index d46e1127..ef1e2f5b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,17 +1,122 @@ -## [1.18.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.18.2...v1.18.3) (2024-09-11) +## [1.19.0-beta.8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.19.0-beta.7...v1.19.0-beta.8) (2024-09-12) + + +### Features + +* refactoring of the tokenization function ([ec6b164](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ec6b164653250fdf01fd4db1454ea7534822f9cf)) + +## [1.19.0-beta.7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.19.0-beta.6...v1.19.0-beta.7) (2024-09-12) + + +### Bug Fixes + +* pyproject.toml dependencies ([b805aea](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b805aea1deb227e213bb9a027924d49058fefcc1)) + +## [1.19.0-beta.6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.19.0-beta.5...v1.19.0-beta.6) (2024-09-12) ### Bug Fixes * models tokens ([039fe3c](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/039fe3c6d91978f70baedfef407bda912a285aed)) -## [1.18.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.18.1...v1.18.2) (2024-09-10) +### Docs + +* Updated the graph_config in the documentation. ([57a58e1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/57a58e162e254828d890e1a110cb5d3d4beb03df)) + + +### CI + +* **release:** 1.18.2 [skip ci] ([e1a9caa](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e1a9caa905f2a62d5b245a0abbcf4d304bd24de3)) +* **release:** 1.18.3 [skip ci] ([4bd4659](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/4bd4659dc15ae5c7f71702ad6acab200c2a64921)) ### Bug Fixes + +* models tokens ([039fe3c](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/039fe3c6d91978f70baedfef407bda912a285aed)) + +## [1.18.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.18.1...v1.18.2) (2024-09-10) + * models tokens ([b2be6b7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b2be6b739e0a6b71e16867f751012bc2d95f72c9)) +## [1.19.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.19.0-beta.3...v1.19.0-beta.4) (2024-09-10) + + +### Features + +* removed semchunk and used tikton ([1a7f21f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/1a7f21fbf34dc9ef17bca683e2139a88eed70b16)) + +## [1.19.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.19.0-beta.2...v1.19.0-beta.3) (2024-09-10) + + +### Bug Fixes + + +* parse node ([947ebd2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/947ebd2895408c5ebd00b9a3da1b220937553c4a)) + +## [1.19.0-beta.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.19.0-beta.1...v1.19.0-beta.2) (2024-09-09) + + +### Features + +* return urls in searchgraph ([afb6eb7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/afb6eb7e4796ab208a050ad04ad96a83406f7fa1)) + + +### Bug Fixes + +* temporary fix for parse_node ([f2bb22d](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f2bb22d8e9b3ac5c1560793a6ec09f9ae4f257d3)) + +## [1.19.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.18.1...v1.19.0-beta.1) (2024-09-08) + + +### Features + +* **AbstractGraph:** add adjustable rate limit ([2859fb7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/2859fb72d699f26b617ed2f949cdcfca1671c5c8)) +* add scrape_do_integration ([94e69a0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/94e69a051591aeec1e7268bf0d5e0338f90e9539)) +* add togheterai ([8f615ad](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/8f615adef320dacdd214a184981384dd05df8171)) +* ConcatNode.py added for heavy merge operations ([bd4b26d](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bd4b26d7d7c1a7953d1bc9d78b436007880028c9)) +* fetch_node improved ([167f970](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/167f97040f081867cecff542c3af8aa122499ce8)) + + +### Bug Fixes + +* **AbstractGraph:** Bedrock init issues ([63a5d18](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/63a5d18486789ce1b4a8f5ea661fc83779fceca2)), closes [#633](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/633) +* correctly parsing output when using structured_output ([8e74ac5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/8e74ac55a16ca012b52affbc754e4b04130e65db)) +* **ScreenshotScraper:** impose dynamic imports ([b8ef937](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b8ef93738ec4ae48c361fe5650df5194e845a2b1)) +* **Ollama:** instance model from correct package ([398b2c5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/398b2c556faf518ca28ccc284bc8761a16281cf7)) +* Parse Node scraping link and img urls allowing OmniScraper to work ([66a3b6d](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/66a3b6d6a3efdf1ee72b802fc9bf8175482c45bd)) +* **SmartScraper:** pass llm_model to ParseNode ([5242166](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/52421665759032bcfad80ce540efebe5f47310f6)) +* **DeepSeek:** proper model initialization ([74dfc69](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/74dfc693f6e487d20da58704284fe9f492d2b2aa)) +* Removed link_urls and img_ulrs from FetchNode output ([57337a0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/57337a0a8c86fb28c9ccbd70d41acfc9abea11f0)) +* screenshot scraper ([388630c](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/388630c0ffa2850c3d5ea47e62b71b41795203d8)) +* screenshot_scraper ([ef7a589](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ef7a5891dcb1b4ed8a97947f5563fa78af917ecb)) +* **ScreenShotScraper:** static import of optional dependencies ([52fe441](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/52fe441c5af9c728983a2c3cd880fe9afcb5d428)) +* update generate answernode ([c348f67](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/c348f674ad0caae4f4dc04e194fae9634e01b621)) + + +### chore + +* **examples:** create Together AI examples ([34942de](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/34942deca514df53e8aa1c7f96f812ee78b994bf)) + + +### CI + +* **release:** 1.16.0-beta.1 [skip ci] ([d7f6036](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/d7f6036f907eda8d1faa0944da4d1d168ca4c40e)) +* **release:** 1.16.0-beta.2 [skip ci] ([1c37d5d](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/1c37d5db1c637f791133df254838a0deade6d6be)) +* **release:** 1.16.0-beta.3 [skip ci] ([886c987](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/886c987172bb57fb59863e4d7b494797bba16980)) +* **release:** 1.16.0-beta.4 [skip ci] ([ba5c7ad](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ba5c7adcea138d993005377f4cfe438795e1b124)) +* **release:** 1.17.0-beta.1 [skip ci] ([13efd4e](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/13efd4e3a4175e85e7c41f5d575a249c27ecbf1d)) +* **release:** 1.17.0-beta.10 [skip ci] ([af28885](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/af2888539e4ce83ab5f52b5c605ecc3472b14aff)) +* **release:** 1.17.0-beta.11 [skip ci] ([a73fec5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/a73fec5a98f5e646dd8f7d08dfe2dd0dbe067a94)) +* **release:** 1.17.0-beta.2 [skip ci] ([08afc92](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/08afc9292ea8ae227b75f640db3d4dd097265482)) +* **release:** 1.17.0-beta.3 [skip ci] ([fc55418](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/fc55418a4511389d053e8c6b9a28878a3bc91fe6)) +* **release:** 1.17.0-beta.4 [skip ci] ([5e99071](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/5e990719cfc9e063fc2253fc70b3da14fae49360)) +* **release:** 1.17.0-beta.5 [skip ci] ([16ab1bf](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/16ab1bf3d920ae8e3dbac372f075e4853200a0e9)) +* **release:** 1.17.0-beta.6 [skip ci] ([50c9c6b](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/50c9c6bd8ca67d3d4d83ca3717085042e8a51bc5)) +* **release:** 1.17.0-beta.7 [skip ci] ([4347afb](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/4347afb8d4d93f600221d8f77c2701361f0f96a2)), closes [#633](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/633) +* **release:** 1.17.0-beta.8 [skip ci] ([85c374e](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/85c374e4b38f825af20e9e3d095c3a467025fdca)) +* **release:** 1.17.0-beta.9 [skip ci] ([77d0fd3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/77d0fd3dba8d52aff8321ab5ff1a1cc8b92b0837)) + ## [1.18.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.18.0...v1.18.1) (2024-09-08) @@ -22,6 +127,7 @@ ## [1.18.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.17.0...v1.18.0) (2024-09-08) + ### Features * **browser_base_fetch:** add async_mode to support both synchronous and asynchronous execution ([d56253d](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/d56253d183969584cacc0cb164daa0152462f21c)) @@ -29,6 +135,7 @@ ## [1.17.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.16.0...v1.17.0) (2024-09-08) + ### Features * **docloaders:** Enhance browser_base_fetch function flexibility ([57fd01f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/57fd01f9a76ea8ea69ec04b7238ab58ca72ac8f4)) @@ -38,9 +145,84 @@ * **sponsor:** 🅱️ Browserbase sponsor 🅱️ ([a540139](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/a5401394cc939d9a5fc58b8a9145141c2f047bab)) +* **AbstractGraph:** add adjustable rate limit ([2859fb7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/2859fb72d699f26b617ed2f949cdcfca1671c5c8)) + +## [1.17.0-beta.7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.17.0-beta.6...v1.17.0-beta.7) (2024-09-05) + + +### Bug Fixes + +* **AbstractGraph:** Bedrock init issues ([63a5d18](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/63a5d18486789ce1b4a8f5ea661fc83779fceca2)), closes [#633](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/633) + +## [1.17.0-beta.6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.17.0-beta.5...v1.17.0-beta.6) (2024-09-04) + + +### Bug Fixes + +* **ScreenShotScraper:** static import of optional dependencies ([52fe441](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/52fe441c5af9c728983a2c3cd880fe9afcb5d428)) + +## [1.17.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.17.0-beta.4...v1.17.0-beta.5) (2024-09-02) + + +### Bug Fixes + +* correctly parsing output when using structured_output ([8e74ac5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/8e74ac55a16ca012b52affbc754e4b04130e65db)) + +## [1.17.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.17.0-beta.3...v1.17.0-beta.4) (2024-09-02) + + +### Bug Fixes + +* Parse Node scraping link and img urls allowing OmniScraper to work ([66a3b6d](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/66a3b6d6a3efdf1ee72b802fc9bf8175482c45bd)) +* Removed link_urls and img_ulrs from FetchNode output ([57337a0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/57337a0a8c86fb28c9ccbd70d41acfc9abea11f0)) + +## [1.17.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.17.0-beta.2...v1.17.0-beta.3) (2024-09-02) + + +### Bug Fixes + +* **ScreenshotScraper:** impose dynamic imports ([b8ef937](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b8ef93738ec4ae48c361fe5650df5194e845a2b1)) +* **SmartScraper:** pass llm_model to ParseNode ([5242166](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/52421665759032bcfad80ce540efebe5f47310f6)) + +## [1.17.0-beta.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.17.0-beta.1...v1.17.0-beta.2) (2024-09-02) + + +### Bug Fixes + +* **Ollama:** instance model from correct package ([398b2c5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/398b2c556faf518ca28ccc284bc8761a16281cf7)) +* **DeepSeek:** proper model initialization ([74dfc69](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/74dfc693f6e487d20da58704284fe9f492d2b2aa)) +* screenshot scraper ([388630c](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/388630c0ffa2850c3d5ea47e62b71b41795203d8)) + +## [1.17.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.16.0...v1.17.0-beta.1) (2024-09-02) + + +### Features + +* add togheterai ([8f615ad](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/8f615adef320dacdd214a184981384dd05df8171)) + + +### Bug Fixes + +* update generate answernode ([c348f67](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/c348f674ad0caae4f4dc04e194fae9634e01b621)) + + +### chore + +* **examples:** create Together AI examples ([34942de](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/34942deca514df53e8aa1c7f96f812ee78b994bf)) + + +### CI + +* **release:** 1.16.0-beta.1 [skip ci] ([d7f6036](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/d7f6036f907eda8d1faa0944da4d1d168ca4c40e)) +* **release:** 1.16.0-beta.2 [skip ci] ([1c37d5d](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/1c37d5db1c637f791133df254838a0deade6d6be)) +* **release:** 1.16.0-beta.3 [skip ci] ([886c987](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/886c987172bb57fb59863e4d7b494797bba16980)) +* **release:** 1.16.0-beta.4 [skip ci] ([ba5c7ad](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ba5c7adcea138d993005377f4cfe438795e1b124)) + + ## [1.16.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.15.2...v1.16.0) (2024-09-01) + ### Features * add deepcopy error ([71b22d4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/71b22d48804c462798109bb47ec792a5a3c70b6e)) @@ -54,10 +236,28 @@ ## [1.15.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.15.1...v1.15.2) (2024-09-01) +## [1.16.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.16.0-beta.2...v1.16.0-beta.3) (2024-09-01) + + ### Bug Fixes * pyproject.toml ([360ce1c](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/360ce1c0e468c959e63555120ac7cecf55563846)) + +### CI + +* **release:** 1.15.2 [skip ci] ([d88730c](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/d88730ccc7190d09a54e6c24db1644512b576430)) + +## [1.15.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.15.1...v1.15.2) (2024-09-01) + + + + +### Bug Fixes + +* pyproject.toml ([360ce1c](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/360ce1c0e468c959e63555120ac7cecf55563846)) + + ## [1.15.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.15.0...v1.15.1) (2024-08-28) diff --git a/README.md b/README.md index 75cda0eb..32068761 100644 --- a/README.md +++ b/README.md @@ -38,9 +38,10 @@ Additional dependecies can be added while installing the library: - More Language Models: additional language models are installed, such as Fireworks, Groq, Anthropic, Hugging Face, and Nvidia AI Endpoints. - ```bash - pip install scrapegraphai[other-language-models] - ``` + +This group allows you to use additional language models like Fireworks, Groq, Anthropic, Together AI, Hugging Face, and Nvidia AI Endpoints. +```bash +pip install scrapegraphai[other-language-models] - Semantic Options: this group includes tools for advanced semantic processing, such as Graphviz. @@ -58,6 +59,13 @@ Additional dependecies can be added while installing the library: +### Installing "More Browser Options" + +This group includes an ocr scraper for websites +```bash +pip install scrapegraphai[screenshot_scraper] +``` + ## 💻 Usage There are multiple standard scraping pipelines that can be used to extract information from a website (or local file). diff --git a/docs/chinese.md b/docs/chinese.md index 0ffd0777..5eb6460c 100644 --- a/docs/chinese.md +++ b/docs/chinese.md @@ -133,7 +133,7 @@ from scrapegraphai.graphs import SpeechGraph graph_config = { "llm": { "api_key": "OPENAI_API_KEY", - "model": "gpt-3.5-turbo", + "model": "openai/gpt-3.5-turbo", }, "tts_model": { "api_key": "OPENAI_API_KEY", diff --git a/docs/japanese.md b/docs/japanese.md index 65dbc4e0..7279321f 100644 --- a/docs/japanese.md +++ b/docs/japanese.md @@ -133,7 +133,7 @@ from scrapegraphai.graphs import SpeechGraph graph_config = { "llm": { "api_key": "OPENAI_API_KEY", - "model": "gpt-3.5-turbo", + "model": "openai/gpt-3.5-turbo", }, "tts_model": { "api_key": "OPENAI_API_KEY", diff --git a/docs/korean.md b/docs/korean.md index f868f8d8..64c287a0 100644 --- a/docs/korean.md +++ b/docs/korean.md @@ -132,7 +132,7 @@ from scrapegraphai.graphs import SpeechGraph graph_config = { "llm": { "api_key": "OPENAI_API_KEY", - "model": "gpt-3.5-turbo", + "model": "openai/gpt-3.5-turbo", }, "tts_model": { "api_key": "OPENAI_API_KEY", diff --git a/docs/russian.md b/docs/russian.md index 366194fb..84da9796 100644 --- a/docs/russian.md +++ b/docs/russian.md @@ -138,7 +138,7 @@ from scrapegraphai.graphs import SpeechGraph graph_config = { "llm": { "api_key": "OPENAI_API_KEY", - "model": "gpt-3.5-turbo", + "model": "openai/gpt-3.5-turbo", }, "tts_model": { "api_key": "OPENAI_API_KEY", diff --git a/docs/source/getting_started/examples.rst b/docs/source/getting_started/examples.rst index b406f7b3..af746b26 100644 --- a/docs/source/getting_started/examples.rst +++ b/docs/source/getting_started/examples.rst @@ -22,7 +22,7 @@ OpenAI models graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-3.5-turbo", + "model": "openai/gpt-3.5-turbo", }, } diff --git a/examples/anthropic/custom_graph_haiku.py b/examples/anthropic/custom_graph_haiku.py index d8b4dc19..96115d2e 100644 --- a/examples/anthropic/custom_graph_haiku.py +++ b/examples/anthropic/custom_graph_haiku.py @@ -40,7 +40,7 @@ fetch_node = FetchNode( input="url | local_dir", - output=["doc", "link_urls", "img_urls"], + output=["doc"], node_config={ "verbose": True, "headless": True, diff --git a/examples/anthropic/rate_limit_haiku.py b/examples/anthropic/rate_limit_haiku.py new file mode 100644 index 00000000..a01bff44 --- /dev/null +++ b/examples/anthropic/rate_limit_haiku.py @@ -0,0 +1,48 @@ +""" +Basic example of scraping pipeline using SmartScraper while setting an API rate limit. +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + + +# required environment variables in .env +# ANTHROPIC_API_KEY +load_dotenv() + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +graph_config = { + "llm": { + "api_key": os.getenv("ANTHROPIC_API_KEY"), + "model": "anthropic/claude-3-haiku-20240307", + "rate_limit": { + "requests_per_second": 1 + } + }, +} + +smart_scraper_graph = SmartScraperGraph( + prompt="""Don't say anything else. Output JSON only. List me all the events, with the following fields: company_name, event_name, event_start_date, event_start_time, + event_end_date, event_end_time, location, event_mode, event_category, + third_party_redirect, no_of_days, + time_in_hours, hosted_or_attending, refreshments_type, + registration_available, registration_link""", + # also accepts a string with the already downloaded HTML code + source="https://www.hmhco.com/event", + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/anthropic/smart_scraper_multi_concat_haiku.py b/examples/anthropic/smart_scraper_multi_concat_haiku.py new file mode 100644 index 00000000..5faa60c8 --- /dev/null +++ b/examples/anthropic/smart_scraper_multi_concat_haiku.py @@ -0,0 +1,39 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" + +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiConcatGraph + +load_dotenv() + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +graph_config = { + "llm": { + "api_key": os.getenv("ANTHROPIC_API_KEY"), + "model": "anthropic/claude-3-haiku-20240307", + }, +} + + +# ******************************************************* +# Create the SmartScraperMultiGraph instance and run it +# ******************************************************* + +multiple_search_graph = SmartScraperMultiConcatGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/azure/rate_limit_azure.py b/examples/azure/rate_limit_azure.py new file mode 100644 index 00000000..cfd05f1f --- /dev/null +++ b/examples/azure/rate_limit_azure.py @@ -0,0 +1,57 @@ +""" +Basic example of scraping pipeline using SmartScraper with a custom rate limit +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + + +# required environment variable in .env +# AZURE_OPENAI_ENDPOINT +# AZURE_OPENAI_CHAT_DEPLOYMENT_NAME +# MODEL_NAME +# AZURE_OPENAI_API_KEY +# OPENAI_API_TYPE +# AZURE_OPENAI_API_VERSION +# AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME +load_dotenv() + + +# ************************************************ +# Initialize the model instances +# ************************************************ + +graph_config = { + "llm": { + "api_key": os.environ["AZURE_OPENAI_KEY"], + "model": "azure_openai/gpt-3.5-turbo", + "rate_limit": { + "requests_per_second": 1 + }, + }, + "verbose": True, + "headless": False +} + +smart_scraper_graph = SmartScraperGraph( + prompt="""List me all the events, with the following fields: company_name, event_name, event_start_date, event_start_time, + event_end_date, event_end_time, location, event_mode, event_category, + third_party_redirect, no_of_days, + time_in_hours, hosted_or_attending, refreshments_type, + registration_available, registration_link""", + # also accepts a string with the already downloaded HTML code + source="https://www.hmhco.com/event", + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/azure/smart_scraper_multi_azure.py b/examples/azure/smart_scraper_multi_azure.py index a4f26d7e..f1f3451e 100644 --- a/examples/azure/smart_scraper_multi_azure.py +++ b/examples/azure/smart_scraper_multi_azure.py @@ -1,8 +1,8 @@ """ Basic example of scraping pipeline using SmartScraper """ - -import os, json +import os +import json from dotenv import load_dotenv from scrapegraphai.graphs import SmartScraperMultiGraph diff --git a/examples/azure/smart_scraper_multi_concat_azure.py b/examples/azure/smart_scraper_multi_concat_azure.py new file mode 100644 index 00000000..e3870a4c --- /dev/null +++ b/examples/azure/smart_scraper_multi_concat_azure.py @@ -0,0 +1,39 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" + +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiConcatGraph + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ +graph_config = { + "llm": { + "api_key": os.environ["AZURE_OPENAI_KEY"], + "model": "azure_openai/gpt-3.5-turbo", + }, + "verbose": True, + "headless": False +} + +# ******************************************************* +# Create the SmartScraperMultiGraph instance and run it +# ******************************************************* + +multiple_search_graph = SmartScraperMultiConcatGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/bedrock/custom_graph_bedrock.py b/examples/bedrock/custom_graph_bedrock.py index 9002a598..d72f6999 100644 --- a/examples/bedrock/custom_graph_bedrock.py +++ b/examples/bedrock/custom_graph_bedrock.py @@ -55,7 +55,7 @@ fetch_node = FetchNode( input="url | local_dir", - output=["doc", "link_urls", "img_urls"], + output=["doc"], node_config={ "verbose": True, "headless": True, diff --git a/examples/bedrock/rate_limit_bedrock.py b/examples/bedrock/rate_limit_bedrock.py new file mode 100644 index 00000000..79a76a3e --- /dev/null +++ b/examples/bedrock/rate_limit_bedrock.py @@ -0,0 +1,47 @@ +""" +Basic example of scraping pipeline using SmartScraper with a custom rate limit +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "client": "client_name", + "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + "temperature": 0.0, + "rate_limit": { + "requests_per_second": 1 + }, + } +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the projects with their description", + # also accepts a string with the already downloaded HTML code + source="https://perinim.github.io/projects/", + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/bedrock/smart_scraper_multi_bedrock.py b/examples/bedrock/smart_scraper_multi_bedrock.py index b363d6ab..bbff3d12 100644 --- a/examples/bedrock/smart_scraper_multi_bedrock.py +++ b/examples/bedrock/smart_scraper_multi_bedrock.py @@ -1,12 +1,9 @@ """ Basic example of scraping pipeline using SmartScraper """ - -import os, json -from dotenv import load_dotenv +import json from scrapegraphai.graphs import SmartScraperMultiGraph -load_dotenv() # ************************************************ # Define the configuration for the graph diff --git a/examples/bedrock/smart_scraper_multi_concat_bedrock.py b/examples/bedrock/smart_scraper_multi_concat_bedrock.py new file mode 100644 index 00000000..74c30a3f --- /dev/null +++ b/examples/bedrock/smart_scraper_multi_concat_bedrock.py @@ -0,0 +1,35 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import json +from scrapegraphai.graphs import SmartScraperMultiConcatGraph + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "client": "client_name", + "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + "temperature": 0.0 + } +} + + +# ******************************************************* +# Create the SmartScraperMultiGraph instance and run it +# ******************************************************* + +multiple_search_graph = SmartScraperMultiConcatGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/benchmarks/GenerateScraper/benchmark_openai_gpt35.py b/examples/benchmarks/GenerateScraper/benchmark_openai_gpt35.py index e0a27161..83ed3913 100644 --- a/examples/benchmarks/GenerateScraper/benchmark_openai_gpt35.py +++ b/examples/benchmarks/GenerateScraper/benchmark_openai_gpt35.py @@ -24,7 +24,7 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-3.5-turbo", + "model": "openai/gpt-3.5-turbo", }, "library": "beautifoulsoup" } diff --git a/examples/benchmarks/GenerateScraper/benchmark_openai_gpt4.py b/examples/benchmarks/GenerateScraper/benchmark_openai_gpt4.py index 3e451a73..37791c29 100644 --- a/examples/benchmarks/GenerateScraper/benchmark_openai_gpt4.py +++ b/examples/benchmarks/GenerateScraper/benchmark_openai_gpt4.py @@ -24,7 +24,7 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-4-turbo-2024-04-09", + "model": "openai/gpt-4-turbo-2024-04-09", }, "library": "beautifoulsoup" } diff --git a/examples/benchmarks/SmartScraper/benchmark_openai_gpt35.py b/examples/benchmarks/SmartScraper/benchmark_openai_gpt35.py index d615be7f..659d2c78 100644 --- a/examples/benchmarks/SmartScraper/benchmark_openai_gpt35.py +++ b/examples/benchmarks/SmartScraper/benchmark_openai_gpt35.py @@ -24,7 +24,7 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-3.5-turbo", + "model": "openai/gpt-3.5-turbo", }, } diff --git a/examples/benchmarks/SmartScraper/benchmark_openai_gpt4.py b/examples/benchmarks/SmartScraper/benchmark_openai_gpt4.py index 835ec7b4..a23901a9 100644 --- a/examples/benchmarks/SmartScraper/benchmark_openai_gpt4.py +++ b/examples/benchmarks/SmartScraper/benchmark_openai_gpt4.py @@ -25,7 +25,7 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-4-turbo", + "model": "openai/gpt-4-turbo", }, } diff --git a/examples/benchmarks/SmartScraper/benchmark_openai_gpt4o.py b/examples/benchmarks/SmartScraper/benchmark_openai_gpt4o.py index aa273c5b..8b2da6d7 100644 --- a/examples/benchmarks/SmartScraper/benchmark_openai_gpt4o.py +++ b/examples/benchmarks/SmartScraper/benchmark_openai_gpt4o.py @@ -25,7 +25,7 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-4o", + "model": "openai/gpt-4o", }, } diff --git a/examples/deepseek/rate_limit_deepseek.py b/examples/deepseek/rate_limit_deepseek.py new file mode 100644 index 00000000..36278452 --- /dev/null +++ b/examples/deepseek/rate_limit_deepseek.py @@ -0,0 +1,49 @@ +""" +Basic example of scraping pipeline using SmartScraper with a custom rate limit +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +deepseek_key = os.getenv("DEEPSEEK_APIKEY") + +graph_config = { + "llm": { + "model": "deepseek/deepseek-chat", + "api_key": deepseek_key, + "rate_limit": { + "requests_per_second": 1 + } + }, + "verbose": True, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the projects with their description.", + # also accepts a string with the already downloaded HTML code + source="https://perinim.github.io/projects/", + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/deepseek/smart_scraper_deepseek.py b/examples/deepseek/smart_scraper_deepseek.py index 4c49b160..c94a5a80 100644 --- a/examples/deepseek/smart_scraper_deepseek.py +++ b/examples/deepseek/smart_scraper_deepseek.py @@ -9,7 +9,6 @@ load_dotenv() - # ************************************************ # Define the configuration for the graph # ************************************************ diff --git a/examples/deepseek/smart_scraper_multi_concat_deepseek.py b/examples/deepseek/smart_scraper_multi_concat_deepseek.py new file mode 100644 index 00000000..bf6c0c53 --- /dev/null +++ b/examples/deepseek/smart_scraper_multi_concat_deepseek.py @@ -0,0 +1,42 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" + +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiConcatGraph + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +deepseek_key = os.getenv("DEEPSEEK_APIKEY") + +graph_config = { + "llm": { + "model": "deepseek/deepseek-chat", + "api_key": deepseek_key, + }, + "verbose": True, +} + + +# ******************************************************* +# Create the SmartScraperMultiGraph instance and run it +# ******************************************************* + +multiple_search_graph = SmartScraperMultiConcatGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/ernie/custom_graph_ernie.py b/examples/ernie/custom_graph_ernie.py index 5dad8bac..57d422e5 100644 --- a/examples/ernie/custom_graph_ernie.py +++ b/examples/ernie/custom_graph_ernie.py @@ -43,7 +43,7 @@ fetch_node = FetchNode( input="url | local_dir", - output=["doc", "link_urls", "img_urls"], + output=["doc"], node_config={ "verbose": True, "headless": True, diff --git a/examples/ernie/rate_limit_ernie.py b/examples/ernie/rate_limit_ernie.py new file mode 100644 index 00000000..41314e87 --- /dev/null +++ b/examples/ernie/rate_limit_ernie.py @@ -0,0 +1,49 @@ +""" +Basic example of scraping pipeline using SmartScraper with a custom rate limit +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "ernie/ernie-bot-turbo", + "ernie_client_id": "", + "ernie_client_secret": "", + "temperature": 0.1, + "rate_limit": { + "requests_per_second": 1 + }, + }, + "library": "beautifulsoup" +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the projects with their description", + # also accepts a string with the already downloaded HTML code + source="https://perinim.github.io/projects/", + config=graph_config, +) + +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/ernie/smart_scraper_ernie.py b/examples/ernie/smart_scraper_ernie.py index 56084dad..9fcc7820 100644 --- a/examples/ernie/smart_scraper_ernie.py +++ b/examples/ernie/smart_scraper_ernie.py @@ -2,19 +2,14 @@ Basic example of scraping pipeline using SmartScraper """ -import os -from dotenv import load_dotenv from scrapegraphai.graphs import SmartScraperGraph from scrapegraphai.utils import prettify_exec_info -load_dotenv() - - # ************************************************ # Define the configuration for the graph # ************************************************ -graph_config = { +graph_config = { "llm": { "model": "ernie/ernie-bot-turbo", "ernie_client_id": "", diff --git a/examples/ernie/smart_scraper_multi_concat_ernie.py b/examples/ernie/smart_scraper_multi_concat_ernie.py new file mode 100644 index 00000000..5be9898d --- /dev/null +++ b/examples/ernie/smart_scraper_multi_concat_ernie.py @@ -0,0 +1,35 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import json +from scrapegraphai.graphs import SmartScraperMultiConcatGraph + +# ************************************************ +# Define the configuration for the graph +# ************************************************ +graph_config = { + "llm": { + "model": "ernie/ernie-bot-turbo", + "ernie_client_id": "", + "ernie_client_secret": "", + "temperature": 0.1 + }, + "library": "beautifulsoup" +} + +# ******************************************************* +# Create the SmartScraperMultiGraph instance and run it +# ******************************************************* + +multiple_search_graph = SmartScraperMultiConcatGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/extras/Savedscreenshots/test_image.jpeg b/examples/extras/Savedscreenshots/test_image.jpeg new file mode 100644 index 00000000..159625bc Binary files /dev/null and b/examples/extras/Savedscreenshots/test_image.jpeg differ diff --git a/examples/extras/browser_base_integration.py b/examples/extras/browser_base_integration.py index 61363024..7030e101 100644 --- a/examples/extras/browser_base_integration.py +++ b/examples/extras/browser_base_integration.py @@ -18,7 +18,7 @@ graph_config = { "llm": { "api_key": os.getenv("OPENAI_API_KEY"), - "model": "gpt-4o", + "model": "openai/gpt-4o", }, "browser_base": { "api_key": os.getenv("BROWSER_BASE_API_KEY"), diff --git a/examples/extras/custom_prompt.py b/examples/extras/custom_prompt.py index bfee86ce..7def35a3 100644 --- a/examples/extras/custom_prompt.py +++ b/examples/extras/custom_prompt.py @@ -21,7 +21,7 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-3.5-turbo", + "model": "openai/gpt-3.5-turbo", }, "additional_info": prompt, "verbose": True, diff --git a/examples/extras/no_cut.py b/examples/extras/no_cut.py index b7aa3452..71bfad86 100644 --- a/examples/extras/no_cut.py +++ b/examples/extras/no_cut.py @@ -15,7 +15,7 @@ graph_config = { "llm": { "api_key": "s", - "model": "gpt-3.5-turbo", + "model": "openai/gpt-3.5-turbo", }, "cut": False, "verbose": True, diff --git a/examples/extras/proxy_rotation.py b/examples/extras/proxy_rotation.py index 28400859..adfb87ed 100644 --- a/examples/extras/proxy_rotation.py +++ b/examples/extras/proxy_rotation.py @@ -13,7 +13,7 @@ graph_config = { "llm": { "api_key": "API_KEY", - "model": "gpt-3.5-turbo", + "model": "openai/gpt-3.5-turbo", }, "loader_kwargs": { "proxy" : { diff --git a/examples/extras/rag_caching.py b/examples/extras/rag_caching.py index 8f42dbbd..df73d2b4 100644 --- a/examples/extras/rag_caching.py +++ b/examples/extras/rag_caching.py @@ -19,7 +19,7 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-3.5-turbo", + "model": "openai/gpt-3.5-turbo", }, "caching": True } diff --git a/examples/extras/scrape_do.py b/examples/extras/scrape_do.py new file mode 100644 index 00000000..45026f21 --- /dev/null +++ b/examples/extras/scrape_do.py @@ -0,0 +1,40 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" + +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + + +graph_config = { + "llm": { + "api_key": os.getenv("OPENAI_API_KEY"), + "model": "openai/gpt-4o", + }, + "scrape_do": { + "api_key": os.getenv("SCRAPE_DO_API_KEY"), + }, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the projects", + source="https://perinim.github.io/projects/", + config=graph_config +) + +result = smart_scraper_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/extras/screenshot_scaping.py b/examples/extras/screenshot_scaping.py new file mode 100644 index 00000000..439c2a0c --- /dev/null +++ b/examples/extras/screenshot_scaping.py @@ -0,0 +1,32 @@ +""" +example of scraping with screenshots +""" +import asyncio +from scrapegraphai.utils.screenshot_scraping import (take_screenshot, + select_area_with_opencv, + crop_image, detect_text) + +# STEP 1: Take a screenshot +image = asyncio.run(take_screenshot( + url="https://colab.google/", + save_path="Savedscreenshots/test_image.jpeg", + quality = 50 +)) + +# STEP 2 (Optional): Select an area of the image which you want to use for text detection. +LEFT, TOP, RIGHT, BOTTOM = select_area_with_opencv(image) +print("LEFT: ", LEFT, " TOP: ", TOP, " RIGHT: ", RIGHT, " BOTTOM: ", BOTTOM) + +# STEP 3 (Optional): Crop the image. +# Note: If any of the coordinates (LEFT, TOP, RIGHT, BOTTOM) is None, +# it will be set to the corresponding edge of the image. +cropped_image = crop_image(image, LEFT=LEFT, RIGHT=RIGHT,TOP=TOP,BOTTOM=BOTTOM) + +# STEP 4: Detect text +TEXT = detect_text( + cropped_image, # The image to detect text from + languages = ["en"] # The languages to detect text in +) + +print("DETECTED TEXT: ") +print(TEXT) diff --git a/examples/extras/serch_graph_scehma.py b/examples/extras/serch_graph_scehma.py index fe66530d..f4135d19 100644 --- a/examples/extras/serch_graph_scehma.py +++ b/examples/extras/serch_graph_scehma.py @@ -23,7 +23,7 @@ class Ceos(BaseModel): graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-4o", + "model": "openai/gpt-4o", }, "max_results": 2, "verbose": True, diff --git a/examples/fireworks/custom_graph_fireworks.py b/examples/fireworks/custom_graph_fireworks.py index d0dcd994..66784d5b 100644 --- a/examples/fireworks/custom_graph_fireworks.py +++ b/examples/fireworks/custom_graph_fireworks.py @@ -43,7 +43,7 @@ fetch_node = FetchNode( input="url | local_dir", - output=["doc", "link_urls", "img_urls"], + output=["doc"], node_config={ "verbose": True, "headless": True, diff --git a/examples/fireworks/rate_limit_fireworks.py b/examples/fireworks/rate_limit_fireworks.py new file mode 100644 index 00000000..b19cb770 --- /dev/null +++ b/examples/fireworks/rate_limit_fireworks.py @@ -0,0 +1,50 @@ +""" +Basic example of scraping pipeline using SmartScraper with a custom rate limit +""" + +import os, json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +fireworks_api_key = os.getenv("FIREWORKS_APIKEY") + +graph_config = { + "llm": { + "api_key": fireworks_api_key, + "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct", + "rate_limit": { + "requests_per_second": 1 + }, + }, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the projects with their description", + # also accepts a string with the already downloaded HTML code + source="https://perinim.github.io/projects/", + config=graph_config, +) + +result = smart_scraper_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/fireworks/smart_scraper_fireworks.py b/examples/fireworks/smart_scraper_fireworks.py index cff9aedb..778f1a07 100644 --- a/examples/fireworks/smart_scraper_fireworks.py +++ b/examples/fireworks/smart_scraper_fireworks.py @@ -1,8 +1,8 @@ """ Basic example of scraping pipeline using SmartScraper """ - -import os, json +import os +import json from dotenv import load_dotenv from scrapegraphai.graphs import SmartScraperGraph from scrapegraphai.utils import prettify_exec_info diff --git a/examples/fireworks/smart_scraper_multi_concat_fireworks.py b/examples/fireworks/smart_scraper_multi_concat_fireworks.py new file mode 100644 index 00000000..c0da49a3 --- /dev/null +++ b/examples/fireworks/smart_scraper_multi_concat_fireworks.py @@ -0,0 +1,37 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiConcatGraph + +load_dotenv() + +fireworks_api_key = os.getenv("FIREWORKS_APIKEY") + +graph_config = { + "llm": { + "api_key": fireworks_api_key, + "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" + }, + "verbose": True, + "headless": False, +} + +# ******************************************************* +# Create the SmartScraperMultiGraph instance and run it +# ******************************************************* + +multiple_search_graph = SmartScraperMultiConcatGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/google_genai/rate_limit_gemini.py b/examples/google_genai/rate_limit_gemini.py new file mode 100644 index 00000000..f4e68f69 --- /dev/null +++ b/examples/google_genai/rate_limit_gemini.py @@ -0,0 +1,47 @@ +""" +Basic example of scraping pipeline using SmartScraper with a custom rate limit +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.utils import prettify_exec_info +from scrapegraphai.graphs import SmartScraperGraph +load_dotenv() + + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +gemini_key = os.getenv("GOOGLE_APIKEY") + +graph_config = { + "llm": { + "api_key": gemini_key, + "model": "google_genai/gemini-pro", + "rate_limit": { + "requests_per_second": 1 + } + }, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the news with their description.", + # also accepts a string with the already downloaded HTML code + source="https://www.wired.com", + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/google_genai/smart_scraper_multi_concat_gemini.py b/examples/google_genai/smart_scraper_multi_concat_gemini.py new file mode 100644 index 00000000..facd74c3 --- /dev/null +++ b/examples/google_genai/smart_scraper_multi_concat_gemini.py @@ -0,0 +1,40 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" + +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiConcatGraph + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +gemini_key = os.getenv("GOOGLE_APIKEY") + +graph_config = { + "llm": { + "api_key": gemini_key, + "model": "google_genai/gemini-pro", + }, +} + +# ******************************************************* +# Create the SmartScraperMultiGraph instance and run it +# ******************************************************* + +multiple_search_graph = SmartScraperMultiConcatGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/google_vertexai/rate_limit_gemini.py b/examples/google_vertexai/rate_limit_gemini.py new file mode 100644 index 00000000..c5f15a35 --- /dev/null +++ b/examples/google_vertexai/rate_limit_gemini.py @@ -0,0 +1,47 @@ +""" +Basic example of scraping pipeline using SmartScraper with a custom rate limit +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.utils import prettify_exec_info +from scrapegraphai.graphs import SmartScraperGraph +load_dotenv() + + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +gemini_key = os.getenv("GOOGLE_APIKEY") + +graph_config = { + "llm": { + "api_key": gemini_key, + "model": "google_vertexai/gemini-1.5-pro", + "rate_limit": { + "requests_per_second": 1 + } + }, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the news with their description.", + # also accepts a string with the already downloaded HTML code + source="https://www.wired.com", + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/google_vertexai/smart_scraper_gemini.py b/examples/google_vertexai/smart_scraper_gemini.py index 0888d656..4ed7c352 100644 --- a/examples/google_vertexai/smart_scraper_gemini.py +++ b/examples/google_vertexai/smart_scraper_gemini.py @@ -6,8 +6,8 @@ from dotenv import load_dotenv from scrapegraphai.utils import prettify_exec_info from scrapegraphai.graphs import SmartScraperGraph -load_dotenv() +load_dotenv() # ************************************************ # Define the configuration for the graph diff --git a/examples/google_vertexai/smart_scraper_multi_concat_gemini.py b/examples/google_vertexai/smart_scraper_multi_concat_gemini.py new file mode 100644 index 00000000..c6874ff6 --- /dev/null +++ b/examples/google_vertexai/smart_scraper_multi_concat_gemini.py @@ -0,0 +1,36 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" + +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiConcatGraph + +load_dotenv() + +gemini_key = os.getenv("GOOGLE_APIKEY") + +graph_config = { + "llm": { + "api_key": gemini_key, + "model": "google_vertexai/gemini-1.5-pro", + }, +} + +# ******************************************************* +# Create the SmartScraperMultiGraph instance and run it +# ******************************************************* + +multiple_search_graph = SmartScraperMultiConcatGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/groq/custom_graph_groq.py b/examples/groq/custom_graph_groq.py index 79d2f0c6..f0d7e215 100644 --- a/examples/groq/custom_graph_groq.py +++ b/examples/groq/custom_graph_groq.py @@ -43,7 +43,7 @@ fetch_node = FetchNode( input="url | local_dir", - output=["doc", "link_urls", "img_urls"], + output=["doc"], node_config={ "verbose": True, "headless": True, diff --git a/examples/mixed_models/smart_scraper_groq_ollama.py b/examples/groq/rate_limit_groq.py similarity index 87% rename from examples/mixed_models/smart_scraper_groq_ollama.py rename to examples/groq/rate_limit_groq.py index f32f3493..976127be 100644 --- a/examples/mixed_models/smart_scraper_groq_ollama.py +++ b/examples/groq/rate_limit_groq.py @@ -9,7 +9,6 @@ load_dotenv() - # ************************************************ # Define the configuration for the graph # ************************************************ @@ -20,12 +19,10 @@ "llm": { "model": "groq/gemma-7b-it", "api_key": groq_key, - "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", "temperature": 0, - "base_url": "http://localhost:11434", # set ollama URL arbitrarily + "rate_limit": { + "requests_per_second": 1 + } }, "headless": False } diff --git a/examples/groq/smart_scraper_multi_concat_groq.py b/examples/groq/smart_scraper_multi_concat_groq.py new file mode 100644 index 00000000..038ca37c --- /dev/null +++ b/examples/groq/smart_scraper_multi_concat_groq.py @@ -0,0 +1,42 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" + +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiConcatGraph + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +groq_key = os.getenv("GROQ_APIKEY") + +graph_config = { + "llm": { + "model": "groq/gemma-7b-it", + "api_key": groq_key, + "temperature": 0 + }, + "headless": False +} + +# ******************************************************* +# Create the SmartScraperMultiGraph instance and run it +# ******************************************************* + +multiple_search_graph = SmartScraperMultiConcatGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/huggingfacehub/custom_graph_huggingfacehub.py b/examples/huggingfacehub/custom_graph_huggingfacehub.py index 604bfae8..cec007b7 100644 --- a/examples/huggingfacehub/custom_graph_huggingfacehub.py +++ b/examples/huggingfacehub/custom_graph_huggingfacehub.py @@ -55,7 +55,7 @@ fetch_node = FetchNode( input="url | local_dir", - output=["doc", "link_urls", "img_urls"], + output=["doc"], node_config={ "verbose": True, "headless": True, diff --git a/examples/huggingfacehub/smart_scraper_huggingfacehub.py b/examples/huggingfacehub/smart_scraper_huggingfacehub.py index 6f9a863f..a50b574e 100644 --- a/examples/huggingfacehub/smart_scraper_huggingfacehub.py +++ b/examples/huggingfacehub/smart_scraper_huggingfacehub.py @@ -10,8 +10,6 @@ from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings - - ## required environment variable in .env #HUGGINGFACEHUB_API_TOKEN load_dotenv() diff --git a/examples/huggingfacehub/smart_scraper_multi_concat_huggingfacehub.py b/examples/huggingfacehub/smart_scraper_multi_concat_huggingfacehub.py new file mode 100644 index 00000000..3f2d7135 --- /dev/null +++ b/examples/huggingfacehub/smart_scraper_multi_concat_huggingfacehub.py @@ -0,0 +1,53 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" + +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiConcatGraph +from langchain_community.llms import HuggingFaceEndpoint +from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings + +load_dotenv() + +HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') +# ************************************************ +# Initialize the model instances +# ************************************************ + +repo_id = "mistralai/Mistral-7B-Instruct-v0.2" + +llm_model_instance = HuggingFaceEndpoint( + repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN +) + + +embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( + api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2" +) + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +graph_config = { + "llm": {"model_instance": llm_model_instance}, +} + +# ******************************************************* +# Create the SmartScraperMultiGraph instance and run it +# ******************************************************* + +multiple_search_graph = SmartScraperMultiConcatGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/integrations/indexify_node_example.py b/examples/integrations/indexify_node_example.py index a77b6bd2..fae2403a 100644 --- a/examples/integrations/indexify_node_example.py +++ b/examples/integrations/indexify_node_example.py @@ -32,7 +32,7 @@ class Images(BaseModel): graph_config = { "llm": { "api_key":openai_key, - "model": "gpt-3.5-turbo", + "model": "openai/gpt-3.5-turbo", }, "verbose": True, "headless": False, diff --git a/examples/local_models/custom_graph_ollama.py b/examples/local_models/custom_graph_ollama.py index 66dd59b6..c505d068 100644 --- a/examples/local_models/custom_graph_ollama.py +++ b/examples/local_models/custom_graph_ollama.py @@ -44,7 +44,7 @@ fetch_node = FetchNode( input="url | local_dir", - output=["doc", "link_urls", "img_urls"], + output=["doc"], node_config={ "verbose": True, "headless": True, diff --git a/examples/local_models/smart_scraper_multi_concat_ollama.py b/examples/local_models/smart_scraper_multi_concat_ollama.py new file mode 100644 index 00000000..665b5db4 --- /dev/null +++ b/examples/local_models/smart_scraper_multi_concat_ollama.py @@ -0,0 +1,42 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" + +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiConcatGraph + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "ollama/llama3.1", + "temperature": 0, + "format": "json", # Ollama needs the format to be specified explicitly + "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "verbose": True, + "headless": False +} + +# ******************************************************* +# Create the SmartScraperMultiGraph instance and run it +# ******************************************************* + +multiple_search_graph = SmartScraperMultiConcatGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/local_models/smart_scraper_multi_ollama.py b/examples/local_models/smart_scraper_multi_ollama.py new file mode 100644 index 00000000..c9d49793 --- /dev/null +++ b/examples/local_models/smart_scraper_multi_ollama.py @@ -0,0 +1,39 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" + +import json +from scrapegraphai.graphs import SmartScraperMultiGraph + +# ************************************************ +# Define the configuration for the graph +# ************************************************ +graph_config = { + "llm": { + "model": "ollama/llama3.1", + "temperature": 0, + "format": "json", # Ollama needs the format to be specified explicitly + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + + "verbose": True, + "headless": False +} + + +# ******************************************************* +# Create the SmartScraperMultiGraph instance and run it +# ******************************************************* + +multiple_search_graph = SmartScraperMultiGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/local_models/smart_scraper_schema_ollama.py b/examples/local_models/smart_scraper_schema_ollama.py index a732aa8d..5f15b080 100644 --- a/examples/local_models/smart_scraper_schema_ollama.py +++ b/examples/local_models/smart_scraper_schema_ollama.py @@ -24,7 +24,7 @@ class Projects(BaseModel): "format": "json", # Ollama needs the format to be specified explicitly # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, - + "verbose": True, "headless": False } diff --git a/examples/mistral/custom_graph_mistral.py b/examples/mistral/custom_graph_mistral.py index f02ead0c..ec2878c1 100644 --- a/examples/mistral/custom_graph_mistral.py +++ b/examples/mistral/custom_graph_mistral.py @@ -42,7 +42,7 @@ fetch_node = FetchNode( input="url | local_dir", - output=["doc", "link_urls", "img_urls"], + output=["doc"], node_config={ "verbose": True, "headless": True, diff --git a/examples/mistral/rate_limit_mistral.py b/examples/mistral/rate_limit_mistral.py new file mode 100644 index 00000000..fbd65a1a --- /dev/null +++ b/examples/mistral/rate_limit_mistral.py @@ -0,0 +1,46 @@ +""" +Basic example of scraping pipeline using SmartScraper with a custom rate limit +""" + +import os, json +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info +from dotenv import load_dotenv +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + + +graph_config = { + "llm": { + "api_key": os.getenv("MISTRAL_API_KEY"), + "model": "mistralai/open-mistral-nemo", + "rate_limit": { + "requests_per_second": 1 + } + }, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me what does the company do, the name and a contact email.", + source="https://scrapegraphai.com/", + config=graph_config +) + +result = smart_scraper_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/mistral/smart_scraper_mistral.py b/examples/mistral/smart_scraper_mistral.py index 7291a40a..a2f82504 100644 --- a/examples/mistral/smart_scraper_mistral.py +++ b/examples/mistral/smart_scraper_mistral.py @@ -1,11 +1,11 @@ """ Basic example of scraping pipeline using SmartScraper """ - -import os, json -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info +import os +import json from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph + load_dotenv() # ************************************************ @@ -34,10 +34,3 @@ result = smart_scraper_graph.run() print(json.dumps(result, indent=4)) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/mistral/smart_scraper_multi_concat_mistral.py b/examples/mistral/smart_scraper_multi_concat_mistral.py new file mode 100644 index 00000000..cef9e16e --- /dev/null +++ b/examples/mistral/smart_scraper_multi_concat_mistral.py @@ -0,0 +1,39 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" + +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiConcatGraph + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ +graph_config = { + "llm": { + "api_key": os.getenv("MISTRAL_API_KEY"), + "model": "mistralai/open-mistral-nemo", + }, + "verbose": True, + "headless": False, +} + +# ******************************************************* +# Create the SmartScraperMultiGraph instance and run it +# ******************************************************* + +multiple_search_graph = SmartScraperMultiConcatGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/mixed_models/.env.example b/examples/mixed_models/.env.example deleted file mode 100644 index ee2490d5..00000000 --- a/examples/mixed_models/.env.example +++ /dev/null @@ -1,5 +0,0 @@ -OPENAI_APIKEY="your openai api key" -GOOGLE_APIKEY="your google api key" -AZURE_OPENAI_API_KEY="your azure api key" -AZURE_OPENAI_ENDPOINT="https://.openai.azure.com/" -GROQ_APIKEY= "your groq key" \ No newline at end of file diff --git a/examples/mixed_models/custom_graph_groq_openai.py b/examples/mixed_models/custom_graph_groq_openai.py deleted file mode 100644 index 942b0fcb..00000000 --- a/examples/mixed_models/custom_graph_groq_openai.py +++ /dev/null @@ -1,118 +0,0 @@ -""" -Example of custom graph using existing nodes -""" - -import os -from dotenv import load_dotenv - -from langchain_openai import OpenAIEmbeddings -from langchain_openai import ChatOpenAI -from scrapegraphai.graphs import BaseGraph -from scrapegraphai.nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode, RobotsNode -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ -groq_key = os.getenv("GROQ_APIKEY") -openai_key = os.getenv("OPENAI_APIKEY") - -graph_config = { - "llm": { - "model": "groq/gemma-7b-it", - "api_key": groq_key, - "temperature": 0 - }, - "embeddings": { - "api_key": openai_key, - "model": "openai", - }, - "verbose": True, - "headless": False -} - -# ************************************************ -# Define the graph nodes -# ************************************************ - -llm_model = OpenAI(graph_config["llm"]) -embedder = OpenAIEmbeddings(api_key=llm_model.openai_api_key) - -# define the nodes for the graph -robot_node = RobotsNode( - input="url", - output=["is_scrapable"], - node_config={ - "llm_model": llm_model, - "force_scraping": True, - "verbose": True, - } -) - -fetch_node = FetchNode( - input="url | local_dir", - output=["doc", "link_urls", "img_urls"], - node_config={ - "verbose": True, - "headless": True, - } -) -parse_node = ParseNode( - input="doc", - output=["parsed_doc"], - node_config={ - "chunk_size": 4096, - "verbose": True, - } -) -rag_node = RAGNode( - input="user_prompt & (parsed_doc | doc)", - output=["relevant_chunks"], - node_config={ - "llm_model": llm_model, - "embedder_model": embedder, - "verbose": True, - } -) -generate_answer_node = GenerateAnswerNode( - input="user_prompt & (relevant_chunks | parsed_doc | doc)", - output=["answer"], - node_config={ - "llm_model": llm_model, - "verbose": True, - } -) - -# ************************************************ -# Create the graph by defining the connections -# ************************************************ - -graph = BaseGraph( - nodes=[ - robot_node, - fetch_node, - parse_node, - rag_node, - generate_answer_node, - ], - edges=[ - (robot_node, fetch_node), - (fetch_node, parse_node), - (parse_node, rag_node), - (rag_node, generate_answer_node) - ], - entry_point=robot_node -) - -# ************************************************ -# Execute the graph -# ************************************************ - -result, execution_info = graph.execute({ - "user_prompt": "Describe the content", - "url": "https://example.com/" -}) - -# get the answer from the result -result = result.get("answer", "No answer found.") -print(result) diff --git a/examples/mixed_models/inputs/plain_html_example.txt b/examples/mixed_models/inputs/plain_html_example.txt deleted file mode 100644 index 78f814ae..00000000 --- a/examples/mixed_models/inputs/plain_html_example.txt +++ /dev/null @@ -1,105 +0,0 @@ - -
- - -
-
-
- - -
- \ No newline at end of file diff --git a/examples/mixed_models/readme.md b/examples/mixed_models/readme.md deleted file mode 100644 index 9e739212..00000000 --- a/examples/mixed_models/readme.md +++ /dev/null @@ -1 +0,0 @@ -This folder contains an example of how to use ScrapeGraph-AI with mixed models. The example shows how to extract information from a website using a natural language prompt and a machine learning model. \ No newline at end of file diff --git a/examples/moonshot/smart_scraper_multi_concat_moonshot.py b/examples/moonshot/smart_scraper_multi_concat_moonshot.py new file mode 100644 index 00000000..1e652db4 --- /dev/null +++ b/examples/moonshot/smart_scraper_multi_concat_moonshot.py @@ -0,0 +1,52 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from langchain_community.chat_models.moonshot import MoonshotChat +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiConcatGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + + +llm_instance_config = { + "model": "moonshot-v1-8k", + "base_url": "https://api.moonshot.cn/v1", + "moonshot_api_key": os.getenv("MOONLIGHT_API_KEY"), +} + + +llm_model_instance = MoonshotChat(**llm_instance_config) + +graph_config = { + "llm": { + "model_instance": llm_model_instance, + "model_tokens": 10000 + }, + "verbose": True, + "headless": True, +} + + +# ******************************************************* +# Create the SmartScraperMultiGraph instance and run it +# ******************************************************* + +multiple_search_graph = SmartScraperMultiConcatGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/moonshot/smart_scraper_with_moonshot.py b/examples/moonshot/smart_scraper_with_moonshot.py index b362414f..28635ba3 100644 --- a/examples/moonshot/smart_scraper_with_moonshot.py +++ b/examples/moonshot/smart_scraper_with_moonshot.py @@ -1,12 +1,13 @@ """ Basic example of scraping pipeline using SmartScraper and model_instace """ - -import os, json -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info +import os +import json from langchain_community.chat_models.moonshot import MoonshotChat from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + load_dotenv() # ************************************************ diff --git a/examples/nemotron/custom_graph_nemotron.py b/examples/nemotron/custom_graph_nemotron.py index 07702680..22c6a4a1 100644 --- a/examples/nemotron/custom_graph_nemotron.py +++ b/examples/nemotron/custom_graph_nemotron.py @@ -42,7 +42,7 @@ fetch_node = FetchNode( input="url | local_dir", - output=["doc", "link_urls", "img_urls"], + output=["doc"], node_config={ "verbose": True, "headless": True, diff --git a/examples/nemotron/rate_limit_nemotron.py b/examples/nemotron/rate_limit_nemotron.py new file mode 100644 index 00000000..8b1a5eb4 --- /dev/null +++ b/examples/nemotron/rate_limit_nemotron.py @@ -0,0 +1,46 @@ +""" +Basic example of scraping pipeline using SmartScraper with a custom rate limit +""" + +import os, json +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info +from dotenv import load_dotenv +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + + +graph_config = { + "llm": { + "api_key": os.getenv("NEMOTRON_KEY"), + "model": "nvidia/meta/llama3-70b-instruct", + "rate_limit": { + "requests_per_second": 1 + } + }, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="Extract me the python code inside the page", + source="https://www.exploit-db.com/exploits/51447", + config=graph_config +) + +result = smart_scraper_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/nemotron/search_link_graph_nemotron.py b/examples/nemotron/search_link_graph_nemotron.py index 6d1edbde..50dce11b 100644 --- a/examples/nemotron/search_link_graph_nemotron.py +++ b/examples/nemotron/search_link_graph_nemotron.py @@ -7,6 +7,7 @@ from scrapegraphai.utils import prettify_exec_info load_dotenv() + # ************************************************ # Define the configuration for the graph # ************************************************ diff --git a/examples/nemotron/smart_scraper_multi_concat_nemotron.py b/examples/nemotron/smart_scraper_multi_concat_nemotron.py new file mode 100644 index 00000000..0444e18e --- /dev/null +++ b/examples/nemotron/smart_scraper_multi_concat_nemotron.py @@ -0,0 +1,39 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" + +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiConcatGraph + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "api_key": os.getenv("NEMOTRON_APIKEY"), + "model": "nvidia/meta/llama3-70b-instruct", + }, + "verbose": True, + "headless": False, +} +# ******************************************************* +# Create the SmartScraperMultiGraph instance and run it +# ******************************************************* + +multiple_search_graph = SmartScraperMultiConcatGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/oneapi/custom_graph_oneapi.py b/examples/oneapi/custom_graph_oneapi.py index be58d1d1..1e27dcf9 100644 --- a/examples/oneapi/custom_graph_oneapi.py +++ b/examples/oneapi/custom_graph_oneapi.py @@ -38,7 +38,7 @@ fetch_node = FetchNode( input="url | local_dir", - output=["doc", "link_urls", "img_urls"], + output=["doc"], node_config={ "verbose": True, "headless": True, diff --git a/examples/oneapi/pdf_scraper_multi_oneapi.py b/examples/oneapi/pdf_scraper_multi_oneapi.py index 8b6c57a1..7d0ce231 100644 --- a/examples/oneapi/pdf_scraper_multi_oneapi.py +++ b/examples/oneapi/pdf_scraper_multi_oneapi.py @@ -13,7 +13,7 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-3.5-turbo", + "model": "openai/gpt-3.5-turbo", }, } diff --git a/examples/mixed_models/smartscraper_oneapi_ollama.py b/examples/oneapi/rate_limit_oneapi.py similarity index 62% rename from examples/mixed_models/smartscraper_oneapi_ollama.py rename to examples/oneapi/rate_limit_oneapi.py index eff5a41d..64a170f7 100644 --- a/examples/mixed_models/smartscraper_oneapi_ollama.py +++ b/examples/oneapi/rate_limit_oneapi.py @@ -1,23 +1,21 @@ """ -Basic example of scraping pipeline using SmartScraper +Basic example of scraping pipeline using SmartScraper with a custom rate limit """ - from scrapegraphai.graphs import SmartScraperGraph from scrapegraphai.utils import prettify_exec_info - # ************************************************ # Define the configuration for the graph -# ********************************************* +# ************************************************ + graph_config = { "llm": { "api_key": "***************************", "model": "oneapi/qwen-turbo", "base_url": "http://127.0.0.1:3000/v1", # 设置 OneAPI URL - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "base_url": "http://127.0.0.1:11434", # 设置 Ollama URL + "rate_limit": { + "requests_per_second": 1 + } } } @@ -26,15 +24,18 @@ # ************************************************ smart_scraper_graph = SmartScraperGraph( - prompt="该网站为XXXXX,请提取出标题、发布时间、发布来源以及内容摘要,并以中文回答。", - # 也可以使用已下载的 HTML 代码的字符串 - source="http://XXXX", + prompt="List me all the titles", + # also accepts a string with the already downloaded HTML code + source="https://www.wired.com/", config=graph_config ) +result = smart_scraper_graph.run() +print(result) + # ************************************************ # Get graph execution info # ************************************************ -result = smart_scraper_graph.run() -print(result) -print(prettify_exec_info(result)) + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/oneapi/smart_scraper_multi_concat_oneapi.py b/examples/oneapi/smart_scraper_multi_concat_oneapi.py new file mode 100644 index 00000000..e1f5490d --- /dev/null +++ b/examples/oneapi/smart_scraper_multi_concat_oneapi.py @@ -0,0 +1,37 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" + +import os +import json +from scrapegraphai.graphs import SmartScraperMultiConcatGraph + + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "api_key": "***************************", + "model": "oneapi/qwen-turbo", + "base_url": "http://127.0.0.1:3000/v1", # 设置 OneAPI URL + } +} + +# ******************************************************* +# Create the SmartScraperMultiGraph instance and run it +# ******************************************************* + +multiple_search_graph = SmartScraperMultiConcatGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/oneapi/xml_scraper_graph_multi_oneapi.py b/examples/oneapi/xml_scraper_graph_multi_oneapi.py index 564c2a3a..b459fdd3 100644 --- a/examples/oneapi/xml_scraper_graph_multi_oneapi.py +++ b/examples/oneapi/xml_scraper_graph_multi_oneapi.py @@ -28,7 +28,7 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-3.5-turbo", + "model": "openai/gpt-3.5-turbo", }, } diff --git a/examples/oneapi/xml_scraper_oneapi.py b/examples/oneapi/xml_scraper_oneapi.py index 15862052..cb92bbf2 100644 --- a/examples/oneapi/xml_scraper_oneapi.py +++ b/examples/oneapi/xml_scraper_oneapi.py @@ -28,7 +28,7 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-3.5-turbo", + "model": "openai/gpt-3.5-turbo", }, "verbose":False, } diff --git a/examples/openai/custom_graph_openai.py b/examples/openai/custom_graph_openai.py index b1471a21..a4cf9351 100644 --- a/examples/openai/custom_graph_openai.py +++ b/examples/openai/custom_graph_openai.py @@ -43,7 +43,7 @@ fetch_node = FetchNode( input="url | local_dir", - output=["doc", "link_urls", "img_urls"], + output=["doc"], node_config={ "verbose": True, "headless": True, diff --git a/examples/openai/rate_limit_openai.py b/examples/openai/rate_limit_openai.py new file mode 100644 index 00000000..9455e798 --- /dev/null +++ b/examples/openai/rate_limit_openai.py @@ -0,0 +1,48 @@ +""" +Basic example of scraping pipeline using SmartScraper with a custom rate limit +""" + +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + + +graph_config = { + "llm": { + "api_key": os.getenv("OPENAI_API_KEY"), + "model": "openai/gpt-4o", + "rate_limit": { + "requests_per_second": 1 + } + }, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me what does the company do, the name and a contact email.", + source="https://scrapegraphai.com/", + config=graph_config +) + +result = smart_scraper_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/openai/script_generator_openai.py b/examples/openai/script_generator_openai.py index 119f67e5..f5b231d2 100644 --- a/examples/openai/script_generator_openai.py +++ b/examples/openai/script_generator_openai.py @@ -5,7 +5,7 @@ import os import json from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.graphs import ScriptCreatorGraph from scrapegraphai.utils import prettify_exec_info load_dotenv() @@ -18,7 +18,7 @@ graph_config = { "llm": { "api_key": os.getenv("OPENAI_API_KEY"), - "model": "gpt-4o", + "model": "openai/gpt-4o", }, "verbose": True, "headless": False, @@ -28,9 +28,10 @@ # Create the SmartScraperGraph instance and run it # ************************************************ -smart_scraper_graph = SmartScraperGraph( - prompt="List me what does the company do, the name and a contact email.", - source="https://scrapegraphai.com/", +smart_scraper_graph = ScriptCreatorGraph( + prompt="List me all the news with their description.", + # also accepts a string with the already downloaded HTML code + source="https://perinim.github.io/projects", config=graph_config ) diff --git a/examples/openai/smart_scraper_multi_concat_openai.py b/examples/openai/smart_scraper_multi_concat_openai.py new file mode 100644 index 00000000..c6ee88cc --- /dev/null +++ b/examples/openai/smart_scraper_multi_concat_openai.py @@ -0,0 +1,41 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" + +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiConcatGraph + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ +openai_key = os.getenv("OPENAI_APIKEY") + +graph_config = { + "llm": { + "api_key": openai_key, + "model": "openai/gpt-4o", + }, + "verbose": True, + "headless": False, +} + +# ******************************************************* +# Create the SmartScraperMultiGraph instance and run it +# ******************************************************* + +multiple_search_graph = SmartScraperMultiConcatGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/together/.env.example b/examples/together/.env.example new file mode 100644 index 00000000..7004713a --- /dev/null +++ b/examples/together/.env.example @@ -0,0 +1 @@ +TOGETHER_APIKEY="your api key" \ No newline at end of file diff --git a/examples/together/csv_scraper_graph_multi_together.py b/examples/together/csv_scraper_graph_multi_together.py new file mode 100644 index 00000000..588d2c5e --- /dev/null +++ b/examples/together/csv_scraper_graph_multi_together.py @@ -0,0 +1,57 @@ +""" +Basic example of scraping pipeline using CSVScraperMultiGraph from CSV documents +""" + +import os +from dotenv import load_dotenv +import pandas as pd +from scrapegraphai.graphs import CSVScraperMultiGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +load_dotenv() +# ************************************************ +# Read the CSV file +# ************************************************ + +FILE_NAME = "inputs/username.csv" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +text = pd.read_csv(file_path) + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +together_key = os.getenv("TOGETHER_APIKEY") + +graph_config = { + "llm": { + "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", + "api_key": together_key, + }, + "verbose": True, +} +# ************************************************ +# Create the CSVScraperMultiGraph instance and run it +# ************************************************ + +csv_scraper_graph = CSVScraperMultiGraph( + prompt="List me all the last names", + source=[str(text), str(text)], + config=graph_config +) + +result = csv_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = csv_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/together/csv_scraper_together.py b/examples/together/csv_scraper_together.py new file mode 100644 index 00000000..9b1838ae --- /dev/null +++ b/examples/together/csv_scraper_together.py @@ -0,0 +1,57 @@ +""" +Basic example of scraping pipeline using CSVScraperGraph from CSV documents +""" + +import os +from dotenv import load_dotenv +import pandas as pd +from scrapegraphai.graphs import CSVScraperGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info +load_dotenv() + +# ************************************************ +# Read the CSV file +# ************************************************ + +FILE_NAME = "inputs/username.csv" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +text = pd.read_csv(file_path) + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +together_key = os.getenv("TOGETHER_APIKEY") + +graph_config = { + "llm": { + "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", + "api_key": together_key, + }, + "verbose": True, +} +# ************************************************ +# Create the CSVScraperGraph instance and run it +# ************************************************ + +csv_scraper_graph = CSVScraperGraph( + prompt="List me all the last names", + source=str(text), # Pass the content of the file, not the file object + config=graph_config +) + +result = csv_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = csv_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/mixed_models/inputs/books.xml b/examples/together/inputs/books.xml similarity index 100% rename from examples/mixed_models/inputs/books.xml rename to examples/together/inputs/books.xml diff --git a/examples/mixed_models/inputs/example.json b/examples/together/inputs/example.json similarity index 100% rename from examples/mixed_models/inputs/example.json rename to examples/together/inputs/example.json diff --git a/examples/together/inputs/username.csv b/examples/together/inputs/username.csv new file mode 100644 index 00000000..006ac8e6 --- /dev/null +++ b/examples/together/inputs/username.csv @@ -0,0 +1,7 @@ +Username; Identifier;First name;Last name +booker12;9012;Rachel;Booker +grey07;2070;Laura;Grey +johnson81;4081;Craig;Johnson +jenkins46;9346;Mary;Jenkins +smith79;5079;Jamie;Smith + diff --git a/examples/together/json_scraper_multi_together.py b/examples/together/json_scraper_multi_together.py new file mode 100644 index 00000000..0d9ac293 --- /dev/null +++ b/examples/together/json_scraper_multi_together.py @@ -0,0 +1,38 @@ +""" +Module for showing how JSONScraperMultiGraph multi works +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import JSONScraperMultiGraph + +load_dotenv() + +together_key = os.getenv("TOGETHER_APIKEY") + +graph_config = { + "llm": { + "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", + "api_key": together_key, + }, + "verbose": True, +} + +FILE_NAME = "inputs/example.json" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +sources = [text, text] + +multiple_search_graph = JSONScraperMultiGraph( + prompt= "List me all the authors, title and genres of the books", + source= sources, + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/together/json_scraper_together.py b/examples/together/json_scraper_together.py new file mode 100644 index 00000000..b1e646f9 --- /dev/null +++ b/examples/together/json_scraper_together.py @@ -0,0 +1,57 @@ +""" +Basic example of scraping pipeline using JSONScraperGraph from JSON documents +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import JSONScraperGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info +load_dotenv() + +# ************************************************ +# Read the JSON file +# ************************************************ +together_key = os.getenv("TOGETHER_APIKEY") + +FILE_NAME = "inputs/example.json" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", + "api_key": together_key, + }, + "verbose": True, +} + +# ************************************************ +# Create the JSONScraperGraph instance and run it +# ************************************************ + +json_scraper_graph = JSONScraperGraph( + prompt="List me all the authors, title and genres of the books", + source=text, # Pass the content of the file, not the file object + config=graph_config +) + +result = json_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = json_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/together/pdf_scraper_graph_together.py b/examples/together/pdf_scraper_graph_together.py new file mode 100644 index 00000000..ee7a8c4b --- /dev/null +++ b/examples/together/pdf_scraper_graph_together.py @@ -0,0 +1,44 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" + +import os, json +from dotenv import load_dotenv +from scrapegraphai.utils import prettify_exec_info +from scrapegraphai.graphs import PDFScraperGraph +load_dotenv() + + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +together_key = os.getenv("TOGETHER_APIKEY") + +graph_config = { + "llm": { + "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", + "api_key": together_key, + }, + "verbose": True, +} + +source = """ + The Divine Comedy, Italian La Divina Commedia, original name La commedia, long narrative poem written in Italian + circa 1308/21 by Dante. It is usually held to be one of the world s great works of literature. + Divided into three major sections—Inferno, Purgatorio, and Paradiso—the narrative traces the journey of Dante + from darkness and error to the revelation of the divine light, culminating in the Beatific Vision of God. + Dante is guided by the Roman poet Virgil, who represents the epitome of human knowledge, from the dark wood + through the descending circles of the pit of Hell (Inferno). He then climbs the mountain of Purgatory, guided + by the Roman poet Statius, who represents the fulfilment of human knowledge, and is finally led by his lifelong love, + the Beatrice of his earlier poetry, through the celestial spheres of Paradise. +""" + +pdf_scraper_graph = PDFScraperGraph( + prompt="Summarize the text and find the main topics", + source=source, + config=graph_config, +) +result = pdf_scraper_graph.run() + +print(json.dumps(result, indent=4)) diff --git a/examples/together/pdf_scraper_multi_together.py b/examples/together/pdf_scraper_multi_together.py new file mode 100644 index 00000000..a34b0337 --- /dev/null +++ b/examples/together/pdf_scraper_multi_together.py @@ -0,0 +1,74 @@ +""" +Module for showing how PDFScraper multi works +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import PdfScraperMultiGraph + +load_dotenv() + +together_key = os.getenv("TOGETHER_APIKEY") + +graph_config = { + "llm": { + "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", + "api_key": together_key, + }, + "verbose": True, +} + +# *************** +# Covert to list +# *************** + +sources = [ + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", +] + +prompt = """ +You are an expert in reviewing academic manuscripts. Please analyze the abstracts provided from an academic journal article to extract and clearly identify the following elements: + +Independent Variable (IV): The variable that is manipulated or considered as the primary cause affecting other variables. +Dependent Variable (DV): The variable that is measured or observed, which is expected to change as a result of variations in the Independent Variable. +Exogenous Shock: Identify any external or unexpected events used in the study that serve as a natural experiment or provide a unique setting for observing the effects on the IV and DV. +Response Format: For each abstract, present your response in the following structured format: + +Independent Variable (IV): +Dependent Variable (DV): +Exogenous Shock: + +Example Queries and Responses: + +Query: This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather the interaction between call center architecture and outdoor weather conditions in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking. + +Response: + +Independent Variable (IV): Employee happiness. +Dependent Variable (DV): Overall firm productivity. +Exogenous Shock: Sudden company-wide increase in bonus payments. + +Query: The diffusion of social media coincided with a worsening of mental health conditions among adolescents and young adults in the United States, giving rise to speculation that social media might be detrimental to mental health. In this paper, we provide quasi-experimental estimates of the impact of social media on mental health by leveraging a unique natural experiment: the staggered introduction of Facebook across U.S. colleges. Our analysis couples data on student mental health around the years of Facebook's expansion with a generalized difference-in-differences empirical strategy. We find that the roll-out of Facebook at a college increased symptoms of poor mental health, especially depression. We also find that, among students predicted to be most susceptible to mental illness, the introduction of Facebook led to increased utilization of mental healthcare services. Lastly, we find that, after the introduction of Facebook, students were more likely to report experiencing impairments to academic performance resulting from poor mental health. Additional evidence on mechanisms suggests that the results are due to Facebook fostering unfavorable social comparisons. + +Response: + +Independent Variable (IV): Exposure to social media. +Dependent Variable (DV): Mental health outcomes. +Exogenous Shock: staggered introduction of Facebook across U.S. colleges. +""" +# ******************************************************* +# Create the SmartScraperMultiGraph instance and run it +# ******************************************************* + +multiple_search_graph = PdfScraperMultiGraph( + prompt=prompt, + source= sources, + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/together/rate_limit_together.py b/examples/together/rate_limit_together.py new file mode 100644 index 00000000..072f8557 --- /dev/null +++ b/examples/together/rate_limit_together.py @@ -0,0 +1,49 @@ +""" +Basic example of scraping pipeline using SmartScraper with a custom rate limit +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +together_key = os.getenv("TOGETHER_APIKEY") + +graph_config = { + "llm": { + "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", + "api_key": together_key, + "rate_limit": { + "requests_per_second": 1 + } + }, + "verbose": True, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the projects with their description.", + # also accepts a string with the already downloaded HTML code + source="https://perinim.github.io/projects/", + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/together/scrape_plain_text_together.py b/examples/together/scrape_plain_text_together.py new file mode 100644 index 00000000..a0e222ae --- /dev/null +++ b/examples/together/scrape_plain_text_together.py @@ -0,0 +1,55 @@ +""" +Basic example of scraping pipeline using SmartScraper from text +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info +load_dotenv() + +# ************************************************ +# Read the text file +# ************************************************ + +FILE_NAME = "inputs/plain_html_example.txt" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +# It could be also a http request using the request model +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +together_key = os.getenv("TOGETHER_APIKEY") + +graph_config = { + "llm": { + "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", + "api_key": together_key, + }, + "verbose": True, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the news with their description.", + source=text, + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/together/script_generator_together.py b/examples/together/script_generator_together.py new file mode 100644 index 00000000..a1007cd9 --- /dev/null +++ b/examples/together/script_generator_together.py @@ -0,0 +1,45 @@ +""" +Basic example of scraping pipeline using ScriptCreatorGraph +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import ScriptCreatorGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +together_key = os.getenv("TOGETHER_APIKEY") + +graph_config = { + "llm": { + "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", + "api_key": together_key, + }, + "verbose": True, +} + +# ************************************************ +# Create the ScriptCreatorGraph instance and run it +# ************************************************ + +script_creator_graph = ScriptCreatorGraph( + prompt="List me all the projects with their description.", + # also accepts a string with the already downloaded HTML code + source="https://perinim.github.io/projects", + config=graph_config +) + +result = script_creator_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = script_creator_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/together/script_multi_generator_together.py b/examples/together/script_multi_generator_together.py new file mode 100644 index 00000000..b9c46246 --- /dev/null +++ b/examples/together/script_multi_generator_together.py @@ -0,0 +1,54 @@ +""" +Basic example of scraping pipeline using ScriptCreatorGraph +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import ScriptCreatorMultiGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +together_key = os.getenv("TOGETHER_APIKEY") + +graph_config = { + "llm": { + "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", + "api_key": together_key, + }, + "library": "beautifulsoup" +} + +# ************************************************ +# Create the ScriptCreatorGraph instance and run it +# ************************************************ + +urls=[ + "https://schultzbergagency.com/emil-raste-karlsen/", + "https://schultzbergagency.com/johanna-hedberg/", +] + +# ************************************************ +# Create the ScriptCreatorGraph instance and run it +# ************************************************ + +script_creator_graph = ScriptCreatorMultiGraph( + prompt="Find information about actors", + # also accepts a string with the already downloaded HTML code + source=urls, + config=graph_config +) + +result = script_creator_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = script_creator_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/together/search_graph_schema_together.py b/examples/together/search_graph_schema_together.py new file mode 100644 index 00000000..b7d72250 --- /dev/null +++ b/examples/together/search_graph_schema_together.py @@ -0,0 +1,62 @@ +""" +Example of Search Graph +""" + +import os +from dotenv import load_dotenv +load_dotenv() + +from scrapegraphai.graphs import SearchGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +from pydantic import BaseModel, Field +from typing import List + +# ************************************************ +# Define the output schema for the graph +# ************************************************ + +class Dish(BaseModel): + name: str = Field(description="The name of the dish") + description: str = Field(description="The description of the dish") + +class Dishes(BaseModel): + dishes: List[Dish] + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +together_key = os.getenv("TOGETHER_APIKEY") + +graph_config = { + "llm": { + "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", + "api_key": together_key, + }, + "verbose": True, +} + +# ************************************************ +# Create the SearchGraph instance and run it +# ************************************************ + +search_graph = SearchGraph( + prompt="List me Chioggia's famous dishes", + config=graph_config, + schema=Dishes +) + +result = search_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = search_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json and csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/mixed_models/search_graph_groq_openai.py b/examples/together/search_graph_together.py similarity index 51% rename from examples/mixed_models/search_graph_groq_openai.py rename to examples/together/search_graph_together.py index 3d581063..9c48699b 100644 --- a/examples/mixed_models/search_graph_groq_openai.py +++ b/examples/together/search_graph_together.py @@ -1,35 +1,31 @@ -""" -Basic example of scraping pipeline using SmartScraper +""" +Example of Search Graph """ import os from dotenv import load_dotenv from scrapegraphai.graphs import SearchGraph -from scrapegraphai.utils import prettify_exec_info - load_dotenv() - # ************************************************ # Define the configuration for the graph # ************************************************ -groq_key = os.getenv("GROQ_APIKEY") -openai_key = os.getenv("OPENAI_APIKEY") +together_key = os.getenv("TOGETHER_APIKEY") graph_config = { "llm": { - "model": "groq/gemma-7b-it", - "api_key": groq_key, - "temperature": 0 + "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", + "api_key": together_key, }, - "embeddings": { - "api_key": openai_key, - "model": "openai", - }, - "headless": False + "max_results": 2, + "verbose": True, } +# ************************************************ +# Create the SearchGraph instance and run it +# ************************************************ + search_graph = SearchGraph( prompt="List me the best escursions near Trento", config=graph_config @@ -37,10 +33,3 @@ result = search_graph.run() print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = search_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/mixed_models/search_graph_groq_ollama.py b/examples/together/search_link_graph_together.py similarity index 71% rename from examples/mixed_models/search_graph_groq_ollama.py rename to examples/together/search_link_graph_together.py index 7883fa77..46c86d5c 100644 --- a/examples/mixed_models/search_graph_groq_ollama.py +++ b/examples/together/search_link_graph_together.py @@ -1,32 +1,25 @@ """ Example of Search Graph """ - import os from dotenv import load_dotenv from scrapegraphai.graphs import SearchGraph from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info -load_dotenv() # ************************************************ # Define the configuration for the graph # ************************************************ -groq_key = os.getenv("GROQ_APIKEY") +load_dotenv() + +together_key = os.getenv("TOGETHER_APIKEY") graph_config = { "llm": { - "model": "groq/gemma-7b-it", - "api_key": groq_key, - "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - "base_url": "http://localhost:11434", # set ollama URL arbitrarily + "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", + "api_key": together_key, }, - "max_results": 2, - "verbose": True, + "verbose": True, } # ************************************************ @@ -34,7 +27,7 @@ # ************************************************ search_graph = SearchGraph( - prompt="List me all the regions of Italy.", + prompt="List me the best escursions near Trento", config=graph_config ) diff --git a/examples/together/smart_scraper_multi_together.py b/examples/together/smart_scraper_multi_together.py new file mode 100644 index 00000000..278c4ba5 --- /dev/null +++ b/examples/together/smart_scraper_multi_together.py @@ -0,0 +1,41 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" + +import os, json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiGraph + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +together_key = os.getenv("TOGETHER_APIKEY") + +graph_config = { + "llm": { + "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", + "api_key": together_key, + }, + "verbose": True, +} + + +# ******************************************************* +# Create the SmartScraperMultiGraph instance and run it +# ******************************************************* + +multiple_search_graph = SmartScraperMultiGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/mixed_models/smart_scraper_schema_groq_openai.py b/examples/together/smart_scraper_schema_together.py similarity index 72% rename from examples/mixed_models/smart_scraper_schema_groq_openai.py rename to examples/together/smart_scraper_schema_together.py index f177cb61..f59a521f 100644 --- a/examples/mixed_models/smart_scraper_schema_groq_openai.py +++ b/examples/together/smart_scraper_schema_together.py @@ -1,14 +1,11 @@ """ -Basic example of scraping pipeline using SmartScraper with schema +Basic example of scraping pipeline using SmartScraper """ -import json import os -from typing import Dict, List - +from typing import List +from pydantic import BaseModel, Field from dotenv import load_dotenv -from pydantic import BaseModel - from scrapegraphai.graphs import SmartScraperGraph from scrapegraphai.utils import prettify_exec_info @@ -19,34 +16,26 @@ # ************************************************ class Project(BaseModel): - title: str - description: str + title: str = Field(description="The title of the project") + description: str = Field(description="The description of the project") class Projects(BaseModel): - Projects: Dict[str, Project] + projects: List[Project] # ************************************************ # Define the configuration for the graph # ************************************************ -groq_key = os.getenv("GROQ_APIKEY") -openai_key = os.getenv("OPENAI_APIKEY") +together_key = os.getenv("TOGETHER_APIKEY") graph_config = { "llm": { - "model": "groq/gemma-7b-it", - "api_key": groq_key, - "temperature": 0 - }, - "embeddings": { - "api_key": openai_key, - "model": "openai", + "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", + "api_key": together_key, }, - "headless": False + "verbose": True, } - - # ************************************************ # Create the SmartScraperGraph instance and run it # ************************************************ diff --git a/examples/together/smart_scraper_together.py b/examples/together/smart_scraper_together.py new file mode 100644 index 00000000..7408df20 --- /dev/null +++ b/examples/together/smart_scraper_together.py @@ -0,0 +1,46 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +together_key = os.getenv("TOGETHER_APIKEY") + +graph_config = { + "llm": { + "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", + "api_key": together_key, + }, + "verbose": True, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the projects with their description.", + # also accepts a string with the already downloaded HTML code + source="https://perinim.github.io/projects/", + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/together/xml_scraper_graph_multi_together.py b/examples/together/xml_scraper_graph_multi_together.py new file mode 100644 index 00000000..1fde5c53 --- /dev/null +++ b/examples/together/xml_scraper_graph_multi_together.py @@ -0,0 +1,58 @@ +""" +Basic example of scraping pipeline using XMLScraperMultiGraph from XML documents +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import XMLScraperMultiGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info +load_dotenv() + +# ************************************************ +# Read the XML file +# ************************************************ + +FILE_NAME = "inputs/books.xml" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +together_key = os.getenv("TOGETHER_APIKEY") + +graph_config = { + "llm": { + "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", + "api_key": together_key, + }, + "verbose": True, +} + +# ************************************************ +# Create the XMLScraperMultiGraph instance and run it +# ************************************************ + +xml_scraper_graph = XMLScraperMultiGraph( + prompt="List me all the authors, title and genres of the books", + source=[text, text], # Pass the content of the file, not the file object + config=graph_config +) + +result = xml_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = xml_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/together/xml_scraper_together.py b/examples/together/xml_scraper_together.py new file mode 100644 index 00000000..690d2cff --- /dev/null +++ b/examples/together/xml_scraper_together.py @@ -0,0 +1,60 @@ +""" +Basic example of scraping pipeline using XMLScraperGraph from XML documents +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import XMLScraperGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info +load_dotenv() + +# ************************************************ +# Read the XML file +# ************************************************ + +FILE_NAME = "inputs/books.xml" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + + +together_key = os.getenv("TOGETHER_APIKEY") + +graph_config = { + "llm": { + "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", + "api_key": together_key, + }, + "verbose": True, +} + + +# ************************************************ +# Create the XMLScraperGraph instance and run it +# ************************************************ + +xml_scraper_graph = XMLScraperGraph( + prompt="List me all the authors, title and genres of the books", + source=text, # Pass the content of the file, not the file object + config=graph_config +) + +result = xml_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = xml_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/pyproject.toml b/pyproject.toml index c7498521..ab4b5359 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,9 @@ [project] name = "scrapegraphai" -version = "1.18.3" + + +version = "1.19.0b8" + description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ { name = "Marco Vinciguerra", email = "mvincig11@gmail.com" }, @@ -27,7 +30,9 @@ dependencies = [ "playwright>=1.43.0", "undetected-playwright>=0.3.0", "google>=3.0.0", - "semchunk>=1.0.1", + "langchain-ollama>=0.1.3", + "semchunk==2.2.0", + "transformers==4.44.2" ] license = "MIT" @@ -74,6 +79,7 @@ other-language-models = [ "langchain-anthropic>=0.1.11", "langchain-huggingface>=0.0.3", "langchain-nvidia-ai-endpoints>=0.1.6", + "langchain_together>=1.2.9" ] # Group 2: More Semantic Options @@ -86,6 +92,14 @@ more-browser-options = [ "browserbase>=0.3.0", ] +# Group 4: Surya Library +screenshot_scraper = [ + "surya-ocr>=0.5.0; python_version >= '3.10'", + "matplotlib>=3.7.2; python_version >= '3.10'", + "ipywidgets>=8.1.0; python_version >= '3.10'", + "pillow>=10.4.0", +] + [build-system] requires = ["hatchling"] build-backend = "hatchling.build" diff --git a/requirements-dev.lock b/requirements-dev.lock index b816db3d..66a0ec32 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -88,6 +88,7 @@ fastapi-pagination==0.12.26 # via burr filelock==3.15.4 # via huggingface-hub + # via transformers fonttools==4.53.1 # via matplotlib free-proxy==1.1.1 @@ -147,11 +148,13 @@ httplib2==0.22.0 # via google-auth-httplib2 httpx==0.27.0 # via langchain-mistralai + # via ollama # via openai httpx-sse==0.4.0 # via langchain-mistralai huggingface-hub==0.24.5 # via tokenizers + # via transformers idna==3.7 # via anyio # via httpx @@ -159,9 +162,9 @@ idna==3.7 # via yarl imagesize==1.4.1 # via sphinx -importlib-metadata==8.2.0 +importlib-metadata==8.4.0 # via sphinx -importlib-resources==6.4.0 +importlib-resources==6.4.4 # via matplotlib iniconfig==2.0.0 # via pytest @@ -194,18 +197,21 @@ langchain-aws==0.1.16 # via scrapegraphai langchain-community==0.2.11 # via scrapegraphai -langchain-core==0.2.33 +langchain-core==0.2.37 # via langchain # via langchain-aws # via langchain-community # via langchain-google-genai # via langchain-mistralai + # via langchain-ollama # via langchain-openai # via langchain-text-splitters langchain-google-genai==1.0.8 # via scrapegraphai langchain-mistralai==0.1.12 # via scrapegraphai +langchain-ollama==0.1.3 + # via scrapegraphai langchain-openai==0.1.22 # via scrapegraphai langchain-text-splitters==0.2.2 @@ -255,6 +261,9 @@ numpy==1.26.4 # via pydeck # via sf-hamilton # via streamlit + # via transformers +ollama==0.3.2 + # via langchain-ollama openai==1.40.3 # via burr # via langchain-openai @@ -270,6 +279,7 @@ packaging==24.1 # via pytest # via sphinx # via streamlit + # via transformers pandas==2.2.2 # via scrapegraphai # via sf-hamilton @@ -342,11 +352,13 @@ pyyaml==6.0.2 # via langchain # via langchain-community # via langchain-core + # via transformers referencing==0.35.1 # via jsonschema # via jsonschema-specifications regex==2024.7.24 # via tiktoken + # via transformers requests==2.32.3 # via burr # via free-proxy @@ -358,6 +370,7 @@ requests==2.32.3 # via sphinx # via streamlit # via tiktoken + # via transformers rich==13.7.1 # via streamlit rpds-py==0.20.0 @@ -367,6 +380,8 @@ rsa==4.9 # via google-auth s3transfer==0.10.2 # via boto3 +safetensors==0.4.5 + # via transformers semchunk==2.2.0 # via scrapegraphai sf-hamilton==1.73.1 @@ -418,6 +433,7 @@ tiktoken==0.7.0 # via scrapegraphai tokenizers==0.19.1 # via langchain-mistralai + # via transformers toml==0.10.2 # via streamlit tomli==2.0.1 @@ -434,6 +450,9 @@ tqdm==4.66.5 # via openai # via scrapegraphai # via semchunk + # via transformers +transformers==4.44.2 + # via scrapegraphai typing-extensions==4.12.2 # via altair # via anyio @@ -470,6 +489,6 @@ uvicorn==0.30.5 # via burr yarl==1.9.4 # via aiohttp -zipp==3.20.0 +zipp==3.20.1 # via importlib-metadata # via importlib-resources diff --git a/requirements.lock b/requirements.lock index 30d89366..f29ac340 100644 --- a/requirements.lock +++ b/requirements.lock @@ -51,6 +51,7 @@ faiss-cpu==1.8.0.post1 # via scrapegraphai filelock==3.15.4 # via huggingface-hub + # via transformers free-proxy==1.1.1 # via scrapegraphai frozenlist==1.4.1 @@ -99,11 +100,13 @@ httplib2==0.22.0 # via google-auth-httplib2 httpx==0.27.0 # via langchain-mistralai + # via ollama # via openai httpx-sse==0.4.0 # via langchain-mistralai huggingface-hub==0.24.1 # via tokenizers + # via transformers idna==3.7 # via anyio # via httpx @@ -125,18 +128,21 @@ langchain-aws==0.1.12 # via scrapegraphai langchain-community==0.2.10 # via scrapegraphai -langchain-core==0.2.33 +langchain-core==0.2.37 # via langchain # via langchain-aws # via langchain-community # via langchain-google-genai # via langchain-mistralai + # via langchain-ollama # via langchain-openai # via langchain-text-splitters langchain-google-genai==1.0.8 # via scrapegraphai langchain-mistralai==0.1.12 # via scrapegraphai +langchain-ollama==0.1.3 + # via scrapegraphai langchain-openai==0.1.22 # via scrapegraphai langchain-text-splitters==0.2.2 @@ -166,6 +172,9 @@ numpy==1.26.4 # via langchain-aws # via langchain-community # via pandas + # via transformers +ollama==0.3.2 + # via langchain-ollama openai==1.41.0 # via langchain-openai orjson==3.10.6 @@ -175,6 +184,7 @@ packaging==24.1 # via huggingface-hub # via langchain-core # via marshmallow + # via transformers pandas==2.2.2 # via scrapegraphai playwright==1.45.1 @@ -221,8 +231,10 @@ pyyaml==6.0.1 # via langchain # via langchain-community # via langchain-core + # via transformers regex==2024.5.15 # via tiktoken + # via transformers requests==2.32.3 # via free-proxy # via google-api-core @@ -231,10 +243,13 @@ requests==2.32.3 # via langchain-community # via langsmith # via tiktoken + # via transformers rsa==4.9 # via google-auth s3transfer==0.10.2 # via boto3 +safetensors==0.4.5 + # via transformers semchunk==2.2.0 # via scrapegraphai six==1.16.0 @@ -257,6 +272,7 @@ tiktoken==0.7.0 # via scrapegraphai tokenizers==0.19.1 # via langchain-mistralai + # via transformers tqdm==4.66.4 # via google-generativeai # via huggingface-hub @@ -264,6 +280,9 @@ tqdm==4.66.4 # via openai # via scrapegraphai # via semchunk + # via transformers +transformers==4.44.2 + # via scrapegraphai typing-extensions==4.12.2 # via anyio # via google-generativeai diff --git a/requirements.txt b/requirements.txt index 80cb0767..8a29f1c8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,3 +17,4 @@ playwright>=1.43.0 undetected-playwright>=0.3.0 google>=3.0.0 semchunk>=1.0.1 +langchain-ollama>=0.1.3 diff --git a/scrapegraphai/docloaders/__init__.py b/scrapegraphai/docloaders/__init__.py index 45a3783d..1010a6be 100644 --- a/scrapegraphai/docloaders/__init__.py +++ b/scrapegraphai/docloaders/__init__.py @@ -2,3 +2,4 @@ from .chromium import ChromiumLoader from .browser_base import browser_base_fetch +from .scrape_do import scrape_do_fetch diff --git a/scrapegraphai/docloaders/scrape_do.py b/scrapegraphai/docloaders/scrape_do.py new file mode 100644 index 00000000..cd9086c3 --- /dev/null +++ b/scrapegraphai/docloaders/scrape_do.py @@ -0,0 +1,41 @@ +""" +Scrape_do module +""" +import urllib.parse +import requests +import urllib3 + +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + +def scrape_do_fetch(token, target_url, use_proxy=False, geoCode=None, super_proxy=False): + """ + Fetches the IP address of the machine associated with the given URL using Scrape.do. + + Args: + token (str): The API token for Scrape.do service. + target_url (str): A valid web page URL to fetch its associated IP address. + use_proxy (bool): Whether to use Scrape.do proxy mode. Default is False. + geoCode (str, optional): Specify the country code for + geolocation-based proxies. Default is None. + super_proxy (bool): If True, use Residential & Mobile Proxy Networks. Default is False. + + Returns: + str: The raw response from the target URL. + """ + encoded_url = urllib.parse.quote(target_url) + if use_proxy: + # Create proxy mode URL + proxyModeUrl = f"http://{token}:@proxy.scrape.do:8080" + proxies = { + "http": proxyModeUrl, + "https": proxyModeUrl, + } + # Add optional geoCode and super proxy parameters if provided + params = {"geoCode": geoCode, "super": str(super_proxy).lower()} if geoCode else {} + response = requests.get(target_url, proxies=proxies, verify=False, params=params) + else: + # API Mode URL + url = f"http://api.scrape.do?token={token}&url={encoded_url}" + response = requests.get(url) + + return response.text diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py index 6dda222d..966f9978 100644 --- a/scrapegraphai/graphs/__init__.py +++ b/scrapegraphai/graphs/__init__.py @@ -25,3 +25,4 @@ from .markdown_scraper_multi_graph import MDScraperMultiGraph from .search_link_graph import SearchLinkGraph from .screenshot_scraper_graph import ScreenshotScraperGraph +from .smart_scraper_multi_concat_graph import SmartScraperMultiConcatGraph diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index 7b161963..02869ab9 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -1,13 +1,13 @@ """ AbstractGraph Module """ - from abc import ABC, abstractmethod from typing import Optional import uuid import warnings from pydantic import BaseModel from langchain.chat_models import init_chat_model +from langchain_core.rate_limiters import InMemoryRateLimiter from ..helpers import models_tokens from ..models import ( OneApi, @@ -62,6 +62,7 @@ def __init__(self, prompt: str, config: dict, self.loader_kwargs = self.config.get("loader_kwargs", {}) self.cache_path = self.config.get("cache_path", False) self.browser_base = self.config.get("browser_base") + self.scrape_do = self.config.get("scrape_do") self.graph = self._create_graph() self.final_state = None @@ -119,52 +120,78 @@ def _create_llm(self, llm_config: dict) -> object: llm_defaults = {"temperature": 0, "streaming": False} llm_params = {**llm_defaults, **llm_config} + rate_limit_params = llm_params.pop("rate_limit", {}) + + if rate_limit_params: + requests_per_second = rate_limit_params.get("requests_per_second") + max_retries = rate_limit_params.get("max_retries") + if requests_per_second is not None: + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + llm_params["rate_limiter"] = InMemoryRateLimiter(requests_per_second=requests_per_second) + if max_retries is not None: + llm_params["max_retries"] = max_retries if "model_instance" in llm_params: try: self.model_token = llm_params["model_tokens"] except KeyError as exc: raise KeyError("model_tokens not specified") from exc - return llm_params["model_instance"] + return llm_params["model_instance"] known_providers = {"openai", "azure_openai", "google_genai", "google_vertexai", "ollama", "oneapi", "nvidia", "groq", "anthropic", "bedrock", "mistralai", - "hugging_face", "deepseek", "ernie", "fireworks"} + "hugging_face", "deepseek", "ernie", "fireworks", "togetherai"} split_model_provider = llm_params["model"].split("/", 1) llm_params["model_provider"] = split_model_provider[0] llm_params["model"] = split_model_provider[1] if llm_params["model_provider"] not in known_providers: - raise ValueError(f"Provider {llm_params['model_provider']} is not supported. If possible, try to use a model instance instead.") + raise ValueError(f"""Provider {llm_params['model_provider']} is not supported. + If possible, try to use a model instance instead.""") try: self.model_token = models_tokens[llm_params["model_provider"]][llm_params["model"]] except KeyError: - print("Model not found, using default token size (8192)") + print(f"""Model {llm_params['model_provider']}/{llm_params['model']} not found, + using default token size (8192)""") self.model_token = 8192 try: - if llm_params["model_provider"] not in {"oneapi", "nvidia", "ernie", "deepseek"}: + if llm_params["model_provider"] not in {"oneapi","nvidia","ernie","deepseek","togetherai"}: + if llm_params["model_provider"] == "bedrock": + llm_params["model_kwargs"] = { "temperature" : llm_params.pop("temperature") } with warnings.catch_warnings(): warnings.simplefilter("ignore") return init_chat_model(**llm_params) else: - if llm_params["model_provider"] == "deepseek": + model_provider = llm_params.pop("model_provider") + + if model_provider == "deepseek": return DeepSeek(**llm_params) - if llm_params["model_provider"] == "ernie": + if model_provider == "ernie": from langchain_community.chat_models import ErnieBotChat return ErnieBotChat(**llm_params) - if llm_params["model_provider"] == "oneapi": + elif model_provider == "oneapi": return OneApi(**llm_params) - if llm_params["model_provider"] == "nvidia": + elif model_provider == "togehterai": + try: + from langchain_together import ChatTogether + except ImportError: + raise ImportError("""The langchain_together module is not installed. + Please install it using `pip install scrapegraphai[other-language-models]`.""") + return ChatTogether(**llm_params) + + elif model_provider == "nvidia": try: from langchain_nvidia_ai_endpoints import ChatNVIDIA except ImportError: - raise ImportError("The langchain_nvidia_ai_endpoints module is not installed. Please install it using `pip install langchain_nvidia_ai_endpoints`.") + raise ImportError("""The langchain_nvidia_ai_endpoints module is not installed. + Please install it using `pip install scrapegraphai[other-language-models]`.""") return ChatNVIDIA(**llm_params) except Exception as e: diff --git a/scrapegraphai/graphs/base_graph.py b/scrapegraphai/graphs/base_graph.py index 17c05032..71e42760 100644 --- a/scrapegraphai/graphs/base_graph.py +++ b/scrapegraphai/graphs/base_graph.py @@ -116,28 +116,21 @@ def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]: curr_time = time.time() current_node = next(node for node in self.nodes if node.node_name == current_node_name) - # check if there is a "source" key in the node config if current_node.__class__.__name__ == "FetchNode": - # get the second key name of the state dictionary source_type = list(state.keys())[1] if state.get("user_prompt", None): - # Set 'prompt' if 'user_prompt' is a string, otherwise None prompt = state["user_prompt"] if isinstance(state["user_prompt"], str) else None - # Convert 'local_dir' source type to 'html_dir' if source_type == "local_dir": source_type = "html_dir" elif source_type == "url": - # If the source is a list, add string URLs to 'source' if isinstance(state[source_type], list): for url in state[source_type]: if isinstance(url, str): source.append(url) - # If the source is a single string, add it to 'source' elif isinstance(state[source_type], str): source.append(state[source_type]) - # check if there is an "llm_model" variable in the class if hasattr(current_node, "llm_model") and llm_model is None: llm_model = current_node.llm_model if hasattr(llm_model, "model_name"): @@ -145,7 +138,6 @@ def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]: elif hasattr(llm_model, "model"): llm_model = llm_model.model - # check if there is an "embedder_model" variable in the class if hasattr(current_node, "embedder_model") and embedder_model is None: embedder_model = current_node.embedder_model if hasattr(embedder_model, "model_name"): @@ -157,7 +149,6 @@ def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]: if isinstance(current_node.node_config,dict): if current_node.node_config.get("schema", None) and schema is None: if not isinstance(current_node.node_config["schema"], dict): - # convert to dict try: schema = current_node.node_config["schema"].schema() except Exception as e: @@ -220,7 +211,6 @@ def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]: "exec_time": total_exec_time, }) - # Log the graph execution telemetry graph_execution_time = time.time() - start_time response = state.get("answer", None) if source_type == "url" else None content = state.get("parsed_doc", None) if response is not None else None @@ -272,13 +262,10 @@ def append_node(self, node): # if node name already exists in the graph, raise an exception if node.node_name in {n.node_name for n in self.nodes}: - raise ValueError(f"Node with name '{node.node_name}' already exists in the graph. You can change it by setting the 'node_name' attribute.") + raise ValueError(f"""Node with name '{node.node_name}' already exists in the graph. + You can change it by setting the 'node_name' attribute.""") - # get the last node in the list last_node = self.nodes[-1] - # add the edge connecting the last node to the new node self.raw_edges.append((last_node, node)) - # add the node to the list of nodes self.nodes.append(node) - # update the edges connecting the last node to the new node self.edges = self._create_edges({e for e in self.raw_edges}) diff --git a/scrapegraphai/graphs/csv_scraper_graph.py b/scrapegraphai/graphs/csv_scraper_graph.py index 48d84c18..eb34383e 100644 --- a/scrapegraphai/graphs/csv_scraper_graph.py +++ b/scrapegraphai/graphs/csv_scraper_graph.py @@ -43,7 +43,8 @@ class CSVScraperGraph(AbstractGraph): the answer to the prompt as a string. run runs the CSVScraperGraph class to extract information from a CSV file based on the user's prompt. It requires no additional arguments since all necessary data - is stored within the class instance. The method fetches the relevant chunks of text or speech, + is stored within the class instance. + The method fetches the relevant chunks of text or speech, generates an answer based on these chunks, and returns this answer as a string. """ diff --git a/scrapegraphai/graphs/csv_scraper_multi_graph.py b/scrapegraphai/graphs/csv_scraper_multi_graph.py index 67498475..da2e2a81 100644 --- a/scrapegraphai/graphs/csv_scraper_multi_graph.py +++ b/scrapegraphai/graphs/csv_scraper_multi_graph.py @@ -4,8 +4,6 @@ from typing import List, Optional from pydantic import BaseModel - - from .base_graph import BaseGraph from .abstract_graph import AbstractGraph from .csv_scraper_graph import CSVScraperGraph @@ -38,7 +36,7 @@ class CSVScraperMultiGraph(AbstractGraph): Example: >>> search_graph = MultipleSearchGraph( ... "What is Chioggia famous for?", - ... {"llm": {"model": "gpt-3.5-turbo"}} + ... {"llm": {"model": "openai/gpt-3.5-turbo"}} ... ) >>> result = search_graph.run() """ @@ -60,20 +58,12 @@ def _create_graph(self) -> BaseGraph: BaseGraph: A graph instance representing the web scraping and searching workflow. """ - # ************************************************ - # Create a CSVScraperGraph instance - # ************************************************ - smart_scraper_instance = CSVScraperGraph( prompt="", source="", config=self.copy_config, ) - # ************************************************ - # Define the graph nodes - # ************************************************ - graph_iterator_node = GraphIteratorNode( input="user_prompt & jsons", output=["results"], diff --git a/scrapegraphai/graphs/deep_scraper_graph.py b/scrapegraphai/graphs/deep_scraper_graph.py index d07a5276..404fed9f 100644 --- a/scrapegraphai/graphs/deep_scraper_graph.py +++ b/scrapegraphai/graphs/deep_scraper_graph.py @@ -1,7 +1,6 @@ """ DeepScraperGraph Module """ - from typing import Optional from pydantic import BaseModel from .base_graph import BaseGraph @@ -47,14 +46,14 @@ class DeepScraperGraph(AbstractGraph): >>> deep_scraper = DeepScraperGraph( ... "List me all the job titles and detailed job description.", ... "https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India", - ... {"llm": {"model": "gpt-3.5-turbo"}} + ... {"llm": {"model": "openai/gpt-3.5-turbo"}} ... ) >>> result = deep_scraper.run() ) """ def __init__(self, prompt: str, source: str, config: dict, schema: Optional[BaseModel] = None): - + super().__init__(prompt, config, source, schema) self.input_key = "url" if source.startswith("http") else "local_dir" @@ -69,16 +68,17 @@ def _create_repeated_graph(self) -> BaseGraph: """ fetch_node = FetchNode( input="url | local_dir", - output=["doc", "link_urls", "img_urls"] + output=["doc"] ) parse_node = ParseNode( input="doc", output=["parsed_doc"], node_config={ - "chunk_size": self.model_token + "chunk_size": self.model_token, + "llm_model": self.llm_model } ) - + generate_answer_node = GenerateAnswerNode( input="user_prompt & (relevant_chunks | parsed_doc | doc)", output=["answer"], @@ -88,6 +88,7 @@ def _create_repeated_graph(self) -> BaseGraph: "schema": self.schema } ) + search_node = SearchLinkNode( input="user_prompt & relevant_chunks", output=["relevant_links"], @@ -95,6 +96,7 @@ def _create_repeated_graph(self) -> BaseGraph: "llm_model": self.llm_model, } ) + graph_iterator_node = GraphIteratorNode( input="user_prompt & relevant_links", output=["results"], @@ -103,6 +105,7 @@ def _create_repeated_graph(self) -> BaseGraph: "batchsize": 1 } ) + merge_answers_node = MergeAnswersNode( input="user_prompt & results", output=["answer"], @@ -142,8 +145,8 @@ def _create_graph(self) -> BaseGraph: """ base_graph = self._create_repeated_graph() - graph_iterator_node = list(filter(lambda x: x.node_name == "GraphIterator", base_graph.nodes))[0] - # Graph iterator will repeat the same graph for multiple hyperlinks found within input webpage + graph_iterator_node = list(filter(lambda x: x.node_name == "GraphIterator", + base_graph.nodes))[0] graph_iterator_node.node_config["graph_instance"] = self return base_graph diff --git a/scrapegraphai/graphs/json_scraper_graph.py b/scrapegraphai/graphs/json_scraper_graph.py index 288b8ee1..69749a44 100644 --- a/scrapegraphai/graphs/json_scraper_graph.py +++ b/scrapegraphai/graphs/json_scraper_graph.py @@ -1,7 +1,6 @@ """ JSONScraperGraph Module """ - from typing import Optional from pydantic import BaseModel from .base_graph import BaseGraph @@ -36,7 +35,7 @@ class JSONScraperGraph(AbstractGraph): >>> json_scraper = JSONScraperGraph( ... "List me all the attractions in Chioggia.", ... "data/chioggia.json", - ... {"llm": {"model": "gpt-3.5-turbo"}} + ... {"llm": {"model": "openai/gpt-3.5-turbo"}} ... ) >>> result = json_scraper.run() """ @@ -56,7 +55,7 @@ def _create_graph(self) -> BaseGraph: fetch_node = FetchNode( input="json | json_dir", - output=["doc", "link_urls", "img_urls"], + output=["doc"], ) generate_answer_node = GenerateAnswerNode( diff --git a/scrapegraphai/graphs/json_scraper_multi_graph.py b/scrapegraphai/graphs/json_scraper_multi_graph.py index c72d8afd..e008325c 100644 --- a/scrapegraphai/graphs/json_scraper_multi_graph.py +++ b/scrapegraphai/graphs/json_scraper_multi_graph.py @@ -5,7 +5,6 @@ from copy import deepcopy from typing import List, Optional from pydantic import BaseModel - from .base_graph import BaseGraph from .abstract_graph import AbstractGraph from .json_scraper_graph import JSONScraperGraph @@ -38,12 +37,13 @@ class JSONScraperMultiGraph(AbstractGraph): Example: >>> search_graph = MultipleSearchGraph( ... "What is Chioggia famous for?", - ... {"llm": {"model": "gpt-3.5-turbo"}} + ... {"llm": {"model": "openai/gpt-3.5-turbo"}} ... ) >>> result = search_graph.run() """ - def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None): + def __init__(self, prompt: str, source: List[str], + config: dict, schema: Optional[BaseModel] = None): self.max_results = config.get("max_results", 3) @@ -61,10 +61,6 @@ def _create_graph(self) -> BaseGraph: BaseGraph: A graph instance representing the web scraping and searching workflow. """ - # ************************************************ - # Create a JSONScraperGraph instance - # ************************************************ - smart_scraper_instance = JSONScraperGraph( prompt="", source="", @@ -72,10 +68,6 @@ def _create_graph(self) -> BaseGraph: schema=self.copy_schema ) - # ************************************************ - # Define the graph nodes - # ************************************************ - graph_iterator_node = GraphIteratorNode( input="user_prompt & jsons", output=["results"], diff --git a/scrapegraphai/graphs/markdown_scraper_graph.py b/scrapegraphai/graphs/markdown_scraper_graph.py index c177facd..ed3c6856 100644 --- a/scrapegraphai/graphs/markdown_scraper_graph.py +++ b/scrapegraphai/graphs/markdown_scraper_graph.py @@ -1,3 +1,6 @@ +""" +md_scraper module +""" from typing import Optional import logging from pydantic import BaseModel @@ -17,7 +20,8 @@ class MDScraperGraph(AbstractGraph): config (dict): Configuration parameters for the graph. schema (BaseModel): The schema for the graph output. llm_model: An instance of a language model client, configured for generating answers. - embedder_model: An instance of an embedding model client, configured for generating embeddings. + embedder_model: An instance of an embedding model client, + configured for generating embeddings. verbose (bool): A flag indicating whether to show print statements during execution. headless (bool): A flag indicating whether to run the graph in headless mode. @@ -31,7 +35,7 @@ class MDScraperGraph(AbstractGraph): >>> smart_scraper = MDScraperGraph( ... "List me all the attractions in Chioggia.", ... "https://en.wikipedia.org/wiki/Chioggia", - ... {"llm": {"model": "gpt-3.5-turbo"}} + ... {"llm": {"model": "openai/gpt-3.5-turbo"}} ... ) >>> result = smart_scraper.run() """ @@ -60,7 +64,8 @@ def _create_graph(self) -> BaseGraph: output=["parsed_doc"], node_config={ "parse_html": False, - "chunk_size": self.model_token + "chunk_size": self.model_token, + "llm_model": self.llm_model } ) generate_answer_node = GenerateAnswerNode( diff --git a/scrapegraphai/graphs/markdown_scraper_multi_graph.py b/scrapegraphai/graphs/markdown_scraper_multi_graph.py index 772eebe6..bdbcc5c1 100644 --- a/scrapegraphai/graphs/markdown_scraper_multi_graph.py +++ b/scrapegraphai/graphs/markdown_scraper_multi_graph.py @@ -1,7 +1,6 @@ """ MDScraperMultiGraph Module """ - from copy import copy, deepcopy from typing import List, Optional from pydantic import BaseModel @@ -37,12 +36,13 @@ class MDScraperMultiGraph(AbstractGraph): >>> search_graph = MDScraperMultiGraph( ... "What is Chioggia famous for?", ... ["http://example.com/page1", "http://example.com/page2"], - ... {"llm_model": {"model": "gpt-3.5-turbo"}} + ... {"llm_model": {"model": "openai/gpt-3.5-turbo"}} ... ) >>> result = search_graph.run() """ - def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None): + def __init__(self, prompt: str, source: List[str], + config: dict, schema: Optional[BaseModel] = None): self.copy_config = safe_deepcopy(config) self.copy_schema = deepcopy(schema) diff --git a/scrapegraphai/graphs/omni_scraper_graph.py b/scrapegraphai/graphs/omni_scraper_graph.py index 8b5f7fc9..500d9461 100644 --- a/scrapegraphai/graphs/omni_scraper_graph.py +++ b/scrapegraphai/graphs/omni_scraper_graph.py @@ -42,7 +42,7 @@ class OmniScraperGraph(AbstractGraph): >>> omni_scraper = OmniScraperGraph( ... "List me all the attractions in Chioggia and describe their pictures.", ... "https://en.wikipedia.org/wiki/Chioggia", - ... {"llm": {"model": "gpt-4o"}} + ... {"llm": {"model": "openai/gpt-4o"}} ... ) >>> result = omni_scraper.run() ) @@ -65,16 +65,18 @@ def _create_graph(self) -> BaseGraph: """ fetch_node = FetchNode( input="url | local_dir", - output=["doc", "link_urls", "img_urls"], + output=["doc"], node_config={ "loader_kwargs": self.config.get("loader_kwargs", {}), } ) parse_node = ParseNode( - input="doc", - output=["parsed_doc"], + input="doc & (url | local_dir)", + output=["parsed_doc", "link_urls", "img_urls"], node_config={ - "chunk_size": self.model_token + "chunk_size": self.model_token, + "parse_urls": True, + "llm_model": self.llm_model } ) image_to_text_node = ImageToTextNode( diff --git a/scrapegraphai/graphs/omni_search_graph.py b/scrapegraphai/graphs/omni_search_graph.py index c005dbac..669e2df8 100644 --- a/scrapegraphai/graphs/omni_search_graph.py +++ b/scrapegraphai/graphs/omni_search_graph.py @@ -5,11 +5,9 @@ from copy import deepcopy from typing import Optional from pydantic import BaseModel - from .base_graph import BaseGraph from .abstract_graph import AbstractGraph from .omni_scraper_graph import OmniScraperGraph - from ..nodes import ( SearchInternetNode, GraphIteratorNode, @@ -40,7 +38,7 @@ class OmniSearchGraph(AbstractGraph): Example: >>> omni_search_graph = OmniSearchGraph( ... "What is Chioggia famous for?", - ... {"llm": {"model": "gpt-4o"}} + ... {"llm": {"model": "openai/gpt-4o"}} ... ) >>> result = search_graph.run() """ @@ -63,35 +61,30 @@ def _create_graph(self) -> BaseGraph: BaseGraph: A graph instance representing the web scraping and searching workflow. """ - # ************************************************ - # Create a OmniScraperGraph instance - # ************************************************ - - omni_scraper_instance = OmniScraperGraph( - prompt="", - source="", - config=self.copy_config, - schema=self.copy_schema - ) - - # ************************************************ - # Define the graph nodes - # ************************************************ + # omni_scraper_instance = OmniScraperGraph( + # prompt="", + # source="", + # config=self.copy_config, + # schema=self.copy_schema + # ) search_internet_node = SearchInternetNode( input="user_prompt", output=["urls"], node_config={ "llm_model": self.llm_model, - "max_results": self.max_results + "max_results": self.max_results, + "search_engine": self.copy_config.get("search_engine") } ) graph_iterator_node = GraphIteratorNode( input="user_prompt & urls", output=["results"], node_config={ - "graph_instance": omni_scraper_instance, - } + "graph_instance": OmniScraperGraph, + "scraper_config": self.copy_config, + }, + schema=self.copy_schema ) merge_answers_node = MergeAnswersNode( diff --git a/scrapegraphai/graphs/pdf_scraper_graph.py b/scrapegraphai/graphs/pdf_scraper_graph.py index ae783aba..341243a4 100644 --- a/scrapegraphai/graphs/pdf_scraper_graph.py +++ b/scrapegraphai/graphs/pdf_scraper_graph.py @@ -2,7 +2,6 @@ """ PDFScraperGraph Module """ - from typing import Optional from pydantic import BaseModel from .base_graph import BaseGraph @@ -40,7 +39,7 @@ class PDFScraperGraph(AbstractGraph): >>> pdf_scraper = PDFScraperGraph( ... "List me all the attractions in Chioggia.", ... "data/chioggia.pdf", - ... {"llm": {"model": "gpt-3.5-turbo"}} + ... {"llm": {"model": "openai/gpt-3.5-turbo"}} ... ) >>> result = pdf_scraper.run() """ @@ -68,7 +67,8 @@ def _create_graph(self) -> BaseGraph: output=["parsed_doc"], node_config={ "parse_html": False, - "chunk_size": self.model_token + "chunk_size": self.model_token, + "llm_model": self.llm_model } ) diff --git a/scrapegraphai/graphs/pdf_scraper_multi_graph.py b/scrapegraphai/graphs/pdf_scraper_multi_graph.py index 06da6944..9551dc90 100644 --- a/scrapegraphai/graphs/pdf_scraper_multi_graph.py +++ b/scrapegraphai/graphs/pdf_scraper_multi_graph.py @@ -37,7 +37,7 @@ class PdfScraperMultiGraph(AbstractGraph): Example: >>> search_graph = MultipleSearchGraph( ... "What is Chioggia famous for?", - ... {"llm": {"model": "gpt-3.5-turbo"}} + ... {"llm": {"model": "openai/gpt-3.5-turbo"}} ... ) >>> result = search_graph.run() """ @@ -59,10 +59,6 @@ def _create_graph(self) -> BaseGraph: BaseGraph: A graph instance representing the web scraping and searching workflow. """ - # ************************************************ - # Create a PDFScraperGraph instance - # ************************************************ - pdf_scraper_instance = PDFScraperGraph( prompt="", source="", @@ -70,10 +66,6 @@ def _create_graph(self) -> BaseGraph: schema=self.copy_schema ) - # ************************************************ - # Define the graph nodes - # ************************************************ - graph_iterator_node = GraphIteratorNode( input="user_prompt & pdfs", output=["results"], diff --git a/scrapegraphai/graphs/script_creator_graph.py b/scrapegraphai/graphs/script_creator_graph.py index bb5629c5..732fb3cf 100644 --- a/scrapegraphai/graphs/script_creator_graph.py +++ b/scrapegraphai/graphs/script_creator_graph.py @@ -1,7 +1,6 @@ """ ScriptCreatorGraph Module """ - from typing import Optional from pydantic import BaseModel from .base_graph import BaseGraph @@ -39,7 +38,7 @@ class ScriptCreatorGraph(AbstractGraph): >>> script_creator = ScriptCreatorGraph( ... "List me all the attractions in Chioggia.", ... "https://en.wikipedia.org/wiki/Chioggia", - ... {"llm": {"model": "gpt-3.5-turbo"}} + ... {"llm": {"model": "openai/gpt-3.5-turbo"}} ... ) >>> result = script_creator.run() """ @@ -62,7 +61,7 @@ def _create_graph(self) -> BaseGraph: fetch_node = FetchNode( input="url | local_dir", - output=["doc", "link_urls", "img_urls"], + output=["doc"], node_config={ "llm_model": self.llm_model, "loader_kwargs": self.config.get("loader_kwargs", {}), @@ -73,11 +72,12 @@ def _create_graph(self) -> BaseGraph: input="doc", output=["parsed_doc"], node_config={"chunk_size": self.model_token, - "parse_html": False + "parse_html": False, + "llm_model": self.llm_model } ) generate_scraper_node = GenerateScraperNode( - input="user_prompt & (doc)", + input="user_prompt & (parsed_doc)", output=["answer"], node_config={ "llm_model": self.llm_model, diff --git a/scrapegraphai/graphs/script_creator_multi_graph.py b/scrapegraphai/graphs/script_creator_multi_graph.py index b2ea8465..864485fb 100644 --- a/scrapegraphai/graphs/script_creator_multi_graph.py +++ b/scrapegraphai/graphs/script_creator_multi_graph.py @@ -3,13 +3,10 @@ """ from typing import List, Optional - from pydantic import BaseModel - from .base_graph import BaseGraph from .abstract_graph import AbstractGraph from .script_creator_graph import ScriptCreatorGraph - from ..nodes import ( GraphIteratorNode, MergeGeneratedScriptsNode @@ -37,13 +34,14 @@ class ScriptCreatorMultiGraph(AbstractGraph): >>> script_graph = ScriptCreatorMultiGraph( ... "What is Chioggia famous for?", ... source=[], - ... config={"llm": {"model": "gpt-3.5-turbo"}} + ... config={"llm": {"model": "openai/gpt-3.5-turbo"}} ... schema={} ... ) >>> result = script_graph.run() """ - def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None): + def __init__(self, prompt: str, source: List[str], + config: dict, schema: Optional[BaseModel] = None): self.max_results = config.get("max_results", 3) @@ -58,10 +56,6 @@ def _create_graph(self) -> BaseGraph: BaseGraph: A graph instance representing the web scraping and searching workflow. """ - # ************************************************ - # Create a ScriptCreatorGraph instance - # ************************************************ - script_generator_instance = ScriptCreatorGraph( prompt="", source="", @@ -69,10 +63,6 @@ def _create_graph(self) -> BaseGraph: schema=self.schema ) - # ************************************************ - # Define the graph nodes - # ************************************************ - graph_iterator_node = GraphIteratorNode( input="user_prompt & urls", output=["scripts"], diff --git a/scrapegraphai/graphs/search_graph.py b/scrapegraphai/graphs/search_graph.py index d27e7186..461dc80c 100644 --- a/scrapegraphai/graphs/search_graph.py +++ b/scrapegraphai/graphs/search_graph.py @@ -39,7 +39,7 @@ class SearchGraph(AbstractGraph): Example: >>> search_graph = SearchGraph( ... "What is Chioggia famous for?", - ... {"llm": {"model": "gpt-3.5-turbo"}} + ... {"llm": {"model": "openai/gpt-3.5-turbo"}} ... ) >>> result = search_graph.run() >>> print(search_graph.get_considered_urls()) @@ -62,29 +62,30 @@ def _create_graph(self) -> BaseGraph: BaseGraph: A graph instance representing the web scraping and searching workflow. """ - # Create a SmartScraperGraph instance - smart_scraper_instance = SmartScraperGraph( - prompt="", - source="", - config=self.copy_config, - schema=self.copy_schema - ) + # smart_scraper_instance = SmartScraperGraph( + # prompt="", + # source="", + # config=self.copy_config, + # schema=self.copy_schema + # ) - # Define the graph nodes search_internet_node = SearchInternetNode( input="user_prompt", output=["urls"], node_config={ "llm_model": self.llm_model, - "max_results": self.max_results + "max_results": self.max_results, + "search_engine": self.copy_config.get("search_engine") } ) graph_iterator_node = GraphIteratorNode( input="user_prompt & urls", output=["results"], node_config={ - "graph_instance": smart_scraper_instance, - } + "graph_instance": SmartScraperGraph, + "scraper_config": self.copy_config + }, + schema=self.copy_schema ) merge_answers_node = MergeAnswersNode( diff --git a/scrapegraphai/graphs/search_link_graph.py b/scrapegraphai/graphs/search_link_graph.py index 66b2f223..c44d707a 100644 --- a/scrapegraphai/graphs/search_link_graph.py +++ b/scrapegraphai/graphs/search_link_graph.py @@ -1,4 +1,6 @@ -""" SearchLinkGraph Module """ +""" +SearchLinkGraph Module +""" from typing import Optional import logging from pydantic import BaseModel @@ -32,7 +34,7 @@ class SearchLinkGraph(AbstractGraph): >>> smart_scraper = SearchLinkGraph( ... "List me all the attractions in Chioggia.", ... "https://en.wikipedia.org/wiki/Chioggia", - ... {"llm": {"model": "gpt-3.5-turbo"}} + ... {"llm": {"model": "openai/gpt-3.5-turbo"}} ... ) >>> result = smart_scraper.run() """ @@ -52,7 +54,7 @@ def _create_graph(self) -> BaseGraph: fetch_node = FetchNode( input="url| local_dir", - output=["doc", "link_urls", "img_urls"], + output=["doc"], node_config={ "llm_model": self.llm_model, "force": self.config.get("force", False), @@ -64,7 +66,8 @@ def _create_graph(self) -> BaseGraph: input="doc", output=["parsed_doc"], node_config={ - "chunk_size": self.model_token + "chunk_size": self.model_token, + "llm_model": self.llm_model } ) search_link_node = SearchLinkNode( diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index 0167103e..0c025c3a 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -1,13 +1,11 @@ """ SmartScraperGraph Module """ - from typing import Optional import logging from pydantic import BaseModel from .base_graph import BaseGraph from .abstract_graph import AbstractGraph - from ..nodes import ( FetchNode, ParseNode, @@ -41,7 +39,7 @@ class SmartScraperGraph(AbstractGraph): >>> smart_scraper = SmartScraperGraph( ... "List me all the attractions in Chioggia.", ... "https://en.wikipedia.org/wiki/Chioggia", - ... {"llm": {"model": "gpt-3.5-turbo"}} + ... {"llm": {"model": "openai/gpt-3.5-turbo"}} ... ) >>> result = smart_scraper.run() ) @@ -61,19 +59,21 @@ def _create_graph(self) -> BaseGraph: """ fetch_node = FetchNode( input="url| local_dir", - output=["doc", "link_urls", "img_urls"], + output=["doc"], node_config={ "llm_model": self.llm_model, "force": self.config.get("force", False), "cut": self.config.get("cut", True), "loader_kwargs": self.config.get("loader_kwargs", {}), - "browser_base": self.config.get("browser_base") + "browser_base": self.config.get("browser_base"), + "scrape_do": self.config.get("scrape_do") } ) parse_node = ParseNode( input="doc", output=["parsed_doc"], node_config={ + "llm_model": self.llm_model, "chunk_size": self.model_token } ) diff --git a/scrapegraphai/graphs/smart_scraper_multi_concat_graph.py b/scrapegraphai/graphs/smart_scraper_multi_concat_graph.py new file mode 100644 index 00000000..4d1867f9 --- /dev/null +++ b/scrapegraphai/graphs/smart_scraper_multi_concat_graph.py @@ -0,0 +1,109 @@ +""" +SmartScraperMultiGraph Module +""" + +from copy import copy, deepcopy +from typing import List, Optional +from pydantic import BaseModel + +from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph +from .smart_scraper_graph import SmartScraperGraph + +from ..nodes import ( + GraphIteratorNode, + ConcatAnswersNode +) + + +class SmartScraperMultiConcatGraph(AbstractGraph): + """ + SmartScraperMultiGraph is a scraping pipeline that scrapes a + list of URLs and generates answers to a given prompt. + It only requires a user prompt and a list of URLs. + + Attributes: + prompt (str): The user prompt to search the internet. + llm_model (dict): The configuration for the language model. + embedder_model (dict): The configuration for the embedder model. + headless (bool): A flag to run the browser in headless mode. + verbose (bool): A flag to display the execution information. + model_token (int): The token limit for the language model. + + Args: + prompt (str): The user prompt to search the internet. + source (List[str]): The source of the graph. + config (dict): Configuration parameters for the graph. + schema (Optional[BaseModel]): The schema for the graph output. + + Example: + >>> search_graph = SmartScraperMultiConcatGraph( + ... "What is Chioggia famous for?", + ... {"llm": {"model": "openai/gpt-3.5-turbo"}} + ... ) + >>> result = search_graph.run() + """ + + def __init__(self, prompt: str, source: List[str], + config: dict, schema: Optional[BaseModel] = None): + + if all(isinstance(value, str) for value in config.values()): + self.copy_config = copy(config) + else: + self.copy_config = deepcopy(config) + + self.copy_schema = deepcopy(schema) + + super().__init__(prompt, config, source, schema) + + def _create_graph(self) -> BaseGraph: + """ + Creates the graph of nodes representing the workflow for web scraping and searching. + + Returns: + BaseGraph: A graph instance representing the web scraping and searching workflow. + """ + + smart_scraper_instance = SmartScraperGraph( + prompt="", + source="", + config=self.copy_config, + schema=self.copy_schema + ) + + graph_iterator_node = GraphIteratorNode( + input="user_prompt & urls", + output=["results"], + node_config={ + "graph_instance": smart_scraper_instance, + } + ) + + concat_answers_node = ConcatAnswersNode( + input="results", + output=["answer"] + ) + + return BaseGraph( + nodes=[ + graph_iterator_node, + concat_answers_node, + ], + edges=[ + (graph_iterator_node, concat_answers_node), + ], + entry_point=graph_iterator_node, + graph_name=self.__class__.__name__ + ) + + def run(self) -> str: + """ + Executes the web scraping and searching process. + + Returns: + str: The answer to the prompt. + """ + inputs = {"user_prompt": self.prompt, "urls": self.source} + self.final_state, self.execution_info = self.graph.execute(inputs) + + return self.final_state.get("answer", "No answer found.") diff --git a/scrapegraphai/graphs/smart_scraper_multi_graph.py b/scrapegraphai/graphs/smart_scraper_multi_graph.py index 82585cf0..b44a01de 100644 --- a/scrapegraphai/graphs/smart_scraper_multi_graph.py +++ b/scrapegraphai/graphs/smart_scraper_multi_graph.py @@ -5,11 +5,9 @@ from copy import deepcopy from typing import List, Optional from pydantic import BaseModel - from .base_graph import BaseGraph from .abstract_graph import AbstractGraph from .smart_scraper_graph import SmartScraperGraph - from ..nodes import ( GraphIteratorNode, MergeAnswersNode @@ -39,7 +37,7 @@ class SmartScraperMultiGraph(AbstractGraph): Example: >>> search_graph = MultipleSearchGraph( ... "What is Chioggia famous for?", - ... {"llm": {"model": "gpt-3.5-turbo"}} + ... {"llm": {"model": "openai/gpt-3.5-turbo"}} ... ) >>> result = search_graph.run() """ @@ -63,10 +61,6 @@ def _create_graph(self) -> BaseGraph: BaseGraph: A graph instance representing the web scraping and searching workflow. """ - # ************************************************ - # Create a SmartScraperGraph instance - # ************************************************ - smart_scraper_instance = SmartScraperGraph( prompt="", source="", @@ -74,10 +68,6 @@ def _create_graph(self) -> BaseGraph: schema=self.copy_schema ) - # ************************************************ - # Define the graph nodes - # ************************************************ - graph_iterator_node = GraphIteratorNode( input="user_prompt & urls", output=["results"], diff --git a/scrapegraphai/graphs/speech_graph.py b/scrapegraphai/graphs/speech_graph.py index 8d77621a..6065bcf4 100644 --- a/scrapegraphai/graphs/speech_graph.py +++ b/scrapegraphai/graphs/speech_graph.py @@ -1,20 +1,16 @@ """ SpeechGraph Module """ - from typing import Optional from pydantic import BaseModel - from .base_graph import BaseGraph from .abstract_graph import AbstractGraph - from ..nodes import ( FetchNode, ParseNode, GenerateAnswerNode, TextToSpeechNode, ) - from ..utils.save_audio_from_bytes import save_audio_from_bytes from ..models import OpenAITextToSpeech @@ -29,7 +25,8 @@ class SpeechGraph(AbstractGraph): config (dict): Configuration parameters for the graph. schema (BaseModel): The schema for the graph output. llm_model: An instance of a language model client, configured for generating answers. - embedder_model: An instance of an embedding model client, configured for generating embeddings. + embedder_model: An instance of an embedding model clienta + configured for generating embeddings. verbose (bool): A flag indicating whether to show print statements during execution. headless (bool): A flag indicating whether to run the graph in headless mode. model_token (int): The token limit for the language model. @@ -44,7 +41,7 @@ class SpeechGraph(AbstractGraph): >>> speech_graph = SpeechGraph( ... "List me all the attractions in Chioggia and generate an audio summary.", ... "https://en.wikipedia.org/wiki/Chioggia", - ... {"llm": {"model": "gpt-3.5-turbo"}} + ... {"llm": {"model": "openai/gpt-3.5-turbo"}} """ def __init__(self, prompt: str, source: str, config: dict, schema: Optional[BaseModel] = None): @@ -62,13 +59,14 @@ def _create_graph(self) -> BaseGraph: fetch_node = FetchNode( input="url | local_dir", - output=["doc", "link_urls", "img_urls"] + output=["doc"] ) parse_node = ParseNode( input="doc", output=["parsed_doc"], node_config={ - "chunk_size": self.model_token + "chunk_size": self.model_token, + "llm_model": self.llm_model } ) generate_answer_node = GenerateAnswerNode( diff --git a/scrapegraphai/graphs/xml_scraper_graph.py b/scrapegraphai/graphs/xml_scraper_graph.py index f5806f56..ec75aee9 100644 --- a/scrapegraphai/graphs/xml_scraper_graph.py +++ b/scrapegraphai/graphs/xml_scraper_graph.py @@ -4,10 +4,8 @@ from typing import Optional from pydantic import BaseModel - from .base_graph import BaseGraph from .abstract_graph import AbstractGraph - from ..nodes import ( FetchNode, GenerateAnswerNode @@ -40,7 +38,7 @@ class XMLScraperGraph(AbstractGraph): >>> xml_scraper = XMLScraperGraph( ... "List me all the attractions in Chioggia.", ... "data/chioggia.xml", - ... {"llm": {"model": "gpt-3.5-turbo"}} + ... {"llm": {"model": "openai/gpt-3.5-turbo"}} ... ) >>> result = xml_scraper.run() """ @@ -60,7 +58,7 @@ def _create_graph(self) -> BaseGraph: fetch_node = FetchNode( input="xml | xml_dir", - output=["doc", "link_urls", "img_urls"] + output=["doc"] ) generate_answer_node = GenerateAnswerNode( diff --git a/scrapegraphai/graphs/xml_scraper_multi_graph.py b/scrapegraphai/graphs/xml_scraper_multi_graph.py index 493d12ca..6eba69b4 100644 --- a/scrapegraphai/graphs/xml_scraper_multi_graph.py +++ b/scrapegraphai/graphs/xml_scraper_multi_graph.py @@ -5,11 +5,9 @@ from copy import deepcopy from typing import List, Optional from pydantic import BaseModel - from .base_graph import BaseGraph from .abstract_graph import AbstractGraph from .xml_scraper_graph import XMLScraperGraph - from ..nodes import ( GraphIteratorNode, MergeAnswersNode @@ -39,7 +37,7 @@ class XMLScraperMultiGraph(AbstractGraph): Example: >>> search_graph = MultipleSearchGraph( ... "What is Chioggia famous for?", - ... {"llm": {"model": "gpt-3.5-turbo"}} + ... {"llm": {"model": "openai/gpt-3.5-turbo"}} ... ) >>> result = search_graph.run() """ @@ -61,10 +59,6 @@ def _create_graph(self) -> BaseGraph: BaseGraph: A graph instance representing the web scraping and searching workflow. """ - # ************************************************ - # Create a XMLScraperGraph instance - # ************************************************ - smart_scraper_instance = XMLScraperGraph( prompt="", source="", @@ -72,10 +66,6 @@ def _create_graph(self) -> BaseGraph: schema=self.copy_schema ) - # ************************************************ - # Define the graph nodes - # ************************************************ - graph_iterator_node = GraphIteratorNode( input="user_prompt & jsons", output=["results"], diff --git a/scrapegraphai/helpers/default_filters.py b/scrapegraphai/helpers/default_filters.py index a997736d..c3846f86 100644 --- a/scrapegraphai/helpers/default_filters.py +++ b/scrapegraphai/helpers/default_filters.py @@ -10,4 +10,4 @@ '/login', '/signup', '/register', '/contact', 'facebook.com', 'twitter.com', 'linkedin.com', 'instagram.com', '.js', '.css', ] -} \ No newline at end of file +} diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py index ea35ed71..c90cf1f6 100644 --- a/scrapegraphai/helpers/models_tokens.py +++ b/scrapegraphai/helpers/models_tokens.py @@ -127,6 +127,9 @@ "gemma-7b-it": 8192, "claude-3-haiku-20240307'": 8192, }, + "togheterai": { + "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo": 128000 + }, "anthropic": { "claude_instant": 100000, "claude2": 9000, diff --git a/scrapegraphai/integrations/burr_bridge.py b/scrapegraphai/integrations/burr_bridge.py index e5eb3c6a..76cce914 100644 --- a/scrapegraphai/integrations/burr_bridge.py +++ b/scrapegraphai/integrations/burr_bridge.py @@ -18,8 +18,6 @@ raise ImportError("burr package is not installed. Please install it with 'pip install scrapegraphai[burr]'") - - class PrintLnHook(PostRunStepHook, PreRunStepHook): """ Hook to print the action name before and after it is executed. diff --git a/scrapegraphai/models/deepseek.py b/scrapegraphai/models/deepseek.py index 31b2bd5d..1901269e 100644 --- a/scrapegraphai/models/deepseek.py +++ b/scrapegraphai/models/deepseek.py @@ -18,5 +18,5 @@ def __init__(self, **llm_config): if 'api_key' in llm_config: llm_config['openai_api_key'] = llm_config.pop('api_key') llm_config['openai_api_base'] = 'https://api.deepseek.com/v1' - + super().__init__(**llm_config) diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py index dd1c3fcc..1e990400 100644 --- a/scrapegraphai/nodes/__init__.py +++ b/scrapegraphai/nodes/__init__.py @@ -22,3 +22,4 @@ from .merge_generated_scripts import MergeGeneratedScriptsNode from .fetch_screen_node import FetchScreenNode from .generate_answer_from_image_node import GenerateAnswerFromImageNode +from .concat_answers_node import ConcatAnswersNode diff --git a/scrapegraphai/nodes/base_node.py b/scrapegraphai/nodes/base_node.py index 206b0547..90dbea51 100644 --- a/scrapegraphai/nodes/base_node.py +++ b/scrapegraphai/nodes/base_node.py @@ -27,7 +27,8 @@ class BaseNode(ABC): input (str): Expression defining the input keys needed from the state. output (List[str]): List of output keys to be updated in the state. min_input_len (int, optional): Minimum required number of input keys; defaults to 1. - node_config (Optional[dict], optional): Additional configuration for the node; defaults to None. + node_config (Optional[dict], optional): Additional configuration + for the node; defaults to None. Raises: ValueError: If `node_type` is not one of the allowed types. @@ -233,7 +234,9 @@ def evaluate_expression(expression: str) -> List[str]: result = evaluate_expression(expression) if not result: - raise ValueError(f"No state keys matched the expression. Expression was {expression}. State contains keys: {', '.join(state.keys())}") + raise ValueError(f"""No state keys matched the expression. + Expression was {expression}. + State contains keys: {', '.join(state.keys())}""") # Remove redundant state keys from the result, without changing their order final_result = [] diff --git a/scrapegraphai/nodes/concat_answers_node.py b/scrapegraphai/nodes/concat_answers_node.py new file mode 100644 index 00000000..5af81702 --- /dev/null +++ b/scrapegraphai/nodes/concat_answers_node.py @@ -0,0 +1,77 @@ +""" +ConcatAnswersNode Module +""" + +from typing import List, Optional +from ..utils.logging import get_logger +from .base_node import BaseNode + +class ConcatAnswersNode(BaseNode): + """ + A node responsible for concatenating the answers from multiple + graph instances into a single answer. + + Attributes: + verbose (bool): A flag indicating whether to show print statements during execution. + + Args: + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "GenerateAnswer". + """ + + def __init__( + self, + input: str, + output: List[str], + node_config: Optional[dict] = None, + node_name: str = "ConcatAnswers", + ): + super().__init__(node_name, "node", input, output, 1, node_config) + + self.verbose = ( + False if node_config is None else node_config.get("verbose", False) + ) + + def _merge_dict(self, items): + + return {"products": {f"item_{i+1}": item for i, item in enumerate(items)}} + + def execute(self, state: dict) -> dict: + """ + Executes the node's logic to concatenate the answers from multiple graph instances into a + single answer. + + Args: + state (dict): The current state of the graph. The input keys will be used + to fetch the correct data from the state. + + Returns: + dict: The updated state with the output key containing the generated answer. + + Raises: + KeyError: If the input keys are not found in the state, indicating + that the necessary information for generating an answer is missing. + """ + + self.logger.info(f"--- Executing {self.node_name} Node ---") + + # Interpret input keys based on the provided input expression + input_keys = self.get_input_keys(state) + + # Fetching data from the state based on the input keys + input_data = [state[key] for key in input_keys] + + answers = input_data[0] + + if len(answers) > 1: + # merge the answers in one string + answer = self._merge_dict(answers) + + # Update the state with the generated answer + state.update({self.output[0]: answer}) + + else: + state.update({self.output[0]: answers[0]}) + return state diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index f015278d..19d59004 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -76,6 +76,10 @@ def __init__( None if node_config is None else node_config.get("browser_base", None) ) + self.scrape_do = ( + None if node_config is None else node_config.get("scrape_do", None) + ) + def execute(self, state): """ Executes the node's logic to fetch HTML content from a specified URL and @@ -102,7 +106,7 @@ def execute(self, state): source = input_data[0] input_type = input_keys[0] - + handlers = { "json_dir": self.handle_directory, "xml_dir": self.handle_directory, @@ -271,28 +275,48 @@ def handle_web_source(self, state, source): try: from ..docloaders.browser_base import browser_base_fetch except ImportError: - raise ImportError("The browserbase module is not installed. Please install it using `pip install browserbase`.") + raise ImportError("""The browserbase module is not installed. + Please install it using `pip install browserbase`.""") data = browser_base_fetch(self.browser_base.get("api_key"), self.browser_base.get("project_id"), [source]) document = [Document(page_content=content, metadata={"source": source}) for content in data] + elif self.scrape_do is not None: + from ..docloaders.scrape_do import scrape_do_fetch + if (self.scrape_do.get("use_proxy") is None) or \ + self.scrape_do.get("geoCode") is None or \ + self.scrape_do.get("super_proxy") is None: + data = scrape_do_fetch(self.scrape_do.get("api_key"), + source) + else: + data = scrape_do_fetch(self.scrape_do.get("api_key"), + source, self.scrape_do.get("use_proxy"), + self.scrape_do.get("geoCode"), + self.scrape_do.get("super_proxy")) + + document = [Document(page_content=data, + metadata={"source": source})] else: loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs) document = loader.load() if not document or not document[0].page_content.strip(): - raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.") + raise ValueError("""No HTML body content found in + the document fetched by ChromiumLoader.""") parsed_content = document[0].page_content - if (isinstance(self.llm_model, ChatOpenAI) or isinstance(self.llm_model, AzureChatOpenAI)) and not self.script_creator or self.force and not self.script_creator and not self.openai_md_enabled: + if (isinstance(self.llm_model, ChatOpenAI) \ + or isinstance(self.llm_model, AzureChatOpenAI)) \ + and not self.script_creator or self.force \ + and not self.script_creator and not self.openai_md_enabled: parsed_content = convert_to_md(document[0].page_content, parsed_content) compressed_document = [ Document(page_content=parsed_content, metadata={"source": "html file"}) ] - + return self.update_state(state, compressed_document) def update_state(self, state, compressed_document): diff --git a/scrapegraphai/nodes/generate_answer_csv_node.py b/scrapegraphai/nodes/generate_answer_csv_node.py index 0907dfb9..de127f47 100644 --- a/scrapegraphai/nodes/generate_answer_csv_node.py +++ b/scrapegraphai/nodes/generate_answer_csv_node.py @@ -6,11 +6,13 @@ from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel +from langchain_core.utils.pydantic import is_basemodel_subclass +from langchain_openai import ChatOpenAI +from langchain_mistralai import ChatMistralAI from tqdm import tqdm from ..utils.logging import get_logger from .base_node import BaseNode -from ..prompts.generate_answer_node_csv_prompts import (TEMPLATE_CHUKS_CSV, - TEMPLATE_NO_CHUKS_CSV, TEMPLATE_MERGE_CSV) +from ..prompts import TEMPLATE_CHUKS_CSV, TEMPLATE_NO_CHUKS_CSV, TEMPLATE_MERGE_CSV class GenerateAnswerCSVNode(BaseNode): """ @@ -92,9 +94,24 @@ def execute(self, state): # Initialize the output parser if self.node_config.get("schema", None) is not None: - output_parser = JsonOutputParser(pydantic_object=self.node_config["schema"]) + + if isinstance(self.llm_model, (ChatOpenAI, ChatMistralAI)): + self.llm_model = self.llm_model.with_structured_output( + schema = self.node_config["schema"], + method="function_calling") # json schema works only on specific models + + # default parser to empty lambda function + output_parser = lambda x: x + if is_basemodel_subclass(self.node_config["schema"]): + output_parser = dict + format_instructions = "NA" + else: + output_parser = JsonOutputParser(pydantic_object=self.node_config["schema"]) + format_instructions = output_parser.get_format_instructions() + else: output_parser = JsonOutputParser() + format_instructions = output_parser.get_format_instructions() TEMPLATE_NO_CHUKS_CSV_PROMPT = TEMPLATE_NO_CHUKS_CSV TEMPLATE_CHUKS_CSV_PROMPT = TEMPLATE_CHUKS_CSV @@ -105,8 +122,6 @@ def execute(self, state): TEMPLATE_CHUKS_CSV_PROMPT = self.additional_info + TEMPLATE_CHUKS_CSV TEMPLATE_MERGE_CSV_PROMPT = self.additional_info + TEMPLATE_MERGE_CSV - format_instructions = output_parser.get_format_instructions() - chains_dict = {} if len(doc) == 1: diff --git a/scrapegraphai/nodes/generate_answer_from_image_node.py b/scrapegraphai/nodes/generate_answer_from_image_node.py index 4cc93d18..7134cabe 100644 --- a/scrapegraphai/nodes/generate_answer_from_image_node.py +++ b/scrapegraphai/nodes/generate_answer_from_image_node.py @@ -73,7 +73,7 @@ async def execute_async(self, state: dict) -> dict: supported_models = ("gpt-4o", "gpt-4o-mini", "gpt-4-turbo") - if self.node_config["config"]["llm"]["model"] not in supported_models: + if self.node_config["config"]["llm"]["model"].split("/")[-1]not in supported_models: raise ValueError(f"""Model '{self.node_config['config']['llm']['model']}' is not supported. Supported models are: {', '.join(supported_models)}.""") diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index 966a758f..ae92f6c5 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -1,18 +1,20 @@ """ GenerateAnswerNode Module """ -from sys import modules from typing import List, Optional from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel +from langchain_core.utils.pydantic import is_basemodel_subclass from langchain_openai import ChatOpenAI, AzureChatOpenAI from langchain_mistralai import ChatMistralAI from langchain_community.chat_models import ChatOllama from tqdm import tqdm -from ..utils.logging import get_logger from .base_node import BaseNode -from ..prompts import TEMPLATE_CHUNKS, TEMPLATE_NO_CHUNKS, TEMPLATE_MERGE, TEMPLATE_CHUNKS_MD, TEMPLATE_NO_CHUNKS_MD, TEMPLATE_MERGE_MD +from ..prompts import (TEMPLATE_CHUNKS, + TEMPLATE_NO_CHUNKS, TEMPLATE_MERGE, + TEMPLATE_CHUNKS_MD, TEMPLATE_NO_CHUNKS_MD, + TEMPLATE_MERGE_MD) class GenerateAnswerNode(BaseNode): """ @@ -80,37 +82,37 @@ def execute(self, state: dict) -> dict: self.logger.info(f"--- Executing {self.node_name} Node ---") - # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) - # Fetching data from the state based on the input keys + input_data = [state[key] for key in input_keys] user_prompt = input_data[0] doc = input_data[1] - # Initialize the output parser if self.node_config.get("schema", None) is not None: - output_parser = JsonOutputParser(pydantic_object=self.node_config["schema"]) - - # Use built-in structured output for providers that allow it - optional_modules = {"langchain_anthropic", "langchain_fireworks", "langchain_groq", "langchain_google_vertexai"} - if all(key in modules for key in optional_modules): - if isinstance(self.llm_model, (ChatOpenAI, ChatMistralAI, ChatAnthropic, ChatFireworks, ChatGroq, ChatVertexAI)): - self.llm_model = self.llm_model.with_structured_output( - schema = self.node_config["schema"], - method="json_schema") + + if isinstance(self.llm_model, (ChatOpenAI, ChatMistralAI)): + self.llm_model = self.llm_model.with_structured_output( + schema = self.node_config["schema"]) # json schema works only on specific models + + # default parser to empty lambda function + def output_parser(x): + return x + if is_basemodel_subclass(self.node_config["schema"]): + output_parser = dict + format_instructions = "NA" else: - if isinstance(self.llm_model, (ChatOpenAI, ChatMistralAI)): - self.llm_model = self.llm_model.with_structured_output( - schema = self.node_config["schema"], - method="json_schema") - + output_parser = JsonOutputParser(pydantic_object=self.node_config["schema"]) + format_instructions = output_parser.get_format_instructions() else: output_parser = JsonOutputParser() + format_instructions = output_parser.get_format_instructions() - format_instructions = output_parser.get_format_instructions() + if isinstance(self.llm_model, (ChatOpenAI, AzureChatOpenAI)) \ + and not self.script_creator \ + or self.force \ + and not self.script_creator or self.is_md_scraper: - if isinstance(self.llm_model, (ChatOpenAI, AzureChatOpenAI)) and not self.script_creator or self.force and not self.script_creator or self.is_md_scraper: template_no_chunks_prompt = TEMPLATE_NO_CHUNKS_MD template_chunks_prompt = TEMPLATE_CHUNKS_MD template_merge_prompt = TEMPLATE_MERGE_MD diff --git a/scrapegraphai/nodes/generate_answer_omni_node.py b/scrapegraphai/nodes/generate_answer_omni_node.py index 34ee3e87..32dfbff6 100644 --- a/scrapegraphai/nodes/generate_answer_omni_node.py +++ b/scrapegraphai/nodes/generate_answer_omni_node.py @@ -5,10 +5,15 @@ from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel +from langchain_core.utils.pydantic import is_basemodel_subclass +from langchain_openai import ChatOpenAI +from langchain_mistralai import ChatMistralAI from tqdm import tqdm from langchain_community.chat_models import ChatOllama from .base_node import BaseNode -from ..prompts.generate_answer_node_omni_prompts import TEMPLATE_NO_CHUNKS_OMNI, TEMPLATE_CHUNKS_OMNI, TEMPLATE_MERGE_OMNI +from ..prompts.generate_answer_node_omni_prompts import (TEMPLATE_NO_CHUNKS_OMNI, + TEMPLATE_CHUNKS_OMNI, + TEMPLATE_MERGE_OMNI) class GenerateAnswerOmniNode(BaseNode): """ @@ -78,9 +83,25 @@ def execute(self, state: dict) -> dict: # Initialize the output parser if self.node_config.get("schema", None) is not None: - output_parser = JsonOutputParser(pydantic_object=self.node_config["schema"]) + + if isinstance(self.llm_model, (ChatOpenAI, ChatMistralAI)): + self.llm_model = self.llm_model.with_structured_output( + schema = self.node_config["schema"], + method="function_calling") # json schema works only on specific models + + # default parser to empty lambda function + output_parser = lambda x: x + if is_basemodel_subclass(self.node_config["schema"]): + output_parser = dict + format_instructions = "NA" + else: + output_parser = JsonOutputParser(pydantic_object=self.node_config["schema"]) + format_instructions = output_parser.get_format_instructions() + else: output_parser = JsonOutputParser() + format_instructions = output_parser.get_format_instructions() + TEMPLATE_NO_CHUNKS_OMNI_prompt = TEMPLATE_NO_CHUNKS_OMNI TEMPLATE_CHUNKS_OMNI_prompt = TEMPLATE_CHUNKS_OMNI TEMPLATE_MERGE_OMNI_prompt= TEMPLATE_MERGE_OMNI @@ -90,7 +111,6 @@ def execute(self, state: dict) -> dict: TEMPLATE_CHUNKS_OMNI_prompt = self.additional_info + TEMPLATE_CHUNKS_OMNI_prompt TEMPLATE_MERGE_OMNI_prompt = self.additional_info + TEMPLATE_MERGE_OMNI_prompt - format_instructions = output_parser.get_format_instructions() chains_dict = {} diff --git a/scrapegraphai/nodes/generate_answer_pdf_node.py b/scrapegraphai/nodes/generate_answer_pdf_node.py index f3e68eab..3f7daf73 100644 --- a/scrapegraphai/nodes/generate_answer_pdf_node.py +++ b/scrapegraphai/nodes/generate_answer_pdf_node.py @@ -5,11 +5,16 @@ from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel +from langchain_core.utils.pydantic import is_basemodel_subclass +from langchain_openai import ChatOpenAI +from langchain_mistralai import ChatMistralAI from tqdm import tqdm from langchain_community.chat_models import ChatOllama from ..utils.logging import get_logger from .base_node import BaseNode -from ..prompts.generate_answer_node_pdf_prompts import TEMPLATE_CHUNKS_PDF, TEMPLATE_NO_CHUNKS_PDF, TEMPLATE_MERGE_PDF +from ..prompts.generate_answer_node_pdf_prompts import (TEMPLATE_CHUNKS_PDF, + TEMPLATE_NO_CHUNKS_PDF, + TEMPLATE_MERGE_PDF) class GenerateAnswerPDFNode(BaseNode): """ @@ -82,20 +87,32 @@ def execute(self, state): self.logger.info(f"--- Executing {self.node_name} Node ---") - # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) - # Fetching data from the state based on the input keys input_data = [state[key] for key in input_keys] user_prompt = input_data[0] doc = input_data[1] - # Initialize the output parser if self.node_config.get("schema", None) is not None: - output_parser = JsonOutputParser(pydantic_object=self.node_config["schema"]) + + if isinstance(self.llm_model, (ChatOpenAI, ChatMistralAI)): + self.llm_model = self.llm_model.with_structured_output( + schema = self.node_config["schema"], + method="function_calling") # json schema works only on specific models + + output_parser = lambda x: x + if is_basemodel_subclass(self.node_config["schema"]): + output_parser = dict + format_instructions = "NA" + else: + output_parser = JsonOutputParser(pydantic_object=self.node_config["schema"]) + format_instructions = output_parser.get_format_instructions() + else: output_parser = JsonOutputParser() + format_instructions = output_parser.get_format_instructions() + TEMPLATE_NO_CHUNKS_PDF_prompt = TEMPLATE_NO_CHUNKS_PDF TEMPLATE_CHUNKS_PDF_prompt = TEMPLATE_CHUNKS_PDF TEMPLATE_MERGE_PDF_prompt = TEMPLATE_MERGE_PDF @@ -105,8 +122,6 @@ def execute(self, state): TEMPLATE_CHUNKS_PDF_prompt = self.additional_info + TEMPLATE_CHUNKS_PDF_prompt TEMPLATE_MERGE_PDF_prompt = self.additional_info + TEMPLATE_MERGE_PDF_prompt - format_instructions = output_parser.get_format_instructions() - if len(doc) == 1: prompt = PromptTemplate( template=TEMPLATE_NO_CHUNKS_PDF_prompt, @@ -122,9 +137,9 @@ def execute(self, state): state.update({self.output[0]: answer}) return state - + chains_dict = {} - + for i, chunk in enumerate( tqdm(doc, desc="Processing chunks", disable=not self.verbose)): prompt = PromptTemplate( diff --git a/scrapegraphai/nodes/generate_scraper_node.py b/scrapegraphai/nodes/generate_scraper_node.py index a7c5e5bb..93ad9cf3 100644 --- a/scrapegraphai/nodes/generate_scraper_node.py +++ b/scrapegraphai/nodes/generate_scraper_node.py @@ -102,9 +102,20 @@ def execute(self, state: dict) -> dict: TEMPLATE_NO_CHUNKS += self.additional_info if len(doc) > 1: - raise NotImplementedError( - "Currently GenerateScraperNode cannot handle more than 1 context chunks" - ) + # Short term partial fix for issue #543 (Context length exceeded) + # If there are more than one chunks returned by ParseNode we just use the first one + # on the basis that the structure of the remainder of the HTML page is probably + # very similar to the first chunk therefore the generated script should still work. + # The better fix is to generate multiple scripts then use the LLM to merge them. + + #raise NotImplementedError( + # "Currently GenerateScraperNode cannot handle more than 1 context chunks" + #) + self.logger.warn(f"""Warning: {self.node_name} + Node provided with {len(doc)} chunks but can only " + "support 1, ignoring remaining chunks""") + doc = [doc[0]] + template = TEMPLATE_NO_CHUNKS else: template = TEMPLATE_NO_CHUNKS diff --git a/scrapegraphai/nodes/graph_iterator_node.py b/scrapegraphai/nodes/graph_iterator_node.py index a765da28..8781cf2d 100644 --- a/scrapegraphai/nodes/graph_iterator_node.py +++ b/scrapegraphai/nodes/graph_iterator_node.py @@ -2,11 +2,10 @@ GraphIterator Module """ import asyncio -import copy from typing import List, Optional from tqdm.asyncio import tqdm -from ..utils.logging import get_logger from .base_node import BaseNode +from langchain_core.pydantic_v1 import BaseModel DEFAULT_BATCHSIZE = 16 @@ -31,12 +30,14 @@ def __init__( output: List[str], node_config: Optional[dict] = None, node_name: str = "GraphIterator", + schema: Optional[BaseModel] = None, ): super().__init__(node_name, "node", input, output, 2, node_config) self.verbose = ( False if node_config is None else node_config.get("verbose", False) ) + self.schema = schema def execute(self, state: dict) -> dict: """ @@ -89,26 +90,32 @@ async def _async_execute(self, state: dict, batchsize: int) -> dict: KeyError: If the input keys are not found in the state. """ - # interprets input keys based on the provided input expression input_keys = self.get_input_keys(state) - # fetches data from the state based on the input keys input_data = [state[key] for key in input_keys] user_prompt = input_data[0] urls = input_data[1] graph_instance = self.node_config.get("graph_instance", None) + scraper_config = self.node_config.get("scraper_config", None) if graph_instance is None: raise ValueError("graph instance is required for concurrent execution") - if "graph_depth" in graph_instance.config: - graph_instance.config["graph_depth"] += 1 - else: - graph_instance.config["graph_depth"] = 1 + graph_instance = [graph_instance( + prompt="", + source="", + config=scraper_config, + schema=self.schema) for _ in range(len(urls))] + + for graph in graph_instance: + if "graph_depth" in graph.config: + graph.config["graph_depth"] += 1 + else: + graph.config["graph_depth"] = 1 - graph_instance.prompt = user_prompt + graph.prompt = user_prompt participants = [] @@ -118,13 +125,12 @@ async def _async_run(graph): async with semaphore: return await asyncio.to_thread(graph.run) - for url in urls: - instance = copy.copy(graph_instance) - instance.source = url + for url, graph in zip(urls, graph_instance): + graph.source = url if url.startswith("http"): - instance.input_key = "url" - participants.append(instance) - + graph.input_key = "url" + participants.append(graph) + futures = [_async_run(graph) for graph in participants] answers = await tqdm.gather( diff --git a/scrapegraphai/nodes/image_to_text_node.py b/scrapegraphai/nodes/image_to_text_node.py index 934710ef..13113aa0 100644 --- a/scrapegraphai/nodes/image_to_text_node.py +++ b/scrapegraphai/nodes/image_to_text_node.py @@ -1,9 +1,11 @@ """ ImageToTextNode Module """ +import traceback from typing import List, Optional from ..utils.logging import get_logger from .base_node import BaseNode +from langchain_core.messages import HumanMessage class ImageToTextNode(BaseNode): """ @@ -58,16 +60,25 @@ def execute(self, state: dict) -> dict: if isinstance(urls, str): urls = [urls] elif len(urls) == 0: - return state + return state.update({self.output[0]: []}) # Skip the image-to-text conversion if self.max_images < 1: - return state - + return state.update({self.output[0]: []}) + img_desc = [] for url in urls[: self.max_images]: try: - text_answer = self.llm_model.run(url) + message = HumanMessage( + content=[ + {"type": "text", "text": "Describe the provided image."}, + { + "type": "image_url", + "image_url": {"url": url}, + }, + ] + ) + text_answer = self.llm_model.invoke([message]).content except Exception as e: text_answer = f"Error: incompatible image format or model failure." img_desc.append(text_answer) diff --git a/scrapegraphai/nodes/merge_answers_node.py b/scrapegraphai/nodes/merge_answers_node.py index f2559a09..a269425f 100644 --- a/scrapegraphai/nodes/merge_answers_node.py +++ b/scrapegraphai/nodes/merge_answers_node.py @@ -4,6 +4,9 @@ from typing import List, Optional from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser +from langchain_core.utils.pydantic import is_basemodel_subclass +from langchain_openai import ChatOpenAI +from langchain_mistralai import ChatMistralAI from ..utils.logging import get_logger from .base_node import BaseNode from ..prompts import TEMPLATE_COMBINED @@ -68,11 +71,23 @@ def execute(self, state: dict) -> dict: answers_str += f"CONTENT WEBSITE {i+1}: {answer}\n" if self.node_config.get("schema", None) is not None: - output_parser = JsonOutputParser(pydantic_object=self.node_config["schema"]) + + if isinstance(self.llm_model, (ChatOpenAI, ChatMistralAI)): + self.llm_model = self.llm_model.with_structured_output( + schema = self.node_config["schema"], + method="function_calling") # json schema works only on specific models + # default parser to empty lambda function + output_parser = lambda x: x + if is_basemodel_subclass(self.node_config["schema"]): + output_parser = dict + format_instructions = "NA" + else: + output_parser = JsonOutputParser(pydantic_object=self.node_config["schema"]) + format_instructions = output_parser.get_format_instructions() + else: output_parser = JsonOutputParser() - - format_instructions = output_parser.get_format_instructions() + format_instructions = output_parser.get_format_instructions() prompt_template = PromptTemplate( template=TEMPLATE_COMBINED, @@ -85,6 +100,7 @@ def execute(self, state: dict) -> dict: merge_chain = prompt_template | self.llm_model | output_parser answer = merge_chain.invoke({"user_prompt": user_prompt}) + answer["sources"] = state.get("urls") state.update({self.output[0]: answer}) return state diff --git a/scrapegraphai/nodes/merge_generated_scripts.py b/scrapegraphai/nodes/merge_generated_scripts.py index ffc2b2fd..e3a138a8 100644 --- a/scrapegraphai/nodes/merge_generated_scripts.py +++ b/scrapegraphai/nodes/merge_generated_scripts.py @@ -60,7 +60,6 @@ def execute(self, state: dict) -> dict: user_prompt = input_data[0] scripts = input_data[1] - # merge the scripts in one string scripts_str = "" for i, script in enumerate(scripts): scripts_str += "-----------------------------------\n" @@ -68,16 +67,6 @@ def execute(self, state: dict) -> dict: scripts_str += "-----------------------------------\n" scripts_str += script - # TODO: should we pass the schema to the output parser even if the scripts already have it implemented? - - # schema to be used for output parsing - # if self.node_config.get("schema", None) is not None: - # output_schema = JsonOutputParser(pydantic_object=self.node_config["schema"]) - # else: - # output_schema = JsonOutputParser() - - # format_instructions = output_schema.get_format_instructions() - TEMPLATE_MERGE = """ You are a python expert in web scraping and you have just generated multiple scripts to scrape different URLs.\n The scripts are generated based on a user question and the content of the websites.\n @@ -103,4 +92,4 @@ def execute(self, state: dict) -> dict: # Update the state with the generated answer state.update({self.output[0]: answer}) - return state \ No newline at end of file + return state diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py index 3e8ed5ac..240daf1f 100644 --- a/scrapegraphai/nodes/parse_node.py +++ b/scrapegraphai/nodes/parse_node.py @@ -2,10 +2,10 @@ ParseNode Module """ from typing import List, Optional -from semchunk import chunk from langchain_community.document_transformers import Html2TextTransformer from langchain_core.documents import Document from .base_node import BaseNode +from ..utils.split_text_into_chunks import split_text_into_chunks class ParseNode(BaseNode): """ @@ -41,6 +41,9 @@ def __init__( True if node_config is None else node_config.get("parse_html", True) ) + self.llm_model = node_config.get("llm_model") + self.chunk_size = node_config.get("chunk_size") + def execute(self, state: dict) -> dict: """ Executes the node's logic to parse the HTML document content and split it into chunks. @@ -65,29 +68,25 @@ def execute(self, state: dict) -> dict: docs_transformed = input_data[0] if self.parse_html: - docs_transformed = Html2TextTransformer().transform_documents(input_data[0]) + docs_transformed = Html2TextTransformer(ignore_links=False).transform_documents(input_data[0]) docs_transformed = docs_transformed[0] - chunks = chunk(text=docs_transformed.page_content, - chunk_size=self.node_config.get("chunk_size", 4096)-250, - token_counter=lambda text: len(text.split()), - memoize=False) + chunks = split_text_into_chunks(text=docs_transformed.page_content, + chunk_size=self.chunk_size-250, model=self.llm_model) else: docs_transformed = docs_transformed[0] - chunk_size = self.node_config.get("chunk_size", 4096) + chunk_size = self.chunk_size chunk_size = min(chunk_size - 500, int(chunk_size * 0.9)) if isinstance(docs_transformed, Document): - chunks = chunk(text=docs_transformed.page_content, - chunk_size=chunk_size, - token_counter=lambda text: len(text.split()), - memoize=False) + chunks = split_text_into_chunks(text=docs_transformed.page_content, + chunk_size=chunk_size, + model=self.llm_model) else: - chunks = chunk(text=docs_transformed, - chunk_size=chunk_size, - token_counter=lambda text: len(text.split()), - memoize=False) + chunks = split_text_into_chunks(text=docs_transformed, + chunk_size=chunk_size, + model=self.llm_model) state.update({self.output[0]: chunks}) diff --git a/scrapegraphai/nodes/robots_node.py b/scrapegraphai/nodes/robots_node.py index 6f9bc352..2bb47e74 100644 --- a/scrapegraphai/nodes/robots_node.py +++ b/scrapegraphai/nodes/robots_node.py @@ -119,7 +119,8 @@ def execute(self, state: dict) -> dict: raise ValueError("The website you selected is not scrapable") else: self.logger.warning( - "\033[33m(WARNING: Scraping this website is not allowed but you decided to force it)\033[0m" + """\033[33m(WARNING: Scraping this website is + not allowed but you decided to force it)\033[0m""" ) else: self.logger.warning("\033[32m(Scraping this website is allowed)\033[0m") diff --git a/scrapegraphai/nodes/search_internet_node.py b/scrapegraphai/nodes/search_internet_node.py index df1b6277..14ce3207 100644 --- a/scrapegraphai/nodes/search_internet_node.py +++ b/scrapegraphai/nodes/search_internet_node.py @@ -41,7 +41,11 @@ def __init__( self.verbose = ( False if node_config is None else node_config.get("verbose", False) ) - self.search_engine = node_config.get("search_engine", "google") + self.search_engine = ( + node_config["search_engine"] + if node_config.get("search_engine") + else "google" + ) self.max_results = node_config.get("max_results", 3) def execute(self, state: dict) -> dict: diff --git a/scrapegraphai/nodes/search_link_node.py b/scrapegraphai/nodes/search_link_node.py index 60c3e1aa..034599ea 100644 --- a/scrapegraphai/nodes/search_link_node.py +++ b/scrapegraphai/nodes/search_link_node.py @@ -42,9 +42,7 @@ def __init__( self.llm_model = node_config["llm_model"] - # Apply filters if filter_links is True or if filter_config is provided if node_config.get("filter_links", False) or "filter_config" in node_config: - # Merge provided filter config with default filter config for partial configuration provided_filter_config = node_config.get("filter_config", {}) self.filter_config = {**default_filters.filter_dict, **provided_filter_config} self.filter_links = True @@ -79,7 +77,7 @@ def _is_language_url(self, url): return any(indicator in parsed_url.path.lower() or indicator in query_params for indicator in lang_indicators) def _is_potentially_irrelevant(self, url): if not self.filter_links: - return False # Skip irrelevant URL filtering if filtering is not enabled + return False irrelevant_keywords = self.filter_config.get("irrelevant_keywords", []) return any(keyword in url.lower() for keyword in irrelevant_keywords) @@ -118,7 +116,6 @@ def execute(self, state: dict) -> dict: ): try: - # Primary approach: Regular expression to extract links links = re.findall(r'https?://[^\s"<>\]]+', str(chunk.page_content)) if not self.filter_links: @@ -140,7 +137,6 @@ def execute(self, state: dict) -> dict: self.seen_links.update(relevant_links) except Exception as e: - # Fallback approach: Using the LLM to extract links self.logger.error(f"Error extracting links: {e}. Falling back to LLM.") merge_prompt = PromptTemplate( diff --git a/scrapegraphai/utils/__init__.py b/scrapegraphai/utils/__init__.py index 707d2b18..fbd03800 100644 --- a/scrapegraphai/utils/__init__.py +++ b/scrapegraphai/utils/__init__.py @@ -1,7 +1,6 @@ """ __init__.py file for utils folder """ - from .convert_to_csv import convert_to_csv from .convert_to_json import convert_to_json from .prettify_exec_info import prettify_exec_info @@ -11,3 +10,10 @@ from .cleanup_html import cleanup_html from .logging import * from .convert_to_md import convert_to_md +from .screenshot_scraping.screenshot_preparation import (take_screenshot, + select_area_with_opencv, + select_area_with_ipywidget, + crop_image) +from .screenshot_scraping.text_detection import detect_text +from .tokenizer import num_tokens_calculus +from .split_text_into_chunks import split_text_into_chunks diff --git a/scrapegraphai/utils/copy.py b/scrapegraphai/utils/copy.py index 2defbfa3..d6bb839a 100644 --- a/scrapegraphai/utils/copy.py +++ b/scrapegraphai/utils/copy.py @@ -1,3 +1,6 @@ +""" +copy module +""" import copy from typing import Any, Dict, Optional from pydantic.v1 import BaseModel @@ -24,52 +27,36 @@ def safe_deepcopy(obj: Any) -> Any: """ try: - - # Try to use copy.deepcopy first return copy.deepcopy(obj) except (TypeError, AttributeError) as e: - # If deepcopy fails, handle specific types manually - - # Handle dictionaries if isinstance(obj, dict): new_obj = {} - for k, v in obj.items(): new_obj[k] = safe_deepcopy(v) return new_obj - # Handle lists elif isinstance(obj, list): new_obj = [] - for v in obj: new_obj.append(safe_deepcopy(v)) return new_obj - # Handle tuples (immutable, but might contain mutable objects) elif isinstance(obj, tuple): new_obj = tuple(safe_deepcopy(v) for v in obj) - return new_obj - # Handle frozensets (immutable, but might contain mutable objects) elif isinstance(obj, frozenset): new_obj = frozenset(safe_deepcopy(v) for v in obj) return new_obj - # Handle objects with attributes elif hasattr(obj, "__dict__"): - # If an object cannot be deep copied, then the sub-properties of \ - # the object will not be analyzed and shallow copy will be used directly. try: return copy.copy(obj) except (TypeError, AttributeError): raise DeepCopyError(f"Cannot deep copy the object of type {type(obj)}") from e - # Attempt shallow copy as a fallback try: return copy.copy(obj) except (TypeError, AttributeError): raise DeepCopyError(f"Cannot deep copy the object of type {type(obj)}") from e - diff --git a/scrapegraphai/utils/parse_state_keys.py b/scrapegraphai/utils/parse_state_keys.py index f4bd2ea5..79de329c 100644 --- a/scrapegraphai/utils/parse_state_keys.py +++ b/scrapegraphai/utils/parse_state_keys.py @@ -3,7 +3,6 @@ """ import re - def parse_expression(expression, state: dict) -> list: """ Parses a complex boolean expression involving state keys. @@ -22,7 +21,8 @@ def parse_expression(expression, state: dict) -> list: Example: >>> parse_expression("user_input & (relevant_chunks | parsed_document | document)", - {"user_input": None, "document": None, "parsed_document": None, "relevant_chunks": None}) + {"user_input": None, "document": None, + "parsed_document": None, "relevant_chunks": None}) ['user_input', 'relevant_chunks', 'parsed_document', 'document'] This function evaluates the expression to determine the @@ -69,7 +69,6 @@ def evaluate_simple_expression(exp): return [elem.strip() for elem in and_segment if elem.strip() in state] return [] - # Helper function to evaluate expressions with parentheses def evaluate_expression(expression): while '(' in expression: start = expression.rfind('(') diff --git a/scrapegraphai/utils/prettify_exec_info.py b/scrapegraphai/utils/prettify_exec_info.py index 8cfef81a..07a36e49 100644 --- a/scrapegraphai/utils/prettify_exec_info.py +++ b/scrapegraphai/utils/prettify_exec_info.py @@ -12,10 +12,12 @@ def prettify_exec_info(complete_result: list[dict]) -> pd.DataFrame: complete_result (list[dict]): The complete execution information of the graph. Returns: - pd.DataFrame: A DataFrame that organizes the execution information for better readability and analysis. + pd.DataFrame: A DataFrame that organizes the execution information + for better readability and analysis. Example: - >>> prettify_exec_info([{'node': 'A', 'status': 'success'}, {'node': 'B', 'status': 'failure'}]) + >>> prettify_exec_info([{'node': 'A', 'status': 'success'}, + {'node': 'B', 'status': 'failure'}]) DataFrame with columns 'node' and 'status' showing execution results for each node. """ diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py index fe7902d3..0a10c8f2 100644 --- a/scrapegraphai/utils/research_web.py +++ b/scrapegraphai/utils/research_web.py @@ -62,13 +62,12 @@ def search_on_web(query: str, search_engine: str = "Google", url = f"http://localhost:{port}" params = {"q": query, "format": "json"} - # Send the GET request to the server response = requests.get(url, params=params) - # Parse the response and limit to the specified max_results data = response.json() limited_results = data["results"][:max_results] return limited_results else: - raise ValueError("The only search engines available are DuckDuckGo, Google, Bing, or SearXNG") + raise ValueError("""The only search engines available are + DuckDuckGo, Google, Bing, or SearXNG""") diff --git a/scrapegraphai/utils/screenshot_scraping/__init__.py b/scrapegraphai/utils/screenshot_scraping/__init__.py new file mode 100644 index 00000000..20cfb3c0 --- /dev/null +++ b/scrapegraphai/utils/screenshot_scraping/__init__.py @@ -0,0 +1,2 @@ +from .screenshot_preparation import take_screenshot, select_area_with_opencv, select_area_with_ipywidget, crop_image +from .text_detection import detect_text diff --git a/scrapegraphai/utils/screenshot_scraping/screenshot_preparation.py b/scrapegraphai/utils/screenshot_scraping/screenshot_preparation.py new file mode 100644 index 00000000..6bbc562f --- /dev/null +++ b/scrapegraphai/utils/screenshot_scraping/screenshot_preparation.py @@ -0,0 +1,232 @@ +""" +screenshot_preparation module +""" +import asyncio +from io import BytesIO +from playwright.async_api import async_playwright +import numpy as np +from io import BytesIO + +async def take_screenshot(url: str, save_path: str = None, quality: int = 100): + """ + Takes a screenshot of a webpage at the specified URL and saves it if the save_path is specified. + Parameters: + url (str): The URL of the webpage to take a screenshot of. + save_path (str): The path to save the screenshot to. Defaults to None. + quality (int): The quality of the jpeg image, between 1 and 100. Defaults to 100. + Returns: + PIL.Image: The screenshot of the webpage as a PIL Image object. + """ + try: + from PIL import Image + except: + raise ImportError("The dependencies for screenshot scraping are not installed. Please install them using `pip install scrapegraphai[screenshot_scraper]`.") + + async with async_playwright() as p: + browser = await p.chromium.launch(headless=True) + page = await browser.new_page() + await page.goto(url) + image_bytes = await page.screenshot(path=save_path, + type="jpeg", + full_page=True, + quality=quality) + await browser.close() + return Image.open(BytesIO(image_bytes)) + +def select_area_with_opencv(image): + """ + Allows you to manually select an image area using OpenCV. + It is recommended to use this function if your project is on your computer, + otherwise use select_area_with_ipywidget(). + Parameters: + image (PIL.Image): The image from which to select an area. + Returns: + A tuple containing the LEFT, TOP, RIGHT, and BOTTOM coordinates of the selected area. + """ + + try: + import cv2 as cv + from PIL import ImageGrab + except ImportError: + raise ImportError("The dependencies for screenshot scraping are not installed. Please install them using `pip install scrapegraphai[screenshot_scraper]`.") + + + fullscreen_screenshot = ImageGrab.grab() + dw, dh = fullscreen_screenshot.size + + def draw_selection_rectanlge(event, x, y, flags, param): + global ix, iy, drawing, overlay, img + if event == cv.EVENT_LBUTTONDOWN: + drawing = True + ix, iy = x, y + elif event == cv.EVENT_MOUSEMOVE: + if drawing == True: + cv.rectangle(img, (ix, iy), (x, y), (41, 215, 162), -1) + cv.putText(img, 'PRESS ANY KEY TO SELECT THIS AREA', (ix, + iy-10), cv.FONT_HERSHEY_SIMPLEX, 1.5, (55, 46, 252), 5) + img = cv.addWeighted(overlay, alpha, img, 1 - alpha, 0) + elif event == cv.EVENT_LBUTTONUP: + global LEFT, TOP, RIGHT, BOTTOM + + drawing = False + if ix < x: + LEFT = int(ix) + RIGHT = int(x) + else: + LEFT = int(x) + RIGHT = int(ix) + if iy < y: + TOP = int(iy) + BOTTOM = int(y) + else: + TOP = int(y) + BOTTOM = int(iy) + + global drawing, ix, iy, overlay, img + drawing = False + ix, iy = -1, -1 + + img = np.array(image) + img = cv.cvtColor(img, cv.COLOR_RGB2BGR) + + img = cv.rectangle( + img, (0, 0), (image.size[0], image.size[1]), (0, 0, 255), 10) + img = cv.putText(img, 'SELECT AN AREA', (int( + image.size[0]*0.3), 100), cv.FONT_HERSHEY_SIMPLEX, 2, (0, 0, 255), 5) + + overlay = img.copy() + alpha = 0.3 + + while True: + cv.namedWindow('SELECT AREA', cv.WINDOW_KEEPRATIO) + cv.setMouseCallback('SELECT AREA', draw_selection_rectanlge) + cv.resizeWindow('SELECT AREA', int( + image.size[0]/(image.size[1]/dh)), dh) + + cv.imshow('SELECT AREA', img) + + if cv.waitKey(20) > -1: + break + + cv.destroyAllWindows() + return LEFT, TOP, RIGHT, BOTTOM + + +def select_area_with_ipywidget(image): + """ + Allows you to manually select an image area using ipywidgets. + It is recommended to use this function if your project is in Google Colab, + Kaggle or other similar platform, otherwise use select_area_with_opencv(). + Parameters: + image (PIL Image): The input image. + Returns: + None + """ + + import matplotlib.pyplot as plt + import numpy as np + try: + from ipywidgets import interact, IntSlider + import ipywidgets as widgets + except: + raise ImportError("The dependencies for screenshot scraping are not installed. Please install them using `pip install scrapegraphai[screenshot_scraper]`.") + + from PIL import Image + + img_array = np.array(image) + + print(img_array.shape) + + def update_plot(top_bottom, left_right, image_size): + plt.figure(figsize=(image_size, image_size)) + plt.imshow(img_array) + plt.axvline(x=left_right[0], color='blue', linewidth=1) + plt.text(left_right[0]+1, -25, 'LEFT', rotation=90, color='blue') + plt.axvline(x=left_right[1], color='red', linewidth=1) + plt.text(left_right[1]+1, -25, 'RIGHT', rotation=90, color='red') + + plt.axhline(y=img_array.shape[0] - + top_bottom[0], color='green', linewidth=1) + plt.text(-100, img_array.shape[0] - + top_bottom[0]+1, 'BOTTOM', color='green') + plt.axhline(y=img_array.shape[0]-top_bottom[1], + color='darkorange', linewidth=1) + plt.text(-100, img_array.shape[0] - + top_bottom[1]+1, 'TOP', color='darkorange') + plt.axis('off') + plt.show() + + top_bottom_slider = widgets.IntRangeSlider( + value=[int(img_array.shape[0]*0.25), int(img_array.shape[0]*0.75)], + min=0, + max=img_array.shape[0], + step=1, + description='top_bottom:', + disabled=False, + continuous_update=True, + orientation='vertical', + readout=True, + readout_format='d', + ) + + left_right_slider = widgets.IntRangeSlider( + value=[int(img_array.shape[1]*0.25), int(img_array.shape[1]*0.75)], + min=0, + max=img_array.shape[1], + step=1, + description='left_right:', + disabled=False, + continuous_update=True, + orientation='horizontal', + readout=True, + readout_format='d', + ) + image_size_bt = widgets.BoundedIntText( + value=10, + min=2, + max=20, + step=1, + description='Image size:', + disabled=False + ) + + interact(update_plot, top_bottom=top_bottom_slider, + left_right=left_right_slider, image_size=image_size_bt) + + return left_right_slider, top_bottom_slider + + +def crop_image(image, LEFT=None, TOP=None, RIGHT=None, BOTTOM=None, save_path: str = None): + """ + Crop an image using the specified coordinates. + Parameters: + image (PIL.Image): The image to be cropped. + LEFT (int, optional): The x-coordinate of the left edge of the crop area. Defaults to None. + TOP (int, optional): The y-coordinate of the top edge of the crop area. Defaults to None. + RIGHT (int, optional): The x-coordinate of + the right edge of the crop area. Defaults to None. + BOTTOM (int, optional): The y-coordinate of the bottom edge of the crop area. Defaults to None. + save_path (str, optional): The path to save the cropped image. Defaults to None. + Returns: + PIL.Image: The cropped image. + Notes: + If any of the coordinates (LEFT, TOP, RIGHT, BOTTOM) is None, + it will be set to the corresponding edge of the image. + If save_path is specified, the cropped image will be saved as a JPEG file at the specified path. + """ + + if LEFT is None: + LEFT = 0 + if TOP is None: + TOP = 0 + if RIGHT is None: + RIGHT = image.size[0] + if BOTTOM is None: + BOTTOM = image.size[1] + + croped_image = image.crop((LEFT, TOP, RIGHT, BOTTOM)) + if save_path is not None: + from pathlib import Path + croped_image.save(save_path, "JPEG") + + return image.crop((LEFT, TOP, RIGHT, BOTTOM)) diff --git a/scrapegraphai/utils/screenshot_scraping/text_detection.py b/scrapegraphai/utils/screenshot_scraping/text_detection.py new file mode 100644 index 00000000..16367a21 --- /dev/null +++ b/scrapegraphai/utils/screenshot_scraping/text_detection.py @@ -0,0 +1,34 @@ +""" +text_detection_module +""" + + +def detect_text(image, languages: list = ["en"]): + """ + Detects and extracts text from a given image. + Parameters: + image (PIL Image): The input image to extract text from. + lahguages (list): A list of languages to detect text in. Defaults to ["en"]. List of languages can be found here: https://github.com/VikParuchuri/surya/blob/master/surya/languages.py + Returns: + str: The extracted text from the image. + Notes: + Model weights will automatically download the first time you run this function. + """ + + try: + from surya.ocr import run_ocr + from surya.model.detection.model import (load_model as load_det_model, + load_processor as load_det_processor) + from surya.model.recognition.model import load_model as load_rec_model + from surya.model.recognition.processor import load_processor as load_rec_processor + except: + raise ImportError("The dependencies for screenshot scraping are not installed. Please install them using `pip install scrapegraphai[screenshot_scraper]`.") + + + langs = languages + det_processor, det_model = load_det_processor(), load_det_model() + rec_model, rec_processor = load_rec_model(), load_rec_processor() + predictions = run_ocr([image], [langs], det_model, + det_processor, rec_model, rec_processor) + text = "\n".join([line.text for line in predictions[0].text_lines]) + return text \ No newline at end of file diff --git a/scrapegraphai/utils/split_text_into_chunks.py b/scrapegraphai/utils/split_text_into_chunks.py new file mode 100644 index 00000000..73b2856b --- /dev/null +++ b/scrapegraphai/utils/split_text_into_chunks.py @@ -0,0 +1,58 @@ +""" +split_text_into_chunks module +""" +from typing import List +from .tokenizer import num_tokens_calculus # Import the new tokenizing function +from langchain_core.language_models.chat_models import BaseChatModel + +def split_text_into_chunks(text: str, chunk_size: int, model: BaseChatModel, use_semchunk=True) -> List[str]: + """ + Splits the text into chunks based on the number of tokens. + + Args: + text (str): The text to split. + chunk_size (int): The maximum number of tokens per chunk. + + Returns: + List[str]: A list of text chunks. + """ + + if use_semchunk: + from semchunk import chunk + def count_tokens(text): + return num_tokens_calculus(text, model) + + chunk_size = min(chunk_size - 500, int(chunk_size * 0.9)) + + chunks = chunk(text=text, + chunk_size=chunk_size, + token_counter=count_tokens, + memoize=False) + return chunks + + else: + + tokens = num_tokens_calculus(text, model) + + if tokens <= chunk_size: + return [text] + + chunks = [] + current_chunk = [] + current_length = 0 + + words = text.split() + for word in words: + word_tokens = num_tokens_calculus(word, model) + if current_length + word_tokens > chunk_size: + chunks.append(' '.join(current_chunk)) + current_chunk = [word] + current_length = word_tokens + else: + current_chunk.append(word) + current_length += word_tokens + + if current_chunk: + chunks.append(' '.join(current_chunk)) + + return chunks diff --git a/scrapegraphai/utils/sys_dynamic_import.py b/scrapegraphai/utils/sys_dynamic_import.py index 14910b3f..aa45a9b1 100644 --- a/scrapegraphai/utils/sys_dynamic_import.py +++ b/scrapegraphai/utils/sys_dynamic_import.py @@ -3,11 +3,9 @@ source code inspired by https://gist.github.com/DiTo97/46f4b733396b8d7a8f1d4d22db902cfc """ - import sys import typing import importlib.util - if typing.TYPE_CHECKING: import types diff --git a/scrapegraphai/utils/token_calculator.py b/scrapegraphai/utils/token_calculator.py deleted file mode 100644 index c5e5fbbb..00000000 --- a/scrapegraphai/utils/token_calculator.py +++ /dev/null @@ -1,38 +0,0 @@ -""" -Module for truncating in chunks the messages -""" -from typing import List -import tiktoken -from ..helpers.models_tokens import models_tokens - - -def truncate_text_tokens(text: str, model: str, encoding_name: str) -> List[str]: - """ - Truncates text into chunks that are small enough to be processed by specified llm models. - - Args: - text (str): The input text to be truncated. - model (str): The name of the llm model to determine the maximum token limit. - encoding_name (str): The encoding strategy used to encode the text before truncation. - - Returns: - List[str]: A list of text chunks, each within the token limit of the specified model. - - Example: - >>> truncate_text_tokens("This is a sample text for truncation.", "GPT-3", "EMBEDDING_ENCODING") - ["This is a sample text", "for truncation."] - - This function ensures that each chunk of text can be tokenized - by the specified model without exceeding the model's token limit. - """ - - encoding = tiktoken.get_encoding(encoding_name) - max_tokens = min(models_tokens[model] - 500, int(models_tokens[model] * 0.9)) - encoded_text = encoding.encode(text) - - chunks = [encoded_text[i:i + max_tokens] - for i in range(0, len(encoded_text), max_tokens)] - - result = [encoding.decode(chunk) for chunk in chunks] - - return result diff --git a/scrapegraphai/utils/tokenizer.py b/scrapegraphai/utils/tokenizer.py new file mode 100644 index 00000000..2e20a244 --- /dev/null +++ b/scrapegraphai/utils/tokenizer.py @@ -0,0 +1,30 @@ +""" +Module for counting tokens and splitting text into chunks +""" +from typing import List +from langchain_openai import ChatOpenAI +from langchain_ollama import ChatOllama +from langchain_mistralai import ChatMistralAI +from langchain_core.language_models.chat_models import BaseChatModel + +def num_tokens_calculus(string: str, llm_model: BaseChatModel) -> int: + """Returns the number of tokens in a text string.""" + + if isinstance(llm_model, ChatOpenAI): + from .tokenizers.tokenizer_openai import num_tokens_openai + num_tokens_fn = num_tokens_openai + + elif isinstance(llm_model, ChatMistralAI): + from .tokenizers.tokenizer_mistral import num_tokens_mistral + num_tokens_fn = num_tokens_mistral + + elif isinstance(llm_model, ChatOllama): + from .tokenizers.tokenizer_ollama import num_tokens_ollama + num_tokens_fn = num_tokens_ollama + + else: + from .tokenizers.tokenizer_openai import num_tokens_openai + num_tokens_fn = num_tokens_openai + + num_tokens = num_tokens_fn(string, llm_model) + return num_tokens diff --git a/scrapegraphai/utils/tokenizers/tokenizer_mistral.py b/scrapegraphai/utils/tokenizers/tokenizer_mistral.py new file mode 100644 index 00000000..22dae552 --- /dev/null +++ b/scrapegraphai/utils/tokenizers/tokenizer_mistral.py @@ -0,0 +1,46 @@ +""" +Tokenization utilities for Mistral models +""" +from mistral_common.protocol.instruct.messages import UserMessage +from mistral_common.protocol.instruct.request import ChatCompletionRequest +from mistral_common.protocol.instruct.tool_calls import Function, Tool +from mistral_common.tokens.tokenizers.mistral import MistralTokenizer +from langchain_core.language_models.chat_models import BaseChatModel +from ..logging import get_logger + + +def num_tokens_mistral(text: str, llm_model:BaseChatModel) -> int: + """ + Estimate the number of tokens in a given text using Mistral's tokenization method, + adjusted for different Mistral models. + + Args: + text (str): The text to be tokenized and counted. + llm_model (BaseChatModel): The specific Mistral model to adjust tokenization. + + Returns: + int: The number of tokens in the text. + """ + + logger = get_logger() + + logger.debug(f"Counting tokens for text of {len(text)} characters") + try: + model = llm_model.model + except AttributeError: + raise NotImplementedError(f"The model provider you are using ('{llm_model}') " + "does not give us a model name so we cannot identify which encoding to use") + + tokenizer = MistralTokenizer.from_model(model) + + tokenized = tokenizer.encode_chat_completion( + ChatCompletionRequest( + tools=[], + messages=[ + UserMessage(content=text), + ], + model=model, + ) + ) + tokens = tokenized.tokens + return len(tokens) diff --git a/scrapegraphai/utils/tokenizers/tokenizer_ollama.py b/scrapegraphai/utils/tokenizers/tokenizer_ollama.py new file mode 100644 index 00000000..a981e25c --- /dev/null +++ b/scrapegraphai/utils/tokenizers/tokenizer_ollama.py @@ -0,0 +1,28 @@ +""" +Tokenization utilities for Ollama models +""" +from langchain_core.language_models.chat_models import BaseChatModel +from ..logging import get_logger + +def num_tokens_ollama(text: str, llm_model:BaseChatModel) -> int: + """ + Estimate the number of tokens in a given text using Ollama's tokenization method, + adjusted for different Ollama models. + + Args: + text (str): The text to be tokenized and counted. + llm_model (BaseChatModel): The specific Ollama model to adjust tokenization. + + Returns: + int: The number of tokens in the text. + """ + + logger = get_logger() + + logger.debug(f"Counting tokens for text of {len(text)} characters") + + # Use langchain token count implementation + # NB: https://github.com/ollama/ollama/issues/1716#issuecomment-2074265507 + tokens = llm_model.get_num_tokens(text) + return tokens + diff --git a/scrapegraphai/utils/tokenizers/tokenizer_openai.py b/scrapegraphai/utils/tokenizers/tokenizer_openai.py new file mode 100644 index 00000000..ede53905 --- /dev/null +++ b/scrapegraphai/utils/tokenizers/tokenizer_openai.py @@ -0,0 +1,28 @@ +""" +Tokenization utilities for OpenAI models +""" +import tiktoken +from langchain_core.language_models.chat_models import BaseChatModel +from ..logging import get_logger + +def num_tokens_openai(text: str, llm_model:BaseChatModel) -> int: + """ + Estimate the number of tokens in a given text using OpenAI's tokenization method, + adjusted for different OpenAI models. + + Args: + text (str): The text to be tokenized and counted. + llm_model (BaseChatModel): The specific OpenAI model to adjust tokenization. + + Returns: + int: The number of tokens in the text. + """ + + logger = get_logger() + + logger.debug(f"Counting tokens for text of {len(text)} characters") + + encoding = tiktoken.encoding_for_model("gpt-4") + + num_tokens = len(encoding.encode(text)) + return num_tokens diff --git a/tests/graphs/abstract_graph_test.py b/tests/graphs/abstract_graph_test.py index 60c8ab4c..642868fb 100644 --- a/tests/graphs/abstract_graph_test.py +++ b/tests/graphs/abstract_graph_test.py @@ -10,8 +10,9 @@ ) from scrapegraphai.models import OneApi, DeepSeek from langchain_openai import ChatOpenAI, AzureChatOpenAI -from langchain_community.chat_models import ChatOllama +from langchain_ollama import ChatOllama from langchain_google_genai import ChatGoogleGenerativeAI +from langchain_aws import ChatBedrock @@ -22,7 +23,7 @@ def __init__(self, prompt: str, config: dict): def _create_graph(self) -> BaseGraph: fetch_node = FetchNode( input="url| local_dir", - output=["doc", "link_urls", "img_urls"], + output=["doc"], node_config={ "llm_model": self.llm_model, "force": self.config.get("force", False), @@ -35,6 +36,7 @@ def _create_graph(self) -> BaseGraph: input="doc", output=["parsed_doc"], node_config={ + "llm_model": self.llm_model, "chunk_size": self.model_token } ) @@ -70,6 +72,7 @@ class TestAbstractGraph: ({"model": "ollama/llama2"}, ChatOllama), ({"model": "oneapi/qwen-turbo", "api_key": "oneapi-api-key"}, OneApi), ({"model": "deepseek/deepseek-coder", "api_key": "deepseek-api-key"}, DeepSeek), + ({"model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "region_name": "IDK"}, ChatBedrock), ]) def test_create_llm(self, llm_config, expected_model): @@ -80,3 +83,17 @@ def test_create_llm_unknown_provider(self): with pytest.raises(ValueError): TestGraph("Test prompt", {"llm": {"model": "unknown_provider/model"}}) + @pytest.mark.parametrize("llm_config, expected_model", [ + ({"model": "openai/gpt-3.5-turbo", "openai_api_key": "sk-randomtest001", "rate_limit": {"requests_per_second": 1}}, ChatOpenAI), + ({"model": "azure_openai/gpt-3.5-turbo", "api_key": "random-api-key", "api_version": "no version", "azure_endpoint": "https://www.example.com/", "rate_limit": {"requests_per_second": 1}}, AzureChatOpenAI), + ({"model": "google_genai/gemini-pro", "google_api_key": "google-key-test", "rate_limit": {"requests_per_second": 1}}, ChatGoogleGenerativeAI), + ({"model": "ollama/llama2", "rate_limit": {"requests_per_second": 1}}, ChatOllama), + ({"model": "oneapi/qwen-turbo", "api_key": "oneapi-api-key", "rate_limit": {"requests_per_second": 1}}, OneApi), + ({"model": "deepseek/deepseek-coder", "api_key": "deepseek-api-key", "rate_limit": {"requests_per_second": 1}}, DeepSeek), + ({"model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "region_name": "IDK", "rate_limit": {"requests_per_second": 1}}, ChatBedrock), + ]) + + + def test_create_llm_with_rate_limit(self, llm_config, expected_model): + graph = TestGraph("Test prompt", {"llm": llm_config}) + assert isinstance(graph.llm_model, expected_model) \ No newline at end of file