diff --git a/CHANGELOG.md b/CHANGELOG.md index 7cdea914..ccb58c7b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,22 +1,25 @@ + ## [1.4.0-beta.2](https://github.com/VinciGit00/Scrapegraph-ai/compare/v1.4.0-beta.1...v1.4.0-beta.2) (2024-05-19) ### Features -* **docloaders:** undetected-playwright ([7b3ee4e](https://github.com/VinciGit00/Scrapegraph-ai/commit/7b3ee4e71e4af04edeb47999d70d398b67c93ac4)) +* Add new models and update existing ones ([58289ec](https://github.com/VinciGit00/Scrapegraph-ai/commit/58289eccc523814a2898650c41410f9a35b4e4c2)) -## [1.4.0-beta.1](https://github.com/VinciGit00/Scrapegraph-ai/compare/v1.3.0...v1.4.0-beta.1) (2024-05-19) +## [1.3.2](https://github.com/VinciGit00/Scrapegraph-ai/compare/v1.3.1...v1.3.2) (2024-05-22) -### Features +### Bug Fixes -* **base_graph:** alligned with main ([73fa31d](https://github.com/VinciGit00/Scrapegraph-ai/commit/73fa31db0f791d1fd63b489ac88cc6e595aa07f9)) +* pdf scraper bug ([f2dffe5](https://github.com/VinciGit00/Scrapegraph-ai/commit/f2dffe534f51aa83aed5ac491243604a443f4373)) +## [1.3.1](https://github.com/VinciGit00/Scrapegraph-ai/compare/v1.3.0...v1.3.1) (2024-05-21) -### CI -* **release:** 1.2.0-beta.1 [skip ci] ([fd3e0aa](https://github.com/VinciGit00/Scrapegraph-ai/commit/fd3e0aa5823509dfb46b4f597521c24d4eb345f1)) -* **release:** 1.3.0-beta.1 [skip ci] ([191db0b](https://github.com/VinciGit00/Scrapegraph-ai/commit/191db0bc779e4913713b47b68ec4162a347da3ea)) +### Bug Fixes + +* add deepseek embeddings ([659fad7](https://github.com/VinciGit00/Scrapegraph-ai/commit/659fad770a5b6ace87511513e5233a3bc1269009)) + ## [1.3.0](https://github.com/VinciGit00/Scrapegraph-ai/compare/v1.2.4...v1.3.0) (2024-05-19) diff --git a/README.md b/README.md index e0c13e96..00eb0540 100644 --- a/README.md +++ b/README.md @@ -180,9 +180,14 @@ Wanna visualize the roadmap in a more interactive way? Check out the [markmap](h ## ❤️ Contributors [![Contributors](https://contrib.rocks/image?repo=VinciGit00/Scrapegraph-ai)](https://github.com/VinciGit00/Scrapegraph-ai/graphs/contributors) ## Sponsors -

- SerpAPI -

+
+ + SerpAPI + + + Stats + +
## 🎓 Citations If you have used our library for research purposes please quote us with the following reference: diff --git a/docs/assets/transparent_stat.png b/docs/assets/transparent_stat.png new file mode 100644 index 00000000..d4f3c5fc Binary files /dev/null and b/docs/assets/transparent_stat.png differ diff --git a/pyproject.toml b/pyproject.toml index 21cb3e59..8b51660e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,8 +1,10 @@ [project] name = "scrapegraphai" + version = "1.4.0b2" + description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ { name = "Marco Vinciguerra", email = "mvincig11@gmail.com" }, diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index b923c89d..0377506a 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -7,6 +7,8 @@ from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings from langchain_community.embeddings import HuggingFaceHubEmbeddings, OllamaEmbeddings from langchain_google_genai import GoogleGenerativeAIEmbeddings +from ..helpers import models_tokens +from ..models import AzureOpenAI, Bedrock, Gemini, Groq, HuggingFace, Ollama, OpenAI, Anthropic, DeepSeek from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings from ..helpers import models_tokens @@ -169,7 +171,7 @@ def _create_llm(self, llm_config: dict, chat=False) -> object: raise KeyError("Model not supported") from exc return Anthropic(llm_params) elif "ollama" in llm_params["model"]: - llm_params["model"] = llm_params["model"].split("/")[-1] + llm_params["model"] = llm_params["model"].split("ollama/")[-1] # allow user to set model_tokens in config try: @@ -243,6 +245,8 @@ def _create_default_embedder(self, llm_config=None) -> object: model="models/embedding-001") if isinstance(self.llm_model, OpenAI): return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key) + elif isinstance(self.llm_model, DeepSeek): + return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key) elif isinstance(self.llm_model, AzureOpenAIEmbeddings): return self.llm_model elif isinstance(self.llm_model, AzureOpenAI): @@ -283,7 +287,7 @@ def _create_embedder(self, embedder_config: dict) -> object: elif "azure" in embedder_config["model"]: return AzureOpenAIEmbeddings() elif "ollama" in embedder_config["model"]: - embedder_config["model"] = embedder_config["model"].split("/")[-1] + embedder_config["model"] = embedder_config["model"].split("ollama/")[-1] try: models_tokens["ollama"][embedder_config["model"]] except KeyError as exc: diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py index 934bf5fe..18be7bf8 100644 --- a/scrapegraphai/helpers/models_tokens.py +++ b/scrapegraphai/helpers/models_tokens.py @@ -33,9 +33,18 @@ }, "ollama": { + "command-r": 12800, + "command-r-plus": 12800, + "codellama": 16000, + "dbrx": 32768, + "dbrx:instruct": 32768, + "deepseek-coder:33b": 16000, + "dolphin-mixtral": 32000, "llama2": 4096, "llama3": 8192, + "llama3:70b-instruct": 8192, "llava": 4096, + "llava:34b": 4096, "llava_next": 4096, "mistral": 8192, "falcon": 2048, @@ -46,13 +55,21 @@ "command-r-plus": 12800, "command-r": 12800, "mistral:7b-instruct": 32768, - "llama3:70b-instruct": 8192, + "mistral-openorca": 32000, "mixtral:8x22b-instruct": 65536, - "wizardlm2:8x22b": 65536, - "dbrx": 32768, - "dbrx:instruct": 32768, "nous-hermes2:34b": 4096, "orca-mini": 2048, + "phi3:3.8b": 12800, + "phi3:14b": 12800, + "qwen:0.5b": 32000, + "qwen:1.8b": 32000, + "qwen:4b": 32000, + "qwen:14b": 32000, + "qwen:32b": 32000, + "qwen:72b": 32000, + "qwen:110b": 32000, + "stablelm-zephyr": 8192, + "wizardlm2:8x22b": 65536, # embedding models "nomic-embed-text": 8192, "snowflake-arctic-embed:335m": 8192, diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 6528f098..6c9858c9 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -86,13 +86,14 @@ def execute(self, state): input_keys[0] == "json_dir" or input_keys[0] == "xml_dir" or input_keys[0] == "csv_dir" + or input_keys[0] == "pdf_dir" ): compressed_document = [ Document(page_content=source, metadata={"source": "local_dir"}) ] state.update({self.output[0]: compressed_document}) return state - + # handling for pdf elif input_keys[0] == "pdf": loader = PyPDFLoader(source) @@ -108,7 +109,7 @@ def execute(self, state): ] state.update({self.output[0]: compressed_document}) return state - + elif input_keys[0] == "json": f = open(source) compressed_document = [