Skip to content

Ligthweigthing library #573

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Aug 23, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 12 additions & 16 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,12 +1,7 @@
[project]
name = "scrapegraphai"


version = "1.14.1b1"


description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."

authors = [
{ name = "Marco Vinciguerra", email = "mvincig11@gmail.com" },
{ name = "Marco Perini", email = "perinim.98@gmail.com" },
Expand All @@ -15,32 +10,23 @@ authors = [

dependencies = [
"langchain>=0.2.14",
"langchain-fireworks>=0.1.3",
"langchain_community>=0.2.9",
"langchain-google-genai>=1.0.7",
"langchain-google-vertexai>=1.0.7",
"langchain-openai>=0.1.22",
"langchain-groq>=0.1.3",
"langchain-aws>=0.1.3",
"langchain-anthropic>=0.1.11",
"langchain-mistralai>=0.1.12",
"langchain-huggingface>=0.0.3",
"langchain-nvidia-ai-endpoints>=0.1.6",
"langchain_community>=0.2.9",
"langchain-aws>=0.1.3",
"html2text>=2024.2.26",
"faiss-cpu>=1.8.0",
"beautifulsoup4>=4.12.3",
"pandas>=2.2.2",
"python-dotenv>=1.0.1",
"tiktoken>=0.7",
"tqdm>=4.66.4",
"graphviz>=0.20.3",
"minify-html>=0.15.0",
"free-proxy>=1.1.1",
"playwright>=1.43.0",
"google>=3.0.0",
"undetected-playwright>=0.3.0",
"semchunk>=1.0.1",
"browserbase>=0.3.0",
]

license = "MIT"
Expand Down Expand Up @@ -78,6 +64,16 @@ requires-python = ">=3.9,<4.0"
[project.optional-dependencies]
burr = ["burr[start]==0.22.1"]
docs = ["sphinx==6.0", "furo==2024.5.6"]
other = [
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

split them in three groups of extras, one for "other" language models, one for more semantic options, one for more browser options

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok @DiTo97, I updated

"graphviz>=0.20.3",
"langchain-fireworks>=0.1.3",
"langchain-groq>=0.1.3",
"langchain-anthropic>=0.1.11",
"langchain-huggingface>=0.0.3",
"langchain-nvidia-ai-endpoints>=0.1.6",
"semchunk>=1.0.1",
"browserbase>=0.3.0",
]

[build-system]
requires = ["hatchling"]
Expand Down
14 changes: 8 additions & 6 deletions scrapegraphai/graphs/abstract_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
import uuid
import warnings
from pydantic import BaseModel
from langchain_community.chat_models import ErnieBotChat
from langchain_nvidia_ai_endpoints import ChatNVIDIA
from langchain.chat_models import init_chat_model
from ..helpers import models_tokens
from ..models import (
Expand Down Expand Up @@ -147,16 +145,17 @@ def handle_model(model_name, provider, token_key, default_token=8192):
warnings.simplefilter("ignore")
return init_chat_model(**llm_params)

known_models = ["chatgpt","gpt","openai", "azure_openai", "google_genai", "ollama", "oneapi", "nvidia", "groq", "google_vertexai", "bedrock", "mistralai", "hugging_face", "deepseek", "ernie", "fireworks"]
known_models = ["chatgpt","gpt","openai", "azure_openai", "google_genai",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

known_models = {...}, not square brackets

"ollama", "oneapi", "nvidia", "groq", "google_vertexai",
"bedrock", "mistralai", "hugging_face", "deepseek", "ernie", "fireworks"]

if llm_params["model"].split("/")[0] not in known_models and llm_params["model"].split("-")[0] not in known_models:
raise ValueError(f"Model '{llm_params['model']}' is not supported")

try:
if "azure" in llm_params["model"]:
model_name = llm_params["model"].split("/")[-1]
return handle_model(model_name, "azure_openai", model_name)

return handle_model(model_name, "azure_openai", model_name)
if "fireworks" in llm_params["model"]:
model_name = "/".join(llm_params["model"].split("/")[1:])
token_key = llm_params["model"].split("/")[-1]
Expand Down Expand Up @@ -188,7 +187,6 @@ def handle_model(model_name, provider, token_key, default_token=8192):
model_name = llm_params["model"].split("/")[-1]
return handle_model(model_name, "mistralai", model_name)

# Instantiate the language model based on the model name (models that do not use the common interface)
elif "deepseek" in llm_params["model"]:
try:
self.model_token = models_tokens["deepseek"][llm_params["model"]]
Expand All @@ -198,6 +196,8 @@ def handle_model(model_name, provider, token_key, default_token=8192):
return DeepSeek(llm_params)

elif "ernie" in llm_params["model"]:
from langchain_community.chat_models import ErnieBotChat

try:
self.model_token = models_tokens["ernie"][llm_params["model"]]
except KeyError:
Expand All @@ -215,6 +215,8 @@ def handle_model(model_name, provider, token_key, default_token=8192):
return OneApi(llm_params)

elif "nvidia" in llm_params["model"]:
from langchain_nvidia_ai_endpoints import ChatNVIDIA

try:
self.model_token = models_tokens["nvidia"][llm_params["model"].split("/")[-1]]
llm_params["model"] = "/".join(llm_params["model"].split("/")[1:])
Expand Down
3 changes: 2 additions & 1 deletion scrapegraphai/nodes/fetch_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from langchain_core.documents import Document
from ..utils.cleanup_html import cleanup_html
from ..docloaders import ChromiumLoader
from ..docloaders.browser_base import browser_base_fetch
from ..utils.convert_to_md import convert_to_md
from ..utils.logging import get_logger
from .base_node import BaseNode
Expand Down Expand Up @@ -269,6 +268,8 @@ def handle_web_source(self, state, source):
loader_kwargs = self.node_config.get("loader_kwargs", {})

if self.browser_base is not None:
from ..docloaders.browser_base import browser_base_fetch

data = browser_base_fetch(self.browser_base.get("api_key"),
self.browser_base.get("project_id"), [source])

Expand Down
Loading