From ae6d8a2b50dd04d949968a06d61857f7123a8ccb Mon Sep 17 00:00:00 2001 From: ks6088ts Date: Sat, 27 Jul 2024 15:48:54 +0900 Subject: [PATCH 1/3] add summarize youtube app --- frontend/.env.sample | 4 +- frontend/pages/chat.py | 2 +- frontend/pages/summarize_youtube.py | 139 ++++++++++++++++++++++ frontend/pages/transcription.py | 2 +- poetry.lock | 178 ++++++++-------------------- pyproject.toml | 5 + 6 files changed, 196 insertions(+), 134 deletions(-) create mode 100644 frontend/pages/summarize_youtube.py diff --git a/frontend/.env.sample b/frontend/.env.sample index f101a48..23c33cc 100644 --- a/frontend/.env.sample +++ b/frontend/.env.sample @@ -5,5 +5,5 @@ BACKEND_URL = "http://localhost:8888" AZURE_OPENAI_ENDPOINT = "https://.openai.azure.com" AZURE_OPENAI_API_KEY = "" AZURE_OPENAI_API_VERSION = "2024-05-01-preview" -AZURE_OPENAI_WHISPER_MODEL = "whisper" -AZURE_OPENAI_GPT_MODEL = "gpt-4o" +AZURE_OPENAI_MODEL_WHISPER = "whisper" +AZURE_OPENAI_MODEL_CHAT = "gpt-4o" diff --git a/frontend/pages/chat.py b/frontend/pages/chat.py index e61186a..45074d7 100644 --- a/frontend/pages/chat.py +++ b/frontend/pages/chat.py @@ -37,7 +37,7 @@ def main( st.markdown(prompt) response = client.chat.completions.create( - model=getenv("AZURE_OPENAI_GPT_MODEL"), + model=getenv("AZURE_OPENAI_MODEL_CHAT"), messages=[{"role": m["role"], "content": m["content"]} for m in st.session_state.messages], stream=True, ) diff --git a/frontend/pages/summarize_youtube.py b/frontend/pages/summarize_youtube.py new file mode 100644 index 0000000..6ff093b --- /dev/null +++ b/frontend/pages/summarize_youtube.py @@ -0,0 +1,139 @@ +import logging +import traceback +from os import getenv +from urllib.parse import urlparse + +import streamlit as st +import tiktoken +from dotenv import load_dotenv +from langchain_community.document_loaders import YoutubeLoader # Youtube用 +from langchain_core.output_parsers import StrOutputParser +from langchain_core.prompts import ChatPromptTemplate +from langchain_core.runnables import RunnableLambda +from langchain_openai import AzureChatOpenAI +from langchain_text_splitters import RecursiveCharacterTextSplitter + +logger = logging.getLogger(__name__) +load_dotenv() + + +SUMMARIZE_PROMPT = """Please provide a clear 300 word summary of the following content in Japanese. + +======== + +{content} + +======== +""" + + +def init_page(): + st.set_page_config(page_title="Summarize YouTube", page_icon="💻") + st.header("Summarize YouTube") + st.sidebar.title("Options") + + +def select_model(temperature=0): + return AzureChatOpenAI( + temperature=temperature, + api_key=getenv("AZURE_OPENAI_API_KEY"), + api_version=getenv("AZURE_OPENAI_API_VERSION"), + azure_endpoint=getenv("AZURE_OPENAI_ENDPOINT"), + model=getenv("AZURE_OPENAI_MODEL_CHAT"), + ) + + +def init_summarize_chain(): + llm = select_model() + prompt = ChatPromptTemplate.from_messages( + [ + ("user", SUMMARIZE_PROMPT), + ] + ) + output_parser = StrOutputParser() + return prompt | llm | output_parser + + +def init_map_reduce_chain(): + summarize_chain = init_summarize_chain() + + text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( + model_name="gpt-4o", # hard-coded for now + chunk_size=16000, + chunk_overlap=0, + ) + text_split = RunnableLambda(lambda x: [{"content": doc} for doc in text_splitter.split_text(x["content"])]) + text_concat = RunnableLambda(lambda x: {"content": "\n".join(x)}) + return text_split | summarize_chain.map() | text_concat | summarize_chain + + +def init_chain(): + summarize_chain = init_summarize_chain() + map_reduce_chain = init_map_reduce_chain() + + def route(x): + encoding = tiktoken.encoding_for_model("gpt-4o") + token_count = len(encoding.encode(x["content"])) + if token_count > 16000: + return map_reduce_chain + else: + return summarize_chain + + chain = RunnableLambda(route) + + return chain + + +def validate_url(url): + """URLが有効かどうかを判定する関数""" + try: + result = urlparse(url) + if result.netloc != "www.youtube.com": + return False + if not result.path.startswith("/watch"): + return False + return all([result.scheme, result.netloc]) + except ValueError: + return False + + +def get_content(url): + with st.spinner("Fetching Youtube ..."): + loader = YoutubeLoader.from_youtube_url( + url, + add_video_info=True, # タイトルや再生数も取得できる + language=["en", "ja"], # 英語→日本語の優先順位で字幕を取得 + ) + res = loader.load() # list of `Document` (page_content, metadata) + try: + if res: + content = res[0].page_content + title = res[0].metadata["title"] + return f"Title: {title}\n\n{content}" + else: + return None + except Exception as e: + logger.error(f"An error occurred: {e}") + st.write(traceback.format_exc()) + return None + + +def main(): + init_page() + chain = init_chain() + if url := st.text_input("URL: ", key="input"): + # clear text input + is_valid_url = validate_url(url) + if not is_valid_url: + st.write("Please input valid url") + else: + if content := get_content(url): + st.markdown("## Summary") + st.write_stream(chain.stream({"content": content})) + st.markdown("---") + st.markdown("## Original Text") + st.write(content) + + +if __name__ == "__main__": + main() diff --git a/frontend/pages/transcription.py b/frontend/pages/transcription.py index 533e778..14995f9 100644 --- a/frontend/pages/transcription.py +++ b/frontend/pages/transcription.py @@ -21,7 +21,7 @@ def get_transcription(file_path: str) -> Transcription: return client.audio.transcriptions.create( file=open(file=file_path, mode="rb"), - model=getenv("AZURE_OPENAI_WHISPER_MODEL"), + model=getenv("AZURE_OPENAI_MODEL_WHISPER"), ) diff --git a/poetry.lock b/poetry.lock index 19cb4ad..74e8ee8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1694,19 +1694,19 @@ referencing = ">=0.31.0" [[package]] name = "langchain" -version = "0.2.10" +version = "0.2.11" description = "Building applications with LLMs through composability" optional = false python-versions = "<4.0,>=3.8.1" files = [ - {file = "langchain-0.2.10-py3-none-any.whl", hash = "sha256:b4fb58c7faf4f4999cfe3325474979a7121a1737dd101655a723a1d957ef0617"}, - {file = "langchain-0.2.10.tar.gz", hash = "sha256:1f861c1b59ac9c91b02bb0fa58d3adad1c1d0686636872b5b357bbce3ce41d06"}, + {file = "langchain-0.2.11-py3-none-any.whl", hash = "sha256:5a7a8b4918f3d3bebce9b4f23b92d050699e6f7fb97591e8941177cf07a260a2"}, + {file = "langchain-0.2.11.tar.gz", hash = "sha256:d7a9e4165f02dca0bd78addbc2319d5b9286b5d37c51d784124102b57e9fd297"}, ] [package.dependencies] aiohttp = ">=3.8.3,<4.0.0" async-timeout = {version = ">=4.0.0,<5.0.0", markers = "python_version < \"3.11\""} -langchain-core = ">=0.2.22,<0.3.0" +langchain-core = ">=0.2.23,<0.3.0" langchain-text-splitters = ">=0.2.0,<0.3.0" langsmith = ">=0.1.17,<0.2.0" numpy = [ @@ -1721,20 +1721,20 @@ tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<9.0.0" [[package]] name = "langchain-community" -version = "0.2.9" +version = "0.2.10" description = "Community contributed LangChain integrations." optional = false python-versions = "<4.0,>=3.8.1" files = [ - {file = "langchain_community-0.2.9-py3-none-any.whl", hash = "sha256:b51d3adf9346a1161c1098917585b9e303cf24e2f5c71f5d232a0504edada5f2"}, - {file = "langchain_community-0.2.9.tar.gz", hash = "sha256:1e7c180232916cbe35fe00509680dd1f805e32d7c87b5e80b3a9ec8754ecae37"}, + {file = "langchain_community-0.2.10-py3-none-any.whl", hash = "sha256:9f4d1b5ab7f0b0a704f538e26e50fce45a461da6d2bf6b7b636d24f22fbc088a"}, + {file = "langchain_community-0.2.10.tar.gz", hash = "sha256:3a0404bad4bd07d6f86affdb62fb3d080a456c66191754d586a409d9d6024d62"}, ] [package.dependencies] aiohttp = ">=3.8.3,<4.0.0" dataclasses-json = ">=0.5.7,<0.7" langchain = ">=0.2.9,<0.3.0" -langchain-core = ">=0.2.22,<0.3.0" +langchain-core = ">=0.2.23,<0.3.0" langsmith = ">=0.1.0,<0.2.0" numpy = [ {version = ">=1,<2", markers = "python_version < \"3.12\""}, @@ -1747,13 +1747,13 @@ tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<9.0.0" [[package]] name = "langchain-core" -version = "0.2.22" +version = "0.2.24" description = "Building applications with LLMs through composability" optional = false python-versions = "<4.0,>=3.8.1" files = [ - {file = "langchain_core-0.2.22-py3-none-any.whl", hash = "sha256:7731a86440c0958b3186c003fb9b26b2d5a682a6344bda7bfb9174e2898f8b43"}, - {file = "langchain_core-0.2.22.tar.gz", hash = "sha256:582d6f929a43b830139444e4124123cd415331ad62f25757b1406252958cdcac"}, + {file = "langchain_core-0.2.24-py3-none-any.whl", hash = "sha256:9444fc082d21ef075d925590a684a73fe1f9688a3d90087580ec929751be55e7"}, + {file = "langchain_core-0.2.24.tar.gz", hash = "sha256:f2e3fa200b124e8c45d270da9bf836bed9c09532612c96ff3225e59b9a232f5a"}, ] [package.dependencies] @@ -1769,17 +1769,17 @@ tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<9.0.0" [[package]] name = "langchain-openai" -version = "0.1.17" +version = "0.1.19" description = "An integration package connecting OpenAI and LangChain" optional = false python-versions = "<4.0,>=3.8.1" files = [ - {file = "langchain_openai-0.1.17-py3-none-any.whl", hash = "sha256:30bef5574ecbbbb91b8025b2dc5a1bd81fd62157d3ad1a35d820141f31c5b443"}, - {file = "langchain_openai-0.1.17.tar.gz", hash = "sha256:c5d70ddecdcb93e146f376bdbadbb6ec69de9ac0f402cd5b83de50b655ba85ee"}, + {file = "langchain_openai-0.1.19-py3-none-any.whl", hash = "sha256:a7a739f1469d54cd988865420e7fc21b50fb93727b2e6da5ad30273fc61ecf19"}, + {file = "langchain_openai-0.1.19.tar.gz", hash = "sha256:3bf342bb302d1444f4abafdf01c467dbd9b248497e1133808c4bae70396c79b3"}, ] [package.dependencies] -langchain-core = ">=0.2.20,<0.3.0" +langchain-core = ">=0.2.24,<0.3.0" openai = ">=1.32.0,<2.0.0" tiktoken = ">=0.7,<1" @@ -3111,25 +3111,6 @@ files = [ {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"}, ] -[[package]] -name = "pydantic" -version = "2.7.4" -description = "Data validation using Python type hints" -optional = false -python-versions = ">=3.8" -files = [ - {file = "pydantic-2.7.4-py3-none-any.whl", hash = "sha256:ee8538d41ccb9c0a9ad3e0e5f07bf15ed8015b481ced539a1759d8cc89ae90d0"}, - {file = "pydantic-2.7.4.tar.gz", hash = "sha256:0c84efd9548d545f63ac0060c1e4d39bb9b14db8b3c0652338aecc07b5adec52"}, -] - -[package.dependencies] -annotated-types = ">=0.4.0" -pydantic-core = "2.18.4" -typing-extensions = ">=4.6.1" - -[package.extras] -email = ["email-validator (>=2.0.0)"] - [[package]] name = "pydantic" version = "2.8.2" @@ -3144,102 +3125,14 @@ files = [ [package.dependencies] annotated-types = ">=0.4.0" pydantic-core = "2.20.1" -typing-extensions = {version = ">=4.6.1", markers = "python_version < \"3.13\""} +typing-extensions = [ + {version = ">=4.6.1", markers = "python_version < \"3.13\""}, + {version = ">=4.12.2", markers = "python_version >= \"3.13\""}, +] [package.extras] email = ["email-validator (>=2.0.0)"] -[[package]] -name = "pydantic-core" -version = "2.18.4" -description = "Core functionality for Pydantic validation and serialization" -optional = false -python-versions = ">=3.8" -files = [ - {file = "pydantic_core-2.18.4-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:f76d0ad001edd426b92233d45c746fd08f467d56100fd8f30e9ace4b005266e4"}, - {file = "pydantic_core-2.18.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:59ff3e89f4eaf14050c8022011862df275b552caef8082e37b542b066ce1ff26"}, - {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a55b5b16c839df1070bc113c1f7f94a0af4433fcfa1b41799ce7606e5c79ce0a"}, - {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4d0dcc59664fcb8974b356fe0a18a672d6d7cf9f54746c05f43275fc48636851"}, - {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8951eee36c57cd128f779e641e21eb40bc5073eb28b2d23f33eb0ef14ffb3f5d"}, - {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4701b19f7e3a06ea655513f7938de6f108123bf7c86bbebb1196eb9bd35cf724"}, - {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e00a3f196329e08e43d99b79b286d60ce46bed10f2280d25a1718399457e06be"}, - {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:97736815b9cc893b2b7f663628e63f436018b75f44854c8027040e05230eeddb"}, - {file = "pydantic_core-2.18.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6891a2ae0e8692679c07728819b6e2b822fb30ca7445f67bbf6509b25a96332c"}, - {file = "pydantic_core-2.18.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bc4ff9805858bd54d1a20efff925ccd89c9d2e7cf4986144b30802bf78091c3e"}, - {file = "pydantic_core-2.18.4-cp310-none-win32.whl", hash = "sha256:1b4de2e51bbcb61fdebd0ab86ef28062704f62c82bbf4addc4e37fa4b00b7cbc"}, - {file = "pydantic_core-2.18.4-cp310-none-win_amd64.whl", hash = "sha256:6a750aec7bf431517a9fd78cb93c97b9b0c496090fee84a47a0d23668976b4b0"}, - {file = "pydantic_core-2.18.4-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:942ba11e7dfb66dc70f9ae66b33452f51ac7bb90676da39a7345e99ffb55402d"}, - {file = "pydantic_core-2.18.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b2ebef0e0b4454320274f5e83a41844c63438fdc874ea40a8b5b4ecb7693f1c4"}, - {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a642295cd0c8df1b86fc3dced1d067874c353a188dc8e0f744626d49e9aa51c4"}, - {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5f09baa656c904807e832cf9cce799c6460c450c4ad80803517032da0cd062e2"}, - {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:98906207f29bc2c459ff64fa007afd10a8c8ac080f7e4d5beff4c97086a3dabd"}, - {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:19894b95aacfa98e7cb093cd7881a0c76f55731efad31073db4521e2b6ff5b7d"}, - {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0fbbdc827fe5e42e4d196c746b890b3d72876bdbf160b0eafe9f0334525119c8"}, - {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f85d05aa0918283cf29a30b547b4df2fbb56b45b135f9e35b6807cb28bc47951"}, - {file = "pydantic_core-2.18.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e85637bc8fe81ddb73fda9e56bab24560bdddfa98aa64f87aaa4e4b6730c23d2"}, - {file = "pydantic_core-2.18.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:2f5966897e5461f818e136b8451d0551a2e77259eb0f73a837027b47dc95dab9"}, - {file = "pydantic_core-2.18.4-cp311-none-win32.whl", hash = "sha256:44c7486a4228413c317952e9d89598bcdfb06399735e49e0f8df643e1ccd0558"}, - {file = "pydantic_core-2.18.4-cp311-none-win_amd64.whl", hash = "sha256:8a7164fe2005d03c64fd3b85649891cd4953a8de53107940bf272500ba8a788b"}, - {file = "pydantic_core-2.18.4-cp311-none-win_arm64.whl", hash = "sha256:4e99bc050fe65c450344421017f98298a97cefc18c53bb2f7b3531eb39bc7805"}, - {file = "pydantic_core-2.18.4-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:6f5c4d41b2771c730ea1c34e458e781b18cc668d194958e0112455fff4e402b2"}, - {file = "pydantic_core-2.18.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2fdf2156aa3d017fddf8aea5adfba9f777db1d6022d392b682d2a8329e087cef"}, - {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4748321b5078216070b151d5271ef3e7cc905ab170bbfd27d5c83ee3ec436695"}, - {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:847a35c4d58721c5dc3dba599878ebbdfd96784f3fb8bb2c356e123bdcd73f34"}, - {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3c40d4eaad41f78e3bbda31b89edc46a3f3dc6e171bf0ecf097ff7a0ffff7cb1"}, - {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:21a5e440dbe315ab9825fcd459b8814bb92b27c974cbc23c3e8baa2b76890077"}, - {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:01dd777215e2aa86dfd664daed5957704b769e726626393438f9c87690ce78c3"}, - {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4b06beb3b3f1479d32befd1f3079cc47b34fa2da62457cdf6c963393340b56e9"}, - {file = "pydantic_core-2.18.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:564d7922e4b13a16b98772441879fcdcbe82ff50daa622d681dd682175ea918c"}, - {file = "pydantic_core-2.18.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:0eb2a4f660fcd8e2b1c90ad566db2b98d7f3f4717c64fe0a83e0adb39766d5b8"}, - {file = "pydantic_core-2.18.4-cp312-none-win32.whl", hash = "sha256:8b8bab4c97248095ae0c4455b5a1cd1cdd96e4e4769306ab19dda135ea4cdb07"}, - {file = "pydantic_core-2.18.4-cp312-none-win_amd64.whl", hash = "sha256:14601cdb733d741b8958224030e2bfe21a4a881fb3dd6fbb21f071cabd48fa0a"}, - {file = "pydantic_core-2.18.4-cp312-none-win_arm64.whl", hash = "sha256:c1322d7dd74713dcc157a2b7898a564ab091ca6c58302d5c7b4c07296e3fd00f"}, - {file = "pydantic_core-2.18.4-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:823be1deb01793da05ecb0484d6c9e20baebb39bd42b5d72636ae9cf8350dbd2"}, - {file = "pydantic_core-2.18.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ebef0dd9bf9b812bf75bda96743f2a6c5734a02092ae7f721c048d156d5fabae"}, - {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae1d6df168efb88d7d522664693607b80b4080be6750c913eefb77e34c12c71a"}, - {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f9899c94762343f2cc2fc64c13e7cae4c3cc65cdfc87dd810a31654c9b7358cc"}, - {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:99457f184ad90235cfe8461c4d70ab7dd2680e28821c29eca00252ba90308c78"}, - {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:18f469a3d2a2fdafe99296a87e8a4c37748b5080a26b806a707f25a902c040a8"}, - {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b7cdf28938ac6b8b49ae5e92f2735056a7ba99c9b110a474473fd71185c1af5d"}, - {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:938cb21650855054dc54dfd9120a851c974f95450f00683399006aa6e8abb057"}, - {file = "pydantic_core-2.18.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:44cd83ab6a51da80fb5adbd9560e26018e2ac7826f9626bc06ca3dc074cd198b"}, - {file = "pydantic_core-2.18.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:972658f4a72d02b8abfa2581d92d59f59897d2e9f7e708fdabe922f9087773af"}, - {file = "pydantic_core-2.18.4-cp38-none-win32.whl", hash = "sha256:1d886dc848e60cb7666f771e406acae54ab279b9f1e4143babc9c2258213daa2"}, - {file = "pydantic_core-2.18.4-cp38-none-win_amd64.whl", hash = "sha256:bb4462bd43c2460774914b8525f79b00f8f407c945d50881568f294c1d9b4443"}, - {file = "pydantic_core-2.18.4-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:44a688331d4a4e2129140a8118479443bd6f1905231138971372fcde37e43528"}, - {file = "pydantic_core-2.18.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a2fdd81edd64342c85ac7cf2753ccae0b79bf2dfa063785503cb85a7d3593223"}, - {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:86110d7e1907ab36691f80b33eb2da87d780f4739ae773e5fc83fb272f88825f"}, - {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:46387e38bd641b3ee5ce247563b60c5ca098da9c56c75c157a05eaa0933ed154"}, - {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:123c3cec203e3f5ac7b000bd82235f1a3eced8665b63d18be751f115588fea30"}, - {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dc1803ac5c32ec324c5261c7209e8f8ce88e83254c4e1aebdc8b0a39f9ddb443"}, - {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:53db086f9f6ab2b4061958d9c276d1dbe3690e8dd727d6abf2321d6cce37fa94"}, - {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:abc267fa9837245cc28ea6929f19fa335f3dc330a35d2e45509b6566dc18be23"}, - {file = "pydantic_core-2.18.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a0d829524aaefdebccb869eed855e2d04c21d2d7479b6cada7ace5448416597b"}, - {file = "pydantic_core-2.18.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:509daade3b8649f80d4e5ff21aa5673e4ebe58590b25fe42fac5f0f52c6f034a"}, - {file = "pydantic_core-2.18.4-cp39-none-win32.whl", hash = "sha256:ca26a1e73c48cfc54c4a76ff78df3727b9d9f4ccc8dbee4ae3f73306a591676d"}, - {file = "pydantic_core-2.18.4-cp39-none-win_amd64.whl", hash = "sha256:c67598100338d5d985db1b3d21f3619ef392e185e71b8d52bceacc4a7771ea7e"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:574d92eac874f7f4db0ca653514d823a0d22e2354359d0759e3f6a406db5d55d"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:1f4d26ceb5eb9eed4af91bebeae4b06c3fb28966ca3a8fb765208cf6b51102ab"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77450e6d20016ec41f43ca4a6c63e9fdde03f0ae3fe90e7c27bdbeaece8b1ed4"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d323a01da91851a4f17bf592faf46149c9169d68430b3146dcba2bb5e5719abc"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:43d447dd2ae072a0065389092a231283f62d960030ecd27565672bd40746c507"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:578e24f761f3b425834f297b9935e1ce2e30f51400964ce4801002435a1b41ef"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:81b5efb2f126454586d0f40c4d834010979cb80785173d1586df845a632e4e6d"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ab86ce7c8f9bea87b9d12c7f0af71102acbf5ecbc66c17796cff45dae54ef9a5"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:90afc12421df2b1b4dcc975f814e21bc1754640d502a2fbcc6d41e77af5ec312"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:51991a89639a912c17bef4b45c87bd83593aee0437d8102556af4885811d59f5"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:293afe532740370aba8c060882f7d26cfd00c94cae32fd2e212a3a6e3b7bc15e"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b48ece5bde2e768197a2d0f6e925f9d7e3e826f0ad2271120f8144a9db18d5c8"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:eae237477a873ab46e8dd748e515c72c0c804fb380fbe6c85533c7de51f23a8f"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:834b5230b5dfc0c1ec37b2fda433b271cbbc0e507560b5d1588e2cc1148cf1ce"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:e858ac0a25074ba4bce653f9b5d0a85b7456eaddadc0ce82d3878c22489fa4ee"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:2fd41f6eff4c20778d717af1cc50eca52f5afe7805ee530a4fbd0bae284f16e9"}, - {file = "pydantic_core-2.18.4.tar.gz", hash = "sha256:ec3beeada09ff865c344ff3bc2f427f5e6c26401cc6113d77e372c3fdac73864"}, -] - -[package.dependencies] -typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" - [[package]] name = "pydantic-core" version = "2.20.1" @@ -3584,6 +3477,17 @@ files = [ [package.extras] dev = ["atomicwrites (==1.4.1)", "attrs (==23.2.0)", "coverage (==7.4.1)", "hatch", "invoke (==2.2.0)", "more-itertools (==10.2.0)", "pbr (==6.0.0)", "pluggy (==1.4.0)", "py (==1.11.0)", "pytest (==8.0.0)", "pytest-cov (==4.1.0)", "pytest-timeout (==2.2.0)", "pyyaml (==6.0.1)", "ruff (==0.2.1)"] +[[package]] +name = "pytube" +version = "15.0.0" +description = "Python 3 library for downloading YouTube Videos." +optional = false +python-versions = ">=3.7" +files = [ + {file = "pytube-15.0.0-py3-none-any.whl", hash = "sha256:07b9904749e213485780d7eb606e5e5b8e4341aa4dccf699160876da00e12d78"}, + {file = "pytube-15.0.0.tar.gz", hash = "sha256:076052efe76f390dfa24b1194ff821d4e86c17d41cb5562f3a276a8bcbfc9d1d"}, +] + [[package]] name = "pytz" version = "2024.1" @@ -4380,13 +4284,13 @@ typing-extensions = ">=3.7.4.3" [[package]] name = "typing-extensions" -version = "4.10.0" +version = "4.12.2" description = "Backported and Experimental Type Hints for Python 3.8+" optional = false python-versions = ">=3.8" files = [ - {file = "typing_extensions-4.10.0-py3-none-any.whl", hash = "sha256:69b1a937c3a517342112fb4c6df7e72fc39a38e7891a5730ed4985b5214b5475"}, - {file = "typing_extensions-4.10.0.tar.gz", hash = "sha256:b0abd7c89e8fb96f98db18d86106ff1d90ab692004eb746cf6eda2682f91b3cb"}, + {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"}, + {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, ] [[package]] @@ -4984,6 +4888,20 @@ requests = ">=2.31" nospam = ["requests-cache (>=1.0)", "requests-ratelimiter (>=0.3.1)"] repair = ["scipy (>=1.6.3)"] +[[package]] +name = "youtube-transcript-api" +version = "0.6.2" +description = "This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles, supports translating subtitles and it does not require a headless browser, like other selenium based solutions do!" +optional = false +python-versions = "*" +files = [ + {file = "youtube_transcript_api-0.6.2-py3-none-any.whl", hash = "sha256:019dbf265c6a68a0591c513fff25ed5a116ce6525832aefdfb34d4df5567121c"}, + {file = "youtube_transcript_api-0.6.2.tar.gz", hash = "sha256:cad223d7620633cec44f657646bffc8bbc5598bd8e70b1ad2fa8277dec305eb7"}, +] + +[package.dependencies] +requests = "*" + [[package]] name = "zipp" version = "3.18.1" @@ -5002,4 +4920,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "0f740c71ed6ff07bb8ba4dce046aa3839a3d5002fb2c24c63c6bd02820ce2659" +content-hash = "83a459d0ebd65ba4cfd4925a48309599784890f57ddd9d4963c3b37a10eee505" diff --git a/pyproject.toml b/pyproject.toml index 80d9763..7d1e36f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,6 +56,11 @@ microsoft-kiota-serialization-multipart = "^0.1.0" microsoft-kiota-serialization-text = "^1.0.0" streamlit-audiorecorder = "^0.0.5" openai = "^1.30.5" +langchain = "^0.2.11" +langchain-openai = "^0.1.19" +langchain-community = "^0.2.10" +youtube-transcript-api = "^0.6.2" +pytube = "^15.0.0" [tool.poetry.group.azure-functions.dependencies] From 275deed4dc7932314f6fb1db7a7f2bd8fc26aaa8 Mon Sep 17 00:00:00 2001 From: ks6088ts Date: Sat, 27 Jul 2024 16:12:08 +0900 Subject: [PATCH 2/3] add tool agent app --- frontend/.env.sample | 6 ++ frontend/pages/tool_agent.py | 113 +++++++++++++++++++++++++++++++++++ frontend/tools/__init__.py | 0 frontend/tools/search_ddg.py | 54 +++++++++++++++++ poetry.lock | 41 ++++++++++++- pyproject.toml | 1 + 6 files changed, 214 insertions(+), 1 deletion(-) create mode 100644 frontend/pages/tool_agent.py create mode 100644 frontend/tools/__init__.py create mode 100644 frontend/tools/search_ddg.py diff --git a/frontend/.env.sample b/frontend/.env.sample index 23c33cc..aff4350 100644 --- a/frontend/.env.sample +++ b/frontend/.env.sample @@ -7,3 +7,9 @@ AZURE_OPENAI_API_KEY = "" AZURE_OPENAI_API_VERSION = "2024-05-01-preview" AZURE_OPENAI_MODEL_WHISPER = "whisper" AZURE_OPENAI_MODEL_CHAT = "gpt-4o" + +# LangSmith +LANGCHAIN_TRACING_V2 = "false" # set to "true" to enable tracing +LANGCHAIN_API_KEY = "" +LANGCHAIN_ENDPOINT = "https://api.smith.langchain.com" +LANGCHAIN_PROJECT = "default" diff --git a/frontend/pages/tool_agent.py b/frontend/pages/tool_agent.py new file mode 100644 index 0000000..9963d5a --- /dev/null +++ b/frontend/pages/tool_agent.py @@ -0,0 +1,113 @@ +import logging +from os import getenv + +import streamlit as st +from dotenv import load_dotenv +from langchain.agents import AgentExecutor, create_tool_calling_agent +from langchain.memory import ConversationBufferWindowMemory +from langchain_community.callbacks import StreamlitCallbackHandler +from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder +from langchain_core.runnables import RunnableConfig +from langchain_openai import AzureChatOpenAI +from tools.search_ddg import search_ddg + +logger = logging.getLogger(__name__) +load_dotenv() + + +CUSTOM_SYSTEM_PROMPT = """ +あなたは、ユーザーのリクエストに基づいてインターネットで調べ物を行うアシスタントです。 +利用可能なツールを使用して、調査した情報を説明してください。 +既に知っていることだけに基づいて答えないでください。回答する前にできる限り検索を行ってください。 +(ユーザーが読むページを指定するなど、特別な場合は、検索する必要はありません。) + +検索結果ページを見ただけでは情報があまりないと思われる場合は、次の2つのオプションを検討して試してみてください。 + +- 検索結果のリンクをクリックして、各ページのコンテンツにアクセスし、読んでみてください。 +- 1ページが長すぎる場合は、3回以上ページ送りしないでください(メモリの負荷がかかるため)。 +- 検索クエリを変更して、新しい検索を実行してください。 +- 検索する内容に応じて検索に利用する言語を適切に変更してください。 +- 例えば、プログラミング関連の質問については英語で検索するのがいいでしょう。 + +ユーザーは非常に忙しく、あなたほど自由ではありません。 +そのため、ユーザーの労力を節約するために、直接的な回答を提供してください。 + +=== 悪い回答の例 === +- これらのページを参照してください。 +- これらのページを参照してコードを書くことができます。 +- 次のページが役立つでしょう。 + +=== 良い回答の例 === +- これはサンプルコードです。 -- サンプルコードをここに -- +- あなたの質問の答えは -- 回答をここに -- + +回答の最後には、参照したページのURLを**必ず**記載してください。(これにより、ユーザーは回答を検証することができます) + +ユーザーが使用している言語で回答するようにしてください。 +ユーザーが日本語で質問した場合は、日本語で回答してください。ユーザーがスペイン語で質問した場合は、スペイン語で回答してください。 +""" + + +def init_page(): + st.set_page_config(page_title="Web Browsing Agent", page_icon="🤗") + st.header("Web Browsing Agent 🤗") + st.sidebar.title("Options") + + +def init_messages(): + clear_button = st.sidebar.button("Clear Conversation", key="clear") + if clear_button or "messages" not in st.session_state: + st.session_state.messages = [{"role": "assistant", "content": "Please ask me any questions you may have."}] + st.session_state["memory"] = ConversationBufferWindowMemory( + return_messages=True, memory_key="chat_history", k=10 + ) + + +def create_agent(): + tools = [search_ddg] + prompt = ChatPromptTemplate.from_messages( + [ + ("system", CUSTOM_SYSTEM_PROMPT), + MessagesPlaceholder(variable_name="chat_history"), + ("user", "{input}"), + MessagesPlaceholder(variable_name="agent_scratchpad"), + ] + ) + llm = AzureChatOpenAI( + temperature=0, + api_key=getenv("AZURE_OPENAI_API_KEY"), + api_version=getenv("AZURE_OPENAI_API_VERSION"), + azure_endpoint=getenv("AZURE_OPENAI_ENDPOINT"), + model=getenv("AZURE_OPENAI_MODEL_CHAT"), + ) + agent = create_tool_calling_agent(llm, tools, prompt) + return AgentExecutor( + agent=agent, + tools=tools, + verbose=True, + memory=st.session_state["memory"], + ) + + +def main(): + init_page() + init_messages() + web_browsing_agent = create_agent() + + for msg in st.session_state["memory"].chat_memory.messages: + st.chat_message(msg.type).write(msg.content) + + if prompt := st.chat_input(placeholder="Type your question here..."): + st.chat_message("user").write(prompt) + + with st.chat_message("assistant"): + st_cb = StreamlitCallbackHandler(st.container(), expand_new_thoughts=True) + response = web_browsing_agent.invoke( + {"input": prompt}, + config=RunnableConfig({"callbacks": [st_cb]}), + ) + st.write(response["output"]) + + +if __name__ == "__main__": + main() diff --git a/frontend/tools/__init__.py b/frontend/tools/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/frontend/tools/search_ddg.py b/frontend/tools/search_ddg.py new file mode 100644 index 0000000..5ff4265 --- /dev/null +++ b/frontend/tools/search_ddg.py @@ -0,0 +1,54 @@ +# GitHub: https://github.com/naotaka1128/llm_app_codes/chapter_009/tools/search_ddg.py + +from itertools import islice + +from duckduckgo_search import DDGS +from langchain_core.pydantic_v1 import BaseModel, Field +from langchain_core.tools import tool + +""" +Sample Response of DuckDuckGo python library +-------------------------------------------- +[ + { + 'title': '日程・結果|Fifa 女子ワールドカップ オーストラリア&ニュージーランド 2023|なでしこジャパン|日本代表|Jfa|日本サッカー協会', + 'href': 'https://www.jfa.jp/nadeshikojapan/womensworldcup2023/schedule_result/', + 'body': '日程・結果|FIFA 女子ワールドカップ オーストラリア&ニュージーランド 2023|なでしこジャパン|日本代表|JFA|日本サッカー協会. FIFA 女子ワールドカップ. オーストラリア&ニュージーランド 2023.' + }, ... +] +""" # noqa: E501 + + +class SearchDDGInput(BaseModel): + query: str = Field(description="Type the keyword you want to search.") + + +@tool(args_schema=SearchDDGInput) +def search_ddg(query, max_result_num=5): + """ + DuckDuckGo検索を実行するためのツールです。 + 検索したいキーワードを入力して使用してください。 + 検索結果の各ページのタイトル、スニペット(説明文)、URLが返されます。 + このツールから得られる情報は非常に簡素化されており、時には古い情報の場合もあります。 + + 必要な情報が見つからない場合は、必ず `WEB Page Fetcher` ツールを使用して各ページの内容を確認してください。 + 文脈に応じて最も適切な言語を使用してください(ユーザーの言語と同じである必要はありません)。 + 例えば、プログラミング関連の質問では、英語で検索するのが最適です。 + + Returns + ------- + List[Dict[str, str]]: + - title + - snippet + - url + """ + res = DDGS().text( + query, + region="wt-wt", + safesearch="off", + backend="lite", + ) + return [ + {"title": r.get("title", ""), "snippet": r.get("body", ""), "url": r.get("href", "")} + for r in islice(res, max_result_num) + ] diff --git a/poetry.lock b/poetry.lock index 74e8ee8..8fe4f1f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1050,6 +1050,25 @@ files = [ {file = "distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed"}, ] +[[package]] +name = "duckduckgo-search" +version = "6.2.3" +description = "Search for words, documents, images, news, maps and text translation using the DuckDuckGo.com search engine." +optional = false +python-versions = ">=3.8" +files = [ + {file = "duckduckgo_search-6.2.3-py3-none-any.whl", hash = "sha256:07e717dc8a2dc195086a319a6d094979ff0637be4a829b0f001cd6c220fb7490"}, + {file = "duckduckgo_search-6.2.3.tar.gz", hash = "sha256:69e51c88504212c5ce371b3684fa6fa2459ad8f296d2c94759909ae096f05319"}, +] + +[package.dependencies] +click = ">=8.1.7" +pyreqwest-impersonate = ">=0.5.3" + +[package.extras] +dev = ["mypy (>=1.11.0)", "pytest (>=8.3.1)", "pytest-asyncio (>=0.23.8)", "ruff (>=0.5.4)"] +lxml = ["lxml (>=5.2.2)"] + [[package]] name = "exceptiongroup" version = "1.2.0" @@ -3383,6 +3402,26 @@ files = [ {file = "PyMuPDFb-1.24.6.tar.gz", hash = "sha256:f5a40b1732d65a1e519916d698858b9ce7473e23edf9001ddd085c5293d59d30"}, ] +[[package]] +name = "pyreqwest-impersonate" +version = "0.5.3" +description = "HTTP client that can impersonate web browsers, mimicking their headers and `TLS/JA3/JA4/HTTP2` fingerprints" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pyreqwest_impersonate-0.5.3-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:f15922496f728769fb9e1b116d5d9d7ba5525d0f2f7a76a41a1daef8b2e0c6c3"}, + {file = "pyreqwest_impersonate-0.5.3-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:77533133ae73020e59bc56d776eea3fe3af4ac41d763a89f39c495436da0f4cf"}, + {file = "pyreqwest_impersonate-0.5.3-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:436055fa3eeb3e01e2e8efd42a9f6c4ab62fd643eddc7c66d0e671b71605f273"}, + {file = "pyreqwest_impersonate-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e9d2e981a525fb72c1521f454e5581d2c7a3b1fcf1c97c0acfcb7a923d8cf3e"}, + {file = "pyreqwest_impersonate-0.5.3-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:a6bf986d4a165f6976b3e862111e2a46091883cb55e9e6325150f5aea2644229"}, + {file = "pyreqwest_impersonate-0.5.3-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b7397f6dad3d5ae158e0b272cb3eafe8382e71775d829b286ae9c21cb5a879ff"}, + {file = "pyreqwest_impersonate-0.5.3-cp38-abi3-win_amd64.whl", hash = "sha256:6026e4751b5912aec1e45238c07daf1e2c9126b3b32b33396b72885021e8990c"}, + {file = "pyreqwest_impersonate-0.5.3.tar.gz", hash = "sha256:f21c10609958ff5be18df0c329eed42d2b3ba8a339b65dc5f96ab74537231692"}, +] + +[package.extras] +dev = ["pytest (>=8.1.1)"] + [[package]] name = "pysocks" version = "1.7.1" @@ -4920,4 +4959,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "83a459d0ebd65ba4cfd4925a48309599784890f57ddd9d4963c3b37a10eee505" +content-hash = "d100c32bbf61872d13e085fae62f21b705d5a8274d4e0a6c1292929061ade66c" diff --git a/pyproject.toml b/pyproject.toml index 7d1e36f..78fb067 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,6 +61,7 @@ langchain-openai = "^0.1.19" langchain-community = "^0.2.10" youtube-transcript-api = "^0.6.2" pytube = "^15.0.0" +duckduckgo-search = "^6.2.3" [tool.poetry.group.azure-functions.dependencies] From 27e686083258e52706c0433bc31bafb0a5ed0dd9 Mon Sep 17 00:00:00 2001 From: ks6088ts Date: Sat, 27 Jul 2024 17:53:20 +0900 Subject: [PATCH 3/3] add contoso rules tool --- frontend/.env.sample | 5 ++ frontend/pages/tool_agent.py | 6 +- frontend/tools/fetch_contoso_rules.py | 88 +++++++++++++++++++++++++++ poetry.lock | 2 +- pyproject.toml | 1 + 5 files changed, 100 insertions(+), 2 deletions(-) create mode 100644 frontend/tools/fetch_contoso_rules.py diff --git a/frontend/.env.sample b/frontend/.env.sample index aff4350..4bfec8c 100644 --- a/frontend/.env.sample +++ b/frontend/.env.sample @@ -7,6 +7,11 @@ AZURE_OPENAI_API_KEY = "" AZURE_OPENAI_API_VERSION = "2024-05-01-preview" AZURE_OPENAI_MODEL_WHISPER = "whisper" AZURE_OPENAI_MODEL_CHAT = "gpt-4o" +AZURE_OPENAI_MODEL_EMBEDDING = "text-embedding-3-large" + +# Azure AI Search +AZURE_AI_SEARCH_ENDPOINT = "https://.search.windows.net" +AZURE_AI_SEARCH_API_KEY = "" # LangSmith LANGCHAIN_TRACING_V2 = "false" # set to "true" to enable tracing diff --git a/frontend/pages/tool_agent.py b/frontend/pages/tool_agent.py index 9963d5a..f10ec7f 100644 --- a/frontend/pages/tool_agent.py +++ b/frontend/pages/tool_agent.py @@ -9,6 +9,7 @@ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder from langchain_core.runnables import RunnableConfig from langchain_openai import AzureChatOpenAI +from tools.fetch_contoso_rules import fetch_contoso_rules from tools.search_ddg import search_ddg logger = logging.getLogger(__name__) @@ -64,7 +65,10 @@ def init_messages(): def create_agent(): - tools = [search_ddg] + tools = [ + search_ddg, + fetch_contoso_rules, + ] prompt = ChatPromptTemplate.from_messages( [ ("system", CUSTOM_SYSTEM_PROMPT), diff --git a/frontend/tools/fetch_contoso_rules.py b/frontend/tools/fetch_contoso_rules.py new file mode 100644 index 0000000..33915d6 --- /dev/null +++ b/frontend/tools/fetch_contoso_rules.py @@ -0,0 +1,88 @@ +# GitHub: https://github.com/naotaka1128/llm_app_codes/chapter_010/tools/fetch_qa_content.py + +from os import getenv + +from langchain_community.vectorstores.azuresearch import AzureSearch +from langchain_core.pydantic_v1 import BaseModel, Field +from langchain_core.tools import tool +from langchain_openai import AzureOpenAIEmbeddings + + +class FetchContentInput(BaseModel): + """型を指定するためのクラス""" + + query: str = Field() + + +def get_embeddings(): + return AzureOpenAIEmbeddings( + api_key=getenv("AZURE_OPENAI_API_KEY"), + api_version=getenv("AZURE_OPENAI_API_VERSION"), + azure_endpoint=getenv("AZURE_OPENAI_ENDPOINT"), + azure_deployment=getenv("AZURE_OPENAI_MODEL_EMBEDDING"), + ) + + +def create_azure_search(index_name: str) -> AzureSearch: + return AzureSearch( + azure_search_endpoint=getenv("AZURE_AI_SEARCH_ENDPOINT"), + azure_search_key=getenv("AZURE_AI_SEARCH_API_KEY"), + index_name=index_name, + embedding_function=get_embeddings().embed_query, + additional_search_client_options={"retry_total": 4}, + ) + + +@tool(args_schema=FetchContentInput) +def fetch_contoso_rules(query): + """ + Contoso 社の就業規則情報から、関連するコンテンツを見つけるツールです。 + Contoso 社に関する具体的な知識を得るのに役立ちます。 + + このツールは `similarity`(類似度)と `content`(コンテンツ)を返します。 + - 'similarity'は、回答が質問にどの程度関連しているかを示します。 + 値が高いほど、質問との関連性が高いことを意味します。 + 'similarity'値が0.5未満のドキュメントは返されません。 + - 'content'は、質問に対する回答のテキストを提供します。 + 通常、よくある質問とその対応する回答で構成されています。 + + 空のリストが返された場合、ユーザーの質問に対する回答が見つからなかったことを意味します。 + その場合、ユーザーに質問内容を明確にしてもらうのが良いでしょう。 + + Returns + ------- + List[Dict[str, Any]]: + - page_content + - similarity: float + - content: str + """ + db = create_azure_search("contoso_rules") + docs = db.similarity_search_with_relevance_scores( + query=query, + k=3, + score_threshold=0.5, + ) + return [ + { + "similarity": similarity, + "content": i.page_content, + } + for i, similarity in docs + ] + + +if __name__ == "__main__": + import logging + + from dotenv import load_dotenv + + logging.basicConfig( + format="[%(asctime)s] %(levelname)7s from %(name)s in %(pathname)s:%(lineno)d: " "%(message)s", + level=logging.DEBUG, + force=True, + ) + + load_dotenv() + docs = fetch_contoso_rules("ドレスコード") + for doc in docs: + print(doc) diff --git a/poetry.lock b/poetry.lock index 8fe4f1f..ff3f694 100644 --- a/poetry.lock +++ b/poetry.lock @@ -4959,4 +4959,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "d100c32bbf61872d13e085fae62f21b705d5a8274d4e0a6c1292929061ade66c" +content-hash = "f9e223a78c18c925a1e57b5d8cb646b707a2a7497c838c2110eb95c1aebc431a" diff --git a/pyproject.toml b/pyproject.toml index 78fb067..eb98bb9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,6 +62,7 @@ langchain-community = "^0.2.10" youtube-transcript-api = "^0.6.2" pytube = "^15.0.0" duckduckgo-search = "^6.2.3" +azure-search-documents = "^11.5.0" [tool.poetry.group.azure-functions.dependencies]