parse pdf link instead of upload file from local base on crawl

Lin-jun-xiang · Lin-jun-xiang · commit 63ff0d6d2b0a · 2023-08-30T10:54:15.000+08:00
diff --git a/README.md b/README.md
@@ -62,6 +62,15 @@ If you like this project, please give it a ⭐`Star` to support the developers~
 
 ---
 
+### 🧨Features
+
+- **`gpt4free` Integration**: Everyone can use `docGPT` for **free** without needing an OpenAI API key.
+- **Direct PDF URL Input**: Users can input PDF `URL` links for parsing without uploading `.pdf` files.
+- **Langchain Agent**: Enables AI to answer current questions and achieve Google search-like functionality.
+- **User-Friendly Environment**: Easy-to-use interface for simple operations.
+
+---
+
 ### 🦜️What's LangChain?
 
 * LangChain is a framework for developing applications powered by language models. It supports the following applications:
@@ -101,7 +110,9 @@ Through LangChain, you can create a universal AI model or tailor it for business
    - `OpenAI API KEY`: Ensure you have available usage.
    - `SERPAPI API KEY`: Required if you want to query content not present in the PDF.
 
-3. 📁Upload a PDF file from local storage.
+3. 📁Upload a PDF file (choose one method)
+    * Method 1: Browse and upload your own `.pdf` file from your local machine.
+    * Method 2: Enter the PDF `URL` link directly.
 
 4. 🚀Start asking questions!
 
diff --git a/README.zh-TW.md b/README.zh-TW.md
@@ -65,6 +65,15 @@
 
 ---
 
+### 🧨Features
+
+- **`gpt4free` 整合**：任何人都可以免費使用 GPT4，無需輸入 OpenAI API 金鑰。
+- **直接輸入 PDF 網址**：使用者可以直接輸入 PDF 網址進行解析，無需上傳 .pdf 檔案。
+- **Langchain Agent**：AI 能夠回答當前問題，實現類似 Google 搜尋功能。
+- **簡易操作環境**：友善的界面，操作簡便
+
+---
+
 ### 🦜️What's LangChain?
 
 * LangChain 是一個用於**開發由語言模型支持的應用程序的框架**。它支持以下應用程序
@@ -105,7 +114,10 @@ LangChain 填補了 ChatGPT 的不足之處。通過以下示例，您可以理
     * `OpenAI API KEY`: 確保還有可用的使用次數。
     * `SERPAPI API KEY`: 如果您要查詢 PDF 中不存在的內容，則需要使用此金鑰。
 
-3. 📁上傳來自本地的 PDF 檔案
+3. 📁上傳來自本地的 PDF 檔案 (選擇一個方法)
+    * 方法一: 從本地機瀏覽並上傳自己的 `.pdf` 檔
+    * 方法二: 輸入 PDF URL 連結
+
 4. 🚀開始提問 ! 
 
 ![RGB_cleanup](https://github.com/Lin-jun-xiang/docGPT-streamlit/blob/main/img/docGPT.gif?raw=true)
diff --git a/app.py b/app.py
@@ -42,7 +42,9 @@ def theme() -> None:
                 1. Enter your API keys: (You can choose to skip it and use the `gpt4free` free model)
                     * `OpenAI API Key`: Make sure you still have usage left
                     * `SERPAPI API Key`: Optional. If you want to ask questions about content not appearing in the PDF document, you need this key.
-                2. Upload a PDF file from your local machine.
+                2. Upload a PDF file (choose one method):
+                    * method1: Browse and upload your own `.pdf` file from your local machine.
+                    * method2: Enter the PDF `URL` link directly.
                 3. Start asking questions!
                 4. More details.(https://github.com/Lin-jun-xiang/docGPT-streamlit)
                 5. If you have any questions, feel free to leave comments and engage in discussions.(https://github.com/Lin-jun-xiang/docGPT-streamlit/issues)
@@ -92,10 +94,30 @@ def load_api_key() -> None:
 
 
 def upload_and_process_pdf() -> list:
-    upload_file = st.file_uploader('#### Upload a PDF file:', type='pdf')
+    st.write('#### Upload a PDF file:')
+    browse, url_link = st.tabs(
+        ['Drag and drop file (Browse files)', 'Enter PDF URL link']
+    )
+    with browse:
+        upload_file = st.file_uploader(
+            'Browse file',
+            type='pdf',
+            label_visibility='hidden'
+        )
+        upload_file = upload_file.read() if upload_file else None
+
+    with url_link:
+        pdf_url = st.text_input(
+            "Enter PDF URL Link",
+            placeholder='https://www.xxx/uploads/file.pdf',
+            label_visibility='hidden'
+        )
+        if pdf_url:
+            upload_file = PDFLoader.crawl_pdf_file(pdf_url)
+
     if upload_file:
         temp_file = tempfile.NamedTemporaryFile(delete=False)
-        temp_file.write(upload_file.read())
+        temp_file.write(upload_file)
         temp_file_path = temp_file.name
 
         docs = PDFLoader.load_documents(temp_file_path)
diff --git a/model/data_connection.py b/model/data_connection.py
@@ -1,9 +1,10 @@
-import json
 import os
 from typing import Iterator
 
+import requests
 from langchain.document_loaders import PyMuPDFLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+import streamlit as st
 
 
 class PDFLoader:
@@ -35,3 +36,15 @@ def split_documents(
         )
 
         return splitter.split_documents(document)
+
+    @staticmethod
+    def crawl_pdf_file(url: str) -> str:
+        try:
+            response = requests.get(url)
+            content_type = response.headers.get('content-type')
+            if response.status_code == 200 and 'pdf' in content_type:
+                return response.content
+            else:
+                st.warning('Url cannot parse to PDF')
+        except:
+            st.warning('Url cannot parse to PDF')
diff --git a/requirements.txt b/requirements.txt
@@ -9,3 +9,4 @@ tiktoken==0.4.0
 tenacity==8.1.0
 google-search-results==2.4.2
 sentence_transformers
+requests