Historical research, and analysis request #1860
Unanswered
AchillesActual
asked this question in
Q&A
Replies: 0 comments
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Uh oh!
There was an error while loading. Please reload this page.
-
Does anybody know how to optimize private GPT to use automation for historical research and writing while going through a bunch of primary and secondary sources? I tried setting up a prompt to go through all of my PDFs at once, and I've made a little bit of progress, but I figured you guys might know a better way, using automated processes to go through research and write out responses that are long enough to equal a 50 document that could potentially be published.
Here is my code so far:
import pytesseract
from pdf2image import convert_from_path
import pdfplumber
import os
import requests
from concurrent.futures import ProcessPoolExecutor
def extract_text(pdf_path):
text = ''
try:
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text
if not text: # If no text was extracted with pdfplumber, use OCR
images = convert_from_path(pdf_path)
for image in images:
text += pytesseract.image_to_string(image)
except Exception as e:
print(f"Error processing {pdf_path}: {str(e)}")
return text
def send_to_gpt(text):
if not text.strip(): # Ensure text isn't just whitespace
return None, "No text to process."
url = "http://localhost:8001/api/generate"
payload = {"text": text}
response = requests.post(url, json=payload)
if response.status_code == 200:
return response.json()['response'], None
else:
return None, f"Error: {response.text}"
def check_essay_rules(text):
issues = []
if "conclusion" in text[:500]: # Example of premature conclusion
issues.append("Use 'conclusion' only in the final part of the essay.")
# Add more rule checks as needed
return issues
def process_pdf(filename, directory_path):
pdf_path = os.path.join(directory_path, filename)
print(f"Starting processing of {filename}...")
pdf_text = extract_text(pdf_path)
if not pdf_text.strip():
print(f"No usable text extracted from {filename}.")
return None
print(f"Sending extracted text of {filename} to GPT...")
gpt_response, error = send_to_gpt(pdf_text)
if error:
print(f"Error during GPT processing of {filename}: {error}")
return
feedback = check_essay_rules(gpt_response)
if feedback:
print(f"Feedback for {filename}: {' '.join(feedback)}")
else:
print(f"Successfully processed {filename} with no guideline issues.")
return gpt_response
if name == "main":
directory_path = "/Users/lucascarreon/privateGPT/essay_project/Access PDFs"
files = [f for f in os.listdir(directory_path) if f.endswith('.pdf')]
with ProcessPoolExecutor() as executor:
results = list(executor.map(process_pdf, files, [directory_path]*len(files)))
Beta Was this translation helpful? Give feedback.
All reactions