Historical research, and analysis request #1860

AchillesActual · 2024-04-19T04:57:44Z

AchillesActual
Apr 19, 2024

Does anybody know how to optimize private GPT to use automation for historical research and writing while going through a bunch of primary and secondary sources? I tried setting up a prompt to go through all of my PDFs at once, and I've made a little bit of progress, but I figured you guys might know a better way, using automated processes to go through research and write out responses that are long enough to equal a 50 document that could potentially be published.

Here is my code so far:

import pytesseract
from pdf2image import convert_from_path
import pdfplumber
import os
import requests
from concurrent.futures import ProcessPoolExecutor

def extract_text(pdf_path):
text = ''
try:
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text
if not text: # If no text was extracted with pdfplumber, use OCR
images = convert_from_path(pdf_path)
for image in images:
text += pytesseract.image_to_string(image)
except Exception as e:
print(f"Error processing {pdf_path}: {str(e)}")
return text

def send_to_gpt(text):
if not text.strip(): # Ensure text isn't just whitespace
return None, "No text to process."
url = "http://localhost:8001/api/generate"
payload = {"text": text}
response = requests.post(url, json=payload)
if response.status_code == 200:
return response.json()['response'], None
else:
return None, f"Error: {response.text}"

def check_essay_rules(text):
issues = []
if "conclusion" in text[:500]: # Example of premature conclusion
issues.append("Use 'conclusion' only in the final part of the essay.")
# Add more rule checks as needed
return issues

def process_pdf(filename, directory_path):
pdf_path = os.path.join(directory_path, filename)
print(f"Starting processing of {filename}...")
pdf_text = extract_text(pdf_path)
if not pdf_text.strip():
print(f"No usable text extracted from {filename}.")
return None
print(f"Sending extracted text of {filename} to GPT...")
gpt_response, error = send_to_gpt(pdf_text)
if error:
print(f"Error during GPT processing of {filename}: {error}")
return
feedback = check_essay_rules(gpt_response)
if feedback:
print(f"Feedback for {filename}: {' '.join(feedback)}")
else:
print(f"Successfully processed {filename} with no guideline issues.")
return gpt_response

if name == "main":
directory_path = "/Users/lucascarreon/privateGPT/essay_project/Access PDFs"
files = [f for f in os.listdir(directory_path) if f.endswith('.pdf')]
with ProcessPoolExecutor() as executor:
results = list(executor.map(process_pdf, files, [directory_path]*len(files)))

print("All files processed.")

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Historical research, and analysis request #1860

Uh oh!

{{title}}

Uh oh!

Replies: 0 comments

Select a reply

Uh oh!

Historical research, and analysis request #1860

Uh oh!

AchillesActual Apr 19, 2024

Replies: 0 comments

AchillesActual
Apr 19, 2024