Skip to content

Commit f947b6e

Browse files
committed
adding PDF reading scripts... tbd
1 parent 5e985b0 commit f947b6e

File tree

6 files changed

+280
-2
lines changed

6 files changed

+280
-2
lines changed

coverage.xml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
<?xml version="1.0" ?>
2-
<coverage version="7.5.2" timestamp="1716757725577" lines-valid="660" lines-covered="186" line-rate="0.2818" branches-covered="0" branches-valid="0" branch-rate="0" complexity="0">
3-
<!-- Generated by coverage.py: https://coverage.readthedocs.io/en/7.5.2 -->
2+
<coverage version="7.5.3" timestamp="1718284262278" lines-valid="660" lines-covered="186" line-rate="0.2818" branches-covered="0" branches-valid="0" branch-rate="0" complexity="0">
3+
<!-- Generated by coverage.py: https://coverage.readthedocs.io/en/7.5.3 -->
44
<!-- Based on https://raw.githubusercontent.com/cobertura/web/master/htdocs/xml/coverage-04.dtd -->
55
<sources>
66
<source>/workspaces/devsetgo_lib</source>

unreleased/pdf_margin.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import fitz # PyMuPDF
2+
3+
def get_margins(pdf_path):
4+
try:
5+
# Open the PDF file
6+
document = fitz.open(pdf_path)
7+
page = document[0] # Get the first page
8+
9+
# Get page dimensions
10+
page_rect = page.rect
11+
page_width, page_height = page_rect.width, page_rect.height
12+
13+
# Get text blocks
14+
text_blocks = page.get_text("dict")["blocks"]
15+
16+
# Initialize bounding box
17+
text_x0, text_y0 = page_width, page_height
18+
text_x1, text_y1 = 0, 0
19+
20+
# Iterate through text blocks to find the bounding box
21+
for block in text_blocks:
22+
if block['type'] == 0: # block['type'] == 0 indicates a text block
23+
bbox = block['bbox']
24+
text_x0 = min(text_x0, bbox[0])
25+
text_y0 = min(text_y0, bbox[1])
26+
text_x1 = max(text_x1, bbox[2])
27+
text_y1 = max(text_y1, bbox[3])
28+
29+
# Calculate margins
30+
left_margin = text_x0
31+
right_margin = page_width - text_x1
32+
top_margin = text_y0
33+
bottom_margin = page_height - text_y1
34+
35+
return {
36+
"left_margin": left_margin,
37+
"right_margin": right_margin,
38+
"top_margin": top_margin,
39+
"bottom_margin": bottom_margin
40+
}
41+
except Exception as e:
42+
print(f"Error processing {pdf_path}: {e}")
43+
return None
44+
# Measure margins for the provided PDF files
45+
pdf_files = ['pdf_sample.pdf', 'pdf_sample_narrow.pdf']
46+
for pdf_file in pdf_files:
47+
margins = get_margins(pdf_file)
48+
print(f"Margins for {pdf_file}: {margins}")

unreleased/pdf_processing.py

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
from fastapi import FastAPI, UploadFile, File
2+
from fastapi.responses import ORJSONResponse
3+
import time
4+
import io
5+
from pypdf import PdfReader
6+
from loguru import logger
7+
8+
app = FastAPI()
9+
10+
@app.post('/validate-pdf', response_class=ORJSONResponse, status_code=201)
11+
async def check_pdf(
12+
file: UploadFile = File(...),
13+
include_text: bool = False,
14+
check_text: bool = False,
15+
include_page_errors: bool = False
16+
):
17+
response = dict()
18+
t0 = time.time()
19+
20+
response["file_name"] = file.filename
21+
response["content_type"] = file.content_type
22+
response["file_size"] = file.size
23+
filters = {
24+
"include_text": include_text,
25+
"check_text": check_text
26+
}
27+
28+
if file.content_type != "application/pdf":
29+
message = f"File is not a PDF, but type {file.content_type}"
30+
logger.error(message)
31+
response["message"] = message
32+
return ORJSONResponse(content=response, status_code=400)
33+
34+
pdf_content = await file.read()
35+
reader = PdfReader(io.BytesIO(pdf_content))
36+
37+
if len(reader.pages) == 0:
38+
message = "The PDF is empty"
39+
logger.error(message)
40+
response["message"] = message
41+
return ORJSONResponse(content=response, status_code=400)
42+
43+
response["page_count"] = len(reader.pages)
44+
45+
meta = reader.metadata
46+
if meta is None:
47+
message = "The PDF does not contain meta data"
48+
logger.error(message)
49+
response["message"] = message
50+
return ORJSONResponse(content=response, status_code=400)
51+
52+
cleaned_meta = {k: str(v).replace("\x00", "") for k, v in meta.items()}
53+
response["meta"] = cleaned_meta
54+
55+
text = ""
56+
if check_text:
57+
results = get_pdf_content(pdf_content=pdf_content)
58+
text = results["text"]
59+
if not text.strip():
60+
message = "The PDF does not contain readable text"
61+
logger.error(message)
62+
response["message"] = message
63+
return ORJSONResponse(content=response, status_code=400)
64+
65+
common_words = ["the", "and", "is"]
66+
words_found = [word for word in common_words if word in text]
67+
if len(words_found) == 0:
68+
message = "The PDF does not contain readable text, like the word 'the'"
69+
logger.error(message)
70+
response["message"] = message
71+
return ORJSONResponse(content=response, status_code=400)
72+
73+
response["characters"] = len(text)
74+
response["words_found"] = words_found
75+
if include_page_errors:
76+
response["errors"] = results["errors"]
77+
78+
if reader.is_encrypted:
79+
message = "The PDF is encrypted and not allowed"
80+
logger.error(message)
81+
response["message"] = message
82+
return ORJSONResponse(content=response, status_code=400)
83+
84+
embedded_fonts = []
85+
for page in tqdm(reader.pages, desc="Finding Fonts"):
86+
fonts = page.get_fonts()
87+
for font in fonts:
88+
font_name = font.get("BaseFont", "").replace("/", "").replace("+", "")
89+
if font_name not in embedded_fonts:
90+
embedded_fonts.append(font_name)
91+
92+
if not embedded_fonts:
93+
message = "The PDF does not have embedded fonts"
94+
logger.error(message)
95+
response["message"] = message
96+
return ORJSONResponse(content=response, status_code=400)
97+
98+
response["fonts"] = embedded_fonts
99+
form_fields = any("/AcroForm" in reader.trailer for _ in reader.pages)
100+
if form_fields:
101+
message = "The PDF contains form fields"
102+
logger.error(message)
103+
response["message"] = message
104+
return ORJSONResponse(content=response, status_code=400)
105+
106+
if include_text:
107+
response["text"] = text
108+
109+
t1 = time.time()
110+
logger.debug(f"PDF check response: {response}")
111+
response["processing_time_seconds"] = f"{t1 - t0:.2f}"
112+
return ORJSONResponse(content=response, status_code=201)
113+
114+
115+
# Function to extract data from a PDF file
116+
117+
118+
# coding: utf-8
119+
import io
120+
import re
121+
from functools import lru_cache
122+
123+
from loguru import logger # Import the Loguru logger
124+
from pypdf import PdfReader, PaperSize
125+
from tqdm import tqdm
126+
from unsync import unsync
127+
128+
@unsync
129+
def extract_pdf_text(pdf_content, page_number: int):
130+
try:
131+
reader = get_reader(pdf_content)
132+
page = reader.pages[page_number].extract_text(extraction_mode="layout", layout_mode_strip_rotated=True)
133+
text = reader.pages[page_number].extract_text()
134+
box = reader.pages[page_number].mediabox
135+
136+
print(f"left {box.left}")
137+
print(f"right {box.right}")
138+
print(f"lower left {box.lower_left}")
139+
print(f"lower right {box.lower_right}")
140+
print(f"upper left {box.upper_left}")
141+
print(f"upper right {box.upper_right}")
142+
print(f"top {box.top}")
143+
print(f"bottom {box.bottom}")
144+
145+
return {"text": text, "page_num": page_number, "margin": box, "error": None}
146+
except Exception as ex:
147+
logger.error(ex)
148+
return {"text": "", "page_num": page_number, "margin": None, "error": ex}
149+
150+
@lru_cache(maxsize=300, typed=False)
151+
def get_reader(pdf_content):
152+
reader = PdfReader(io.BytesIO(pdf_content))
153+
return reader
154+
155+
def is_valid_ssn(ssn):
156+
ssn_regex = re.compile(r"^(?!000|666)[0-8]\d{2}-(?!00)\d{2}-(?!0000)\d{4}$")
157+
return bool(ssn_regex.match(ssn))
158+
159+
def get_pdf_content(pdf_content):
160+
reader = PdfReader(io.BytesIO(pdf_content))
161+
162+
tasks = [
163+
extract_pdf_text(pdf_content=pdf_content, page_number=page_number)
164+
for page_number in tqdm(range(len(reader.pages)), desc="PDF Text Processing")
165+
]
166+
167+
results = [task.result() for task in tqdm(tasks, desc="PDF Text Results")]
168+
169+
results.sort(key=lambda x: x["page_num"])
170+
combined_text = "\n".join([result["text"] for result in results])
171+
has_ssn = is_valid_ssn(combined_text)
172+
margins = [result["margin"] for result in results]
173+
error_list = [result for result in results if result["error"] is not None]
174+
175+
for result in results:
176+
if result["error"] is not None:
177+
error_list.append(f"Error on page {result['page_num']} of {result['error']}")
178+
179+
return {"text": combined_text, "margins": margins, "errors": error_list, "PII": has_ssn}

unreleased/pdf_sample.pdf

23.9 KB
Binary file not shown.

unreleased/pdf_sample_narrow.pdf

23.8 KB
Binary file not shown.

unreleased/pdf_script.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import io
2+
from pypdf import PdfReader
3+
4+
pdf_content = open("pdf_sample_narrow.pdf", "rb").read()
5+
6+
reader = PdfReader(io.BytesIO(pdf_content))
7+
8+
print(f"PDF Version {reader.pdf_header}")
9+
10+
parts = []
11+
12+
def visitor_body(text, cm, tm, font_dict, font_size):
13+
y = cm[5]
14+
# if y > 50 and y < 720:
15+
parts.append(text)
16+
17+
for page_number in range(len(reader.pages)):
18+
try:
19+
text = reader.pages[page_number].extract_text(visitor_text=visitor_body, layout_mode_scale_weight=1.0)
20+
mediabox = reader.pages[page_number].mediabox
21+
cropbox = reader.pages[page_number].cropbox
22+
trimbox = reader.pages[page_number].trimbox
23+
artbox = reader.pages[page_number].artbox
24+
bleedbox = reader.pages[page_number].bleedbox
25+
unit_size = reader.pages[page_number].user_unit
26+
27+
print(f"Page {page_number}")
28+
# print("begin text.....")
29+
# print(text)
30+
# print("end text.....")
31+
print(f"MediaBox: {mediabox.width}x{mediabox.height} divid by 72 = {mediabox[2] / 72} x {mediabox[3] / 72}")
32+
print(f"BropBox: {cropbox.width}x{cropbox.height} divid by 72 = {cropbox[2] / 72} x {cropbox[3] / 72}")
33+
print(f"TrimBox: {trimbox.width}x{trimbox.height} divid by 72 = {trimbox[2] / 72} x {trimbox[3] / 72}")
34+
print(f"ArtBox: {artbox.width}x{artbox.height} divid by 72 = {artbox[2] / 72} x {artbox[3] / 72}")
35+
print(f"BleedBox: {bleedbox.width}x{bleedbox.height} divid by 72 = {bleedbox[2] / 72} x {bleedbox[3] / 72}")
36+
print(f"Unit Size: {unit_size}")
37+
38+
except Exception as ex:
39+
print(f"Error on page {page_number}: {ex}")
40+
41+
42+
43+
text_body = "".join(parts)
44+
45+
print(text_body)
46+
for p in parts:
47+
if len(p) > 100:
48+
print(len(p),p)
49+
50+
line = "embed code for the video you want to add. You can also type a keyword to search online for the video that best fits"
51+
print(len(line), line)

0 commit comments

Comments
 (0)