Skip to content

Commit 455df15

Browse files
committed
adding an unreleased idea
1 parent d8e52e9 commit 455df15

File tree

1 file changed

+115
-22
lines changed

1 file changed

+115
-22
lines changed

unreleased/pdf_margin.py

Lines changed: 115 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,48 +1,141 @@
11
import fitz # PyMuPDF
2+
import os
3+
from tqdm import tqdm
4+
import json
5+
from datetime import datetime
6+
from pytz import timezone
27

3-
def get_margins(pdf_path):
8+
def check_interference(page, corner, width, height):
49
try:
5-
# Open the PDF file
6-
document = fitz.open(pdf_path)
7-
page = document[0] # Get the first page
10+
page_rect = page.rect
11+
page_width, page_height = page_rect.width, page_rect.height
12+
13+
if corner == "top_right":
14+
x0, y0 = page_width - width, 0
15+
x1, y1 = page_width, height
16+
elif corner == "bottom_right":
17+
x0, y0 = page_width - width, page_height - height
18+
x1, y1 = page_width, page_height
19+
else:
20+
raise ValueError("Invalid corner, can only be top_right or bottom_right")
21+
22+
check_rect = fitz.Rect(x0, y0, x1, y1)
23+
24+
text_blocks = page.get_text("dict")["blocks"]
25+
for block in text_blocks:
26+
if block["type"] == 0:
27+
bbox = fitz.Rect(block["bbox"])
28+
if check_rect.intersects(bbox):
29+
return f"Text interference detected in {corner} corner."
830

9-
# Get page dimensions
31+
images = page.get_images(full=True)
32+
for img in images:
33+
xref = img[0]
34+
img_rect = fitz.Rect(page.get_image_bbox(xref))
35+
if check_rect.intersects(img_rect):
36+
return f"Image interference detected in {corner} corner."
37+
38+
return None
39+
except Exception as e:
40+
return f"Error in processing {corner} {page}: {e}"
41+
42+
def get_margin_by_page(page):
43+
try:
1044
page_rect = page.rect
1145
page_width, page_height = page_rect.width, page_rect.height
1246

13-
# Get text blocks
1447
text_blocks = page.get_text("dict")["blocks"]
48+
text = page.get_text().encode("utf8")
1549

16-
# Initialize bounding box
1750
text_x0, text_y0 = page_width, page_height
1851
text_x1, text_y1 = 0, 0
1952

20-
# Iterate through text blocks to find the bounding box
2153
for block in text_blocks:
22-
if block['type'] == 0: # block['type'] == 0 indicates a text block
23-
bbox = block['bbox']
54+
if block["type"] == 0:
55+
bbox = block["bbox"]
2456
text_x0 = min(text_x0, bbox[0])
2557
text_y0 = min(text_y0, bbox[1])
2658
text_x1 = max(text_x1, bbox[2])
2759
text_y1 = max(text_y1, bbox[3])
2860

29-
# Calculate margins
3061
left_margin = text_x0
3162
right_margin = page_width - text_x1
3263
top_margin = text_y0
3364
bottom_margin = page_height - text_y1
3465

66+
corners = [
67+
{"corner": "top_right", "width": 144, "height": 36},
68+
{"corner": "bottom_right", "width": 36, "height": 216},
69+
]
70+
71+
interference = []
72+
for c in corners:
73+
inter = check_interference(
74+
page=page, corner=c["corner"], width=c["width"], height=c["height"]
75+
)
76+
if inter and not inter.startswith("Error in"):
77+
interference.append(f"{c['corner']} {inter}")
78+
3579
return {
36-
"left_margin": left_margin,
37-
"right_margin": right_margin,
38-
"top_margin": top_margin,
39-
"bottom_margin": bottom_margin
80+
"left_margin": round(left_margin / 72, 2),
81+
"right_margin": round(right_margin / 72, 2),
82+
"top_margin": round(top_margin / 72, 2),
83+
"bottom_margin": round(bottom_margin / 72, 2),
84+
"page_text": text,
85+
"interference": interference,
4086
}
4187
except Exception as e:
42-
print(f"Error processing {pdf_path}: {e}")
43-
return None
44-
# Measure margins for the provided PDF files
45-
pdf_files = ['pdf_sample.pdf', 'pdf_sample_narrow.pdf']
46-
for pdf_file in pdf_files:
47-
margins = get_margins(pdf_file)
48-
print(f"Margins for {pdf_file}: {margins}")
88+
print(f"Error processing {page}: {e}")
89+
return {"error": str(e)}
90+
91+
def run_pdf(pdf_folder, output_folder, safety_margin=0.4):
92+
dir_list = [f for f in os.listdir(pdf_folder) if f.endswith(".pdf")][:200]
93+
94+
page_count = 0
95+
document_margins = []
96+
for pdf_file in tqdm(dir_list, desc="processing file", leave=True):
97+
document = fitz.open(f"{pdf_folder}/{pdf_file}")
98+
99+
margin_list = []
100+
for i in tqdm(range(len(document)), desc=f"processing {pdf_file}", leave=False):
101+
page = document[i]
102+
margin_dict = get_margin_by_page(page)
103+
margin_list.append({"page": i + 1, "margin": margin_dict})
104+
page_count += 1
105+
106+
document_margins.append({"file": pdf_file, "margins": margin_list})
107+
108+
margin_issues = []
109+
for da in document_margins:
110+
file_name = da.get("file")
111+
margin_page_list = []
112+
for a in da.get("margins"):
113+
page_number = a.get("page")
114+
page_margin = a.get("margin")
115+
warnings = []
116+
117+
if "interference" in page_margin.items():
118+
for key, value in page_margin.items():
119+
if key.endswith("_margin") and value <= safety_margin:
120+
warnings.append(key)
121+
122+
if page_margin["interference"]:
123+
warnings.extend(page_margin["interference"])
124+
125+
if warnings:
126+
margin_page_list.append({"page_number": page_number, "issues": warnings})
127+
128+
if margin_page_list:
129+
margin_issues.append({"file": file_name, "issues": margin_page_list})
130+
131+
dt = datetime.now().astimezone(timezone("America/New_York"))
132+
timestamp = dt.strftime("%Y-%m-%d-%H%M")
133+
134+
output_path = os.path.join(output_folder, f"margin_issues_{timestamp}.json")
135+
with open(output_path, "w") as write_file:
136+
json.dump(margin_issues, write_file)
137+
138+
print(f"Page count: {page_count} | len(margin_issues): {len(margin_issues)}")
139+
140+
if __name__ == "__main__":
141+
run_pdf(pdf_folder="/your/pdf/folder/", output_folder="/your/output/folder/")

0 commit comments

Comments
 (0)