|
1 | 1 | import fitz # PyMuPDF
|
| 2 | +import os |
| 3 | +from tqdm import tqdm |
| 4 | +import json |
| 5 | +from datetime import datetime |
| 6 | +from pytz import timezone |
2 | 7 |
|
3 |
| -def get_margins(pdf_path): |
| 8 | +def check_interference(page, corner, width, height): |
4 | 9 | try:
|
5 |
| - # Open the PDF file |
6 |
| - document = fitz.open(pdf_path) |
7 |
| - page = document[0] # Get the first page |
| 10 | + page_rect = page.rect |
| 11 | + page_width, page_height = page_rect.width, page_rect.height |
| 12 | + |
| 13 | + if corner == "top_right": |
| 14 | + x0, y0 = page_width - width, 0 |
| 15 | + x1, y1 = page_width, height |
| 16 | + elif corner == "bottom_right": |
| 17 | + x0, y0 = page_width - width, page_height - height |
| 18 | + x1, y1 = page_width, page_height |
| 19 | + else: |
| 20 | + raise ValueError("Invalid corner, can only be top_right or bottom_right") |
| 21 | + |
| 22 | + check_rect = fitz.Rect(x0, y0, x1, y1) |
| 23 | + |
| 24 | + text_blocks = page.get_text("dict")["blocks"] |
| 25 | + for block in text_blocks: |
| 26 | + if block["type"] == 0: |
| 27 | + bbox = fitz.Rect(block["bbox"]) |
| 28 | + if check_rect.intersects(bbox): |
| 29 | + return f"Text interference detected in {corner} corner." |
8 | 30 |
|
9 |
| - # Get page dimensions |
| 31 | + images = page.get_images(full=True) |
| 32 | + for img in images: |
| 33 | + xref = img[0] |
| 34 | + img_rect = fitz.Rect(page.get_image_bbox(xref)) |
| 35 | + if check_rect.intersects(img_rect): |
| 36 | + return f"Image interference detected in {corner} corner." |
| 37 | + |
| 38 | + return None |
| 39 | + except Exception as e: |
| 40 | + return f"Error in processing {corner} {page}: {e}" |
| 41 | + |
| 42 | +def get_margin_by_page(page): |
| 43 | + try: |
10 | 44 | page_rect = page.rect
|
11 | 45 | page_width, page_height = page_rect.width, page_rect.height
|
12 | 46 |
|
13 |
| - # Get text blocks |
14 | 47 | text_blocks = page.get_text("dict")["blocks"]
|
| 48 | + text = page.get_text().encode("utf8") |
15 | 49 |
|
16 |
| - # Initialize bounding box |
17 | 50 | text_x0, text_y0 = page_width, page_height
|
18 | 51 | text_x1, text_y1 = 0, 0
|
19 | 52 |
|
20 |
| - # Iterate through text blocks to find the bounding box |
21 | 53 | for block in text_blocks:
|
22 |
| - if block['type'] == 0: # block['type'] == 0 indicates a text block |
23 |
| - bbox = block['bbox'] |
| 54 | + if block["type"] == 0: |
| 55 | + bbox = block["bbox"] |
24 | 56 | text_x0 = min(text_x0, bbox[0])
|
25 | 57 | text_y0 = min(text_y0, bbox[1])
|
26 | 58 | text_x1 = max(text_x1, bbox[2])
|
27 | 59 | text_y1 = max(text_y1, bbox[3])
|
28 | 60 |
|
29 |
| - # Calculate margins |
30 | 61 | left_margin = text_x0
|
31 | 62 | right_margin = page_width - text_x1
|
32 | 63 | top_margin = text_y0
|
33 | 64 | bottom_margin = page_height - text_y1
|
34 | 65 |
|
| 66 | + corners = [ |
| 67 | + {"corner": "top_right", "width": 144, "height": 36}, |
| 68 | + {"corner": "bottom_right", "width": 36, "height": 216}, |
| 69 | + ] |
| 70 | + |
| 71 | + interference = [] |
| 72 | + for c in corners: |
| 73 | + inter = check_interference( |
| 74 | + page=page, corner=c["corner"], width=c["width"], height=c["height"] |
| 75 | + ) |
| 76 | + if inter and not inter.startswith("Error in"): |
| 77 | + interference.append(f"{c['corner']} {inter}") |
| 78 | + |
35 | 79 | return {
|
36 |
| - "left_margin": left_margin, |
37 |
| - "right_margin": right_margin, |
38 |
| - "top_margin": top_margin, |
39 |
| - "bottom_margin": bottom_margin |
| 80 | + "left_margin": round(left_margin / 72, 2), |
| 81 | + "right_margin": round(right_margin / 72, 2), |
| 82 | + "top_margin": round(top_margin / 72, 2), |
| 83 | + "bottom_margin": round(bottom_margin / 72, 2), |
| 84 | + "page_text": text, |
| 85 | + "interference": interference, |
40 | 86 | }
|
41 | 87 | except Exception as e:
|
42 |
| - print(f"Error processing {pdf_path}: {e}") |
43 |
| - return None |
44 |
| -# Measure margins for the provided PDF files |
45 |
| -pdf_files = ['pdf_sample.pdf', 'pdf_sample_narrow.pdf'] |
46 |
| -for pdf_file in pdf_files: |
47 |
| - margins = get_margins(pdf_file) |
48 |
| - print(f"Margins for {pdf_file}: {margins}") |
| 88 | + print(f"Error processing {page}: {e}") |
| 89 | + return {"error": str(e)} |
| 90 | + |
| 91 | +def run_pdf(pdf_folder, output_folder, safety_margin=0.4): |
| 92 | + dir_list = [f for f in os.listdir(pdf_folder) if f.endswith(".pdf")][:200] |
| 93 | + |
| 94 | + page_count = 0 |
| 95 | + document_margins = [] |
| 96 | + for pdf_file in tqdm(dir_list, desc="processing file", leave=True): |
| 97 | + document = fitz.open(f"{pdf_folder}/{pdf_file}") |
| 98 | + |
| 99 | + margin_list = [] |
| 100 | + for i in tqdm(range(len(document)), desc=f"processing {pdf_file}", leave=False): |
| 101 | + page = document[i] |
| 102 | + margin_dict = get_margin_by_page(page) |
| 103 | + margin_list.append({"page": i + 1, "margin": margin_dict}) |
| 104 | + page_count += 1 |
| 105 | + |
| 106 | + document_margins.append({"file": pdf_file, "margins": margin_list}) |
| 107 | + |
| 108 | + margin_issues = [] |
| 109 | + for da in document_margins: |
| 110 | + file_name = da.get("file") |
| 111 | + margin_page_list = [] |
| 112 | + for a in da.get("margins"): |
| 113 | + page_number = a.get("page") |
| 114 | + page_margin = a.get("margin") |
| 115 | + warnings = [] |
| 116 | + |
| 117 | + if "interference" in page_margin.items(): |
| 118 | + for key, value in page_margin.items(): |
| 119 | + if key.endswith("_margin") and value <= safety_margin: |
| 120 | + warnings.append(key) |
| 121 | + |
| 122 | + if page_margin["interference"]: |
| 123 | + warnings.extend(page_margin["interference"]) |
| 124 | + |
| 125 | + if warnings: |
| 126 | + margin_page_list.append({"page_number": page_number, "issues": warnings}) |
| 127 | + |
| 128 | + if margin_page_list: |
| 129 | + margin_issues.append({"file": file_name, "issues": margin_page_list}) |
| 130 | + |
| 131 | + dt = datetime.now().astimezone(timezone("America/New_York")) |
| 132 | + timestamp = dt.strftime("%Y-%m-%d-%H%M") |
| 133 | + |
| 134 | + output_path = os.path.join(output_folder, f"margin_issues_{timestamp}.json") |
| 135 | + with open(output_path, "w") as write_file: |
| 136 | + json.dump(margin_issues, write_file) |
| 137 | + |
| 138 | + print(f"Page count: {page_count} | len(margin_issues): {len(margin_issues)}") |
| 139 | + |
| 140 | +if __name__ == "__main__": |
| 141 | + run_pdf(pdf_folder="/your/pdf/folder/", output_folder="/your/output/folder/") |
0 commit comments