diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 5d9ae2e3..19f7a104 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.13.0 +current_version = 0.13.1 commit = False tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+)(-(?P[a-z]+)(?P\d+))? diff --git a/.github/workflows/autofill_pullrequest.yml b/.github/workflows/autofill_pullrequest.yml index 9872baa5..c7ba10de 100644 --- a/.github/workflows/autofill_pullrequest.yml +++ b/.github/workflows/autofill_pullrequest.yml @@ -19,5 +19,5 @@ jobs: with: github_token: ${{ secrets.GITHUB_TOKEN }} openai_api_key: ${{ secrets.OPENAI_API_KEY }} - max_tokens: 4000 - openai_model: gpt-4 + max_tokens: 16384 + openai_model: gpt-4o-mini diff --git a/coverage-badge.svg b/coverage-badge.svg index 792dcf68..e2abb10d 100644 --- a/coverage-badge.svg +++ b/coverage-badge.svg @@ -1 +1 @@ -coverage: 30.44%coverage30.44% +coverage: 13.25%coverage13.25% diff --git a/coverage.xml b/coverage.xml index daf64bd3..25b90910 100644 --- a/coverage.xml +++ b/coverage.xml @@ -1,6 +1,6 @@ - - + + /github/workspace @@ -624,35 +624,50 @@ - - + + + + + - - - - - - - + + - + - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dsg_lib/__init__.py b/dsg_lib/__init__.py index 20aa0748..682d036b 100644 --- a/dsg_lib/__init__.py +++ b/dsg_lib/__init__.py @@ -1,3 +1,3 @@ # -*- coding: utf-8 -*- -__version__ = '0.13.0' +__version__ = '0.13.1' diff --git a/dsg_lib/common_functions/logging_config.py b/dsg_lib/common_functions/logging_config.py index b1b93c17..a5093418 100644 --- a/dsg_lib/common_functions/logging_config.py +++ b/dsg_lib/common_functions/logging_config.py @@ -1,29 +1,22 @@ # -*- coding: utf-8 -*- """ -This module provides a function to configure and set up a logger using the loguru package. +This module provides a comprehensive logging setup using the loguru library, facilitating easy logging management for Python applications. The `config_log` function, central to this module, allows for extensive customization of logging behavior. It supports specifying the logging directory, log file name, logging level, and controls for log rotation, retention, and formatting among other features. Additionally, it offers advanced options like backtrace and diagnose for in-depth debugging, and the ability to append the application name to the log file for clearer identification. -The `config_log` function takes several optional parameters to customize the logger's behavior, -including the logging directory, log name, logging level, log rotation size, log retention period, -and more. It also provides an option to append the application name to the log file name. +Usage example: -Example: ```python from dsg_lib.common_functions.logging_config import config_log config_log( - logging_directory='logs', # Directory where logs will be stored - log_name='log', # Name of the log file (extension will be added automatically set v0.12.2) - logging_level='DEBUG', # Logging level - log_rotation='100 MB', # Log rotation size - log_retention='30 days', # Log retention period - log_backtrace=True, # Enable backtrace - log_format="{time:YYYY-MM-DD HH:mm:ss.SSSSSS} | {level: <8} | {name}:{function}:{line} - {message}", # Log format - log_serializer=False, # Disable log serialization - log_diagnose=True, # Enable diagnose - app_name='my_app', # Application name - append_app_name=True # Append application name to the log file name + logging_directory='logs', # Directory for storing logs + log_name='log', # Base name for log files + logging_level='DEBUG', # Minimum logging level + log_rotation='100 MB', # Size threshold for log rotation + log_retention='30 days', # Duration to retain old log files + enqueue=True, # Enqueue log messages ) +# Example log messages logger.debug("This is a debug message") logger.info("This is an info message") logger.error("This is an error message") @@ -32,10 +25,12 @@ ``` Author: Mike Ryan -Date: 2024/05/16 +DateCreated: 2021/07/16 +DateUpdated: 2024/07/24 + License: MIT """ - +import time import logging from pathlib import Path from uuid import uuid4 @@ -55,6 +50,9 @@ def config_log( log_diagnose: bool = False, app_name: str = None, append_app_name: bool = False, + enqueue: bool = True, + intercept_standard_logging: bool = True, + file_sink: bool = True, ): """ Configures and sets up a logger using the loguru package. @@ -71,6 +69,9 @@ def config_log( - log_diagnose (bool): Whether to enable diagnose. Default is False. - app_name (str): The application name. Default is None. - append_app_name (bool): Whether to append the application name to the log file name. Default is False. + - enqueue (bool): Whether to enqueue log messages. Default is True. + - intercept_standard_logging (bool): Whether to intercept standard logging. Default is True. + - file_sink (bool): Whether to use a file sink. Default is True. Raises: - ValueError: If the provided logging level is not valid. @@ -83,14 +84,17 @@ def config_log( logging_directory='logs', log_name='app.log', logging_level='DEBUG', - log_rotation='500 MB', + log_rotation='100 MB', log_retention='10 days', log_backtrace=True, log_format="{time:YYYY-MM-DD HH:mm:ss.SSSSSS} | {level: <8} | {name}:{function}:{line} - {message}", log_serializer=False, log_diagnose=True, app_name='my_app', - append_app_name=True + append_app_name=True, + enqueue=True, + intercept_standard_logging=True, + file_sink=True ) ``` """ @@ -108,6 +112,7 @@ def config_log( log_format = '{time:YYYY-MM-DD HH:mm:ss.SSSSSS} | {level: <8} | {name}:{function}:{line} - {message}' # pragma: no cover if log_serializer is True: + log_format = '{message}' # pragma: no cover log_name = f'{log_name}.json' # pragma: no cover else: log_name = f'{log_name}.log' # pragma: no cover @@ -140,7 +145,7 @@ def config_log( log_path, level=logging_level.upper(), format=log_format, - enqueue=True, + enqueue=enqueue, backtrace=log_backtrace, rotation=log_rotation, retention=log_retention, @@ -149,6 +154,8 @@ def config_log( diagnose=log_diagnose, ) + basic_config_handlers:list = [] + class InterceptHandler(logging.Handler): """ Interceptor for standard logging. @@ -194,12 +201,60 @@ def emit(self, record): level, record.getMessage() ) # pragma: no cover - # Configure standard logging to use interceptor handler - logging.basicConfig(handlers=[InterceptHandler()], level=logging_level.upper()) - # Add interceptor handler to all existing loggers - for name in logging.root.manager.loggerDict: - logging.getLogger(name).addHandler(InterceptHandler()) + if intercept_standard_logging: + # Add interceptor handler to all existing loggers + for name in logging.root.manager.loggerDict: + logging.getLogger(name).addHandler(InterceptHandler()) + + # Add interceptor handler to the root logger + basic_config_handlers.append(InterceptHandler()) # Set the root logger's level to the lowest level possible logging.getLogger().setLevel(logging.NOTSET) + + + class ResilientFileSink: + """ + A file sink designed for resilience, capable of retrying write operations. + + This class implements a resilient file writing mechanism that attempts to write messages to a file, retrying the operation a specified number of times if it fails. This is particularly useful in scenarios where write operations might intermittently fail due to temporary issues such as file system locks or networked file system delays. + + Attributes: + path (str): The path to the file where messages will be written. + max_retries (int): The maximum number of retry attempts for a failed write operation. + retry_delay (float): The delay between retry attempts, in seconds. + + Methods: + write(message): Attempts to write a message to the file, retrying on failure up to `max_retries` times. + """ + def __init__(self, path, max_retries=5, retry_delay=0.1): + self.path = path + self.max_retries = max_retries + self.retry_delay = retry_delay + + def write(self, message): # pragma: no cover + for attempt in range(self.max_retries): + try: + with open(self.path, 'a') as file: + file.write(str(message)) + break # Successfully written, break the loop + except FileNotFoundError: + if attempt < self.max_retries - 1: + time.sleep(self.retry_delay) # Wait before retrying + else: + raise # Reraise if max retries exceeded + + + if file_sink: + # Create an instance of ResilientFileSink + resilient_sink = ResilientFileSink(str(log_path)) + + # Configure the logger to use the ResilientFileSink + basic_config_handlers.append(resilient_sink) + + if intercept_standard_logging: + basic_config_handlers.append(InterceptHandler()) + + if len(basic_config_handlers) > 0: + logging.basicConfig(handlers=basic_config_handlers, level=logging_level.upper()) diff --git a/examples/log_example.py b/examples/log_example.py index 1c9ab3c5..71329b33 100644 --- a/examples/log_example.py +++ b/examples/log_example.py @@ -5,43 +5,29 @@ License: MIT """ import logging -import random import secrets - +import threading from loguru import logger from tqdm import tqdm - from dsg_lib.common_functions import logging_config +# Configure logging as before logging_config.config_log( - logging_directory='log', # Directory where logs will be stored - log_name='log', # Name of the log file - logging_level='DEBUG', # Logging level - log_rotation='500 MB', # Log rotation size - log_retention='10 days', # Log retention period - log_backtrace=True, # Enable backtrace - # log_format="{time:YYYY-MM-DD HH:mm:ss.SSSSSS} | {level: <8} | {name}:{function}:{line} - {message}", # Log format - log_serializer=False, # Disable log serialization - log_diagnose=True, # Enable diagnose - app_name='my_app', # Application name - append_app_name=True, # Append application name to the log file name + logging_directory='log', + log_name='log', + logging_level='DEBUG', + log_rotation='100 MB', + log_retention='10 days', + log_backtrace=True, + log_serializer=True, + log_diagnose=True, + # app_name='my_app', + # append_app_name=True, + file_sink=True, + intercept_standard_logging=True, + enqueue=False ) -# after configuring logging -# user loguru to log messages -logger.debug('This is a debug message') -logger.info('This is an info message') -logger.error('This is an error message') -logger.warning('This is a warning message') -logger.critical('This is a critical message') - -# will intercept all standard logging messages also -logging.debug('This is a debug message') -logging.info('This is an info message') -logging.error('This is an error message') -logging.warning('This is a warning message') -logging.critical('This is a critical message') - def div_zero(x, y): try: @@ -56,12 +42,42 @@ def div_zero_two(x, y): return x / y -a = div_zero(x=1, y=0) -b = div_zero_two(x=1, y=0) -for _ in tqdm(range(5000), ascii=True): - big_string = '' - for _ in range(random.randint(275, 1000)): - big_string += f'{secrets.token_urlsafe(random.randint(1,5))} ' - # log a lot of data - logging.debug(f'Lets make this a big message {big_string}') +def log_big_string(lqty=100, size=256): + big_string = secrets.token_urlsafe(size) + for _ in range(lqty): + logging.debug(f'Lets make this a big message {big_string}') + div_zero(x=1, y=0) + div_zero_two(x=1, y=0) + # after configuring logging + # use loguru to log messages + logger.debug('This is a debug message') + logger.info('This is an info message') + logger.error('This is an error message') + logger.warning('This is a warning message') + logger.critical('This is a critical message') + + # will intercept all standard logging messages also + logging.debug('This is a debug message') + logging.info('This is an info message') + logging.error('This is an error message') + logging.warning('This is a warning message') + logging.critical('This is a critical message') + + +def worker(wqty=100, lqty=100, size=256): + for _ in tqdm(range(wqty), ascii=True): # Adjusted for demonstration + log_big_string(lqty=lqty, size=size) + +def main(wqty=100, lqty=100, size=256, workers=2): + threads = [] + for _ in range(workers): # Create workers threads + t = threading.Thread(target=worker, args=(wqty, lqty, size,)) + threads.append(t) + t.start() + + for t in threads: + t.join() + +if __name__ == "__main__": + main(wqty=100, lqty=10, size=256, workers=10) diff --git a/pyproject.toml b/pyproject.toml index 934d1bc7..b293a9a1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ build-backend = "hatchling.build" [project] name = "devsetgo_lib" -version = "0.13.0" +version = "0.13.1" requires-python = ">=3.9" description = """ DevSetGo Library is a Python library offering reusable functions for efficient coding. It includes file operations, calendar utilities, pattern matching, advanced logging with loguru, FastAPI endpoints, async database handling, and email validation. Designed for ease of use and versatility, it's a valuable tool for Python developers. @@ -116,9 +116,9 @@ quote-style = "single" [tool.flake8] -max-line-length = 132 -max-doc-length = 132 -ignore = ["E302","E501"] +max-line-length = 100 +max-doc-length = 100 +ignore = ["E302", "E501","E303"] # Keeping the ignores the same as before since ruff's specific ignores aren't directly transferable exclude = [ ".git", "__pycache__", diff --git a/requirements.txt b/requirements.txt index c5da490a..9e7268d4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,40 +1,40 @@ -aiomysql==0.2.0 # Vulnerabilities: None -aiosqlite==0.20.0 # Vulnerabilities: None -asyncpg==0.29.0 # Vulnerabilities: None -autoflake==2.3.1 # Vulnerabilities: None -autopep8==2.2.0 # From 2.1.0 | Vulnerabilities: None -black==24.4.2 # Vulnerabilities: None -bump2version==1.0.1 # Vulnerabilities: None -click==8.1.7 # Vulnerabilities: None -cx-Oracle==8.3.0 # Vulnerabilities: None -fastapi[all]==0.111.0 # Vulnerabilities: None -flake8==7.0.0 # Vulnerabilities: None -genbadge[all]==1.1.1 # Vulnerabilities: None -hatchling==1.24.2 # Vulnerabilities: None -loguru==0.7.2 # Vulnerabilities: None -mkdocs-material==9.5.25 # From 9.5.23 | Vulnerabilities: None -mkdocs-print-site-plugin==2.5.0 # Vulnerabilities: None -mkdocstrings[python,shell]==0.25.1 # Vulnerabilities: None -packaging==24.0 # Vulnerabilities: None -pre-commit==3.7.1 # Vulnerabilities: None -psycopg2==2.9.9 # Vulnerabilities: None -Pygments==2.18.0 # Vulnerabilities: None -pylint==3.2.2 # From 3.2.0 | Vulnerabilities: None -pymdown-extensions==10.8.1 # Vulnerabilities: None -pytest==8.2.1 # From 8.2.0 | Vulnerabilities: None -pytest-asyncio==0.23.7 # From 0.23.6 | Vulnerabilities: None -pytest-cov==5.0.0 # Vulnerabilities: None -pytest-mock==3.14.0 # Vulnerabilities: None -pytest-runner==6.0.1 # Vulnerabilities: None -pytest-xdist==3.6.1 # Vulnerabilities: None -pytz==2024.1 # Vulnerabilities: None -PyYAML==6.0.1 # Vulnerabilities: None -ruff==0.4.7 # From 0.4.4 | Vulnerabilities: None -SQLAlchemy==2.0.30 # Vulnerabilities: None -toml==0.10.2 # Vulnerabilities: None -tox==4.15.0 # Vulnerabilities: None -tqdm==4.66.4 # Vulnerabilities: None -twine==5.1.0 # Vulnerabilities: None -watchdog==4.0.1 # From 4.0.0 | Vulnerabilities: None -wheel==0.43.0 # Vulnerabilities: None -xmltodict==0.13.0 # Vulnerabilities: None +aiomysql==0.2.0 # Vulnerabilities: None +aiosqlite==0.20.0 # Vulnerabilities: None +asyncpg==0.29.0 # Vulnerabilities: None +autoflake==2.3.1 # Vulnerabilities: None +autopep8==2.3.1 # From 2.1.1 | Vulnerabilities: None +black==24.4.2 # Vulnerabilities: None +bump2version==1.0.1 # Vulnerabilities: None +click==8.1.7 # Vulnerabilities: None +cx-Oracle==8.3.0 # Vulnerabilities: None +fastapi[all]==0.111.1 # From 0.111.0 | Vulnerabilities: None +flake8==7.1.0 # From 7.0.0 | Vulnerabilities: None +genbadge[all]==1.1.1 # Vulnerabilities: None +hatchling==1.25.0 # From 1.24.2 | Vulnerabilities: None +loguru==0.7.2 # Vulnerabilities: None +mkdocs-material==9.5.29 # From 9.5.24 | Vulnerabilities: None +mkdocs-print-site-plugin==2.5.0 # From 2.4.1 | Vulnerabilities: None +mkdocstrings[python,shell]==0.25.1 # Vulnerabilities: None +packaging==24.1 # From 24.0 | Vulnerabilities: None +pre-commit==3.7.1 # Vulnerabilities: None +psycopg2==2.9.9 # Vulnerabilities: None +Pygments==2.18.0 # Vulnerabilities: None +pylint==3.2.5 # From 3.2.2 | Vulnerabilities: None +pymdown-extensions==10.8.1 # Vulnerabilities: None +pytest==8.3.1 # From 8.2.1 | Vulnerabilities: None +pytest-asyncio==0.23.8 # From 0.23.7 | Vulnerabilities: None +pytest-cov==5.0.0 # Vulnerabilities: None +pytest-mock==3.14.0 # Vulnerabilities: None +pytest-runner==6.0.1 # Vulnerabilities: None +pytest-xdist==3.6.1 # Vulnerabilities: None +pytz==2024.1 # Vulnerabilities: None +PyYAML==6.0.1 # Vulnerabilities: None +ruff==0.5.4 # From 0.4.5 | Vulnerabilities: None +SQLAlchemy==2.0.31 # From 2.0.30 | Vulnerabilities: None +toml==0.10.2 # Vulnerabilities: None +tox==4.16.0 # From 4.15.0 | Vulnerabilities: None +tqdm==4.66.4 # Vulnerabilities: None +twine==5.1.1 # From 5.1.0 | Vulnerabilities: None +watchdog==4.0.1 # Vulnerabilities: None +wheel==0.43.0 # Vulnerabilities: None +xmltodict==0.13.0 # Vulnerabilities: None diff --git a/tests/test_common_functions/test_logging_config.py b/tests/test_common_functions/test_logging_config.py index 6693db53..9516ddde 100644 --- a/tests/test_common_functions/test_logging_config.py +++ b/tests/test_common_functions/test_logging_config.py @@ -10,10 +10,20 @@ class TestConfigLog(unittest.TestCase): @patch('dsg_lib.common_functions.logging_config.logger') def test_config_log_with_valid_params(self, mock_logger): config_log( - logging_directory='logs', - log_name='app.log', - logging_level='DEBUG', - log_rotation='1 MB', + logging_directory = 'log', + log_name= 'log', + logging_level= 'INFO', + log_rotation= '2 MB', + log_retention= '30 days', + log_backtrace = False, + log_format= None, + log_serializer = False, + log_diagnose = False, + app_name= None, + append_app_name = False, + enqueue = True, + intercept_standard_logging = True, + file_sink = True, ) mock_logger.configure.assert_called_once() mock_logger.add.assert_called_once() diff --git a/unreleased/pdf_margin.py b/unreleased/pdf_margin.py new file mode 100644 index 00000000..e74760dc --- /dev/null +++ b/unreleased/pdf_margin.py @@ -0,0 +1,141 @@ +import fitz # PyMuPDF +import os +from tqdm import tqdm +import json +from datetime import datetime +from pytz import timezone + +def check_interference(page, corner, width, height): + try: + page_rect = page.rect + page_width, page_height = page_rect.width, page_rect.height + + if corner == "top_right": + x0, y0 = page_width - width, 0 + x1, y1 = page_width, height + elif corner == "bottom_right": + x0, y0 = page_width - width, page_height - height + x1, y1 = page_width, page_height + else: + raise ValueError("Invalid corner, can only be top_right or bottom_right") + + check_rect = fitz.Rect(x0, y0, x1, y1) + + text_blocks = page.get_text("dict")["blocks"] + for block in text_blocks: + if block["type"] == 0: + bbox = fitz.Rect(block["bbox"]) + if check_rect.intersects(bbox): + return f"Text interference detected in {corner} corner." + + images = page.get_images(full=True) + for img in images: + xref = img[0] + img_rect = fitz.Rect(page.get_image_bbox(xref)) + if check_rect.intersects(img_rect): + return f"Image interference detected in {corner} corner." + + return None + except Exception as e: + return f"Error in processing {corner} {page}: {e}" + +def get_margin_by_page(page): + try: + page_rect = page.rect + page_width, page_height = page_rect.width, page_rect.height + + text_blocks = page.get_text("dict")["blocks"] + text = page.get_text().encode("utf8") + + text_x0, text_y0 = page_width, page_height + text_x1, text_y1 = 0, 0 + + for block in text_blocks: + if block["type"] == 0: + bbox = block["bbox"] + text_x0 = min(text_x0, bbox[0]) + text_y0 = min(text_y0, bbox[1]) + text_x1 = max(text_x1, bbox[2]) + text_y1 = max(text_y1, bbox[3]) + + left_margin = text_x0 + right_margin = page_width - text_x1 + top_margin = text_y0 + bottom_margin = page_height - text_y1 + + corners = [ + {"corner": "top_right", "width": 144, "height": 36}, + {"corner": "bottom_right", "width": 36, "height": 216}, + ] + + interference = [] + for c in corners: + inter = check_interference( + page=page, corner=c["corner"], width=c["width"], height=c["height"] + ) + if inter and not inter.startswith("Error in"): + interference.append(f"{c['corner']} {inter}") + + return { + "left_margin": round(left_margin / 72, 2), + "right_margin": round(right_margin / 72, 2), + "top_margin": round(top_margin / 72, 2), + "bottom_margin": round(bottom_margin / 72, 2), + "page_text": text, + "interference": interference, + } + except Exception as e: + print(f"Error processing {page}: {e}") + return {"error": str(e)} + +def run_pdf(pdf_folder, output_folder, safety_margin=0.4): + dir_list = [f for f in os.listdir(pdf_folder) if f.endswith(".pdf")][:200] + + page_count = 0 + document_margins = [] + for pdf_file in tqdm(dir_list, desc="processing file", leave=True): + document = fitz.open(f"{pdf_folder}/{pdf_file}") + + margin_list = [] + for i in tqdm(range(len(document)), desc=f"processing {pdf_file}", leave=False): + page = document[i] + margin_dict = get_margin_by_page(page) + margin_list.append({"page": i + 1, "margin": margin_dict}) + page_count += 1 + + document_margins.append({"file": pdf_file, "margins": margin_list}) + + margin_issues = [] + for da in document_margins: + file_name = da.get("file") + margin_page_list = [] + for a in da.get("margins"): + page_number = a.get("page") + page_margin = a.get("margin") + warnings = [] + + if "interference" in page_margin.items(): + for key, value in page_margin.items(): + if key.endswith("_margin") and value <= safety_margin: + warnings.append(key) + + if page_margin["interference"]: + warnings.extend(page_margin["interference"]) + + if warnings: + margin_page_list.append({"page_number": page_number, "issues": warnings}) + + if margin_page_list: + margin_issues.append({"file": file_name, "issues": margin_page_list}) + + dt = datetime.now().astimezone(timezone("America/New_York")) + timestamp = dt.strftime("%Y-%m-%d-%H%M") + + output_path = os.path.join(output_folder, f"margin_issues_{timestamp}.json") + with open(output_path, "w") as write_file: + json.dump(margin_issues, write_file) + + print(f"Page count: {page_count} | len(margin_issues): {len(margin_issues)}") + +if __name__ == "__main__": + run_pdf(pdf_folder="/your/pdf/folder/", output_folder="/your/output/folder/") diff --git a/unreleased/pdf_processing.py b/unreleased/pdf_processing.py new file mode 100644 index 00000000..e3be9e49 --- /dev/null +++ b/unreleased/pdf_processing.py @@ -0,0 +1,179 @@ +from fastapi import FastAPI, UploadFile, File +from fastapi.responses import ORJSONResponse +import time +import io +from pypdf import PdfReader +from loguru import logger + +app = FastAPI() + +@app.post('/validate-pdf', response_class=ORJSONResponse, status_code=201) +async def check_pdf( + file: UploadFile = File(...), + include_text: bool = False, + check_text: bool = False, + include_page_errors: bool = False +): + response = dict() + t0 = time.time() + + response["file_name"] = file.filename + response["content_type"] = file.content_type + response["file_size"] = file.size + filters = { + "include_text": include_text, + "check_text": check_text + } + + if file.content_type != "application/pdf": + message = f"File is not a PDF, but type {file.content_type}" + logger.error(message) + response["message"] = message + return ORJSONResponse(content=response, status_code=400) + + pdf_content = await file.read() + reader = PdfReader(io.BytesIO(pdf_content)) + + if len(reader.pages) == 0: + message = "The PDF is empty" + logger.error(message) + response["message"] = message + return ORJSONResponse(content=response, status_code=400) + + response["page_count"] = len(reader.pages) + + meta = reader.metadata + if meta is None: + message = "The PDF does not contain meta data" + logger.error(message) + response["message"] = message + return ORJSONResponse(content=response, status_code=400) + + cleaned_meta = {k: str(v).replace("\x00", "") for k, v in meta.items()} + response["meta"] = cleaned_meta + + text = "" + if check_text: + results = get_pdf_content(pdf_content=pdf_content) + text = results["text"] + if not text.strip(): + message = "The PDF does not contain readable text" + logger.error(message) + response["message"] = message + return ORJSONResponse(content=response, status_code=400) + + common_words = ["the", "and", "is"] + words_found = [word for word in common_words if word in text] + if len(words_found) == 0: + message = "The PDF does not contain readable text, like the word 'the'" + logger.error(message) + response["message"] = message + return ORJSONResponse(content=response, status_code=400) + + response["characters"] = len(text) + response["words_found"] = words_found + if include_page_errors: + response["errors"] = results["errors"] + + if reader.is_encrypted: + message = "The PDF is encrypted and not allowed" + logger.error(message) + response["message"] = message + return ORJSONResponse(content=response, status_code=400) + + embedded_fonts = [] + for page in tqdm(reader.pages, desc="Finding Fonts"): + fonts = page.get_fonts() + for font in fonts: + font_name = font.get("BaseFont", "").replace("/", "").replace("+", "") + if font_name not in embedded_fonts: + embedded_fonts.append(font_name) + + if not embedded_fonts: + message = "The PDF does not have embedded fonts" + logger.error(message) + response["message"] = message + return ORJSONResponse(content=response, status_code=400) + + response["fonts"] = embedded_fonts + form_fields = any("/AcroForm" in reader.trailer for _ in reader.pages) + if form_fields: + message = "The PDF contains form fields" + logger.error(message) + response["message"] = message + return ORJSONResponse(content=response, status_code=400) + + if include_text: + response["text"] = text + + t1 = time.time() + logger.debug(f"PDF check response: {response}") + response["processing_time_seconds"] = f"{t1 - t0:.2f}" + return ORJSONResponse(content=response, status_code=201) + + +# Function to extract data from a PDF file + + +# coding: utf-8 +import io +import re +from functools import lru_cache + +from loguru import logger # Import the Loguru logger +from pypdf import PdfReader, PaperSize +from tqdm import tqdm +from unsync import unsync + +@unsync +def extract_pdf_text(pdf_content, page_number: int): + try: + reader = get_reader(pdf_content) + page = reader.pages[page_number].extract_text(extraction_mode="layout", layout_mode_strip_rotated=True) + text = reader.pages[page_number].extract_text() + box = reader.pages[page_number].mediabox + + print(f"left {box.left}") + print(f"right {box.right}") + print(f"lower left {box.lower_left}") + print(f"lower right {box.lower_right}") + print(f"upper left {box.upper_left}") + print(f"upper right {box.upper_right}") + print(f"top {box.top}") + print(f"bottom {box.bottom}") + + return {"text": text, "page_num": page_number, "margin": box, "error": None} + except Exception as ex: + logger.error(ex) + return {"text": "", "page_num": page_number, "margin": None, "error": ex} + +@lru_cache(maxsize=300, typed=False) +def get_reader(pdf_content): + reader = PdfReader(io.BytesIO(pdf_content)) + return reader + +def is_valid_ssn(ssn): + ssn_regex = re.compile(r"^(?!000|666)[0-8]\d{2}-(?!00)\d{2}-(?!0000)\d{4}$") + return bool(ssn_regex.match(ssn)) + +def get_pdf_content(pdf_content): + reader = PdfReader(io.BytesIO(pdf_content)) + + tasks = [ + extract_pdf_text(pdf_content=pdf_content, page_number=page_number) + for page_number in tqdm(range(len(reader.pages)), desc="PDF Text Processing") + ] + + results = [task.result() for task in tqdm(tasks, desc="PDF Text Results")] + + results.sort(key=lambda x: x["page_num"]) + combined_text = "\n".join([result["text"] for result in results]) + has_ssn = is_valid_ssn(combined_text) + margins = [result["margin"] for result in results] + error_list = [result for result in results if result["error"] is not None] + + for result in results: + if result["error"] is not None: + error_list.append(f"Error on page {result['page_num']} of {result['error']}") + + return {"text": combined_text, "margins": margins, "errors": error_list, "PII": has_ssn} diff --git a/unreleased/pdf_sample.pdf b/unreleased/pdf_sample.pdf new file mode 100644 index 00000000..2c9d3e25 Binary files /dev/null and b/unreleased/pdf_sample.pdf differ diff --git a/unreleased/pdf_sample_narrow.pdf b/unreleased/pdf_sample_narrow.pdf new file mode 100644 index 00000000..d3bd4475 Binary files /dev/null and b/unreleased/pdf_sample_narrow.pdf differ diff --git a/unreleased/pdf_script.py b/unreleased/pdf_script.py new file mode 100644 index 00000000..8be988a1 --- /dev/null +++ b/unreleased/pdf_script.py @@ -0,0 +1,51 @@ +import io +from pypdf import PdfReader + +pdf_content = open("pdf_sample_narrow.pdf", "rb").read() + +reader = PdfReader(io.BytesIO(pdf_content)) + +print(f"PDF Version {reader.pdf_header}") + +parts = [] + +def visitor_body(text, cm, tm, font_dict, font_size): + y = cm[5] + # if y > 50 and y < 720: + parts.append(text) + +for page_number in range(len(reader.pages)): + try: + text = reader.pages[page_number].extract_text(visitor_text=visitor_body, layout_mode_scale_weight=1.0) + mediabox = reader.pages[page_number].mediabox + cropbox = reader.pages[page_number].cropbox + trimbox = reader.pages[page_number].trimbox + artbox = reader.pages[page_number].artbox + bleedbox = reader.pages[page_number].bleedbox + unit_size = reader.pages[page_number].user_unit + + print(f"Page {page_number}") + # print("begin text.....") + # print(text) + # print("end text.....") + print(f"MediaBox: {mediabox.width}x{mediabox.height} divid by 72 = {mediabox[2] / 72} x {mediabox[3] / 72}") + print(f"BropBox: {cropbox.width}x{cropbox.height} divid by 72 = {cropbox[2] / 72} x {cropbox[3] / 72}") + print(f"TrimBox: {trimbox.width}x{trimbox.height} divid by 72 = {trimbox[2] / 72} x {trimbox[3] / 72}") + print(f"ArtBox: {artbox.width}x{artbox.height} divid by 72 = {artbox[2] / 72} x {artbox[3] / 72}") + print(f"BleedBox: {bleedbox.width}x{bleedbox.height} divid by 72 = {bleedbox[2] / 72} x {bleedbox[3] / 72}") + print(f"Unit Size: {unit_size}") + + except Exception as ex: + print(f"Error on page {page_number}: {ex}") + + + +text_body = "".join(parts) + +print(text_body) +for p in parts: + if len(p) > 100: + print(len(p),p) + +line = "embed code for the video you want to add. You can also type a keyword to search online for the video that best fits" +print(len(line), line)