CogStack
diff --git a/‎Dockerfile‎
Lines changed: 34 additions & 37 deletions b/‎Dockerfile‎
Lines changed: 34 additions & 37 deletions
diff --git a/‎config.py‎
Lines changed: 34 additions & 31 deletions b/‎config.py‎
Lines changed: 34 additions & 31 deletions
diff --git a/‎env/general.env‎
Lines changed: 2 additions & 1 deletion b/‎env/general.env‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎env/ocr_service.env‎
Lines changed: 2 additions & 1 deletion b/‎env/ocr_service.env‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎ocr_service/app/app.py‎
Lines changed: 2 additions & 1 deletion b/‎ocr_service/app/app.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎ocr_service/processor/processor.py‎
Lines changed: 10 additions & 10 deletions b/‎ocr_service/processor/processor.py‎
Lines changed: 10 additions & 10 deletions
@@ -21,18 +21,17 @@ ENV NVIDIA_VISIBLE_DEVICES=all
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,display
 
 # Keeps Python from generating .pyc files in the container
-ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONDONTWRITEBYTECODE=0
 # Turns off buffering for easier container logging
 ENV PYTHONUNBUFFERED=1
 
-ENV SETUPTOOLS_USE_DISTUTILS=stdlib
+# ENV SETUPTOOLS_USE_DISTUTILS=stdlib
 
 # default user
 USER root
 
-# Update and install python3
-RUN apt-get update && apt-get upgrade -y && \
-     apt-get install -y software-properties-common
+# install extra features
+RUN apt-get update && apt-get upgrade -y && apt-get install -y software-properties-common
 
 # add extra repos
 RUN apt-add-repository multiverse && \
@@ -42,12 +41,9 @@ RUN apt-add-repository multiverse && \
     apt-get update && apt-get upgrade -y 
 
 # install req packages
-RUN apt-get install -y python3.11 python3.11-dev python3.11-venv python3-dev python3-pip
-
-RUN apt-get update && apt-get upgrade -y && \
-    apt-get --force-yes -o Dpkg::Options::="--force-confold" --force-yes -o Dpkg::Options::="--force-confdef" -fuy  dist-upgrade  && \
-    apt-get install -y \
-    pkg-config \
+RUN apt-get install -y --no-install-recommends python3-all-dev python3-dev python3.12 python3-pip libpython3.12-dev python3.12-dev
+RUN apt-get -y --no-install-recommends -o Dpkg::Options::="--force-confold" -y -o Dpkg::Options::="--force-confdef" -fuy dist-upgrade && \
+    apt-get install -y --no-install-recommends \
     gnupg \
     libssl-dev \
     wget \
@@ -66,54 +62,55 @@ RUN apt-get update && apt-get upgrade -y && \
     g++
 
 ##### utils for python and TESSERACT
-
 RUN echo "ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true" | debconf-set-selections
-RUN apt-get install -y --no-install-recommends fontconfig ttf-mscorefonts-installer
-RUN fc-cache -f -v
 
-RUN apt-get install -y libimage-exiftool-perl libtcnative-1 && \
-    apt-get install -y ttf-mscorefonts-installer fontconfig && \
-    apt-get install -y --fix-missing libsm6 libxext6 gstreamer1.0-libav fonts-deva fonts-dejavu fonts-gfs-didot fonts-gfs-didot-classic fonts-junicode fonts-ebgaramond fonts-noto-cjk fonts-takao-gothic fonts-vlgothic && \
-    apt-get install -y --fix-missing ghostscript ghostscript-x gsfonts gsfonts-other gsfonts-x11 fonts-croscore fonts-crosextra-caladea fonts-crosextra-carlito fonts-liberation fonts-open-sans fonts-noto-core fonts-ibm-plex fonts-urw-base35 && \
-    apt-get install -y --fix-missing fonts-noto fonts-noto-cjk fonts-noto-extra xfonts-terminus fonts-font-awesome fonts-hack fonts-inconsolata fonts-liberation2 fonts-mononoki && \
-    apt-get install -y --fix-missing libpcre3 libpcre3-dev && \
-    apt-get install -y --fix-missing mesa-opencl-icd pocl-opencl-icd && \
-    apt-get install -y --fix-missing libvips-tools libvips libvips-dev && \
-    apt-get install -y --fix-missing imagemagick libcairo2-dev tesseract-ocr tesseract-ocr-all libtesseract5 libtesseract-dev libleptonica-dev liblept5
+RUN apt-get install -y --no-install-recommends fontconfig ttf-mscorefonts-installer libimage-exiftool-perl libtcnative-1 \
+    libsm6 libxext6 gstreamer1.0-libav fonts-deva fonts-dejavu fonts-gfs-didot fonts-gfs-didot-classic fonts-junicode fonts-ebgaramond fonts-noto-cjk fonts-takao-gothic fonts-vlgothic \
+    ghostscript ghostscript-x gsfonts gsfonts-other gsfonts-x11 fonts-croscore fonts-crosextra-caladea fonts-crosextra-carlito fonts-liberation fonts-open-sans fonts-noto-core fonts-ibm-plex fonts-urw-base35 \
+    fonts-noto fonts-noto-cjk fonts-noto-extra xfonts-terminus fonts-font-awesome fonts-hack fonts-inconsolata fonts-liberation2 fonts-mononoki \
+    libpcre3 libpcre3-dev \
+    mesa-opencl-icd pocl-opencl-icd libvips-tools libvips libvips-dev \
+    imagemagick libcairo2-dev tesseract-ocr tesseract-ocr-all libtesseract5 libtesseract-dev libleptonica-dev liblept5
 
 # tessaract language packages
-RUN apt-get install -y --fix-missing tesseract-ocr-eng tesseract-ocr-osd tesseract-ocr-lat  && \
-    apt-get install -y --fix-missing tesseract-ocr-eng tesseract-ocr-enm tesseract-ocr-ita tesseract-ocr-osd tesseract-ocr-script-latn && \
-    apt-get install -y --fix-missing tesseract-ocr-fra tesseract-ocr-frk tesseract-ocr-deu tesseract-ocr-ces tesseract-ocr-dan tesseract-ocr-nld tesseract-ocr-nor && \
-    apt-get install -y --fix-missing tesseract-ocr-spa tesseract-ocr-swe tesseract-ocr-slk tesseract-ocr-ron tesseract-ocr-script-grek
+RUN apt-get install -y --no-install-recommends --fix-missing tesseract-ocr-eng tesseract-ocr-osd tesseract-ocr-lat \
+    tesseract-ocr-eng tesseract-ocr-enm tesseract-ocr-ita tesseract-ocr-osd tesseract-ocr-script-latn \
+    tesseract-ocr-fra tesseract-ocr-frk tesseract-ocr-deu tesseract-ocr-ces tesseract-ocr-dan tesseract-ocr-nld tesseract-ocr-nor \
+    tesseract-ocr-spa tesseract-ocr-swe tesseract-ocr-slk tesseract-ocr-ron tesseract-ocr-script-grek
 
 # Pillow package requirements
-RUN apt-get install -y python3-tk tcl8.6-dev tk8.6-dev libopenjp2-7-dev libharfbuzz-dev libfribidi-dev libxcb1-dev libtiff5-dev libjpeg8-dev zlib1g-dev libfreetype6-dev liblcms2-dev libwebp-dev 
+RUN apt-get install -y --no-install-recommends tcl8.6-dev tk8.6-dev libopenjp2-7-dev libharfbuzz-dev libfribidi-dev libxcb1-dev libtiff5-dev libjpeg8-dev zlib1g-dev libfreetype6-dev liblcms2-dev libwebp-dev libglib2.0-dev libgl1
 
 # python3 poppler requirement
-RUN apt-get install poppler-utils -y
+RUN apt-get install -y --no-install-recommends poppler-utils
 
+# libre office and java
 RUN apt-get install -y --no-install-recommends default-jre libreoffice-java-common libreoffice libreoffice-script-provider-python
 
-RUN apt-get clean autoclean && \
-    apt-get autoremove --purge -y
+# build font cache
+RUN fc-cache -f -v
+
+# there is a bug in the blinker package that causes issues with uwsgi
+# (this removes software-properties-common)
+RUN apt remove -y python3-blinker
+
+RUN apt-get clean autoclean && apt-get autoremove --purge -y
 
 # other openCL packages
 # beignet-opencl-icd
 
 RUN rm -rf /var/lib/apt/lists/*
 
-# python3 packages
-# RUN python3.11 -m pip install --no-cache-dir --upgrade pip --break-system-packages
-
 # create and copy the app  
 RUN mkdir /ocr_service
 COPY ./ /ocr_service
 WORKDIR /ocr_service
 
-# Install requirements for the app
-#RUN apt-get remove python3-wheel -y
-RUN python3.11 -m pip install --no-cache-dir --ignore-installed --break-system-packages -r ./requirements.txt
+# Install uwsgi from PyPI source using the global tools
+RUN python3.12 -m pip install --no-cache-dir --break-system-packages --no-build-isolation -r ./requirements.txt
+
+# compile the python files
+RUN python3.12 -m compileall /ocr_service
 
 # Now run the simple api
 CMD ["/bin/bash", "start_service_production.sh"]
@@ -2,102 +2,105 @@
 import multiprocessing
 
 from sys import platform
+from typing import Tuple
 
-OCR_SERVICE_VERSION = "0.2.3"
+OCR_SERVICE_VERSION: str = "0.3.0"
 # 50 - CRITICAL, 40 - ERROR, 30 - WARNING, 20 - INFO, 10 - DEBUG, 0 - NOTSET
-LOG_LEVEL = int(os.environ.get("OCR_SERVICE_LOG_LEVEL", 40))
+LOG_LEVEL: int = int(os.environ.get("OCR_SERVICE_LOG_LEVEL", 40))
 
-DEBUG_MODE = os.environ.get("OCR_SERVICE_DEBUG_MODE", False)
+DEBUG_MODE: bool = True if os.environ.get("OCR_SERVICE_DEBUG_MODE", False) in [True, "True", "true"] else False
 
-ROOT_DIR = os.path.abspath(os.curdir)
-TMP_FILE_DIR = os.path.join(ROOT_DIR, "tmp")
-WORKER_PORT_MAP_FILE_PATH = os.path.join(TMP_FILE_DIR, './worker_process_data.txt')
+ROOT_DIR: str = os.path.abspath(os.curdir)
+TMP_FILE_DIR: str = os.path.join(ROOT_DIR, "tmp")
+WORKER_PORT_MAP_FILE_PATH: str = os.path.join(TMP_FILE_DIR, './worker_process_data.txt')
 
 
 # Should we actually ocr or just extract text from PDFs ?
 #  NOTE: OCR IS STILL APPLIED TO IMAGES if detected | possible vals : "OCR", "NO_OCR"
-OPERATION_MODE = os.environ.get("OCR_SERVICE_OPERATION_MODE", "OCR")
+OPERATION_MODE: str = os.environ.get("OCR_SERVICE_OPERATION_MODE", "OCR")
 
 # basic app settings
-OCR_SERVICE_PORT = os.environ.get("OCR_SERVICE_PORT", 8090)
+OCR_SERVICE_PORT: int = int(os.environ.get("OCR_SERVICE_PORT", 8090))
 
 # Tesseract model path: macos - /opt/homebrew/share/tessdata | linux - "/usr/local/share/tessdata"
-TESSDATA_PREFIX = os.environ.get("OCR_TESSDATA_PREFIX", "/opt/homebrew/share/tessdata")
+TESSDATA_PREFIX: str = os.environ.get("OCR_TESSDATA_PREFIX", "/opt/homebrew/share/tessdata")
 
 # Integer or Float - duration in seconds for the OCR processing, after which,
 #   tesseract will terminate and raise RuntimeError
-TESSERACT_TIMEOUT = int(os.environ.get("OCR_SERVICE_TESSERACT_TIMEOUT", 30))
+TESSERACT_TIMEOUT: int = int(os.environ.get("OCR_SERVICE_TESSERACT_TIMEOUT", 30))
 
 # Tesseract language code string. Defaults to eng if not specified! Example for multiple languages: lang='eng+fra'
-TESSERACT_LANGUAGE = os.environ.get("OCR_SERVICE_TESSERACT_LANG", "eng+lat")
+TESSERACT_LANGUAGE: str = os.environ.get("OCR_SERVICE_TESSERACT_LANG", "eng+lat")
 
 # Integer - modifies the processor priority for the Tesseract run. Not supported on Windows.
 #   Nice adjusts the niceness of unix-like processes.
-TESSERACT_NICE = int(os.environ.get("OCR_SERVICE_TESSERACT_NICE", -18))
+TESSERACT_NICE: int = int(os.environ.get("OCR_SERVICE_TESSERACT_NICE", -18))
 
 # Any additional custom configuration flags that are not available via the tesseract function.
 # For example: config='--psm 6'
-TESSERACT_CUSTOM_CONFIG_FLAGS = os.environ.get("OCR_SERVICE_TESSERACT_CUSTOM_CONFIG_FLAGS", "")
+TESSERACT_CUSTOM_CONFIG_FLAGS: str = os.environ.get("OCR_SERVICE_TESSERACT_CUSTOM_CONFIG_FLAGS", "")
 
 # Controls both threads and cpus for one WEB SERVICE thread, basically, one request handler.
 # This is derived normally from the amount of threads Gunicorn is running with, for example:
 #  - if we have OCR_SERVICE_THREADS = 4, the OCR service can handle at most 4 requests at the same time,
 #    this means that you can not use all of your CPUS for OCR-ing for 1 request,
 #    because that means the other requests are sitting idle while the first one uses all resources,
 #    and so it is recommended to regulate the number of threads per request
-OCR_WEB_SERVICE_THREADS = int(os.environ.get("OCR_WEB_SERVICE_THREADS", 1))
+OCR_WEB_SERVICE_THREADS: int = int(os.environ.get("OCR_WEB_SERVICE_THREADS", 1))
 
 # This controls the number of workers the ocr service may have it is recommended to use this value
 #   instead of OCR_WEB_SERVICE_THREADS if you want to process multiple requests in parallel
 # WARNING: using more than 1 workers assumes you have set the OCR_WEB_SERVICE_WORKER_CLASS setting to sync !
 #          with the above mentioned,
 #          setting OCR_WEB_SERVICE_WORKER_CLASS to sync means that a worker will use 1 THREAD only,
 #          therefore OCR_WEB_SERVICE_THREADS is disregarded
-OCR_WEB_SERVICE_WORKERS = int(os.environ.get("OCR_WEB_SERVICE_WORKERS", 1))
+OCR_WEB_SERVICE_WORKERS: int = int(os.environ.get("OCR_WEB_SERVICE_WORKERS", 1))
 
 # set this to control the number of threads used for OCR-ing per web request thread (check OCR_WEB_SERVICE_THREADS)
-CPU_THREADS = int(os.environ.get("OCR_SERVICE_CPU_THREADS", (multiprocessing.cpu_count() / OCR_WEB_SERVICE_WORKERS)))
+CPU_THREADS: int = int(os.environ.get("OCR_SERVICE_CPU_THREADS",
+                                      (multiprocessing.cpu_count() / OCR_WEB_SERVICE_WORKERS)))
 
 # conversion thread number for the pdf -> PIL img conversion, per web request thread (check OCR_WEB_SERVICE_THREADS)
-CONVERTER_THREAD_NUM = int(os.environ.get("OCR_SERVICE_CONVERTER_THREADS",
-                                          (multiprocessing.cpu_count() / OCR_WEB_SERVICE_WORKERS)))
+CONVERTER_THREAD_NUM: int = int(os.environ.get("OCR_SERVICE_CONVERTER_THREADS",
+                                               (multiprocessing.cpu_count() / OCR_WEB_SERVICE_WORKERS)))
 
 # should we convert detected images to greyscale before OCR-ing
-OCR_CONVERT_GRAYSCALE_IMAGES = True
+OCR_CONVERT_GRAYSCALE_IMAGES: bool = True
 
 # dpi used for images in TESSERACT and other stuff
-OCR_IMAGE_DPI = int(os.environ.get("OCR_SERVICE_IMAGE_DPI", 200))
+OCR_IMAGE_DPI: int = int(os.environ.get("OCR_SERVICE_IMAGE_DPI", 200))
 
 # possible values: json (stringified output), dict (dict means no json.dumps() is applied to the output)
 OCR_SERVICE_RESPONSE_OUTPUT_TYPE: str = str(os.environ.get("OCR_SERVICE_RESPONSE_OUTPUT_TYPE", "json"))
 
 # LIBRE OFFICE SECTION
 
 # 60 seconds before terminating processes
-LIBRE_OFFICE_PROCESS_TIMEOUT = int(os.environ.get("OCR_SERVICE_LIBRE_OFFICE_PROCESS_TIMEOUT", 20))
+LIBRE_OFFICE_PROCESS_TIMEOUT: int = int(os.environ.get("OCR_SERVICE_LIBRE_OFFICE_PROCESS_TIMEOUT", 100))
 
 # This is the port for the background soffice listener service that gets started with the app
 # used internally for LibreOffice doc conversion
 # the service should start multiple libre office servers for doc conversions,
 # a libre office server will only use 1 CPU by default (not changable), thus,
 # for handling multiple requests, we will have one service per OCR_WEB_SERVICE_THREAD
-DEFAULT_LIBRE_OFFICE_SERVER_PORT = 9900
+DEFAULT_LIBRE_OFFICE_SERVER_PORT: int = 9900
 
-LIBRE_OFFICE_PORT_CAP = DEFAULT_LIBRE_OFFICE_SERVER_PORT + 1
+LIBRE_OFFICE_PORT_CAP: int = DEFAULT_LIBRE_OFFICE_SERVER_PORT + 1
 
 if OCR_WEB_SERVICE_THREADS > 1:
     LIBRE_OFFICE_PORT_CAP = DEFAULT_LIBRE_OFFICE_SERVER_PORT + OCR_WEB_SERVICE_THREADS
 if OCR_WEB_SERVICE_WORKERS > 1:
     LIBRE_OFFICE_PORT_CAP = DEFAULT_LIBRE_OFFICE_SERVER_PORT + OCR_WEB_SERVICE_WORKERS
 
-LIBRE_OFFICE_LISTENER_PORT_RANGE = os.environ.get("OCR_SERVICE_LIBRE_OFFICE_LISTENER_PORT_RANGE",
-                                                  range(DEFAULT_LIBRE_OFFICE_SERVER_PORT, LIBRE_OFFICE_PORT_CAP))
+LIBRE_OFFICE_LISTENER_PORT_RANGE: range | str = os.environ.get("OCR_SERVICE_LIBRE_OFFICE_LISTENER_PORT_RANGE",
+                                                               range(DEFAULT_LIBRE_OFFICE_SERVER_PORT,
+                                                                     LIBRE_OFFICE_PORT_CAP))
 
-LIBRE_OFFICE_NETWORK_INTERFACE = "localhost"
+LIBRE_OFFICE_NETWORK_INTERFACE: str = "localhost"
 
 
 # seconds to check for possible failure of port
-LIBRE_OFFICE_PROCESSES_LISTENER_INTERVAL = 10
+LIBRE_OFFICE_PROCESSES_LISTENER_INTERVAL: int = 10
 
 
 # DO NOT CHANGE THIS UNLESS YOU ARE DEVELOPING OR RUNNING THIS APP LOCALLY
@@ -106,9 +109,9 @@
 #
 # MacOS X: /Applications/LibreOffice.app/Contents/Resources/python
 # Windows: C:/Windows/py.exe
-# Linux(Ubuntu): /usr/bin/python3.11 (forcefully uses python3.11,
+# Linux(Ubuntu): /usr/bin/python3.12 (forcefully uses python3.12,
 #  to point to the default python on your system just use /usr/bin/python3)
-LIBRE_OFFICE_PYTHON_PATH = "/Applications/LibreOffice.app/Contents/Resources/python"
+LIBRE_OFFICE_PYTHON_PATH: str = "/Applications/LibreOffice.app/Contents/Resources/python"
 
 # DO NOT CHANGE THIS UNLESS YOU ARE DEVELOPING OR RUNNING THIS APP LOCALLY
 # Description: this sets the path to the LibreOffice executable,
@@ -118,11 +121,11 @@
 # MacOS X: /Applications/LibreOffice.app/Contents/MacOS/soffice
 # Windows: %ProgramFiles%/LibreOffice/Program/soffice
 # Linux(Ubuntu): /usr/bin/soffice
-LIBRE_OFFICE_EXEC_PATH = "/Applications/LibreOffice.app/Contents/MacOS/soffice"
+LIBRE_OFFICE_EXEC_PATH: str = "/Applications/LibreOffice.app/Contents/MacOS/soffice"
 
 if platform == "linux" or platform == "linux2":
     LIBRE_OFFICE_EXEC_PATH = "/usr/bin/soffice"
-    LIBRE_OFFICE_PYTHON_PATH = "/usr/bin/python3.11"
+    LIBRE_OFFICE_PYTHON_PATH = "/usr/bin/python3.12"
 
     # this is the path from the Docker image, Ubuntu Lunar, Noble too.
     TESSDATA_PREFIX = "/usr/share/tesseract-ocr/5/tessdata"
 
@@ -5,4 +5,5 @@
 # remove if this causes issues on any other platform
 # possible values: amd64, arm64
 CPU_ARCHITECTURE=amd64
-DOCKER_DEFAULT_PLATFORM=linux/${CPU_ARCHITECTURE:-amd64}
+DOCKER_DEFAULT_PLATFORM=linux/${CPU_ARCHITECTURE:-amd64}
+COMPOSE_BAKE=True
@@ -16,7 +16,8 @@ OCR_WEB_SERVICE_WORKER_CLASS="sync"
 OCR_SERVICE_OPERATION_MODE=OCR
 
 # 50 - CRITICAL, 40 - ERROR, 30 - WARNING, 20 - INFO, 10 - DEBUG, 0 - NOTSET
-OCR_SERVICE_LOG_LEVEL=10
+OCR_SERVICE_LOG_LEVEL=40
+
 
 # possible values: json (stringified output), dict (dict means no json.dumps() is applied to the output)
 OCR_SERVICE_RESPONSE_OUTPUT_TYPE="json"
 
@@ -72,6 +72,7 @@ def create_app():
         _loffice_processes.update(start_office_converter_servers())
 
         app.register_blueprint(api)
+        app.debug = DEBUG_MODE
 
         # share processes for api call resource allocation
         api.processor = Processor()
@@ -115,4 +116,4 @@ def exit_handler(port_num: int):
 
 
 if __name__ == '__main__':
-    atexit.register(exit_handler)
+    atexit.register(exit_handler, port_num=get_assigned_port(os.getpid()))
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import json
 import logging
 import os
 import sys
@@ -15,8 +14,8 @@
 import injector
 import pypdfium2 as pdfium
 
-from tesserocr import get_languages, PyTessBaseAPI
-from filetype.types import DOCUMENT, IMAGE, archive, document, image
+from tesserocr import PyTessBaseAPI
+from filetype.types import DOCUMENT, IMAGE, archive
 from html2image import Html2Image
 from PIL import Image, ImageFile
 
@@ -39,9 +38,8 @@ class Processor:
 
     @injector.inject
     def __init__(self):
-        app_log_level = os.getenv("LOG_LEVEL", LOG_LEVEL)
-        self.log = setup_logging(component_name="processor", log_level=app_log_level)
-        self.log.debug("log level set to : " + str(app_log_level))
+        self.log = setup_logging(component_name="processor", log_level=LOG_LEVEL)
+        self.log.debug("log level set to : " + str(LOG_LEVEL))
         self.loffice_process_list = {}
 
     def _preprocess_html_to_img(self, stream: bytes, file_name: str) -> List[ImageFile.ImageFile]:
@@ -145,6 +143,9 @@ def _preprocess_doc(self, stream: bytes, file_name: str) -> bytes:
         """
 
         pdf_stream = None
+        doc_file_path = ""
+        pdf_file_path = ""
+        used_port_num = None
 
         try:
             # generate unique id
@@ -160,7 +161,6 @@ def _preprocess_doc(self, stream: bytes, file_name: str) -> bytes:
             conversion_time_start = time.time()
 
             loffice_subprocess = None
-            used_port_num = None
 
             for port_num, loffice_process in self.loffice_process_list.items():
                 if loffice_process["used"] is False:
@@ -266,7 +266,7 @@ def _process_image(self, img: PILImage, img_id: int, tess_api: PyTessBaseAPI) ->
         return output_str, img_id, tess_data
 
     def _init_tesseract_api_worker(self):
-        tess_api = PyTessBaseAPI(path=TESSDATA_PREFIX, lang=TESSERACT_LANGUAGE)
+        tess_api = PyTessBaseAPI(path=TESSDATA_PREFIX, lang=TESSERACT_LANGUAGE)  # type: ignore
         self.log.debug("Initialised pytesseract api worker for language:" + str(TESSERACT_LANGUAGE))
         return tess_api
 
@@ -295,7 +295,7 @@ def _process(self, stream: bytes, file_name: str) -> tuple[str, dict]:
         doc_metadata: dict = {}
 
         if file_type is not None:
-            doc_metadata["content-type"] = str(file_type.mime)
+            doc_metadata["content-type"] = str(file_type.mime)  # type: ignore
         else:
             doc_metadata["content-type"] = "text/plain"
 
@@ -373,7 +373,7 @@ def _process(self, stream: bytes, file_name: str) -> tuple[str, dict]:
                 doc_metadata["pages"] = image_count
                 doc_metadata["confidence"] = round(sum([page["confidence"] for page in tess_data]) / image_count, 4)
 
-            output_text = output_text.translate({'\\n': '', '\\t': '', '\n\n': '\n'})
+            output_text = output_text.translate({'\\n': '', '\\t': '', '\n\n': '\n'})  # type: ignore
         except Exception:
             raise Exception("Failed to convert/generate image content: " + str(traceback.format_exc()))