Skip to content

Commit 2a346e4

Browse files
committed
Updated requirements and moved to python 3.12
1 parent 35ac478 commit 2a346e4

File tree

10 files changed

+106
-105
lines changed

10 files changed

+106
-105
lines changed

Dockerfile

Lines changed: 34 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -21,18 +21,17 @@ ENV NVIDIA_VISIBLE_DEVICES=all
2121
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,display
2222

2323
# Keeps Python from generating .pyc files in the container
24-
ENV PYTHONDONTWRITEBYTECODE=1
24+
ENV PYTHONDONTWRITEBYTECODE=0
2525
# Turns off buffering for easier container logging
2626
ENV PYTHONUNBUFFERED=1
2727

28-
ENV SETUPTOOLS_USE_DISTUTILS=stdlib
28+
# ENV SETUPTOOLS_USE_DISTUTILS=stdlib
2929

3030
# default user
3131
USER root
3232

33-
# Update and install python3
34-
RUN apt-get update && apt-get upgrade -y && \
35-
apt-get install -y software-properties-common
33+
# install extra features
34+
RUN apt-get update && apt-get upgrade -y && apt-get install -y software-properties-common
3635

3736
# add extra repos
3837
RUN apt-add-repository multiverse && \
@@ -42,12 +41,9 @@ RUN apt-add-repository multiverse && \
4241
apt-get update && apt-get upgrade -y
4342

4443
# install req packages
45-
RUN apt-get install -y python3.11 python3.11-dev python3.11-venv python3-dev python3-pip
46-
47-
RUN apt-get update && apt-get upgrade -y && \
48-
apt-get --force-yes -o Dpkg::Options::="--force-confold" --force-yes -o Dpkg::Options::="--force-confdef" -fuy dist-upgrade && \
49-
apt-get install -y \
50-
pkg-config \
44+
RUN apt-get install -y --no-install-recommends python3-all-dev python3-dev python3.12 python3-pip libpython3.12-dev python3.12-dev
45+
RUN apt-get -y --no-install-recommends -o Dpkg::Options::="--force-confold" -y -o Dpkg::Options::="--force-confdef" -fuy dist-upgrade && \
46+
apt-get install -y --no-install-recommends \
5147
gnupg \
5248
libssl-dev \
5349
wget \
@@ -66,54 +62,55 @@ RUN apt-get update && apt-get upgrade -y && \
6662
g++
6763

6864
##### utils for python and TESSERACT
69-
7065
RUN echo "ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true" | debconf-set-selections
71-
RUN apt-get install -y --no-install-recommends fontconfig ttf-mscorefonts-installer
72-
RUN fc-cache -f -v
7366

74-
RUN apt-get install -y libimage-exiftool-perl libtcnative-1 && \
75-
apt-get install -y ttf-mscorefonts-installer fontconfig && \
76-
apt-get install -y --fix-missing libsm6 libxext6 gstreamer1.0-libav fonts-deva fonts-dejavu fonts-gfs-didot fonts-gfs-didot-classic fonts-junicode fonts-ebgaramond fonts-noto-cjk fonts-takao-gothic fonts-vlgothic && \
77-
apt-get install -y --fix-missing ghostscript ghostscript-x gsfonts gsfonts-other gsfonts-x11 fonts-croscore fonts-crosextra-caladea fonts-crosextra-carlito fonts-liberation fonts-open-sans fonts-noto-core fonts-ibm-plex fonts-urw-base35 && \
78-
apt-get install -y --fix-missing fonts-noto fonts-noto-cjk fonts-noto-extra xfonts-terminus fonts-font-awesome fonts-hack fonts-inconsolata fonts-liberation2 fonts-mononoki && \
79-
apt-get install -y --fix-missing libpcre3 libpcre3-dev && \
80-
apt-get install -y --fix-missing mesa-opencl-icd pocl-opencl-icd && \
81-
apt-get install -y --fix-missing libvips-tools libvips libvips-dev && \
82-
apt-get install -y --fix-missing imagemagick libcairo2-dev tesseract-ocr tesseract-ocr-all libtesseract5 libtesseract-dev libleptonica-dev liblept5
67+
RUN apt-get install -y --no-install-recommends fontconfig ttf-mscorefonts-installer libimage-exiftool-perl libtcnative-1 \
68+
libsm6 libxext6 gstreamer1.0-libav fonts-deva fonts-dejavu fonts-gfs-didot fonts-gfs-didot-classic fonts-junicode fonts-ebgaramond fonts-noto-cjk fonts-takao-gothic fonts-vlgothic \
69+
ghostscript ghostscript-x gsfonts gsfonts-other gsfonts-x11 fonts-croscore fonts-crosextra-caladea fonts-crosextra-carlito fonts-liberation fonts-open-sans fonts-noto-core fonts-ibm-plex fonts-urw-base35 \
70+
fonts-noto fonts-noto-cjk fonts-noto-extra xfonts-terminus fonts-font-awesome fonts-hack fonts-inconsolata fonts-liberation2 fonts-mononoki \
71+
libpcre3 libpcre3-dev \
72+
mesa-opencl-icd pocl-opencl-icd libvips-tools libvips libvips-dev \
73+
imagemagick libcairo2-dev tesseract-ocr tesseract-ocr-all libtesseract5 libtesseract-dev libleptonica-dev liblept5
8374

8475
# tessaract language packages
85-
RUN apt-get install -y --fix-missing tesseract-ocr-eng tesseract-ocr-osd tesseract-ocr-lat && \
86-
apt-get install -y --fix-missing tesseract-ocr-eng tesseract-ocr-enm tesseract-ocr-ita tesseract-ocr-osd tesseract-ocr-script-latn && \
87-
apt-get install -y --fix-missing tesseract-ocr-fra tesseract-ocr-frk tesseract-ocr-deu tesseract-ocr-ces tesseract-ocr-dan tesseract-ocr-nld tesseract-ocr-nor && \
88-
apt-get install -y --fix-missing tesseract-ocr-spa tesseract-ocr-swe tesseract-ocr-slk tesseract-ocr-ron tesseract-ocr-script-grek
76+
RUN apt-get install -y --no-install-recommends --fix-missing tesseract-ocr-eng tesseract-ocr-osd tesseract-ocr-lat \
77+
tesseract-ocr-eng tesseract-ocr-enm tesseract-ocr-ita tesseract-ocr-osd tesseract-ocr-script-latn \
78+
tesseract-ocr-fra tesseract-ocr-frk tesseract-ocr-deu tesseract-ocr-ces tesseract-ocr-dan tesseract-ocr-nld tesseract-ocr-nor \
79+
tesseract-ocr-spa tesseract-ocr-swe tesseract-ocr-slk tesseract-ocr-ron tesseract-ocr-script-grek
8980

9081
# Pillow package requirements
91-
RUN apt-get install -y python3-tk tcl8.6-dev tk8.6-dev libopenjp2-7-dev libharfbuzz-dev libfribidi-dev libxcb1-dev libtiff5-dev libjpeg8-dev zlib1g-dev libfreetype6-dev liblcms2-dev libwebp-dev
82+
RUN apt-get install -y --no-install-recommends tcl8.6-dev tk8.6-dev libopenjp2-7-dev libharfbuzz-dev libfribidi-dev libxcb1-dev libtiff5-dev libjpeg8-dev zlib1g-dev libfreetype6-dev liblcms2-dev libwebp-dev libglib2.0-dev libgl1
9283

9384
# python3 poppler requirement
94-
RUN apt-get install poppler-utils -y
85+
RUN apt-get install -y --no-install-recommends poppler-utils
9586

87+
# libre office and java
9688
RUN apt-get install -y --no-install-recommends default-jre libreoffice-java-common libreoffice libreoffice-script-provider-python
9789

98-
RUN apt-get clean autoclean && \
99-
apt-get autoremove --purge -y
90+
# build font cache
91+
RUN fc-cache -f -v
92+
93+
# there is a bug in the blinker package that causes issues with uwsgi
94+
# (this removes software-properties-common)
95+
RUN apt remove -y python3-blinker
96+
97+
RUN apt-get clean autoclean && apt-get autoremove --purge -y
10098

10199
# other openCL packages
102100
# beignet-opencl-icd
103101

104102
RUN rm -rf /var/lib/apt/lists/*
105103

106-
# python3 packages
107-
# RUN python3.11 -m pip install --no-cache-dir --upgrade pip --break-system-packages
108-
109104
# create and copy the app
110105
RUN mkdir /ocr_service
111106
COPY ./ /ocr_service
112107
WORKDIR /ocr_service
113108

114-
# Install requirements for the app
115-
#RUN apt-get remove python3-wheel -y
116-
RUN python3.11 -m pip install --no-cache-dir --ignore-installed --break-system-packages -r ./requirements.txt
109+
# Install uwsgi from PyPI source using the global tools
110+
RUN python3.12 -m pip install --no-cache-dir --break-system-packages --no-build-isolation -r ./requirements.txt
111+
112+
# compile the python files
113+
RUN python3.12 -m compileall /ocr_service
117114

118115
# Now run the simple api
119116
CMD ["/bin/bash", "start_service_production.sh"]

config.py

Lines changed: 34 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -2,102 +2,105 @@
22
import multiprocessing
33

44
from sys import platform
5+
from typing import Tuple
56

6-
OCR_SERVICE_VERSION = "0.2.3"
7+
OCR_SERVICE_VERSION: str = "0.3.0"
78
# 50 - CRITICAL, 40 - ERROR, 30 - WARNING, 20 - INFO, 10 - DEBUG, 0 - NOTSET
8-
LOG_LEVEL = int(os.environ.get("OCR_SERVICE_LOG_LEVEL", 40))
9+
LOG_LEVEL: int = int(os.environ.get("OCR_SERVICE_LOG_LEVEL", 40))
910

10-
DEBUG_MODE = os.environ.get("OCR_SERVICE_DEBUG_MODE", False)
11+
DEBUG_MODE: bool = True if os.environ.get("OCR_SERVICE_DEBUG_MODE", False) in [True, "True", "true"] else False
1112

12-
ROOT_DIR = os.path.abspath(os.curdir)
13-
TMP_FILE_DIR = os.path.join(ROOT_DIR, "tmp")
14-
WORKER_PORT_MAP_FILE_PATH = os.path.join(TMP_FILE_DIR, './worker_process_data.txt')
13+
ROOT_DIR: str = os.path.abspath(os.curdir)
14+
TMP_FILE_DIR: str = os.path.join(ROOT_DIR, "tmp")
15+
WORKER_PORT_MAP_FILE_PATH: str = os.path.join(TMP_FILE_DIR, './worker_process_data.txt')
1516

1617

1718
# Should we actually ocr or just extract text from PDFs ?
1819
# NOTE: OCR IS STILL APPLIED TO IMAGES if detected | possible vals : "OCR", "NO_OCR"
19-
OPERATION_MODE = os.environ.get("OCR_SERVICE_OPERATION_MODE", "OCR")
20+
OPERATION_MODE: str = os.environ.get("OCR_SERVICE_OPERATION_MODE", "OCR")
2021

2122
# basic app settings
22-
OCR_SERVICE_PORT = os.environ.get("OCR_SERVICE_PORT", 8090)
23+
OCR_SERVICE_PORT: int = int(os.environ.get("OCR_SERVICE_PORT", 8090))
2324

2425
# Tesseract model path: macos - /opt/homebrew/share/tessdata | linux - "/usr/local/share/tessdata"
25-
TESSDATA_PREFIX = os.environ.get("OCR_TESSDATA_PREFIX", "/opt/homebrew/share/tessdata")
26+
TESSDATA_PREFIX: str = os.environ.get("OCR_TESSDATA_PREFIX", "/opt/homebrew/share/tessdata")
2627

2728
# Integer or Float - duration in seconds for the OCR processing, after which,
2829
# tesseract will terminate and raise RuntimeError
29-
TESSERACT_TIMEOUT = int(os.environ.get("OCR_SERVICE_TESSERACT_TIMEOUT", 30))
30+
TESSERACT_TIMEOUT: int = int(os.environ.get("OCR_SERVICE_TESSERACT_TIMEOUT", 30))
3031

3132
# Tesseract language code string. Defaults to eng if not specified! Example for multiple languages: lang='eng+fra'
32-
TESSERACT_LANGUAGE = os.environ.get("OCR_SERVICE_TESSERACT_LANG", "eng+lat")
33+
TESSERACT_LANGUAGE: str = os.environ.get("OCR_SERVICE_TESSERACT_LANG", "eng+lat")
3334

3435
# Integer - modifies the processor priority for the Tesseract run. Not supported on Windows.
3536
# Nice adjusts the niceness of unix-like processes.
36-
TESSERACT_NICE = int(os.environ.get("OCR_SERVICE_TESSERACT_NICE", -18))
37+
TESSERACT_NICE: int = int(os.environ.get("OCR_SERVICE_TESSERACT_NICE", -18))
3738

3839
# Any additional custom configuration flags that are not available via the tesseract function.
3940
# For example: config='--psm 6'
40-
TESSERACT_CUSTOM_CONFIG_FLAGS = os.environ.get("OCR_SERVICE_TESSERACT_CUSTOM_CONFIG_FLAGS", "")
41+
TESSERACT_CUSTOM_CONFIG_FLAGS: str = os.environ.get("OCR_SERVICE_TESSERACT_CUSTOM_CONFIG_FLAGS", "")
4142

4243
# Controls both threads and cpus for one WEB SERVICE thread, basically, one request handler.
4344
# This is derived normally from the amount of threads Gunicorn is running with, for example:
4445
# - if we have OCR_SERVICE_THREADS = 4, the OCR service can handle at most 4 requests at the same time,
4546
# this means that you can not use all of your CPUS for OCR-ing for 1 request,
4647
# because that means the other requests are sitting idle while the first one uses all resources,
4748
# and so it is recommended to regulate the number of threads per request
48-
OCR_WEB_SERVICE_THREADS = int(os.environ.get("OCR_WEB_SERVICE_THREADS", 1))
49+
OCR_WEB_SERVICE_THREADS: int = int(os.environ.get("OCR_WEB_SERVICE_THREADS", 1))
4950

5051
# This controls the number of workers the ocr service may have it is recommended to use this value
5152
# instead of OCR_WEB_SERVICE_THREADS if you want to process multiple requests in parallel
5253
# WARNING: using more than 1 workers assumes you have set the OCR_WEB_SERVICE_WORKER_CLASS setting to sync !
5354
# with the above mentioned,
5455
# setting OCR_WEB_SERVICE_WORKER_CLASS to sync means that a worker will use 1 THREAD only,
5556
# therefore OCR_WEB_SERVICE_THREADS is disregarded
56-
OCR_WEB_SERVICE_WORKERS = int(os.environ.get("OCR_WEB_SERVICE_WORKERS", 1))
57+
OCR_WEB_SERVICE_WORKERS: int = int(os.environ.get("OCR_WEB_SERVICE_WORKERS", 1))
5758

5859
# set this to control the number of threads used for OCR-ing per web request thread (check OCR_WEB_SERVICE_THREADS)
59-
CPU_THREADS = int(os.environ.get("OCR_SERVICE_CPU_THREADS", (multiprocessing.cpu_count() / OCR_WEB_SERVICE_WORKERS)))
60+
CPU_THREADS: int = int(os.environ.get("OCR_SERVICE_CPU_THREADS",
61+
(multiprocessing.cpu_count() / OCR_WEB_SERVICE_WORKERS)))
6062

6163
# conversion thread number for the pdf -> PIL img conversion, per web request thread (check OCR_WEB_SERVICE_THREADS)
62-
CONVERTER_THREAD_NUM = int(os.environ.get("OCR_SERVICE_CONVERTER_THREADS",
63-
(multiprocessing.cpu_count() / OCR_WEB_SERVICE_WORKERS)))
64+
CONVERTER_THREAD_NUM: int = int(os.environ.get("OCR_SERVICE_CONVERTER_THREADS",
65+
(multiprocessing.cpu_count() / OCR_WEB_SERVICE_WORKERS)))
6466

6567
# should we convert detected images to greyscale before OCR-ing
66-
OCR_CONVERT_GRAYSCALE_IMAGES = True
68+
OCR_CONVERT_GRAYSCALE_IMAGES: bool = True
6769

6870
# dpi used for images in TESSERACT and other stuff
69-
OCR_IMAGE_DPI = int(os.environ.get("OCR_SERVICE_IMAGE_DPI", 200))
71+
OCR_IMAGE_DPI: int = int(os.environ.get("OCR_SERVICE_IMAGE_DPI", 200))
7072

7173
# possible values: json (stringified output), dict (dict means no json.dumps() is applied to the output)
7274
OCR_SERVICE_RESPONSE_OUTPUT_TYPE: str = str(os.environ.get("OCR_SERVICE_RESPONSE_OUTPUT_TYPE", "json"))
7375

7476
# LIBRE OFFICE SECTION
7577

7678
# 60 seconds before terminating processes
77-
LIBRE_OFFICE_PROCESS_TIMEOUT = int(os.environ.get("OCR_SERVICE_LIBRE_OFFICE_PROCESS_TIMEOUT", 20))
79+
LIBRE_OFFICE_PROCESS_TIMEOUT: int = int(os.environ.get("OCR_SERVICE_LIBRE_OFFICE_PROCESS_TIMEOUT", 100))
7880

7981
# This is the port for the background soffice listener service that gets started with the app
8082
# used internally for LibreOffice doc conversion
8183
# the service should start multiple libre office servers for doc conversions,
8284
# a libre office server will only use 1 CPU by default (not changable), thus,
8385
# for handling multiple requests, we will have one service per OCR_WEB_SERVICE_THREAD
84-
DEFAULT_LIBRE_OFFICE_SERVER_PORT = 9900
86+
DEFAULT_LIBRE_OFFICE_SERVER_PORT: int = 9900
8587

86-
LIBRE_OFFICE_PORT_CAP = DEFAULT_LIBRE_OFFICE_SERVER_PORT + 1
88+
LIBRE_OFFICE_PORT_CAP: int = DEFAULT_LIBRE_OFFICE_SERVER_PORT + 1
8789

8890
if OCR_WEB_SERVICE_THREADS > 1:
8991
LIBRE_OFFICE_PORT_CAP = DEFAULT_LIBRE_OFFICE_SERVER_PORT + OCR_WEB_SERVICE_THREADS
9092
if OCR_WEB_SERVICE_WORKERS > 1:
9193
LIBRE_OFFICE_PORT_CAP = DEFAULT_LIBRE_OFFICE_SERVER_PORT + OCR_WEB_SERVICE_WORKERS
9294

93-
LIBRE_OFFICE_LISTENER_PORT_RANGE = os.environ.get("OCR_SERVICE_LIBRE_OFFICE_LISTENER_PORT_RANGE",
94-
range(DEFAULT_LIBRE_OFFICE_SERVER_PORT, LIBRE_OFFICE_PORT_CAP))
95+
LIBRE_OFFICE_LISTENER_PORT_RANGE: range | str = os.environ.get("OCR_SERVICE_LIBRE_OFFICE_LISTENER_PORT_RANGE",
96+
range(DEFAULT_LIBRE_OFFICE_SERVER_PORT,
97+
LIBRE_OFFICE_PORT_CAP))
9598

96-
LIBRE_OFFICE_NETWORK_INTERFACE = "localhost"
99+
LIBRE_OFFICE_NETWORK_INTERFACE: str = "localhost"
97100

98101

99102
# seconds to check for possible failure of port
100-
LIBRE_OFFICE_PROCESSES_LISTENER_INTERVAL = 10
103+
LIBRE_OFFICE_PROCESSES_LISTENER_INTERVAL: int = 10
101104

102105

103106
# DO NOT CHANGE THIS UNLESS YOU ARE DEVELOPING OR RUNNING THIS APP LOCALLY
@@ -106,9 +109,9 @@
106109
#
107110
# MacOS X: /Applications/LibreOffice.app/Contents/Resources/python
108111
# Windows: C:/Windows/py.exe
109-
# Linux(Ubuntu): /usr/bin/python3.11 (forcefully uses python3.11,
112+
# Linux(Ubuntu): /usr/bin/python3.12 (forcefully uses python3.12,
110113
# to point to the default python on your system just use /usr/bin/python3)
111-
LIBRE_OFFICE_PYTHON_PATH = "/Applications/LibreOffice.app/Contents/Resources/python"
114+
LIBRE_OFFICE_PYTHON_PATH: str = "/Applications/LibreOffice.app/Contents/Resources/python"
112115

113116
# DO NOT CHANGE THIS UNLESS YOU ARE DEVELOPING OR RUNNING THIS APP LOCALLY
114117
# Description: this sets the path to the LibreOffice executable,
@@ -118,11 +121,11 @@
118121
# MacOS X: /Applications/LibreOffice.app/Contents/MacOS/soffice
119122
# Windows: %ProgramFiles%/LibreOffice/Program/soffice
120123
# Linux(Ubuntu): /usr/bin/soffice
121-
LIBRE_OFFICE_EXEC_PATH = "/Applications/LibreOffice.app/Contents/MacOS/soffice"
124+
LIBRE_OFFICE_EXEC_PATH: str = "/Applications/LibreOffice.app/Contents/MacOS/soffice"
122125

123126
if platform == "linux" or platform == "linux2":
124127
LIBRE_OFFICE_EXEC_PATH = "/usr/bin/soffice"
125-
LIBRE_OFFICE_PYTHON_PATH = "/usr/bin/python3.11"
128+
LIBRE_OFFICE_PYTHON_PATH = "/usr/bin/python3.12"
126129

127130
# this is the path from the Docker image, Ubuntu Lunar, Noble too.
128131
TESSDATA_PREFIX = "/usr/share/tesseract-ocr/5/tessdata"

env/general.env

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,5 @@
55
# remove if this causes issues on any other platform
66
# possible values: amd64, arm64
77
CPU_ARCHITECTURE=amd64
8-
DOCKER_DEFAULT_PLATFORM=linux/${CPU_ARCHITECTURE:-amd64}
8+
DOCKER_DEFAULT_PLATFORM=linux/${CPU_ARCHITECTURE:-amd64}
9+
COMPOSE_BAKE=True

env/ocr_service.env

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@ OCR_WEB_SERVICE_WORKER_CLASS="sync"
1616
OCR_SERVICE_OPERATION_MODE=OCR
1717

1818
# 50 - CRITICAL, 40 - ERROR, 30 - WARNING, 20 - INFO, 10 - DEBUG, 0 - NOTSET
19-
OCR_SERVICE_LOG_LEVEL=10
19+
OCR_SERVICE_LOG_LEVEL=40
20+
2021

2122
# possible values: json (stringified output), dict (dict means no json.dumps() is applied to the output)
2223
OCR_SERVICE_RESPONSE_OUTPUT_TYPE="json"

ocr_service/app/app.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ def create_app():
7272
_loffice_processes.update(start_office_converter_servers())
7373

7474
app.register_blueprint(api)
75+
app.debug = DEBUG_MODE
7576

7677
# share processes for api call resource allocation
7778
api.processor = Processor()
@@ -115,4 +116,4 @@ def exit_handler(port_num: int):
115116

116117

117118
if __name__ == '__main__':
118-
atexit.register(exit_handler)
119+
atexit.register(exit_handler, port_num=get_assigned_port(os.getpid()))

ocr_service/processor/processor.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
from __future__ import annotations
22

3-
import json
43
import logging
54
import os
65
import sys
@@ -15,8 +14,8 @@
1514
import injector
1615
import pypdfium2 as pdfium
1716

18-
from tesserocr import get_languages, PyTessBaseAPI
19-
from filetype.types import DOCUMENT, IMAGE, archive, document, image
17+
from tesserocr import PyTessBaseAPI
18+
from filetype.types import DOCUMENT, IMAGE, archive
2019
from html2image import Html2Image
2120
from PIL import Image, ImageFile
2221

@@ -39,9 +38,8 @@ class Processor:
3938

4039
@injector.inject
4140
def __init__(self):
42-
app_log_level = os.getenv("LOG_LEVEL", LOG_LEVEL)
43-
self.log = setup_logging(component_name="processor", log_level=app_log_level)
44-
self.log.debug("log level set to : " + str(app_log_level))
41+
self.log = setup_logging(component_name="processor", log_level=LOG_LEVEL)
42+
self.log.debug("log level set to : " + str(LOG_LEVEL))
4543
self.loffice_process_list = {}
4644

4745
def _preprocess_html_to_img(self, stream: bytes, file_name: str) -> List[ImageFile.ImageFile]:
@@ -145,6 +143,9 @@ def _preprocess_doc(self, stream: bytes, file_name: str) -> bytes:
145143
"""
146144

147145
pdf_stream = None
146+
doc_file_path = ""
147+
pdf_file_path = ""
148+
used_port_num = None
148149

149150
try:
150151
# generate unique id
@@ -160,7 +161,6 @@ def _preprocess_doc(self, stream: bytes, file_name: str) -> bytes:
160161
conversion_time_start = time.time()
161162

162163
loffice_subprocess = None
163-
used_port_num = None
164164

165165
for port_num, loffice_process in self.loffice_process_list.items():
166166
if loffice_process["used"] is False:
@@ -266,7 +266,7 @@ def _process_image(self, img: PILImage, img_id: int, tess_api: PyTessBaseAPI) ->
266266
return output_str, img_id, tess_data
267267

268268
def _init_tesseract_api_worker(self):
269-
tess_api = PyTessBaseAPI(path=TESSDATA_PREFIX, lang=TESSERACT_LANGUAGE)
269+
tess_api = PyTessBaseAPI(path=TESSDATA_PREFIX, lang=TESSERACT_LANGUAGE) # type: ignore
270270
self.log.debug("Initialised pytesseract api worker for language:" + str(TESSERACT_LANGUAGE))
271271
return tess_api
272272

@@ -295,7 +295,7 @@ def _process(self, stream: bytes, file_name: str) -> tuple[str, dict]:
295295
doc_metadata: dict = {}
296296

297297
if file_type is not None:
298-
doc_metadata["content-type"] = str(file_type.mime)
298+
doc_metadata["content-type"] = str(file_type.mime) # type: ignore
299299
else:
300300
doc_metadata["content-type"] = "text/plain"
301301

@@ -373,7 +373,7 @@ def _process(self, stream: bytes, file_name: str) -> tuple[str, dict]:
373373
doc_metadata["pages"] = image_count
374374
doc_metadata["confidence"] = round(sum([page["confidence"] for page in tess_data]) / image_count, 4)
375375

376-
output_text = output_text.translate({'\\n': '', '\\t': '', '\n\n': '\n'})
376+
output_text = output_text.translate({'\\n': '', '\\t': '', '\n\n': '\n'}) # type: ignore
377377
except Exception:
378378
raise Exception("Failed to convert/generate image content: " + str(traceback.format_exc()))
379379

0 commit comments

Comments
 (0)