Skip to content

Commit ed51967

Browse files
committed
Multiprocessing improvement, gunicorn worker support.
1 parent f13c7fa commit ed51967

File tree

4 files changed

+44
-18
lines changed

4 files changed

+44
-18
lines changed

config.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,19 +30,26 @@
3030
# Any additional custom configuration flags that are not available via the tesseract function. For example: config='--psm 6'
3131
TESSERACT_CUSTOM_CONFIG_FLAGS = os.environ.get("OCR_SERVICE_TESSERACT_CUSTOM_CONFIG_FLAGS", "")
3232

33-
# controls both threads and cpus for one WEB SERVICE thread, basically, one request handler.
34-
# this is derived normally from the amount of threads Gunicorn is running with, for example:
33+
# Controls both threads and cpus for one WEB SERVICE thread, basically, one request handler.
34+
# This is derived normally from the amount of threads Gunicorn is running with, for example:
3535
# - if we have OCR_SERVICE_THREADS = 4, the OCR service can handle at most 4 requests at the same time,
3636
# this means that you can not use all of your CPUS for OCR-ing for 1 request,
3737
# because that means the other requests are sitting idle while the first one uses all resources,
3838
# and so it is recommended to regulate the number of threads per request
39-
OCR_WEB_SERVICE_THREADS = int(os.environ.get("OCR_WEB_SERVICE_THREADS", multiprocessing.cpu_count()))
39+
OCR_WEB_SERVICE_THREADS = int(os.environ.get("OCR_WEB_SERVICE_THREADS", 1))
40+
41+
# This controls the number of workers the ocr service may have it is recommended to use this value
42+
# instead of OCR_WEB_SERVICE_THREADS if you want to process multiple requests in parallel
43+
# WARNING: using more than 1 workers assumes you have set the OCR_WEB_SERVICE_WORKER_CLASS setting to sync !
44+
# with the above mentioned, setting OCR_WEB_SERVICE_WORKER_CLASS to sync means that a worker will use 1 THREAD only,
45+
# therefore OCR_WEB_SERVICE_THREADS is disregarded
46+
OCR_WEB_SERVICE_WORKERS = int(os.environ.get("OCR_WEB_SERVICE_WORKERS", 1))
4047

4148
# set this to control the number of threads used for OCR-ing per web request thread (check OCR_WEB_SERVICE_THREADS)
42-
CPU_THREADS = int(os.environ.get("OCR_SERVICE_CPU_THREADS", (multiprocessing.cpu_count() / OCR_WEB_SERVICE_THREADS)))
49+
CPU_THREADS = int(os.environ.get("OCR_SERVICE_CPU_THREADS", (multiprocessing.cpu_count() / OCR_WEB_SERVICE_WORKERS)))
4350

4451
# conversion thread number for the pdf -> PIL img conversion, per web request thread (check OCR_WEB_SERVICE_THREADS)
45-
CONVERTER_THREAD_NUM = int(os.environ.get("OCR_SERVICE_CONVERTER_THREADS", (multiprocessing.cpu_count() / OCR_WEB_SERVICE_THREADS)))
52+
CONVERTER_THREAD_NUM = int(os.environ.get("OCR_SERVICE_CONVERTER_THREADS", (multiprocessing.cpu_count() / OCR_WEB_SERVICE_WORKERS)))
4653

4754
# should we convert detected images to greyscale before OCR-ing
4855
OCR_CONVERT_GRAYSCALE_IMAGES = True
@@ -62,10 +69,12 @@
6269
# a libre office server will only use 1 CPU by default (not changable), thus,
6370
# for handling multiple requests, we will have one service per OCR_WEB_SERVICE_THREAD
6471
DEFAULT_LIBRE_OFFICE_SERVER_PORT = 9900
65-
LIBRE_OFFICE_LISTENER_PORT_RANGE = range(DEFAULT_LIBRE_OFFICE_SERVER_PORT, DEFAULT_LIBRE_OFFICE_SERVER_PORT + OCR_WEB_SERVICE_THREADS)
72+
LIBRE_OFFICE_LISTENER_PORT_RANGE = range(DEFAULT_LIBRE_OFFICE_SERVER_PORT, DEFAULT_LIBRE_OFFICE_SERVER_PORT + OCR_WEB_SERVICE_THREADS + OCR_WEB_SERVICE_WORKERS)
6673

6774
LIBRE_OFFICE_NETWORK_INTERFACE = "localhost"
6875

76+
77+
# seconds to check for possible failure of port
6978
LIBRE_OFFICE_PROCESSES_LISTENER_INTERVAL = 10
7079

7180

ocr_service/app/app.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from config import *
1313
from ocr_service.api import api
1414
from ocr_service.processor.processor import Processor
15-
from ocr_service.utils.utils import get_process_id_by_process_name
15+
from ocr_service.utils.utils import is_port_in_use
1616

1717
sys.path.append("..")
1818

@@ -48,12 +48,23 @@ def start_office_converter_servers():
4848

4949
port_count = 0
5050
for port_num in LIBRE_OFFICE_LISTENER_PORT_RANGE:
51-
if port_count < OCR_WEB_SERVICE_THREADS:
52-
port_count += 1
53-
if port_num not in list(loffice_processes.keys()):
54-
loffice_processes[port_num] = start_office_server(port_num)
51+
if OCR_WEB_SERVICE_WORKERS <= 1:
52+
if port_count < OCR_WEB_SERVICE_THREADS:
53+
port_count += 1
54+
if port_num not in list(loffice_processes.keys()):
55+
if is_port_in_use(port_num) == False:
56+
loffice_processes[port_num] = start_office_server(port_num)
57+
else:
58+
break
5559
else:
56-
break
60+
print("WOREKER TRYING PORT " + str(port_num))
61+
if is_port_in_use(port_num) == False and port_count < OCR_WEB_SERVICE_WORKERS - 1:
62+
loffice_processes[port_num] = start_office_server(port_num)
63+
port_count += 1
64+
else:
65+
break
66+
67+
5768
return loffice_processes
5869

5970
def create_app():

ocr_service/utils/utils.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import sys
33
import psutil
44
import logging
5+
import socket
56

67
from sys import platform
78
from typing import List
@@ -95,3 +96,8 @@ def get_process_id_by_process_name(process_name: str = "") -> int:
9596
break
9697

9798
return pid
99+
100+
def is_port_in_use(port: int) -> bool:
101+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
102+
return s.connect_ex(('localhost', port)) == 0
103+

start_service_production.sh

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,13 @@ if [ -z ${OCR_SERVICE_PORT+x} ]; then
1212
echo "OCR_SERVICE_PORT is unset -- setting to default: $OCR_SERVICE_PORT"
1313
fi
1414

15-
if [ -z ${OCR_SERVICE_WORKERS+x} ]; then
16-
OCR_SERVICE_WORKERS=1
17-
echo "OCR_SERVICE_WORKERS is unset -- setting to default: $OCR_SERVICE_WORKERS"
15+
if [ -z ${OCR_WEB_SERVICE_WORKERS+x} ]; then
16+
OCR_WEB_SERVICE_WORKERS=4
17+
echo "OCR_WEB_SERVICE_WORKERS is unset -- setting to default: $OCR_WEB_SERVICE_WORKERS"
1818
fi
1919

2020
if [ -z ${OCR_WEB_SERVICE_THREADS+x} ]; then
21-
OCR_WEB_SERVICE_THREADS=4
21+
OCR_WEB_SERVICE_THREADS=1
2222
echo "OCR_WEB_SERVICE_THREADS is unset -- setting to default: $OCR_WEB_SERVICE_THREADS"
2323
fi
2424

@@ -33,7 +33,7 @@ if [ -z ${OCR_SERVICE_LOG_LEVEL+x} ]; then
3333
fi
3434

3535
if [ -z ${OCR_WEB_SERVICE_WORKER_CLASS+x} ]; then
36-
OCR_WEB_SERVICE_WORKER_CLASS="gthread"
36+
OCR_WEB_SERVICE_WORKER_CLASS="sync"
3737
echo "OCR_WEB_SERVICE_WORKER_CLASS is unset -- setting to default: $OCR_WEB_SERVICE_WORKER_CLASS"
3838
fi
3939

@@ -44,6 +44,6 @@ OCR_SERVICE_ACCESS_LOG_FORMAT="%(t)s [ACCESSS] %(h)s \"%(r)s\" %(s)s \"%(f)s\" \
4444
# start the OCR_SERVICE
4545
#
4646
echo "Starting up Flask app using gunicorn OCR_SERVICE ..."
47-
python3.11 -m gunicorn --bind $OCR_SERVICE_HOST:$OCR_SERVICE_PORT -w $OCR_SERVICE_WORKERS --threads=$OCR_WEB_SERVICE_THREADS --timeout=$OCR_SERVICE_WORKER_TIMEOUT \
47+
python3.11 -m gunicorn --bind $OCR_SERVICE_HOST:$OCR_SERVICE_PORT -w $OCR_WEB_SERVICE_WORKERS --threads=$OCR_WEB_SERVICE_THREADS --timeout=$OCR_SERVICE_WORKER_TIMEOUT \
4848
--access-logformat="$OCR_SERVICE_ACCESS_LOG_FORMAT" --access-logfile=./ocr_service.log --log-file=./ocr_service.log --log-level error --worker-class=$OCR_WEB_SERVICE_WORKER_CLASS \
4949
wsgi

0 commit comments

Comments
 (0)