Multiprocessing improvement, gunicorn worker support.

vladd-bit · vladd-bit · commit ed51967e6bad · 2023-05-09T19:17:27.000+01:00
diff --git a/config.py b/config.py
@@ -30,19 +30,26 @@
 # Any additional custom configuration flags that are not available via the tesseract function. For example: config='--psm 6'
 TESSERACT_CUSTOM_CONFIG_FLAGS = os.environ.get("OCR_SERVICE_TESSERACT_CUSTOM_CONFIG_FLAGS", "")
 
-# controls both threads and cpus for one WEB SERVICE thread, basically, one request handler.
-# this is derived normally from the amount of threads Gunicorn is running with, for example:
+# Controls both threads and cpus for one WEB SERVICE thread, basically, one request handler.
+# This is derived normally from the amount of threads Gunicorn is running with, for example:
 #  - if we have OCR_SERVICE_THREADS = 4, the OCR service can handle at most 4 requests at the same time,
 #    this means that you can not use all of your CPUS for OCR-ing for 1 request,
 #    because that means the other requests are sitting idle while the first one uses all resources,
 #    and so it is recommended to regulate the number of threads per request
-OCR_WEB_SERVICE_THREADS = int(os.environ.get("OCR_WEB_SERVICE_THREADS", multiprocessing.cpu_count()))
+OCR_WEB_SERVICE_THREADS = int(os.environ.get("OCR_WEB_SERVICE_THREADS", 1))
+
+# This controls the number of workers the ocr service may have it is recommended to use this value
+#   instead of OCR_WEB_SERVICE_THREADS if you want to process multiple requests in parallel
+# WARNING: using more than 1 workers assumes you have set the OCR_WEB_SERVICE_WORKER_CLASS setting to sync !
+#          with the above mentioned, setting OCR_WEB_SERVICE_WORKER_CLASS to sync means that a worker will use 1 THREAD only,
+#          therefore OCR_WEB_SERVICE_THREADS is disregarded
+OCR_WEB_SERVICE_WORKERS = int(os.environ.get("OCR_WEB_SERVICE_WORKERS", 1))
 
 # set this to control the number of threads used for OCR-ing per web request thread (check OCR_WEB_SERVICE_THREADS)
-CPU_THREADS = int(os.environ.get("OCR_SERVICE_CPU_THREADS", (multiprocessing.cpu_count() / OCR_WEB_SERVICE_THREADS)))
+CPU_THREADS = int(os.environ.get("OCR_SERVICE_CPU_THREADS", (multiprocessing.cpu_count() / OCR_WEB_SERVICE_WORKERS)))
 
 # conversion thread number for the pdf -> PIL img conversion, per web request thread (check OCR_WEB_SERVICE_THREADS)
-CONVERTER_THREAD_NUM = int(os.environ.get("OCR_SERVICE_CONVERTER_THREADS", (multiprocessing.cpu_count() / OCR_WEB_SERVICE_THREADS)))
+CONVERTER_THREAD_NUM = int(os.environ.get("OCR_SERVICE_CONVERTER_THREADS", (multiprocessing.cpu_count() / OCR_WEB_SERVICE_WORKERS)))
 
 # should we convert detected images to greyscale before OCR-ing
 OCR_CONVERT_GRAYSCALE_IMAGES = True
@@ -62,10 +69,12 @@
 # a libre office server will only use 1 CPU by default (not changable), thus,
 # for handling multiple requests, we will have one service per OCR_WEB_SERVICE_THREAD
 DEFAULT_LIBRE_OFFICE_SERVER_PORT = 9900
-LIBRE_OFFICE_LISTENER_PORT_RANGE = range(DEFAULT_LIBRE_OFFICE_SERVER_PORT, DEFAULT_LIBRE_OFFICE_SERVER_PORT + OCR_WEB_SERVICE_THREADS)
+LIBRE_OFFICE_LISTENER_PORT_RANGE = range(DEFAULT_LIBRE_OFFICE_SERVER_PORT, DEFAULT_LIBRE_OFFICE_SERVER_PORT + OCR_WEB_SERVICE_THREADS + OCR_WEB_SERVICE_WORKERS)
 
 LIBRE_OFFICE_NETWORK_INTERFACE = "localhost"
 
+
+# seconds to check for possible failure of port
 LIBRE_OFFICE_PROCESSES_LISTENER_INTERVAL = 10
 
 
diff --git a/ocr_service/app/app.py b/ocr_service/app/app.py
@@ -12,7 +12,7 @@
 from config import *
 from ocr_service.api import api
 from ocr_service.processor.processor import Processor
-from ocr_service.utils.utils import get_process_id_by_process_name
+from ocr_service.utils.utils import is_port_in_use
 
 sys.path.append("..")
 
@@ -48,12 +48,23 @@ def start_office_converter_servers():
 
     port_count = 0
     for port_num in LIBRE_OFFICE_LISTENER_PORT_RANGE:
-        if port_count < OCR_WEB_SERVICE_THREADS:
-            port_count += 1
-            if port_num not in list(loffice_processes.keys()):
-                loffice_processes[port_num] = start_office_server(port_num)
+        if OCR_WEB_SERVICE_WORKERS <= 1:
+            if port_count < OCR_WEB_SERVICE_THREADS:
+                port_count += 1
+                if port_num not in list(loffice_processes.keys()):
+                    if is_port_in_use(port_num) == False:
+                        loffice_processes[port_num] = start_office_server(port_num)
+            else:
+                break
         else:
-            break
+            print("WOREKER TRYING PORT " + str(port_num))
+            if is_port_in_use(port_num) == False and port_count < OCR_WEB_SERVICE_WORKERS - 1:
+                loffice_processes[port_num] = start_office_server(port_num)
+                port_count += 1
+            else:
+                break
+
+
     return loffice_processes
             
 def create_app():
diff --git a/ocr_service/utils/utils.py b/ocr_service/utils/utils.py
@@ -2,6 +2,7 @@
 import sys
 import psutil
 import logging
+import socket
 
 from sys import platform
 from typing import List
@@ -95,3 +96,8 @@ def get_process_id_by_process_name(process_name: str = "") -> int:
             break
 
     return pid
+
+def is_port_in_use(port: int) -> bool:
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        return s.connect_ex(('localhost', port)) == 0
+
diff --git a/start_service_production.sh b/start_service_production.sh
@@ -12,13 +12,13 @@ if [ -z ${OCR_SERVICE_PORT+x} ]; then
   echo "OCR_SERVICE_PORT is unset -- setting to default: $OCR_SERVICE_PORT"
 fi
 
-if [ -z ${OCR_SERVICE_WORKERS+x} ]; then
-  OCR_SERVICE_WORKERS=1
-  echo "OCR_SERVICE_WORKERS is unset -- setting to default: $OCR_SERVICE_WORKERS"
+if [ -z ${OCR_WEB_SERVICE_WORKERS+x} ]; then
+  OCR_WEB_SERVICE_WORKERS=4
+  echo "OCR_WEB_SERVICE_WORKERS is unset -- setting to default: $OCR_WEB_SERVICE_WORKERS"
 fi
 
 if [ -z ${OCR_WEB_SERVICE_THREADS+x} ]; then
-  OCR_WEB_SERVICE_THREADS=4
+  OCR_WEB_SERVICE_THREADS=1
   echo "OCR_WEB_SERVICE_THREADS is unset -- setting to default: $OCR_WEB_SERVICE_THREADS"
 fi
 
@@ -33,7 +33,7 @@ if [ -z ${OCR_SERVICE_LOG_LEVEL+x} ]; then
 fi
 
 if [ -z ${OCR_WEB_SERVICE_WORKER_CLASS+x} ]; then
-  OCR_WEB_SERVICE_WORKER_CLASS="gthread"
+  OCR_WEB_SERVICE_WORKER_CLASS="sync"
   echo "OCR_WEB_SERVICE_WORKER_CLASS is unset -- setting to default: $OCR_WEB_SERVICE_WORKER_CLASS"
 fi
 
@@ -44,6 +44,6 @@ OCR_SERVICE_ACCESS_LOG_FORMAT="%(t)s [ACCESSS] %(h)s \"%(r)s\" %(s)s \"%(f)s\" \
 # start the OCR_SERVICE
 #
 echo "Starting up Flask app using gunicorn OCR_SERVICE ..."
-python3.11 -m gunicorn --bind $OCR_SERVICE_HOST:$OCR_SERVICE_PORT -w $OCR_SERVICE_WORKERS --threads=$OCR_WEB_SERVICE_THREADS --timeout=$OCR_SERVICE_WORKER_TIMEOUT \
+python3.11 -m gunicorn --bind $OCR_SERVICE_HOST:$OCR_SERVICE_PORT -w $OCR_WEB_SERVICE_WORKERS --threads=$OCR_WEB_SERVICE_THREADS --timeout=$OCR_SERVICE_WORKER_TIMEOUT \
   --access-logformat="$OCR_SERVICE_ACCESS_LOG_FORMAT" --access-logfile=./ocr_service.log --log-file=./ocr_service.log --log-level error --worker-class=$OCR_WEB_SERVICE_WORKER_CLASS \
   wsgi