|
2 | 2 | import multiprocessing |
3 | 3 |
|
4 | 4 | from sys import platform |
| 5 | +from typing import Tuple |
5 | 6 |
|
6 | | -OCR_SERVICE_VERSION = "0.2.3" |
| 7 | +OCR_SERVICE_VERSION: str = "0.3.0" |
7 | 8 | # 50 - CRITICAL, 40 - ERROR, 30 - WARNING, 20 - INFO, 10 - DEBUG, 0 - NOTSET |
8 | | -LOG_LEVEL = int(os.environ.get("OCR_SERVICE_LOG_LEVEL", 40)) |
| 9 | +LOG_LEVEL: int = int(os.environ.get("OCR_SERVICE_LOG_LEVEL", 40)) |
9 | 10 |
|
10 | | -DEBUG_MODE = os.environ.get("OCR_SERVICE_DEBUG_MODE", False) |
| 11 | +DEBUG_MODE: bool = True if os.environ.get("OCR_SERVICE_DEBUG_MODE", False) in [True, "True", "true"] else False |
11 | 12 |
|
12 | | -ROOT_DIR = os.path.abspath(os.curdir) |
13 | | -TMP_FILE_DIR = os.path.join(ROOT_DIR, "tmp") |
14 | | -WORKER_PORT_MAP_FILE_PATH = os.path.join(TMP_FILE_DIR, './worker_process_data.txt') |
| 13 | +ROOT_DIR: str = os.path.abspath(os.curdir) |
| 14 | +TMP_FILE_DIR: str = os.path.join(ROOT_DIR, "tmp") |
| 15 | +WORKER_PORT_MAP_FILE_PATH: str = os.path.join(TMP_FILE_DIR, './worker_process_data.txt') |
15 | 16 |
|
16 | 17 |
|
17 | 18 | # Should we actually ocr or just extract text from PDFs ? |
18 | 19 | # NOTE: OCR IS STILL APPLIED TO IMAGES if detected | possible vals : "OCR", "NO_OCR" |
19 | | -OPERATION_MODE = os.environ.get("OCR_SERVICE_OPERATION_MODE", "OCR") |
| 20 | +OPERATION_MODE: str = os.environ.get("OCR_SERVICE_OPERATION_MODE", "OCR") |
20 | 21 |
|
21 | 22 | # basic app settings |
22 | | -OCR_SERVICE_PORT = os.environ.get("OCR_SERVICE_PORT", 8090) |
| 23 | +OCR_SERVICE_PORT: int = int(os.environ.get("OCR_SERVICE_PORT", 8090)) |
23 | 24 |
|
24 | 25 | # Tesseract model path: macos - /opt/homebrew/share/tessdata | linux - "/usr/local/share/tessdata" |
25 | | -TESSDATA_PREFIX = os.environ.get("OCR_TESSDATA_PREFIX", "/opt/homebrew/share/tessdata") |
| 26 | +TESSDATA_PREFIX: str = os.environ.get("OCR_TESSDATA_PREFIX", "/opt/homebrew/share/tessdata") |
26 | 27 |
|
27 | 28 | # Integer or Float - duration in seconds for the OCR processing, after which, |
28 | 29 | # tesseract will terminate and raise RuntimeError |
29 | | -TESSERACT_TIMEOUT = int(os.environ.get("OCR_SERVICE_TESSERACT_TIMEOUT", 30)) |
| 30 | +TESSERACT_TIMEOUT: int = int(os.environ.get("OCR_SERVICE_TESSERACT_TIMEOUT", 30)) |
30 | 31 |
|
31 | 32 | # Tesseract language code string. Defaults to eng if not specified! Example for multiple languages: lang='eng+fra' |
32 | | -TESSERACT_LANGUAGE = os.environ.get("OCR_SERVICE_TESSERACT_LANG", "eng+lat") |
| 33 | +TESSERACT_LANGUAGE: str = os.environ.get("OCR_SERVICE_TESSERACT_LANG", "eng+lat") |
33 | 34 |
|
34 | 35 | # Integer - modifies the processor priority for the Tesseract run. Not supported on Windows. |
35 | 36 | # Nice adjusts the niceness of unix-like processes. |
36 | | -TESSERACT_NICE = int(os.environ.get("OCR_SERVICE_TESSERACT_NICE", -18)) |
| 37 | +TESSERACT_NICE: int = int(os.environ.get("OCR_SERVICE_TESSERACT_NICE", -18)) |
37 | 38 |
|
38 | 39 | # Any additional custom configuration flags that are not available via the tesseract function. |
39 | 40 | # For example: config='--psm 6' |
40 | | -TESSERACT_CUSTOM_CONFIG_FLAGS = os.environ.get("OCR_SERVICE_TESSERACT_CUSTOM_CONFIG_FLAGS", "") |
| 41 | +TESSERACT_CUSTOM_CONFIG_FLAGS: str = os.environ.get("OCR_SERVICE_TESSERACT_CUSTOM_CONFIG_FLAGS", "") |
41 | 42 |
|
42 | 43 | # Controls both threads and cpus for one WEB SERVICE thread, basically, one request handler. |
43 | 44 | # This is derived normally from the amount of threads Gunicorn is running with, for example: |
44 | 45 | # - if we have OCR_SERVICE_THREADS = 4, the OCR service can handle at most 4 requests at the same time, |
45 | 46 | # this means that you can not use all of your CPUS for OCR-ing for 1 request, |
46 | 47 | # because that means the other requests are sitting idle while the first one uses all resources, |
47 | 48 | # and so it is recommended to regulate the number of threads per request |
48 | | -OCR_WEB_SERVICE_THREADS = int(os.environ.get("OCR_WEB_SERVICE_THREADS", 1)) |
| 49 | +OCR_WEB_SERVICE_THREADS: int = int(os.environ.get("OCR_WEB_SERVICE_THREADS", 1)) |
49 | 50 |
|
50 | 51 | # This controls the number of workers the ocr service may have it is recommended to use this value |
51 | 52 | # instead of OCR_WEB_SERVICE_THREADS if you want to process multiple requests in parallel |
52 | 53 | # WARNING: using more than 1 workers assumes you have set the OCR_WEB_SERVICE_WORKER_CLASS setting to sync ! |
53 | 54 | # with the above mentioned, |
54 | 55 | # setting OCR_WEB_SERVICE_WORKER_CLASS to sync means that a worker will use 1 THREAD only, |
55 | 56 | # therefore OCR_WEB_SERVICE_THREADS is disregarded |
56 | | -OCR_WEB_SERVICE_WORKERS = int(os.environ.get("OCR_WEB_SERVICE_WORKERS", 1)) |
| 57 | +OCR_WEB_SERVICE_WORKERS: int = int(os.environ.get("OCR_WEB_SERVICE_WORKERS", 1)) |
57 | 58 |
|
58 | 59 | # set this to control the number of threads used for OCR-ing per web request thread (check OCR_WEB_SERVICE_THREADS) |
59 | | -CPU_THREADS = int(os.environ.get("OCR_SERVICE_CPU_THREADS", (multiprocessing.cpu_count() / OCR_WEB_SERVICE_WORKERS))) |
| 60 | +CPU_THREADS: int = int(os.environ.get("OCR_SERVICE_CPU_THREADS", |
| 61 | + (multiprocessing.cpu_count() / OCR_WEB_SERVICE_WORKERS))) |
60 | 62 |
|
61 | 63 | # conversion thread number for the pdf -> PIL img conversion, per web request thread (check OCR_WEB_SERVICE_THREADS) |
62 | | -CONVERTER_THREAD_NUM = int(os.environ.get("OCR_SERVICE_CONVERTER_THREADS", |
63 | | - (multiprocessing.cpu_count() / OCR_WEB_SERVICE_WORKERS))) |
| 64 | +CONVERTER_THREAD_NUM: int = int(os.environ.get("OCR_SERVICE_CONVERTER_THREADS", |
| 65 | + (multiprocessing.cpu_count() / OCR_WEB_SERVICE_WORKERS))) |
64 | 66 |
|
65 | 67 | # should we convert detected images to greyscale before OCR-ing |
66 | | -OCR_CONVERT_GRAYSCALE_IMAGES = True |
| 68 | +OCR_CONVERT_GRAYSCALE_IMAGES: bool = True |
67 | 69 |
|
68 | 70 | # dpi used for images in TESSERACT and other stuff |
69 | | -OCR_IMAGE_DPI = int(os.environ.get("OCR_SERVICE_IMAGE_DPI", 200)) |
| 71 | +OCR_IMAGE_DPI: int = int(os.environ.get("OCR_SERVICE_IMAGE_DPI", 200)) |
70 | 72 |
|
71 | 73 | # possible values: json (stringified output), dict (dict means no json.dumps() is applied to the output) |
72 | 74 | OCR_SERVICE_RESPONSE_OUTPUT_TYPE: str = str(os.environ.get("OCR_SERVICE_RESPONSE_OUTPUT_TYPE", "json")) |
73 | 75 |
|
74 | 76 | # LIBRE OFFICE SECTION |
75 | 77 |
|
76 | 78 | # 60 seconds before terminating processes |
77 | | -LIBRE_OFFICE_PROCESS_TIMEOUT = int(os.environ.get("OCR_SERVICE_LIBRE_OFFICE_PROCESS_TIMEOUT", 20)) |
| 79 | +LIBRE_OFFICE_PROCESS_TIMEOUT: int = int(os.environ.get("OCR_SERVICE_LIBRE_OFFICE_PROCESS_TIMEOUT", 100)) |
78 | 80 |
|
79 | 81 | # This is the port for the background soffice listener service that gets started with the app |
80 | 82 | # used internally for LibreOffice doc conversion |
81 | 83 | # the service should start multiple libre office servers for doc conversions, |
82 | 84 | # a libre office server will only use 1 CPU by default (not changable), thus, |
83 | 85 | # for handling multiple requests, we will have one service per OCR_WEB_SERVICE_THREAD |
84 | | -DEFAULT_LIBRE_OFFICE_SERVER_PORT = 9900 |
| 86 | +DEFAULT_LIBRE_OFFICE_SERVER_PORT: int = 9900 |
85 | 87 |
|
86 | | -LIBRE_OFFICE_PORT_CAP = DEFAULT_LIBRE_OFFICE_SERVER_PORT + 1 |
| 88 | +LIBRE_OFFICE_PORT_CAP: int = DEFAULT_LIBRE_OFFICE_SERVER_PORT + 1 |
87 | 89 |
|
88 | 90 | if OCR_WEB_SERVICE_THREADS > 1: |
89 | 91 | LIBRE_OFFICE_PORT_CAP = DEFAULT_LIBRE_OFFICE_SERVER_PORT + OCR_WEB_SERVICE_THREADS |
90 | 92 | if OCR_WEB_SERVICE_WORKERS > 1: |
91 | 93 | LIBRE_OFFICE_PORT_CAP = DEFAULT_LIBRE_OFFICE_SERVER_PORT + OCR_WEB_SERVICE_WORKERS |
92 | 94 |
|
93 | | -LIBRE_OFFICE_LISTENER_PORT_RANGE = os.environ.get("OCR_SERVICE_LIBRE_OFFICE_LISTENER_PORT_RANGE", |
94 | | - range(DEFAULT_LIBRE_OFFICE_SERVER_PORT, LIBRE_OFFICE_PORT_CAP)) |
| 95 | +LIBRE_OFFICE_LISTENER_PORT_RANGE: range | str = os.environ.get("OCR_SERVICE_LIBRE_OFFICE_LISTENER_PORT_RANGE", |
| 96 | + range(DEFAULT_LIBRE_OFFICE_SERVER_PORT, |
| 97 | + LIBRE_OFFICE_PORT_CAP)) |
95 | 98 |
|
96 | | -LIBRE_OFFICE_NETWORK_INTERFACE = "localhost" |
| 99 | +LIBRE_OFFICE_NETWORK_INTERFACE: str = "localhost" |
97 | 100 |
|
98 | 101 |
|
99 | 102 | # seconds to check for possible failure of port |
100 | | -LIBRE_OFFICE_PROCESSES_LISTENER_INTERVAL = 10 |
| 103 | +LIBRE_OFFICE_PROCESSES_LISTENER_INTERVAL: int = 10 |
101 | 104 |
|
102 | 105 |
|
103 | 106 | # DO NOT CHANGE THIS UNLESS YOU ARE DEVELOPING OR RUNNING THIS APP LOCALLY |
|
106 | 109 | # |
107 | 110 | # MacOS X: /Applications/LibreOffice.app/Contents/Resources/python |
108 | 111 | # Windows: C:/Windows/py.exe |
109 | | -# Linux(Ubuntu): /usr/bin/python3.11 (forcefully uses python3.11, |
| 112 | +# Linux(Ubuntu): /usr/bin/python3.12 (forcefully uses python3.12, |
110 | 113 | # to point to the default python on your system just use /usr/bin/python3) |
111 | | -LIBRE_OFFICE_PYTHON_PATH = "/Applications/LibreOffice.app/Contents/Resources/python" |
| 114 | +LIBRE_OFFICE_PYTHON_PATH: str = "/Applications/LibreOffice.app/Contents/Resources/python" |
112 | 115 |
|
113 | 116 | # DO NOT CHANGE THIS UNLESS YOU ARE DEVELOPING OR RUNNING THIS APP LOCALLY |
114 | 117 | # Description: this sets the path to the LibreOffice executable, |
|
118 | 121 | # MacOS X: /Applications/LibreOffice.app/Contents/MacOS/soffice |
119 | 122 | # Windows: %ProgramFiles%/LibreOffice/Program/soffice |
120 | 123 | # Linux(Ubuntu): /usr/bin/soffice |
121 | | -LIBRE_OFFICE_EXEC_PATH = "/Applications/LibreOffice.app/Contents/MacOS/soffice" |
| 124 | +LIBRE_OFFICE_EXEC_PATH: str = "/Applications/LibreOffice.app/Contents/MacOS/soffice" |
122 | 125 |
|
123 | 126 | if platform == "linux" or platform == "linux2": |
124 | 127 | LIBRE_OFFICE_EXEC_PATH = "/usr/bin/soffice" |
125 | | - LIBRE_OFFICE_PYTHON_PATH = "/usr/bin/python3.11" |
| 128 | + LIBRE_OFFICE_PYTHON_PATH = "/usr/bin/python3.12" |
126 | 129 |
|
127 | 130 | # this is the path from the Docker image, Ubuntu Lunar, Noble too. |
128 | 131 | TESSDATA_PREFIX = "/usr/share/tesseract-ocr/5/tessdata" |
|
0 commit comments