Skip to content

Commit 32f7b4f

Browse files
committed
Reverted to Gunicorn implementation for performance reasons.
1 parent 2306d48 commit 32f7b4f

File tree

11 files changed

+123
-114
lines changed

11 files changed

+123
-114
lines changed

README.md

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ Supports most document formats: pdf, html, doc(x), rtf, odt and also the image f
9898

9999
Using `curl` to send the document to server instance running on localhost on `8090` port:
100100

101-
```curl -F file="@ocr_service/tests/resources/docs/generic/pat_id_1.rtf" http://localhost:8090/api/process | jq```
101+
```curl -F file="@ocr_service/tests/resources/docs/generic/pat_id_1.rtf" http://localhost:8090/api/process/ | jq```
102102

103103
output
104104

@@ -169,7 +169,9 @@ OCR_TESSDATA_PREFIX - default "/usr/share/tessdata", this is the path to the Tes
169169
170170
OCR_SERVICE_TESSERACT_LANG - default "eng", language we are trying to ocr, only English is tested within the unittest, therefore expect variable results with anything else
171171
172-
OCR_WEB_SERVICE_LIMIT_CONCURRENCY_TASKS - default to 1, how many requests can it process at one time, this is global, not per worker, but should always follow the number of WORKERS.
172+
OCR_WEB_SERVICE_WORKER_CLASS - default "gthread", "gthread" is best if you use multiple threads per worker, if you are only using 1 worker and 1 thread, max performance is achieved with "sync", note that with "sync" you can only ever have one thread per worker, the "OCR_WEB_SERVICE_THREADS" will be ignored.
173+
174+
OCR_WEB_SERVICE_THREADS - default 1, this is specifically used by the web service, this can now be set to a value greater than 1 to allow multiple requests to process at the same time, of course, with split CPU resources,see OCR-ing scenarios section above
173175
174176
OCR_SERVICE_LOG_LEVEL - default 40, possible values : 50 - CRITICAL, 40 - ERROR, 30 - WARNING, 20 - INFO, 10 - DEBUG, 0 - NOTSET
175177
@@ -181,7 +183,7 @@ OCR_SERVICE_TESSERACT_NICE - default -18, this is just for Linux systems, we nee
181183
182184
OCR_SERVICE_TESSERACT_CUSTOM_CONFIG_FLAGS - extra parameters that you might want to pass to tesseract
183185
184-
OCR_SERVICE_CPU_THREADS - defaults to whatever the core count on the machine is divided by OCR_WEB_SERVICE_WORKERS, this variable is used by tesseract, each web thread will get access to a limited amount of CPUS so that resources are spread evenly
186+
OCR_SERVICE_CPU_THREADS - defaults to whatever the core count on the machine is divided by OCR_WEB_SERVICE_THREADS , this variable is used by tesseract, each web thread will get access to a limited amount of CPUS so that resources are spread evenly
185187
186188
OCR_SERVICE_CONVERTER_THREADS - defaults to whatever the core count on the machine is, this variable is used for converting pdf docs to images
187189
@@ -190,4 +192,4 @@ OCR_SERVICE_IMAGE_DPI - default 200 DPI, tesseract image DPI rendering resolutio
190192
OCR_SERVICE_LIBRE_OFFICE_PROCESS_TIMEOUT - default 10 seconds, used for converting docs to pdf.
191193
192194
OCR_WEB_SERVICE_WORKERS - number of worker threads (this means running multiple instances in parallel, becareful to balance load out by settings the threads to evenly distribute themselves amongst workers)
193-
```
195+
```

asgi.py

Lines changed: 0 additions & 16 deletions
This file was deleted.

env/ocr_service.env

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
# The default images for ocr-service:
22
# - cogstacksystems/cogstack-ocr-service:latest
3-
# check Dockerfile_multilang for more information.
43
OCR_SERVICE_DOCKER_IMAGE="cogstacksystems/cogstack-ocr-service:latest-${CPU_ARCHITECTURE:-amd64}"
54

65
OCR_SERVICE_CPU_THREADS=1
@@ -10,14 +9,13 @@ OCR_SERVICE_CONVERTER_THREADS=1
109
OCR_SERVICE_OPERATION_MODE=OCR
1110

1211
# 50 - CRITICAL, 40 - ERROR, 30 - WARNING, 20 - INFO, 10 - DEBUG, 0 - NOTSET
13-
OCR_SERVICE_LOG_LEVEL=10
12+
OCR_SERVICE_LOG_LEVEL=20
1413
OCR_SERVICE_DEBUG_MODE=False
1514

1615
OCR_SERVICE_TESSERACT_TIMEOUT=30
1716

1817
# change this to whatever language you are trying to OCR, e.g. eng, deu, fra, ita, nld, ron, spa
1918
# please note that you need to have the corresponding language pack installed in the container
20-
# check Dockerfile_multilang for more information and look for tessaract-ocr-[lang] packages
2119
OCR_SERVICE_TESSERACT_LANG="eng"
2220

2321
OCR_SERVICE_TESSERACT_NICE=-18
@@ -35,12 +33,15 @@ OCR_SERVICE_HOST=0.0.0.0
3533

3634
# READ https://github.com/CogStack/ocr-service/blob/master/README.md on how to handle these settings,
3735
# it is important to divide CPU(s) between workers/threads
38-
OCR_WEB_SERVICE_WORKERS=1
36+
OCR_WEB_SERVICE_WORKERS=2
3937
OCR_WEB_SERVICE_THREADS=1
4038

41-
# Maximum number of concurrent connections or tasks to allow, before issuing HTTP 503 responses.
42-
OCR_WEB_SERVICE_LIMIT_CONCURRENCY_TASKS=1
43-
4439
OCR_SERVICE_PORT=8090
4540

46-
OCR_SERVICE_UVICORN_LOG_LEVEL="info"
41+
OCR_SERVICE_GUNICORN_LOG_LEVEL="info"
42+
43+
# OCR_WEB_SERVICE_WORKER_CLASS - possible values: ["gthread", "sync"], "gthread" is best if you use multiple
44+
# threads per worker, if you are only using 1 worker and 1 thread, max performance is achieved
45+
# with "sync", note that with "sync" you can only ever have one thread per worker,
46+
# the "OCR_WEB_SERVICE_THREADS" will be ignored.
47+
OCR_SERVICE_WORKER_CLASS="sync"

env/ocr_service_text_only.env

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,20 @@ OCR_SERVICE_CONVERTER_THREADS=1
77
OCR_SERVICE_OPERATION_MODE=NO_OCR
88

99
#######################################################################################################
10+
# USED in bash start_service_debug.sh/start_service_production.sh
11+
OCR_SERVICE_HOST=0.0.0.0
12+
1013
# READ https://github.com/CogStack/ocr-service/blob/master/README.md on how to handle these settings,
1114
# it is important to divide CPU(s) between workers/threads
1215
OCR_WEB_SERVICE_WORKERS=1
1316
OCR_WEB_SERVICE_THREADS=1
17+
18+
OCR_SERVICE_PORT=8090
19+
20+
OCR_SERVICE_GUNICORN_LOG_LEVEL="info"
21+
22+
# OCR_WEB_SERVICE_WORKER_CLASS - possible values: ["gthread", "sync"], "gthread" is best if you use multiple
23+
# threads per worker, if you are only using 1 worker and 1 thread, max performance is achieved
24+
# with "sync", note that with "sync" you can only ever have one thread per worker,
25+
# the "OCR_WEB_SERVICE_THREADS" will be ignored.
26+
OCR_SERVICE_WORKER_CLASS="sync"

ocr_service/api/api.py

Lines changed: 25 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
import base64
2-
import json
32
import logging
43
import sys
54
import traceback
5+
import orjson
66
import uuid
77
from multiprocessing import Pool
8-
from typing import Any, Optional
8+
from typing import Any, List, Optional
99

1010
from fastapi import APIRouter, File, Request, UploadFile
11-
from fastapi.responses import JSONResponse, Response
11+
from fastapi.responses import Response, ORJSONResponse
1212

1313
from config import CPU_THREADS, LOG_LEVEL, TESSERACT_TIMEOUT
1414
from ocr_service.processor.processor import Processor
@@ -21,13 +21,18 @@
2121
log = setup_logging("api", log_level=LOG_LEVEL)
2222

2323

24+
@api.get("/health", response_class=ORJSONResponse)
25+
def health() -> ORJSONResponse:
26+
return ORJSONResponse(content={"status": "ok"})
27+
28+
2429
@api.get("/info")
25-
def info() -> JSONResponse:
26-
return JSONResponse(content=get_app_info())
30+
def info() -> ORJSONResponse:
31+
return ORJSONResponse(content=get_app_info())
2732

2833

2934
@api.post("/process")
30-
async def process(request: Request, file: Optional[UploadFile] = File(default=None)) -> Response:
35+
def process(request: Request, file: Optional[UploadFile] = File(default=None)) -> ORJSONResponse:
3136
"""
3237
Processes raw binary input stream, file, or
3338
JSON containing the binary_data field in base64 format
@@ -44,15 +49,15 @@ async def process(request: Request, file: Optional[UploadFile] = File(default=No
4449

4550
if file:
4651
file_name = file.filename if file.filename else ""
47-
stream = await file.read()
52+
stream = file.file.read()
4853
log.info(f"Processing file given via 'file' parameter, file name: {file_name}")
4954
else:
5055
file_name = uuid.uuid4().hex
5156
log.info(f"Processing binary as data-binary, generated file name: {file_name}")
52-
raw_body = await request.body()
57+
raw_body = request._body
5358

5459
try:
55-
record = json.loads(raw_body)
60+
record = orjson.loads(raw_body)
5661
if isinstance(record, list) and len(record) > 0:
5762
record = record[0]
5863

@@ -70,7 +75,7 @@ async def process(request: Request, file: Optional[UploadFile] = File(default=No
7075
stream = raw_body
7176

7277
log.info("Stream contains valid JSON.")
73-
except json.JSONDecodeError:
78+
except orjson.JSONDecodeError:
7479
stream = raw_body
7580
log.warning("Stream does not contain valid JSON.")
7681

@@ -81,22 +86,16 @@ async def process(request: Request, file: Optional[UploadFile] = File(default=No
8186

8287
code = 200 if len(output_text) > 0 or not stream else 500
8388

84-
response: dict[Any, Any] | bytes | str = build_response(
85-
output_text,
86-
footer=footer,
87-
metadata=doc_metadata
88-
)
89+
response: dict[Any, Any] = {"result": build_response(output_text, footer=footer, metadata=doc_metadata)}
8990

90-
response = json.dumps({"result": response}, ensure_ascii=False).encode("utf-8")
91-
92-
return Response(content=response, status_code=code, media_type="application/json")
91+
return ORJSONResponse(content=response, status_code=code, media_type="application/json")
9392

9493

9594
@api.post("/process_file")
96-
async def process_file(request: Request, file: UploadFile = File(...)) -> Response:
95+
def process_file(request: Request, file: UploadFile = File(...)) -> ORJSONResponse:
9796

9897
file_name: str = file.filename if file.filename else ""
99-
stream: bytes = await file.read()
98+
stream: bytes = file.file.read()
10099
log.info(f"Processing file: {file_name}")
101100

102101
processor: Processor = request.app.state.processor
@@ -109,23 +108,18 @@ async def process_file(request: Request, file: UploadFile = File(...)) -> Respon
109108

110109
code = 200 if len(output_text) > 0 or not stream else 500
111110

112-
response: dict[Any, Any] | bytes | str = build_response(
113-
output_text,
114-
metadata=doc_metadata
115-
)
116-
117-
response = json.dumps({"result": response}, ensure_ascii=False).encode("utf-8")
111+
response: dict[Any, Any] = {"result": build_response(output_text, metadata=doc_metadata)}
118112

119-
return Response(content=response, status_code=code)
113+
return ORJSONResponse(content=response, status_code=code, media_type="application/json")
120114

121115

122116
@api.post("/process_bulk")
123-
async def process_bulk(request: Request) -> Response:
117+
def process_bulk(request: Request, files: List[UploadFile] = File(...)) -> Response:
124118
"""
125-
Processes multiple files in a single request.
119+
Processes multiple files in a single request (multipart/form-data with multiple 'files').
126120
"""
127121

128-
form = await request.form()
122+
form = request._form
129123
file_streams = {}
130124

131125
proc_results = list()
@@ -136,7 +130,7 @@ async def process_bulk(request: Request) -> Response:
136130
# collect uploaded files
137131
for name, file in form.items():
138132
if isinstance(file, UploadFile):
139-
content = await file.read()
133+
content = file.read()
140134
file_streams[file.filename] = content
141135

142136
with Pool(processes=CPU_THREADS) as process_pool:

ocr_service/app/app.py

Lines changed: 46 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,10 @@
33
import subprocess
44
import sys
55
import time
6-
from contextlib import asynccontextmanager
76
from threading import Event, Thread
87
from typing import Any
98

9+
from fastapi.responses import ORJSONResponse
1010
import psutil
1111
from fastapi import FastAPI
1212

@@ -19,6 +19,9 @@
1919

2020
sys.path.append("..")
2121

22+
# guard so LibreOffice startup runs only once per worker
23+
_started: bool = False
24+
2225

2326
def start_office_server(port_num) -> dict[str, Any]:
2427
"""
@@ -108,54 +111,60 @@ def monitor_office_processes(thread_event: Event, processor: Processor) -> None:
108111
time.sleep(LIBRE_OFFICE_PROCESSES_LISTENER_INTERVAL)
109112

110113

111-
@asynccontextmanager
112-
async def office_process_lifespan(app: FastAPI):
113-
"""
114-
:description: Lifespan context manager to start and stop LibreOffice unoserver processes
115-
:param app: FastAPI application instance
116-
"""
117-
118-
# start LibreOffice unoserver processes
119-
loffice_processes = start_office_converter_servers()
120-
processor = Processor()
121-
processor.loffice_process_list.update(loffice_processes)
122-
app.state.processor = processor
123-
124-
# start persistent background thread for monitoring
125-
thread_event: Event = Event()
126-
127-
proc_listener_thread = Thread(
128-
target=monitor_office_processes,
129-
args=(thread_event, processor),
130-
name="loffice_proc_listener",
131-
daemon=True
132-
)
133-
proc_listener_thread.start()
134-
135-
try:
136-
yield
137-
finally:
138-
# shutdown: kill processes & stop monitoring
139-
thread_event.set()
140-
for port, proc in processor.loffice_process_list.items():
141-
logging.info(f"shutting down libreoffice process on port {port}")
142-
proc["process"].kill()
143-
144-
145114
def create_app() -> FastAPI:
146115
"""
147116
:description: Creates FastAPI application with API router and starts libreoffice unoserver processes
148117
:return: FastAPI application instance
149118
"""
150119

120+
global _started
121+
151122
try:
152123
app = FastAPI(title="OCR Service",
153124
description="OCR Service API",
154125
version=OCR_SERVICE_VERSION,
155-
debug=DEBUG_MODE,
156-
lifespan=office_process_lifespan)
126+
default_response_class=ORJSONResponse,
127+
debug=DEBUG_MODE)
157128
app.include_router(api)
158129

130+
# start once per worker
131+
if not _started:
132+
_started = True
133+
# Start LibreOffice unoserver processes
134+
loffice_processes = start_office_converter_servers()
135+
processor = Processor()
136+
processor.loffice_process_list.update(loffice_processes)
137+
app.state.processor = processor
138+
139+
# Start monitor thread
140+
thread_event = Event()
141+
proc_listener_thread = Thread(
142+
target=monitor_office_processes,
143+
args=(thread_event, processor),
144+
name="loffice_proc_listener",
145+
daemon=True
146+
)
147+
proc_listener_thread.start()
148+
149+
import atexit
150+
151+
def cleanup():
152+
thread_event.set()
153+
if proc_listener_thread.is_alive():
154+
proc_listener_thread.join(timeout=5)
155+
for port, proc in processor.loffice_process_list.items():
156+
p = proc["process"]
157+
try:
158+
logging.info(f"shutting down libreoffice process on port {port}")
159+
p.terminate()
160+
p.wait(timeout=3)
161+
except Exception:
162+
try:
163+
p.kill()
164+
except Exception as e:
165+
logging.error("error in when shutting down libreoffice process: " + str(e))
166+
atexit.register(cleanup)
167+
159168
except Exception:
160169
raise
161170

0 commit comments

Comments
 (0)