CogStack
diff --git a/‎README.md‎
Lines changed: 6 additions & 4 deletions b/‎README.md‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎asgi.py‎
Lines changed: 0 additions & 16 deletions b/‎asgi.py‎
Lines changed: 0 additions & 16 deletions
diff --git a/‎env/ocr_service.env‎
Lines changed: 9 additions & 8 deletions b/‎env/ocr_service.env‎
Lines changed: 9 additions & 8 deletions
diff --git a/‎env/ocr_service_text_only.env‎
Lines changed: 13 additions & 0 deletions b/‎env/ocr_service_text_only.env‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎ocr_service/api/api.py‎
Lines changed: 25 additions & 31 deletions b/‎ocr_service/api/api.py‎
Lines changed: 25 additions & 31 deletions
diff --git a/‎ocr_service/app/app.py‎
Lines changed: 46 additions & 37 deletions b/‎ocr_service/app/app.py‎
Lines changed: 46 additions & 37 deletions
@@ -98,7 +98,7 @@ Supports most document formats: pdf, html, doc(x), rtf, odt and also the image f
 
 Using `curl` to send the document to server instance running on localhost on `8090` port:
 
-```curl -F file="@ocr_service/tests/resources/docs/generic/pat_id_1.rtf" http://localhost:8090/api/process | jq```
+```curl -F file="@ocr_service/tests/resources/docs/generic/pat_id_1.rtf" http://localhost:8090/api/process/ | jq```
 
 output
 
@@ -169,7 +169,9 @@ OCR_TESSDATA_PREFIX - default "/usr/share/tessdata", this is the path to the Tes
 
 OCR_SERVICE_TESSERACT_LANG - default "eng", language we are trying to ocr, only English is tested within the unittest, therefore expect variable results with anything else
 
-OCR_WEB_SERVICE_LIMIT_CONCURRENCY_TASKS - default to 1, how many requests can it process at one time, this is global, not per worker, but should always follow the number of WORKERS.
+OCR_WEB_SERVICE_WORKER_CLASS - default "gthread", "gthread" is best if you use multiple threads per worker, if you are only using 1 worker and 1 thread, max performance is achieved with "sync", note that with "sync" you can only ever have one thread per worker, the       "OCR_WEB_SERVICE_THREADS" will be ignored.
+
+OCR_WEB_SERVICE_THREADS - default 1, this is specifically used by the web service, this can now be set to a value greater than 1 to allow multiple requests to process at the same time, of course, with split CPU resources,see OCR-ing scenarios section above
 
 OCR_SERVICE_LOG_LEVEL - default 40, possible values : 50 - CRITICAL, 40 - ERROR, 30 - WARNING, 20 - INFO, 10 - DEBUG, 0 - NOTSET
 
@@ -181,7 +183,7 @@ OCR_SERVICE_TESSERACT_NICE - default -18, this is just for Linux systems, we nee
 
 OCR_SERVICE_TESSERACT_CUSTOM_CONFIG_FLAGS - extra parameters that you might want to pass to tesseract
 
-OCR_SERVICE_CPU_THREADS - defaults to whatever the core count on the machine is divided by OCR_WEB_SERVICE_WORKERS, this variable is used by tesseract, each web thread will get access to a limited amount of CPUS so that resources are spread evenly
+OCR_SERVICE_CPU_THREADS - defaults to whatever the core count on the machine is divided by OCR_WEB_SERVICE_THREADS , this variable is used by tesseract, each web thread will get access to a limited amount of CPUS so that resources are spread evenly
 
 OCR_SERVICE_CONVERTER_THREADS - defaults to whatever the core count on the machine is, this variable is used for converting pdf docs to images
 
@@ -190,4 +192,4 @@ OCR_SERVICE_IMAGE_DPI - default 200 DPI, tesseract image DPI rendering resolutio
 OCR_SERVICE_LIBRE_OFFICE_PROCESS_TIMEOUT - default 10 seconds, used for converting docs to pdf. 
 
 OCR_WEB_SERVICE_WORKERS - number of worker threads (this means running multiple instances in parallel, becareful to balance load out by settings the threads to evenly distribute themselves amongst workers)
-```
+```
@@ -1,6 +1,5 @@
 # The default images for ocr-service:
 #   - cogstacksystems/cogstack-ocr-service:latest
-#     check Dockerfile_multilang for more information.
 OCR_SERVICE_DOCKER_IMAGE="cogstacksystems/cogstack-ocr-service:latest-${CPU_ARCHITECTURE:-amd64}"
 
 OCR_SERVICE_CPU_THREADS=1
@@ -10,14 +9,13 @@ OCR_SERVICE_CONVERTER_THREADS=1
 OCR_SERVICE_OPERATION_MODE=OCR
 
 # 50 - CRITICAL, 40 - ERROR, 30 - WARNING, 20 - INFO, 10 - DEBUG, 0 - NOTSET
-OCR_SERVICE_LOG_LEVEL=10
+OCR_SERVICE_LOG_LEVEL=20
 OCR_SERVICE_DEBUG_MODE=False
 
 OCR_SERVICE_TESSERACT_TIMEOUT=30
 
 # change this to whatever language you are trying to OCR, e.g. eng, deu, fra, ita, nld, ron, spa
 # please note that you need to have the corresponding language pack installed in the container
-# check Dockerfile_multilang for more information and look for tessaract-ocr-[lang] packages
 OCR_SERVICE_TESSERACT_LANG="eng"
 
 OCR_SERVICE_TESSERACT_NICE=-18
@@ -35,12 +33,15 @@ OCR_SERVICE_HOST=0.0.0.0
 
 # READ https://github.com/CogStack/ocr-service/blob/master/README.md on how to handle these settings, 
 # it is important to divide CPU(s) between workers/threads
-OCR_WEB_SERVICE_WORKERS=1
+OCR_WEB_SERVICE_WORKERS=2
 OCR_WEB_SERVICE_THREADS=1
 
-# Maximum number of concurrent connections or tasks to allow, before issuing HTTP 503 responses.
-OCR_WEB_SERVICE_LIMIT_CONCURRENCY_TASKS=1
-
 OCR_SERVICE_PORT=8090
 
-OCR_SERVICE_UVICORN_LOG_LEVEL="info"
+OCR_SERVICE_GUNICORN_LOG_LEVEL="info"
+
+# OCR_WEB_SERVICE_WORKER_CLASS - possible values: ["gthread", "sync"], "gthread" is best if you use multiple
+# threads per worker, if you are only using 1 worker and 1 thread, max performance is achieved 
+# with "sync", note that with "sync" you can only ever have one thread per worker, 
+# the "OCR_WEB_SERVICE_THREADS" will be ignored.
+OCR_SERVICE_WORKER_CLASS="sync"
@@ -7,7 +7,20 @@ OCR_SERVICE_CONVERTER_THREADS=1
 OCR_SERVICE_OPERATION_MODE=NO_OCR
 
 #######################################################################################################
+# USED in bash start_service_debug.sh/start_service_production.sh
+OCR_SERVICE_HOST=0.0.0.0
+
 # READ https://github.com/CogStack/ocr-service/blob/master/README.md on how to handle these settings, 
 # it is important to divide CPU(s) between workers/threads
 OCR_WEB_SERVICE_WORKERS=1
 OCR_WEB_SERVICE_THREADS=1
+
+OCR_SERVICE_PORT=8090
+
+OCR_SERVICE_GUNICORN_LOG_LEVEL="info"
+
+# OCR_WEB_SERVICE_WORKER_CLASS - possible values: ["gthread", "sync"], "gthread" is best if you use multiple
+# threads per worker, if you are only using 1 worker and 1 thread, max performance is achieved 
+# with "sync", note that with "sync" you can only ever have one thread per worker, 
+# the "OCR_WEB_SERVICE_THREADS" will be ignored.
+OCR_SERVICE_WORKER_CLASS="sync"
@@ -1,14 +1,14 @@
 import base64
-import json
 import logging
 import sys
 import traceback
+import orjson
 import uuid
 from multiprocessing import Pool
-from typing import Any, Optional
+from typing import Any, List, Optional
 
 from fastapi import APIRouter, File, Request, UploadFile
-from fastapi.responses import JSONResponse, Response
+from fastapi.responses import Response, ORJSONResponse
 
 from config import CPU_THREADS, LOG_LEVEL, TESSERACT_TIMEOUT
 from ocr_service.processor.processor import Processor
@@ -21,13 +21,18 @@
 log = setup_logging("api", log_level=LOG_LEVEL)
 
 
+@api.get("/health", response_class=ORJSONResponse)
+def health() -> ORJSONResponse:
+    return ORJSONResponse(content={"status": "ok"})
+
+
 @api.get("/info")
-def info() -> JSONResponse:
-    return JSONResponse(content=get_app_info())
+def info() -> ORJSONResponse:
+    return ORJSONResponse(content=get_app_info())
 
 
 @api.post("/process")
-async def process(request: Request, file: Optional[UploadFile] = File(default=None)) -> Response:
+def process(request: Request, file: Optional[UploadFile] = File(default=None)) -> ORJSONResponse:
     """
      Processes raw binary input stream, file, or
         JSON containing the binary_data field in base64 format
@@ -44,15 +49,15 @@ async def process(request: Request, file: Optional[UploadFile] = File(default=No
 
     if file:
         file_name = file.filename if file.filename else ""
-        stream = await file.read()
+        stream = file.file.read()
         log.info(f"Processing file given via 'file' parameter, file name: {file_name}")
     else:
         file_name = uuid.uuid4().hex
         log.info(f"Processing binary as data-binary, generated file name: {file_name}")
-        raw_body = await request.body()
+        raw_body = request._body
 
         try:
-            record = json.loads(raw_body)
+            record = orjson.loads(raw_body)
             if isinstance(record, list) and len(record) > 0:
                 record = record[0]
 
@@ -70,7 +75,7 @@ async def process(request: Request, file: Optional[UploadFile] = File(default=No
                 stream = raw_body
 
             log.info("Stream contains valid JSON.")
-        except json.JSONDecodeError:
+        except orjson.JSONDecodeError:
             stream = raw_body
             log.warning("Stream does not contain valid JSON.")
 
@@ -81,22 +86,16 @@ async def process(request: Request, file: Optional[UploadFile] = File(default=No
 
     code = 200 if len(output_text) > 0 or not stream else 500
 
-    response: dict[Any, Any] | bytes | str = build_response(
-        output_text,
-        footer=footer,
-        metadata=doc_metadata
-    )
+    response: dict[Any, Any] = {"result": build_response(output_text, footer=footer, metadata=doc_metadata)}
 
-    response = json.dumps({"result": response}, ensure_ascii=False).encode("utf-8")
-
-    return Response(content=response, status_code=code, media_type="application/json")
+    return ORJSONResponse(content=response, status_code=code, media_type="application/json")
 
 
 @api.post("/process_file")
-async def process_file(request: Request, file: UploadFile = File(...)) -> Response:
+def process_file(request: Request, file: UploadFile = File(...)) -> ORJSONResponse:
 
     file_name: str = file.filename if file.filename else ""
-    stream: bytes = await file.read()
+    stream: bytes = file.file.read()
     log.info(f"Processing file: {file_name}")
 
     processor: Processor = request.app.state.processor
@@ -109,23 +108,18 @@ async def process_file(request: Request, file: UploadFile = File(...)) -> Respon
 
     code = 200 if len(output_text) > 0 or not stream else 500
 
-    response: dict[Any, Any] | bytes | str = build_response(
-        output_text,
-        metadata=doc_metadata
-    )
-
-    response = json.dumps({"result": response}, ensure_ascii=False).encode("utf-8")
+    response: dict[Any, Any] = {"result": build_response(output_text, metadata=doc_metadata)}
 
-    return Response(content=response, status_code=code)
+    return ORJSONResponse(content=response, status_code=code, media_type="application/json")
 
 
 @api.post("/process_bulk")
-async def process_bulk(request: Request) -> Response:
+def process_bulk(request: Request, files: List[UploadFile] = File(...)) -> Response:
     """
-        Processes multiple files in a single request.
+        Processes multiple files in a single request (multipart/form-data with multiple 'files').
     """
 
-    form = await request.form()
+    form = request._form
     file_streams = {}
 
     proc_results = list()
@@ -136,7 +130,7 @@ async def process_bulk(request: Request) -> Response:
     # collect uploaded files
     for name, file in form.items():
         if isinstance(file, UploadFile):
-            content = await file.read()
+            content = file.read()
             file_streams[file.filename] = content
 
     with Pool(processes=CPU_THREADS) as process_pool:
 
@@ -3,10 +3,10 @@
 import subprocess
 import sys
 import time
-from contextlib import asynccontextmanager
 from threading import Event, Thread
 from typing import Any
 
+from fastapi.responses import ORJSONResponse
 import psutil
 from fastapi import FastAPI
 
@@ -19,6 +19,9 @@
 
 sys.path.append("..")
 
+# guard so LibreOffice startup runs only once per worker
+_started: bool = False
+
 
 def start_office_server(port_num) -> dict[str, Any]:
     """
@@ -108,54 +111,60 @@ def monitor_office_processes(thread_event: Event, processor: Processor) -> None:
         time.sleep(LIBRE_OFFICE_PROCESSES_LISTENER_INTERVAL)
 
 
-@asynccontextmanager
-async def office_process_lifespan(app: FastAPI):
-    """
-        :description: Lifespan context manager to start and stop LibreOffice unoserver processes
-        :param app: FastAPI application instance
-    """
-
-    # start LibreOffice unoserver processes
-    loffice_processes = start_office_converter_servers()
-    processor = Processor()
-    processor.loffice_process_list.update(loffice_processes)
-    app.state.processor = processor
-
-    # start persistent background thread for monitoring
-    thread_event: Event = Event()
-
-    proc_listener_thread = Thread(
-                target=monitor_office_processes,
-                args=(thread_event, processor),
-                name="loffice_proc_listener",
-                daemon=True
-            )
-    proc_listener_thread.start()
-
-    try:
-        yield
-    finally:
-        # shutdown: kill processes & stop monitoring
-        thread_event.set()
-        for port, proc in processor.loffice_process_list.items():
-            logging.info(f"shutting down libreoffice process on port {port}")
-            proc["process"].kill()
-
-
 def create_app() -> FastAPI:
     """
         :description: Creates FastAPI application with API router and starts libreoffice unoserver processes
         :return: FastAPI application instance
     """
 
+    global _started
+
     try:
         app = FastAPI(title="OCR Service",
                       description="OCR Service API",
                       version=OCR_SERVICE_VERSION,
-                      debug=DEBUG_MODE,
-                      lifespan=office_process_lifespan)
+                      default_response_class=ORJSONResponse,
+                      debug=DEBUG_MODE)
         app.include_router(api)
 
+        # start once per worker
+        if not _started:
+            _started = True
+            # Start LibreOffice unoserver processes
+            loffice_processes = start_office_converter_servers()
+            processor = Processor()
+            processor.loffice_process_list.update(loffice_processes)
+            app.state.processor = processor
+
+            # Start monitor thread
+            thread_event = Event()
+            proc_listener_thread = Thread(
+                target=monitor_office_processes,
+                args=(thread_event, processor),
+                name="loffice_proc_listener",
+                daemon=True
+            )
+            proc_listener_thread.start()
+
+            import atexit
+
+            def cleanup():
+                thread_event.set()
+                if proc_listener_thread.is_alive():
+                    proc_listener_thread.join(timeout=5)
+                for port, proc in processor.loffice_process_list.items():
+                    p = proc["process"]
+                    try:
+                        logging.info(f"shutting down libreoffice process on port {port}")
+                        p.terminate()
+                        p.wait(timeout=3)
+                    except Exception:
+                        try:
+                            p.kill()
+                        except Exception as e:
+                            logging.error("error in when shutting down libreoffice process: " + str(e))
+            atexit.register(cleanup)
+
     except Exception:
         raise