Skip to content

Commit 62e9fca

Browse files
authored
updates for OCRWorker (#565)
1 parent 8e9fa56 commit 62e9fca

File tree

16 files changed

+230
-19
lines changed

16 files changed

+230
-19
lines changed

.dockerignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,4 @@ ui2/.vscode
1717
ui2/.pnp.cjs
1818
ui2/.pnp.loader.mjs
1919
ui2/.env.development.local
20+
ui2/public/papermerge-runtime-config.js

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,3 +38,4 @@ simplest.yml
3838
ui/public/runtime/config.js
3939
.ruff_cache/
4040
.pytest_cache/
41+
.enwardrc

docker/standard/Dockerfile

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,14 @@ ENV CORE_APP=/core_app
1414
ENV PAPERMERGE__DATABASE__URL=sqlite:////db/db.sqlite3
1515
ENV PAPERMERGE__AUTH__USERNAME=admin
1616
ENV PAPERMERGE__AUTH__EMAIL=admin@example.com
17-
ENV PAPERMERGE__OCR__DEFAULT_LANGUAGE=deu
1817
ENV PAPERMERGE__MAIN__API_PREFIX=""
18+
ENV PAPERMERGE__OCR__LANG_CODES="deu,eng,ron"
19+
ENV PAPERMERGE__OCR__DEFAULT_LANG_CODE="deu"
20+
ENV PAPERMERGE__OCR__AUTOMATIC="false"
1921

2022
RUN apk update && apk add linux-headers python3-dev \
2123
gcc \
24+
curl \
2225
libc-dev \
2326
supervisor \
2427
imagemagick \
@@ -27,6 +30,7 @@ RUN apk update && apk add linux-headers python3-dev \
2730
poppler-utils
2831

2932
RUN pip install --upgrade poetry roco==0.4.2
33+
RUN curl -L -o /bin/env2js https://github.com/papermerge/env2js/releases/download/0.2/env2js.amd64
3034

3135
COPY poetry.lock pyproject.toml README.md LICENSE ${CORE_APP}/
3236

@@ -37,6 +41,7 @@ COPY docker/standard/entrypoint.sh /entrypoint.sh
3741
COPY docker/standard/bundles/supervisor/* /etc/papermerge/
3842
COPY docker/standard/bundles/nginx/* /etc/nginx/
3943
COPY docker/standard/logging.yaml /etc/papermerge/
44+
COPY docker/standard/core.js.tmpl /${CORE_APP}/core.js.tmpl
4045
COPY ./papermerge ${CORE_APP}/papermerge/
4146
COPY alembic.ini ${CORE_APP}/
4247

docker/standard/core.js.tmpl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
window.__PAPERMERGE_RUNTIME_CONFIG__ = {
2+
ocr__lang_codes: "{{ .PAPERMERGE__OCR__LANG_CODES }}",
3+
ocr__default_lang_code: "{{ .PAPERMERGE__OCR__DEFAULT_LANG_CODE }}",
4+
ocr__automatic: {{ .PAPERMERGE__OCR__AUTOMATIC }}
5+
}

docker/standard/entrypoint.sh

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,15 @@ case $CMD in
5555
;;
5656
server)
5757
exec_init
58+
# TODO: replace roco with env2js
5859
roco > /usr/share/nginx/html/auth_server/papermerge-runtime-config.js
59-
roco > /usr/share/nginx/html/ui/papermerge-runtime-config.js
60+
/bin/env2js -f /core_app/core.js.tmpl > /usr/share/nginx/html/ui/papermerge-runtime-config.js
61+
exec /usr/bin/supervisord -c /etc/papermerge/supervisord.conf
62+
;;
63+
server_without_init)
64+
# TODO: replace roco with env2js
65+
roco > /usr/share/nginx/html/auth_server/papermerge-runtime-config.js
66+
/bin/env2js -f /core_app/core.js.tmpl > /usr/share/nginx/html/ui/papermerge-runtime-config.js
6067
exec /usr/bin/supervisord -c /etc/papermerge/supervisord.conf
6168
;;
6269
create_token.sh)

papermerge/core/config.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,15 @@ class Settings(BaseSettings):
1919
papermerge__main__cf_domain: str | None = None
2020
papermerge__database__url: str = "sqlite:////db/db.sqlite3"
2121
papermerge__redis__url: str | None = None
22-
papermerge__ocr__default_language: str = 'deu'
22+
papermerge__ocr__default_lang_code: str = 'deu'
23+
# When is OCR triggered ?
24+
# `ocr__automatic` = True means that OCR will be performed without
25+
# end user intervention i.e. via background scheduler like celery scheduler
26+
# `ocr__automatic` = False means that OCR will be performed only
27+
# if requested by end user. In this case user can choose to
28+
# start schedule OCR on upload; also in this case use can choose to
29+
# scheduler OCR later on any document.
30+
papermerge__ocr__automatic: bool = False
2331
papermerge__search__url: str | None = None
2432

2533

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import uuid
2+
import typer
3+
4+
from papermerge.core.tasks import send_task
5+
from papermerge.core.db.engine import Session
6+
7+
from papermerge.core import dbapi, constants, types
8+
9+
10+
app = typer.Typer(help="OCR tasks")
11+
12+
13+
@app.command()
14+
def schedule_ocr(node_id: uuid.UUID, force: bool = False, lang: str | None = None):
15+
"""Schedules OCR for given node ID"""
16+
with Session() as db_session:
17+
node_type: types.CType = dbapi.get_node_type(db_session, node_id)
18+
19+
if node_type == "document":
20+
if lang is None:
21+
lang = dbapi.get_document_lang(db_session, node_id)
22+
send_task(
23+
constants.WORKER_OCR_DOCUMENT,
24+
kwargs={
25+
"document_id": str(node_id),
26+
"lang": lang,
27+
},
28+
route_name="ocr",
29+
)
30+
else:
31+
# get all descendants of node_id
32+
pass

papermerge/core/features/document/db/api.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,11 @@
2727
)
2828
from papermerge.core.features.document.schema import DocumentCFVRow
2929
from papermerge.core.features.document.ordered_document_cfv import OrderedDocumentCFV
30+
from papermerge.core import config
3031

3132
from .selectors import select_doc_cfv, select_docs_by_type
3233

33-
34+
settings = config.get_settings()
3435

3536
logger = logging.getLogger(__name__)
3637

@@ -634,6 +635,17 @@ def upload(
634635
route_name="s3",
635636
)
636637

638+
if not settings.papermerge__ocr__automatic:
639+
if doc.ocr is True:
640+
# user chose "schedule OCR" when uploading document
641+
tasks.send_task(
642+
constants.WORKER_OCR_DOCUMENT,
643+
kwargs={
644+
"document_id": str(doc.id),
645+
"lang": doc.lang,
646+
},
647+
route_name="ocr",
648+
)
637649

638650
return validated_model, None
639651

ui2/index.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
<body>
1010
<div id="root"></div>
1111
<div id="modals"></div>
12+
<script type="module" src="/papermerge-runtime-config.js"></script>
1213
<script type="module" src="/src/main.tsx"></script>
1314
</body>
1415
</html>

ui2/papermerge-runtime-config.js

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
window.__PAPERMERGE_RUNTIME_CONFIG__ = {
2+
ocr__lang_codes: "deu,eng,ron, spa, ita, fra",
3+
ocr__default_lang_code: "eng",
4+
ocr__automatic: false
5+
}

0 commit comments

Comments
 (0)