Skip to content

Commit 2e870ab

Browse files
authored
add ocr-text endpoint to get document page(s) ocred text
add ocr-text endpoint to get document page(s) ocred text
1 parent 610fd4c commit 2e870ab

File tree

14 files changed

+358
-34
lines changed

14 files changed

+358
-34
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
.idea/
22
.papermerge.toml
33
.env
4+
.envrc
45
.env_shell
56
.env_services
67
config/settings/local/*

docker/dev/.envrc

Lines changed: 0 additions & 11 deletions
This file was deleted.

docker/dev/logging.yaml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,6 @@ handlers:
1212
formatter: verbose
1313

1414
loggers:
15-
django.db.backends:
16-
level: DEBUG
17-
handlers: [console]
18-
propagate: no
1915
papermerge:
2016
level: DEBUG
2117
handlers: [console]

papermerge/core/models/document_version.py

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,13 @@
1212

1313

1414
class DocumentVersion(models.Model):
15+
"""Document Version
1516
17+
Document can have one or multiple versions.
18+
Document has at least one version associated (the original).
19+
Each document version has a number - which starts with 1 (one) i.e.
20+
original document version is - document version 1 (one).
21+
"""
1622
id = models.UUIDField(primary_key=True, default=uuid.uuid4)
1723

1824
document = models.ForeignKey(
@@ -31,7 +37,7 @@ class DocumentVersion(models.Model):
3137
)
3238
# version number
3339
number = models.IntegerField(
34-
default=1,
40+
default=1, # Document versioning starts with 1
3541
verbose_name=_('Version number')
3642
)
3743
#: basename + ext of uploaded file.
@@ -64,8 +70,8 @@ class Meta:
6470
verbose_name = _('Document version')
6571
verbose_name_plural = _('Document versions')
6672

67-
def __str__(self):
68-
return f"id={self.pk} number={self.number}"
73+
def __repr__(self):
74+
return f"DocumentVersion(id={self.pk}, number={self.number})"
6975

7076
def abs_file_path(self):
7177
return abs_path(
@@ -169,3 +175,29 @@ def update_text_field(self, streams):
169175
self.save()
170176

171177
return self.has_combined_text
178+
179+
def get_ocred_text(
180+
self,
181+
page_numbers: list = (),
182+
page_ids: list = ()
183+
) -> str:
184+
"""
185+
Returns OCRed text of given pages.
186+
187+
You can filter pages for which OCRed is requested either be page numbers
188+
or by page_ids.
189+
If both page_numbers and page_ids are empty i.e. no filters, then
190+
return `self.text`.
191+
"""
192+
pages_text = " ".join([
193+
page.text for page in self.pages.all()
194+
if page.number in page_numbers or str(page.pk) in page_ids
195+
])
196+
197+
if page_ids or page_numbers:
198+
result = pages_text.strip()
199+
else:
200+
# when both filters are empty, return the `self.text` field
201+
result = self.text.strip()
202+
203+
return result

papermerge/core/serializers/__init__.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
from .automate import AutomateSerializer
22
from .document import DocumentSerializer
33
from .document import DocumentDetailsSerializer, DocumentsMergeSerializer
4-
from .document_version import DocumentVersionSerializer
4+
from .document_version import (
5+
DocumentVersionSerializer,
6+
DocumentVersionOcrTextSerializer
7+
)
58
from .folder import FolderSerializer
69
from .node import (
710
NodeSerializer,
@@ -42,6 +45,7 @@
4245
'DocumentDetailsSerializer',
4346
'DocumentsMergeSerializer',
4447
'DocumentVersionSerializer',
48+
'DocumentVersionOcrTextSerializer',
4549
'FolderSerializer',
4650
'NodeSerializer',
4751
'NodeMoveSerializer',

papermerge/core/serializers/document_version.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from rest_framework_json_api import serializers
2+
from rest_framework import serializers as rest_serializers
23
from papermerge.core.models import DocumentVersion
34

45

@@ -18,3 +19,8 @@ class Meta:
1819
'short_description',
1920
'document',
2021
)
22+
23+
24+
class DocumentVersionOcrTextSerializer(rest_serializers.Serializer):
25+
"""Returns OCRed Text of the document"""
26+
text = serializers.CharField(required=False, allow_blank=True)

papermerge/core/tasks.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,17 @@ def ocr_document_task(
7777

7878

7979
@shared_task
80+
def post_ocr_document_task(document_id, namespace=None):
81+
"""
82+
Task to run immediately after document OCR is complete
83+
84+
This task guarantees that `increment_document_version` will run
85+
before `update_document_pages`.
86+
"""
87+
increment_document_version(document_id, namespace)
88+
update_document_pages(document_id, namespace)
89+
90+
8091
def increment_document_version(document_id, namespace=None):
8192
logger.debug(
8293
'increment_document_version: '
@@ -113,7 +124,6 @@ def increment_document_version(document_id, namespace=None):
113124
)
114125

115126

116-
@shared_task
117127
def update_document_pages(document_id, namespace=None):
118128
"""
119129
Updates document latest versions's ``text`` field

papermerge/core/urls.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77

88
router = routers.DefaultRouter()
99

10-
router.register(r"automates", views.AutomatesViewSet, basename="automate")
1110
router.register(r"tokens", views.TokensViewSet, basename="token")
1211
router.register(r"tags", views.TagsViewSet, basename="tag")
1312
router.register("nodes", views.NodesViewSet, basename="node")
@@ -29,6 +28,11 @@
2928
views.DocumentUploadView.as_view(),
3029
name='documents_upload'
3130
),
31+
path(
32+
'documents/<uuid:pk>/ocr-text',
33+
views.DocumentOcrTextView.as_view(),
34+
name='document-ocr-text'
35+
),
3236
path(
3337
'documents/merge/',
3438
views.DocumentsMergeView.as_view(),

papermerge/core/views/__init__.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,11 @@
1515
InboxCountView,
1616
NodeTagsView,
1717
)
18-
from .documents import DocumentUploadView, DocumentsMergeView
18+
from .documents import (
19+
DocumentUploadView,
20+
DocumentsMergeView,
21+
DocumentOcrTextView
22+
)
1923
from .document_versions import DocumentVersionsDownloadView
2024
from .documents import DocumentDetailsViewSet
2125
from .folders import FoldersViewSet
@@ -46,6 +50,7 @@
4650
'NodesDownloadView',
4751
'DocumentUploadView',
4852
'DocumentsMergeView',
53+
'DocumentOcrTextView',
4954
'DocumentVersionsDownloadView',
5055
'DocumentDetailsViewSet',
5156
'FoldersViewSet',

papermerge/core/views/documents.py

Lines changed: 72 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,18 +11,21 @@
1111
from rest_framework.parsers import FileUploadParser
1212
from rest_framework_json_api.views import ModelViewSet
1313
from rest_framework_json_api.renderers import JSONRenderer
14-
from drf_spectacular.utils import extend_schema
14+
from drf_spectacular.utils import (
15+
extend_schema,
16+
OpenApiParameter
17+
)
1518

1619
from papermerge.core.serializers import (
1720
DocumentDetailsSerializer,
18-
DocumentsMergeSerializer
21+
DocumentsMergeSerializer,
22+
DocumentVersionOcrTextSerializer
1923
)
2024
from papermerge.core.storage import get_storage_instance
2125
from papermerge.core.models import Document
2226
from papermerge.core.tasks import (
2327
ocr_document_task,
24-
update_document_pages,
25-
increment_document_version
28+
post_ocr_document_task
2629
)
2730
from papermerge.core.exceptions import APIBadRequest
2831

@@ -71,8 +74,7 @@ def put(self, request, document_id, file_name):
7174
'user_id': str(request.user.id)
7275
},
7376
link=[
74-
increment_document_version.s(namespace),
75-
update_document_pages.s(namespace)
77+
post_ocr_document_task.s(namespace),
7678
]
7779
)
7880
except OperationalError as ex:
@@ -90,6 +92,70 @@ def put(self, request, document_id, file_name):
9092
return Response({}, status=status.HTTP_201_CREATED)
9193

9294

95+
class DocumentOcrTextView(RequireAuthMixin, GenericAPIView):
96+
serializer_class = DocumentVersionOcrTextSerializer
97+
parser_classes = (rest_framework_JSONParser,)
98+
renderer_classes = (rest_framework_JSONRenderer,)
99+
queryset = Document.objects.all()
100+
101+
@extend_schema(
102+
operation_id="Document OCR Text",
103+
parameters=[
104+
OpenApiParameter(
105+
name='page_numbers[]',
106+
description=(
107+
"Filter pages by provided page numbers"
108+
),
109+
required=False,
110+
type={'type': 'array', 'items': {'type': 'number'}}
111+
),
112+
OpenApiParameter(
113+
name='page_ids[]',
114+
description=(
115+
"Filter pages by provided page ids"
116+
),
117+
required=False,
118+
type={'type': 'array', 'items': {'type': 'string'}}
119+
),
120+
]
121+
)
122+
def get(self, request, pk, *args, **kwargs):
123+
"""Retrieve OCRed text of the document
124+
125+
You can filter pages for which OCRed text is to be received either by
126+
page numbers or by page ids. When both filters are empty - retrieve
127+
OCRed text of the whole document (i.e. of its last document version)
128+
"""
129+
130+
# Document instance
131+
instance = self.get_object()
132+
document_version = instance.versions.last()
133+
# For what page number does user want to get OCR text ?
134+
# If page_numbers parameter is empty - get OCR text for all pages
135+
# of the document version
136+
try:
137+
page_numbers = self.request.GET.getlist('page_numbers[]', [])
138+
page_numbers = [int(number) for number in page_numbers]
139+
except ValueError:
140+
page_numbers = []
141+
142+
page_ids = self.request.GET.getlist('page_ids[]', [])
143+
144+
text = document_version.get_ocred_text(
145+
page_numbers=page_numbers,
146+
page_ids=page_ids
147+
)
148+
serializer = self.get_serializer(data={'text': text})
149+
150+
if serializer.is_valid():
151+
return Response(data=serializer.data)
152+
153+
return Response(
154+
serializer.errors,
155+
status=status.HTTP_400_BAD_REQUEST
156+
)
157+
158+
93159
class DocumentsMergeView(RequireAuthMixin, GenericAPIView):
94160
serializer_class = DocumentsMergeSerializer
95161
parser_classes = (rest_framework_JSONParser,)

0 commit comments

Comments
 (0)