Skip to content

Commit 30440a8

Browse files
authored
feat: Integrated Vectorization - adding OCR skill (Azure-Samples#1021)
1 parent f61045e commit 30440a8

File tree

6 files changed

+224
-13
lines changed

6 files changed

+224
-13
lines changed

code/backend/batch/utilities/integrated_vectorization/azure_search_indexer.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,26 @@ def create_or_update_indexer(self, indexer_name: str, skillset_name: str):
2727
skillset_name=skillset_name,
2828
target_index_name=self.env_helper.AZURE_SEARCH_INDEX,
2929
data_source_name=self.env_helper.AZURE_SEARCH_DATASOURCE_NAME,
30+
parameters={
31+
"configuration": {
32+
"dataToExtract": "contentAndMetadata",
33+
"parsingMode": "default",
34+
"imageAction": "generateNormalizedImages",
35+
}
36+
},
3037
field_mappings=[
3138
FieldMapping(
3239
source_field_name="metadata_storage_path",
3340
target_field_name="source",
3441
),
42+
FieldMapping(
43+
source_field_name="/document/normalized_images/*/text",
44+
target_field_name="text",
45+
),
46+
FieldMapping(
47+
source_field_name="/document/normalized_images/*/layoutText",
48+
target_field_name="layoutText",
49+
),
3550
],
3651
)
3752
indexer_result = self.indexer_client.create_or_update_indexer(indexer)

code/backend/batch/utilities/integrated_vectorization/azure_search_skillset.py

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
InputFieldMappingEntry,
55
OutputFieldMappingEntry,
66
AzureOpenAIEmbeddingSkill,
7+
OcrSkill,
8+
MergeSkill,
79
SearchIndexerIndexProjections,
810
SearchIndexerIndexProjectionSelector,
911
SearchIndexerIndexProjectionsParameters,
@@ -39,14 +41,46 @@ def __init__(
3941
def create_skillset(self):
4042
skillset_name = f"{self.env_helper.AZURE_SEARCH_INDEX}-skillset"
4143

44+
ocr_skill = OcrSkill(
45+
description="Extract text (plain and structured) from image",
46+
context="/document/normalized_images/*",
47+
inputs=[
48+
InputFieldMappingEntry(
49+
name="image",
50+
source="/document/normalized_images/*",
51+
)
52+
],
53+
outputs=[
54+
OutputFieldMappingEntry(name="text", target_name="text"),
55+
OutputFieldMappingEntry(name="layoutText", target_name="layoutText"),
56+
],
57+
)
58+
59+
merge_skill = MergeSkill(
60+
description="Merge text from OCR and text from document",
61+
context="/document",
62+
inputs=[
63+
InputFieldMappingEntry(name="text", source="/document/content"),
64+
InputFieldMappingEntry(
65+
name="itemsToInsert", source="/document/normalized_images/*/text"
66+
),
67+
InputFieldMappingEntry(
68+
name="offsets", source="/document/normalized_images/*/contentOffset"
69+
),
70+
],
71+
outputs=[
72+
OutputFieldMappingEntry(name="mergedText", target_name="merged_content")
73+
],
74+
)
75+
4276
split_skill = SplitSkill(
4377
description="Split skill to chunk documents",
4478
text_split_mode="pages",
4579
context="/document",
4680
maximum_page_length=self.integrated_vectorization_config.max_page_length,
4781
page_overlap_length=self.integrated_vectorization_config.page_overlap_length,
4882
inputs=[
49-
InputFieldMappingEntry(name="text", source="/document/content"),
83+
InputFieldMappingEntry(name="text", source="/document/merged_content"),
5084
],
5185
outputs=[OutputFieldMappingEntry(name="textItems", target_name="pages")],
5286
)
@@ -98,7 +132,7 @@ def create_skillset(self):
98132
skillset = SearchIndexerSkillset(
99133
name=skillset_name,
100134
description="Skillset to chunk documents and generating embeddings",
101-
skills=[split_skill, embedding_skill],
135+
skills=[ocr_skill, merge_skill, split_skill, embedding_skill],
102136
index_projections=index_projections,
103137
)
104138

code/tests/functional/conftest.py

Lines changed: 68 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -143,23 +143,83 @@ def setup_default_mocking(httpserver: HTTPServer, app_config: AppConfig):
143143
).respond_with_json(
144144
{
145145
"name": f"{app_config.get('AZURE_SEARCH_INDEX')}-skillset",
146-
"description": "Extract entities, detect language and extract key-phrases",
146+
"description": "Skillset to chunk documents and generating embeddings",
147147
"skills": [
148148
{
149-
"@odata.type": "#Microsoft.Skills.Text.SplitSkill",
150-
"name": "#3",
151-
"description": None,
152-
"context": None,
149+
"@odata.type": "#Microsoft.Skills.Vision.OcrSkill",
150+
"description": "Extract text (plain and structured) from image",
151+
"context": "/document/normalized_images/*",
152+
"inputs": [
153+
{"name": "image", "source": "/document/normalized_images/*"}
154+
],
155+
"outputs": [
156+
{"name": "text", "targetName": "text"},
157+
{"name": "layoutText", "targetName": "layoutText"},
158+
],
159+
"detectOrientation": False,
160+
},
161+
{
162+
"@odata.type": "#Microsoft.Skills.Text.MergeSkill",
163+
"description": "Merge text from OCR and text from document",
164+
"context": "/document",
153165
"inputs": [
154166
{"name": "text", "source": "/document/content"},
155-
{"name": "languageCode", "source": "/document/languageCode"},
167+
{
168+
"name": "itemsToInsert",
169+
"source": "/document/normalized_images/*/text",
170+
},
171+
{
172+
"name": "offsets",
173+
"source": "/document/normalized_images/*/contentOffset",
174+
},
156175
],
176+
"outputs": [{"name": "mergedText", "targetName": "merged_content"}],
177+
"insertPreTag": " ",
178+
"insertPostTag": " ",
179+
},
180+
{
181+
"@odata.type": "#Microsoft.Skills.Text.SplitSkill",
182+
"description": "Split skill to chunk documents",
183+
"context": "/document",
184+
"inputs": [{"name": "text", "source": "/document/merged_content"}],
157185
"outputs": [{"name": "textItems", "targetName": "pages"}],
158-
"defaultLanguageCode": None,
159186
"textSplitMode": "pages",
160-
"maximumPageLength": 4000,
187+
"maximumPageLength": 800,
188+
"pageOverlapLength": 100,
189+
},
190+
{
191+
"@odata.type": "#Microsoft.Skills.Text.AzureOpenAIEmbeddingSkill",
192+
"description": "Skill to generate embeddings via Azure OpenAI",
193+
"context": "/document/pages/*",
194+
"inputs": [{"name": "text", "source": "/document/pages/*"}],
195+
"outputs": [{"name": "embedding", "targetName": "content_vector"}],
196+
"resourceUri": f"https://localhost:{httpserver.port}/",
197+
"deploymentId": f"{app_config.get('AZURE_OPENAI_EMBEDDING_MODEL')}",
198+
"apiKey": f"{app_config.get('AZURE_OPENAI_API_KEY')}",
161199
},
162200
],
201+
"indexProjections": {
202+
"selectors": [
203+
{
204+
"targetIndexName": f"{app_config.get('AZURE_SEARCH_INDEX')}",
205+
"parentKeyFieldName": "id",
206+
"sourceContext": "/document/pages/*",
207+
"mappings": [
208+
{"name": "content", "source": "/document/pages/*"},
209+
{
210+
"name": "content_vector",
211+
"source": "/document/pages/*/content_vector",
212+
},
213+
{"name": "title", "source": "/document/title"},
214+
{
215+
"name": "source",
216+
"source": "/document/metadata_storage_path",
217+
},
218+
],
219+
}
220+
],
221+
"parameters": {"projectionMode": "skipIndexingParentDocuments"},
222+
},
163223
},
164224
status=201,
165225
)

code/tests/functional/tests/functions/integrated_vectorization/test_integrated_vectorization_resource_creation.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,92 @@ def test_integrated_vectorization_skillset_created(
284284
method="PUT",
285285
query_string="api-version=2023-10-01-Preview",
286286
times=1,
287+
json={
288+
"name": f"{app_config.get('AZURE_SEARCH_INDEX')}-skillset",
289+
"description": "Skillset to chunk documents and generating embeddings",
290+
"skills": [
291+
{
292+
"@odata.type": "#Microsoft.Skills.Vision.OcrSkill",
293+
"description": "Extract text (plain and structured) from image",
294+
"context": "/document/normalized_images/*",
295+
"inputs": [
296+
{"name": "image", "source": "/document/normalized_images/*"}
297+
],
298+
"outputs": [
299+
{"name": "text", "targetName": "text"},
300+
{"name": "layoutText", "targetName": "layoutText"},
301+
],
302+
"detectOrientation": False,
303+
},
304+
{
305+
"@odata.type": "#Microsoft.Skills.Text.MergeSkill",
306+
"description": "Merge text from OCR and text from document",
307+
"context": "/document",
308+
"inputs": [
309+
{"name": "text", "source": "/document/content"},
310+
{
311+
"name": "itemsToInsert",
312+
"source": "/document/normalized_images/*/text",
313+
},
314+
{
315+
"name": "offsets",
316+
"source": "/document/normalized_images/*/contentOffset",
317+
},
318+
],
319+
"outputs": [
320+
{"name": "mergedText", "targetName": "merged_content"}
321+
],
322+
"insertPreTag": " ",
323+
"insertPostTag": " ",
324+
},
325+
{
326+
"@odata.type": "#Microsoft.Skills.Text.SplitSkill",
327+
"description": "Split skill to chunk documents",
328+
"context": "/document",
329+
"inputs": [
330+
{"name": "text", "source": "/document/merged_content"}
331+
],
332+
"outputs": [{"name": "textItems", "targetName": "pages"}],
333+
"textSplitMode": "pages",
334+
"maximumPageLength": 800,
335+
"pageOverlapLength": 100,
336+
},
337+
{
338+
"@odata.type": "#Microsoft.Skills.Text.AzureOpenAIEmbeddingSkill",
339+
"description": "Skill to generate embeddings via Azure OpenAI",
340+
"context": "/document/pages/*",
341+
"inputs": [{"name": "text", "source": "/document/pages/*"}],
342+
"outputs": [
343+
{"name": "embedding", "targetName": "content_vector"}
344+
],
345+
"resourceUri": f"https://localhost:{httpserver.port}/",
346+
"deploymentId": f"{app_config.get('AZURE_OPENAI_EMBEDDING_MODEL')}",
347+
"apiKey": f"{app_config.get('AZURE_OPENAI_API_KEY')}",
348+
},
349+
],
350+
"indexProjections": {
351+
"selectors": [
352+
{
353+
"targetIndexName": f"{app_config.get('AZURE_SEARCH_INDEX')}",
354+
"parentKeyFieldName": "id",
355+
"sourceContext": "/document/pages/*",
356+
"mappings": [
357+
{"name": "content", "source": "/document/pages/*"},
358+
{
359+
"name": "content_vector",
360+
"source": "/document/pages/*/content_vector",
361+
},
362+
{"name": "title", "source": "/document/title"},
363+
{
364+
"name": "source",
365+
"source": "/document/metadata_storage_path",
366+
},
367+
],
368+
}
369+
],
370+
"parameters": {"projectionMode": "skipIndexingParentDocuments"},
371+
},
372+
},
287373
),
288374
)
289375

code/tests/utilities/integrated_vectorization/test_azure_search_indexer.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,13 @@ def test_create_or_update_indexer_keys(
6161
skillset_name="skillset_name",
6262
target_index_name=env_helper_mock.AZURE_SEARCH_INDEX,
6363
data_source_name=env_helper_mock.AZURE_SEARCH_DATASOURCE_NAME,
64+
parameters={
65+
"configuration": {
66+
"dataToExtract": "contentAndMetadata",
67+
"parsingMode": "default",
68+
"imageAction": "generateNormalizedImages",
69+
}
70+
},
6471
field_mappings=ANY,
6572
)
6673

@@ -88,6 +95,13 @@ def test_create_or_update_indexer_rbac(
8895
skillset_name="skillset_name",
8996
target_index_name=env_helper_mock.AZURE_SEARCH_INDEX,
9097
data_source_name=env_helper_mock.AZURE_SEARCH_DATASOURCE_NAME,
98+
parameters={
99+
"configuration": {
100+
"dataToExtract": "contentAndMetadata",
101+
"parsingMode": "default",
102+
"imageAction": "generateNormalizedImages",
103+
}
104+
},
91105
field_mappings=ANY,
92106
)
93107

code/tests/utilities/integrated_vectorization/test_azure_search_skillset.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
from azure.search.documents.indexes.models import (
77
SearchIndexerSkillset,
88
SplitSkill,
9+
OcrSkill,
10+
MergeSkill,
911
AzureOpenAIEmbeddingSkill,
1012
SearchIndexerIndexProjections,
1113
)
@@ -43,7 +45,7 @@ def search_indexer_client_mock():
4345
indexer_client.create_or_update_skillset.return_value = SearchIndexerSkillset(
4446
name="skillset_name",
4547
description="Skillset to chunk documents and generating embeddings",
46-
skills=[SplitSkill, AzureOpenAIEmbeddingSkill],
48+
skills=[OcrSkill, MergeSkill, SplitSkill, AzureOpenAIEmbeddingSkill],
4749
index_projections=SearchIndexerIndexProjections,
4850
)
4951
yield mock
@@ -62,7 +64,7 @@ def test_create_skillset_keys(
6264

6365
# then
6466
assert create_or_update_skillset.name == "skillset_name"
65-
assert len(create_or_update_skillset.skills) == 2
67+
assert len(create_or_update_skillset.skills) == 4
6668
assert create_or_update_skillset.index_projections is not None
6769
search_indexer_client_mock.return_value.create_or_update_skillset.assert_called_once()
6870

@@ -82,6 +84,6 @@ def test_create_skillset_rbac(
8284

8385
# then
8486
assert create_or_update_skillset.name == "skillset_name"
85-
assert len(create_or_update_skillset.skills) == 2
87+
assert len(create_or_update_skillset.skills) == 4
8688
assert create_or_update_skillset.index_projections is not None
8789
search_indexer_client_mock.return_value.create_or_update_skillset.assert_called_once()

0 commit comments

Comments
 (0)