feat: Integrated Vectorization - adding OCR skill (Azure-Samples#1021)

komalg1 · web-flow · commit 30440a8e3d65 · 2024-06-20T13:24:09.000Z
diff --git a/code/backend/batch/utilities/integrated_vectorization/azure_search_indexer.py b/code/backend/batch/utilities/integrated_vectorization/azure_search_indexer.py
@@ -27,11 +27,26 @@ def create_or_update_indexer(self, indexer_name: str, skillset_name: str):
             skillset_name=skillset_name,
             target_index_name=self.env_helper.AZURE_SEARCH_INDEX,
             data_source_name=self.env_helper.AZURE_SEARCH_DATASOURCE_NAME,
+            parameters={
+                "configuration": {
+                    "dataToExtract": "contentAndMetadata",
+                    "parsingMode": "default",
+                    "imageAction": "generateNormalizedImages",
+                }
+            },
             field_mappings=[
                 FieldMapping(
                     source_field_name="metadata_storage_path",
                     target_field_name="source",
                 ),
+                FieldMapping(
+                    source_field_name="/document/normalized_images/*/text",
+                    target_field_name="text",
+                ),
+                FieldMapping(
+                    source_field_name="/document/normalized_images/*/layoutText",
+                    target_field_name="layoutText",
+                ),
             ],
         )
         indexer_result = self.indexer_client.create_or_update_indexer(indexer)
diff --git a/code/backend/batch/utilities/integrated_vectorization/azure_search_skillset.py b/code/backend/batch/utilities/integrated_vectorization/azure_search_skillset.py
@@ -4,6 +4,8 @@
     InputFieldMappingEntry,
     OutputFieldMappingEntry,
     AzureOpenAIEmbeddingSkill,
+    OcrSkill,
+    MergeSkill,
     SearchIndexerIndexProjections,
     SearchIndexerIndexProjectionSelector,
     SearchIndexerIndexProjectionsParameters,
@@ -39,14 +41,46 @@ def __init__(
     def create_skillset(self):
         skillset_name = f"{self.env_helper.AZURE_SEARCH_INDEX}-skillset"
 
+        ocr_skill = OcrSkill(
+            description="Extract text (plain and structured) from image",
+            context="/document/normalized_images/*",
+            inputs=[
+                InputFieldMappingEntry(
+                    name="image",
+                    source="/document/normalized_images/*",
+                )
+            ],
+            outputs=[
+                OutputFieldMappingEntry(name="text", target_name="text"),
+                OutputFieldMappingEntry(name="layoutText", target_name="layoutText"),
+            ],
+        )
+
+        merge_skill = MergeSkill(
+            description="Merge text from OCR and text from document",
+            context="/document",
+            inputs=[
+                InputFieldMappingEntry(name="text", source="/document/content"),
+                InputFieldMappingEntry(
+                    name="itemsToInsert", source="/document/normalized_images/*/text"
+                ),
+                InputFieldMappingEntry(
+                    name="offsets", source="/document/normalized_images/*/contentOffset"
+                ),
+            ],
+            outputs=[
+                OutputFieldMappingEntry(name="mergedText", target_name="merged_content")
+            ],
+        )
+
         split_skill = SplitSkill(
             description="Split skill to chunk documents",
             text_split_mode="pages",
             context="/document",
             maximum_page_length=self.integrated_vectorization_config.max_page_length,
             page_overlap_length=self.integrated_vectorization_config.page_overlap_length,
             inputs=[
-                InputFieldMappingEntry(name="text", source="/document/content"),
+                InputFieldMappingEntry(name="text", source="/document/merged_content"),
             ],
             outputs=[OutputFieldMappingEntry(name="textItems", target_name="pages")],
         )
@@ -98,7 +132,7 @@ def create_skillset(self):
         skillset = SearchIndexerSkillset(
             name=skillset_name,
             description="Skillset to chunk documents and generating embeddings",
-            skills=[split_skill, embedding_skill],
+            skills=[ocr_skill, merge_skill, split_skill, embedding_skill],
             index_projections=index_projections,
         )
 
diff --git a/code/tests/functional/conftest.py b/code/tests/functional/conftest.py
@@ -143,23 +143,83 @@ def setup_default_mocking(httpserver: HTTPServer, app_config: AppConfig):
     ).respond_with_json(
         {
             "name": f"{app_config.get('AZURE_SEARCH_INDEX')}-skillset",
-            "description": "Extract entities, detect language and extract key-phrases",
+            "description": "Skillset to chunk documents and generating embeddings",
             "skills": [
                 {
-                    "@odata.type": "#Microsoft.Skills.Text.SplitSkill",
-                    "name": "#3",
-                    "description": None,
-                    "context": None,
+                    "@odata.type": "#Microsoft.Skills.Vision.OcrSkill",
+                    "description": "Extract text (plain and structured) from image",
+                    "context": "/document/normalized_images/*",
+                    "inputs": [
+                        {"name": "image", "source": "/document/normalized_images/*"}
+                    ],
+                    "outputs": [
+                        {"name": "text", "targetName": "text"},
+                        {"name": "layoutText", "targetName": "layoutText"},
+                    ],
+                    "detectOrientation": False,
+                },
+                {
+                    "@odata.type": "#Microsoft.Skills.Text.MergeSkill",
+                    "description": "Merge text from OCR and text from document",
+                    "context": "/document",
                     "inputs": [
                         {"name": "text", "source": "/document/content"},
-                        {"name": "languageCode", "source": "/document/languageCode"},
+                        {
+                            "name": "itemsToInsert",
+                            "source": "/document/normalized_images/*/text",
+                        },
+                        {
+                            "name": "offsets",
+                            "source": "/document/normalized_images/*/contentOffset",
+                        },
                     ],
+                    "outputs": [{"name": "mergedText", "targetName": "merged_content"}],
+                    "insertPreTag": " ",
+                    "insertPostTag": " ",
+                },
+                {
+                    "@odata.type": "#Microsoft.Skills.Text.SplitSkill",
+                    "description": "Split skill to chunk documents",
+                    "context": "/document",
+                    "inputs": [{"name": "text", "source": "/document/merged_content"}],
                     "outputs": [{"name": "textItems", "targetName": "pages"}],
-                    "defaultLanguageCode": None,
                     "textSplitMode": "pages",
-                    "maximumPageLength": 4000,
+                    "maximumPageLength": 800,
+                    "pageOverlapLength": 100,
+                },
+                {
+                    "@odata.type": "#Microsoft.Skills.Text.AzureOpenAIEmbeddingSkill",
+                    "description": "Skill to generate embeddings via Azure OpenAI",
+                    "context": "/document/pages/*",
+                    "inputs": [{"name": "text", "source": "/document/pages/*"}],
+                    "outputs": [{"name": "embedding", "targetName": "content_vector"}],
+                    "resourceUri": f"https://localhost:{httpserver.port}/",
+                    "deploymentId": f"{app_config.get('AZURE_OPENAI_EMBEDDING_MODEL')}",
+                    "apiKey": f"{app_config.get('AZURE_OPENAI_API_KEY')}",
                 },
             ],
+            "indexProjections": {
+                "selectors": [
+                    {
+                        "targetIndexName": f"{app_config.get('AZURE_SEARCH_INDEX')}",
+                        "parentKeyFieldName": "id",
+                        "sourceContext": "/document/pages/*",
+                        "mappings": [
+                            {"name": "content", "source": "/document/pages/*"},
+                            {
+                                "name": "content_vector",
+                                "source": "/document/pages/*/content_vector",
+                            },
+                            {"name": "title", "source": "/document/title"},
+                            {
+                                "name": "source",
+                                "source": "/document/metadata_storage_path",
+                            },
+                        ],
+                    }
+                ],
+                "parameters": {"projectionMode": "skipIndexingParentDocuments"},
+            },
         },
         status=201,
     )
diff --git a/code/tests/functional/tests/functions/integrated_vectorization/test_integrated_vectorization_resource_creation.py b/code/tests/functional/tests/functions/integrated_vectorization/test_integrated_vectorization_resource_creation.py
@@ -284,6 +284,92 @@ def test_integrated_vectorization_skillset_created(
             method="PUT",
             query_string="api-version=2023-10-01-Preview",
             times=1,
+            json={
+                "name": f"{app_config.get('AZURE_SEARCH_INDEX')}-skillset",
+                "description": "Skillset to chunk documents and generating embeddings",
+                "skills": [
+                    {
+                        "@odata.type": "#Microsoft.Skills.Vision.OcrSkill",
+                        "description": "Extract text (plain and structured) from image",
+                        "context": "/document/normalized_images/*",
+                        "inputs": [
+                            {"name": "image", "source": "/document/normalized_images/*"}
+                        ],
+                        "outputs": [
+                            {"name": "text", "targetName": "text"},
+                            {"name": "layoutText", "targetName": "layoutText"},
+                        ],
+                        "detectOrientation": False,
+                    },
+                    {
+                        "@odata.type": "#Microsoft.Skills.Text.MergeSkill",
+                        "description": "Merge text from OCR and text from document",
+                        "context": "/document",
+                        "inputs": [
+                            {"name": "text", "source": "/document/content"},
+                            {
+                                "name": "itemsToInsert",
+                                "source": "/document/normalized_images/*/text",
+                            },
+                            {
+                                "name": "offsets",
+                                "source": "/document/normalized_images/*/contentOffset",
+                            },
+                        ],
+                        "outputs": [
+                            {"name": "mergedText", "targetName": "merged_content"}
+                        ],
+                        "insertPreTag": " ",
+                        "insertPostTag": " ",
+                    },
+                    {
+                        "@odata.type": "#Microsoft.Skills.Text.SplitSkill",
+                        "description": "Split skill to chunk documents",
+                        "context": "/document",
+                        "inputs": [
+                            {"name": "text", "source": "/document/merged_content"}
+                        ],
+                        "outputs": [{"name": "textItems", "targetName": "pages"}],
+                        "textSplitMode": "pages",
+                        "maximumPageLength": 800,
+                        "pageOverlapLength": 100,
+                    },
+                    {
+                        "@odata.type": "#Microsoft.Skills.Text.AzureOpenAIEmbeddingSkill",
+                        "description": "Skill to generate embeddings via Azure OpenAI",
+                        "context": "/document/pages/*",
+                        "inputs": [{"name": "text", "source": "/document/pages/*"}],
+                        "outputs": [
+                            {"name": "embedding", "targetName": "content_vector"}
+                        ],
+                        "resourceUri": f"https://localhost:{httpserver.port}/",
+                        "deploymentId": f"{app_config.get('AZURE_OPENAI_EMBEDDING_MODEL')}",
+                        "apiKey": f"{app_config.get('AZURE_OPENAI_API_KEY')}",
+                    },
+                ],
+                "indexProjections": {
+                    "selectors": [
+                        {
+                            "targetIndexName": f"{app_config.get('AZURE_SEARCH_INDEX')}",
+                            "parentKeyFieldName": "id",
+                            "sourceContext": "/document/pages/*",
+                            "mappings": [
+                                {"name": "content", "source": "/document/pages/*"},
+                                {
+                                    "name": "content_vector",
+                                    "source": "/document/pages/*/content_vector",
+                                },
+                                {"name": "title", "source": "/document/title"},
+                                {
+                                    "name": "source",
+                                    "source": "/document/metadata_storage_path",
+                                },
+                            ],
+                        }
+                    ],
+                    "parameters": {"projectionMode": "skipIndexingParentDocuments"},
+                },
+            },
         ),
     )
 
diff --git a/code/tests/utilities/integrated_vectorization/test_azure_search_indexer.py b/code/tests/utilities/integrated_vectorization/test_azure_search_indexer.py
@@ -61,6 +61,13 @@ def test_create_or_update_indexer_keys(
         skillset_name="skillset_name",
         target_index_name=env_helper_mock.AZURE_SEARCH_INDEX,
         data_source_name=env_helper_mock.AZURE_SEARCH_DATASOURCE_NAME,
+        parameters={
+            "configuration": {
+                "dataToExtract": "contentAndMetadata",
+                "parsingMode": "default",
+                "imageAction": "generateNormalizedImages",
+            }
+        },
         field_mappings=ANY,
     )
 
@@ -88,6 +95,13 @@ def test_create_or_update_indexer_rbac(
         skillset_name="skillset_name",
         target_index_name=env_helper_mock.AZURE_SEARCH_INDEX,
         data_source_name=env_helper_mock.AZURE_SEARCH_DATASOURCE_NAME,
+        parameters={
+            "configuration": {
+                "dataToExtract": "contentAndMetadata",
+                "parsingMode": "default",
+                "imageAction": "generateNormalizedImages",
+            }
+        },
         field_mappings=ANY,
     )
 
diff --git a/code/tests/utilities/integrated_vectorization/test_azure_search_skillset.py b/code/tests/utilities/integrated_vectorization/test_azure_search_skillset.py
@@ -6,6 +6,8 @@
 from azure.search.documents.indexes.models import (
     SearchIndexerSkillset,
     SplitSkill,
+    OcrSkill,
+    MergeSkill,
     AzureOpenAIEmbeddingSkill,
     SearchIndexerIndexProjections,
 )
@@ -43,7 +45,7 @@ def search_indexer_client_mock():
         indexer_client.create_or_update_skillset.return_value = SearchIndexerSkillset(
             name="skillset_name",
             description="Skillset to chunk documents and generating embeddings",
-            skills=[SplitSkill, AzureOpenAIEmbeddingSkill],
+            skills=[OcrSkill, MergeSkill, SplitSkill, AzureOpenAIEmbeddingSkill],
             index_projections=SearchIndexerIndexProjections,
         )
         yield mock
@@ -62,7 +64,7 @@ def test_create_skillset_keys(
 
     # then
     assert create_or_update_skillset.name == "skillset_name"
-    assert len(create_or_update_skillset.skills) == 2
+    assert len(create_or_update_skillset.skills) == 4
     assert create_or_update_skillset.index_projections is not None
     search_indexer_client_mock.return_value.create_or_update_skillset.assert_called_once()
 
@@ -82,6 +84,6 @@ def test_create_skillset_rbac(
 
     # then
     assert create_or_update_skillset.name == "skillset_name"
-    assert len(create_or_update_skillset.skills) == 2
+    assert len(create_or_update_skillset.skills) == 4
     assert create_or_update_skillset.index_projections is not None
     search_indexer_client_mock.return_value.create_or_update_skillset.assert_called_once()