From 5078f3b2be334668b10c985f1c2eb584c3baae52 Mon Sep 17 00:00:00 2001 From: "Priyanka Singhal (Persistent Systems Inc)" Date: Fri, 24 Jan 2025 16:16:00 +0530 Subject: [PATCH 1/6] Resolve chunking issue during deployment when enabling advanced image processing --- code/backend/batch/utilities/helpers/config/config_helper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/code/backend/batch/utilities/helpers/config/config_helper.py b/code/backend/batch/utilities/helpers/config/config_helper.py index bc16287ce..5c64bee71 100644 --- a/code/backend/batch/utilities/helpers/config/config_helper.py +++ b/code/backend/batch/utilities/helpers/config/config_helper.py @@ -308,10 +308,10 @@ def clear_config(): @staticmethod def _append_advanced_image_processors(): image_file_types = ["jpeg", "jpg", "png", "tiff", "bmp"] - ConfigHelper._remove_processors_for_file_types(image_file_types) + # ConfigHelper._remove_processors_for_file_types(image_file_types) ConfigHelper._default_config["document_processors"].extend( [ - {"document_type": file_type, "use_advanced_image_processing": True} + {"document_type": file_type, "chunking" : ConfigHelper._default_config["document_processors"][0]["chunking"], "loading" : ConfigHelper._default_config["document_processors"][0]["loading"], "use_advanced_image_processing": True} for file_type in image_file_types ] ) From 0d4e962d7c3713e38b583b209c5b0ab4e44e4f59 Mon Sep 17 00:00:00 2001 From: Harmanpreet Kaur Date: Mon, 21 Apr 2025 10:48:08 +0530 Subject: [PATCH 2/6] updated test case --- .../utilities/helpers/test_config_helper.py | 42 +++++++++++++++---- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/code/tests/utilities/helpers/test_config_helper.py b/code/tests/utilities/helpers/test_config_helper.py index c643269af..213d74d92 100644 --- a/code/tests/utilities/helpers/test_config_helper.py +++ b/code/tests/utilities/helpers/test_config_helper.py @@ -176,8 +176,6 @@ def test_default_config_is_cached(): # then assert default_config_one is default_config_two - - def test_default_config_when_use_advanced_image_processing(env_helper_mock): # given env_helper_mock.return_value.USE_ADVANCED_IMAGE_PROCESSING = True @@ -187,11 +185,13 @@ def test_default_config_when_use_advanced_image_processing(env_helper_mock): # then expected_chunking = {"strategy": "layout", "size": 500, "overlap": 100} + expected_loading = {"strategy": "layout"} + assert config["document_processors"] == [ { "document_type": "pdf", "chunking": expected_chunking, - "loading": {"strategy": "layout"}, + "loading": expected_loading, }, { "document_type": "txt", @@ -228,14 +228,38 @@ def test_default_config_when_use_advanced_image_processing(env_helper_mock): "chunking": {"strategy": "json", "size": 500, "overlap": 100}, "loading": {"strategy": "web"}, }, - {"document_type": "jpeg", "use_advanced_image_processing": True}, - {"document_type": "jpg", "use_advanced_image_processing": True}, - {"document_type": "png", "use_advanced_image_processing": True}, - {"document_type": "tiff", "use_advanced_image_processing": True}, - {"document_type": "bmp", "use_advanced_image_processing": True}, + { + "document_type": "jpg", + "chunking": expected_chunking, + "loading": expected_loading, + "use_advanced_image_processing": True, + }, + { + "document_type": "jpeg", + "chunking": expected_chunking, + "loading": expected_loading, + "use_advanced_image_processing": True, + }, + { + "document_type": "png", + "chunking": expected_chunking, + "loading": expected_loading, + "use_advanced_image_processing": True, + }, + { + "document_type": "tiff", + "chunking": expected_chunking, + "loading": expected_loading, + "use_advanced_image_processing": True, + }, + { + "document_type": "bmp", + "chunking": expected_chunking, + "loading": expected_loading, + "use_advanced_image_processing": True, + }, ] - def test_get_config_from_azure( AzureBlobStorageClientMock: MagicMock, blob_client_mock: MagicMock, From c47b64d9c39708386fa237f1624e22020553dcf6 Mon Sep 17 00:00:00 2001 From: Harmanpreet Kaur Date: Mon, 21 Apr 2025 11:43:23 +0530 Subject: [PATCH 3/6] test case updated2 --- .../utilities/helpers/test_config_helper.py | 83 +++---------------- 1 file changed, 13 insertions(+), 70 deletions(-) diff --git a/code/tests/utilities/helpers/test_config_helper.py b/code/tests/utilities/helpers/test_config_helper.py index 213d74d92..2f383c9b4 100644 --- a/code/tests/utilities/helpers/test_config_helper.py +++ b/code/tests/utilities/helpers/test_config_helper.py @@ -188,76 +188,19 @@ def test_default_config_when_use_advanced_image_processing(env_helper_mock): expected_loading = {"strategy": "layout"} assert config["document_processors"] == [ - { - "document_type": "pdf", - "chunking": expected_chunking, - "loading": expected_loading, - }, - { - "document_type": "txt", - "chunking": expected_chunking, - "loading": {"strategy": "web"}, - }, - { - "document_type": "url", - "chunking": expected_chunking, - "loading": {"strategy": "web"}, - }, - { - "document_type": "md", - "chunking": expected_chunking, - "loading": {"strategy": "web"}, - }, - { - "document_type": "html", - "chunking": expected_chunking, - "loading": {"strategy": "web"}, - }, - { - "document_type": "htm", - "chunking": expected_chunking, - "loading": {"strategy": "web"}, - }, - { - "document_type": "docx", - "chunking": expected_chunking, - "loading": {"strategy": "docx"}, - }, - { - "document_type": "json", - "chunking": {"strategy": "json", "size": 500, "overlap": 100}, - "loading": {"strategy": "web"}, - }, - { - "document_type": "jpg", - "chunking": expected_chunking, - "loading": expected_loading, - "use_advanced_image_processing": True, - }, - { - "document_type": "jpeg", - "chunking": expected_chunking, - "loading": expected_loading, - "use_advanced_image_processing": True, - }, - { - "document_type": "png", - "chunking": expected_chunking, - "loading": expected_loading, - "use_advanced_image_processing": True, - }, - { - "document_type": "tiff", - "chunking": expected_chunking, - "loading": expected_loading, - "use_advanced_image_processing": True, - }, - { - "document_type": "bmp", - "chunking": expected_chunking, - "loading": expected_loading, - "use_advanced_image_processing": True, - }, + {"document_type": "pdf", "chunking": expected_chunking, "loading": expected_loading}, + {"document_type": "txt", "chunking": expected_chunking, "loading": {"strategy": "web"}}, + {"document_type": "url", "chunking": expected_chunking, "loading": {"strategy": "web"}}, + {"document_type": "md", "chunking": expected_chunking, "loading": {"strategy": "web"}}, + {"document_type": "html", "chunking": expected_chunking, "loading": {"strategy": "web"}}, + {"document_type": "htm", "chunking": expected_chunking, "loading": {"strategy": "web"}}, + {"document_type": "docx", "chunking": expected_chunking, "loading": {"strategy": "docx"}}, + {"document_type": "json", "chunking": {"strategy": "json", "size": 500, "overlap": 100}, "loading": {"strategy": "web"}}, + {"document_type": "jpg", "chunking": expected_chunking, "loading": expected_loading, "use_advanced_image_processing": True}, + {"document_type": "jpeg", "chunking": expected_chunking, "loading": expected_loading, "use_advanced_image_processing": True}, + {"document_type": "png", "chunking": expected_chunking, "loading": expected_loading, "use_advanced_image_processing": True}, + {"document_type": "tiff", "chunking": expected_chunking, "loading": expected_loading, "use_advanced_image_processing": True}, + {"document_type": "bmp", "chunking": expected_chunking, "loading": expected_loading, "use_advanced_image_processing": True}, ] def test_get_config_from_azure( From 799ae9a53926d49fe90db4f6b5b3371f29311629 Mon Sep 17 00:00:00 2001 From: Harmanpreet Kaur Date: Mon, 21 Apr 2025 11:56:45 +0530 Subject: [PATCH 4/6] test update 2 --- .../utilities/helpers/test_config_helper.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/code/tests/utilities/helpers/test_config_helper.py b/code/tests/utilities/helpers/test_config_helper.py index 2f383c9b4..4f6950d27 100644 --- a/code/tests/utilities/helpers/test_config_helper.py +++ b/code/tests/utilities/helpers/test_config_helper.py @@ -186,16 +186,17 @@ def test_default_config_when_use_advanced_image_processing(env_helper_mock): # then expected_chunking = {"strategy": "layout", "size": 500, "overlap": 100} expected_loading = {"strategy": "layout"} + expected_web_loading = {"strategy": "web"} - assert config["document_processors"] == [ + expected_document_processors = [ {"document_type": "pdf", "chunking": expected_chunking, "loading": expected_loading}, - {"document_type": "txt", "chunking": expected_chunking, "loading": {"strategy": "web"}}, - {"document_type": "url", "chunking": expected_chunking, "loading": {"strategy": "web"}}, - {"document_type": "md", "chunking": expected_chunking, "loading": {"strategy": "web"}}, - {"document_type": "html", "chunking": expected_chunking, "loading": {"strategy": "web"}}, - {"document_type": "htm", "chunking": expected_chunking, "loading": {"strategy": "web"}}, + {"document_type": "txt", "chunking": expected_chunking, "loading": expected_web_loading}, + {"document_type": "url", "chunking": expected_chunking, "loading": expected_web_loading}, + {"document_type": "md", "chunking": expected_chunking, "loading": expected_web_loading}, + {"document_type": "html", "chunking": expected_chunking, "loading": expected_web_loading}, + {"document_type": "htm", "chunking": expected_chunking, "loading": expected_web_loading}, {"document_type": "docx", "chunking": expected_chunking, "loading": {"strategy": "docx"}}, - {"document_type": "json", "chunking": {"strategy": "json", "size": 500, "overlap": 100}, "loading": {"strategy": "web"}}, + {"document_type": "json", "chunking": {"strategy": "json", "size": 500, "overlap": 100}, "loading": expected_web_loading}, {"document_type": "jpg", "chunking": expected_chunking, "loading": expected_loading, "use_advanced_image_processing": True}, {"document_type": "jpeg", "chunking": expected_chunking, "loading": expected_loading, "use_advanced_image_processing": True}, {"document_type": "png", "chunking": expected_chunking, "loading": expected_loading, "use_advanced_image_processing": True}, @@ -203,6 +204,8 @@ def test_default_config_when_use_advanced_image_processing(env_helper_mock): {"document_type": "bmp", "chunking": expected_chunking, "loading": expected_loading, "use_advanced_image_processing": True}, ] + assert config["document_processors"] == expected_document_processors + def test_get_config_from_azure( AzureBlobStorageClientMock: MagicMock, blob_client_mock: MagicMock, From 0569b0ee13f420634cca3853b9d962abb6bbd6bb Mon Sep 17 00:00:00 2001 From: Harmanpreet Kaur Date: Mon, 21 Apr 2025 14:33:07 +0530 Subject: [PATCH 5/6] updated the test file --- .../utilities/helpers/test_config_helper.py | 46 +++++++++++++------ 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/code/tests/utilities/helpers/test_config_helper.py b/code/tests/utilities/helpers/test_config_helper.py index 4f6950d27..c36ba2607 100644 --- a/code/tests/utilities/helpers/test_config_helper.py +++ b/code/tests/utilities/helpers/test_config_helper.py @@ -176,8 +176,13 @@ def test_default_config_is_cached(): # then assert default_config_one is default_config_two + + + +@patch("backend.batch.utilities.helpers.config.config_helper.EnvHelper") def test_default_config_when_use_advanced_image_processing(env_helper_mock): # given + ConfigHelper._default_config = None env_helper_mock.return_value.USE_ADVANCED_IMAGE_PROCESSING = True # when @@ -186,26 +191,39 @@ def test_default_config_when_use_advanced_image_processing(env_helper_mock): # then expected_chunking = {"strategy": "layout", "size": 500, "overlap": 100} expected_loading = {"strategy": "layout"} - expected_web_loading = {"strategy": "web"} + expected_image_processor = { + "chunking": expected_chunking, + "loading": expected_loading, + "use_advanced_image_processing": True, + } + + actual_processors = config["document_processors"] - expected_document_processors = [ + expected_processors = [ {"document_type": "pdf", "chunking": expected_chunking, "loading": expected_loading}, - {"document_type": "txt", "chunking": expected_chunking, "loading": expected_web_loading}, - {"document_type": "url", "chunking": expected_chunking, "loading": expected_web_loading}, - {"document_type": "md", "chunking": expected_chunking, "loading": expected_web_loading}, - {"document_type": "html", "chunking": expected_chunking, "loading": expected_web_loading}, - {"document_type": "htm", "chunking": expected_chunking, "loading": expected_web_loading}, + {"document_type": "txt", "chunking": expected_chunking, "loading": {"strategy": "web"}}, + {"document_type": "url", "chunking": expected_chunking, "loading": {"strategy": "web"}}, + {"document_type": "md", "chunking": expected_chunking, "loading": {"strategy": "web"}}, + {"document_type": "html", "chunking": expected_chunking, "loading": {"strategy": "web"}}, + {"document_type": "htm", "chunking": expected_chunking, "loading": {"strategy": "web"}}, {"document_type": "docx", "chunking": expected_chunking, "loading": {"strategy": "docx"}}, - {"document_type": "json", "chunking": {"strategy": "json", "size": 500, "overlap": 100}, "loading": expected_web_loading}, - {"document_type": "jpg", "chunking": expected_chunking, "loading": expected_loading, "use_advanced_image_processing": True}, - {"document_type": "jpeg", "chunking": expected_chunking, "loading": expected_loading, "use_advanced_image_processing": True}, - {"document_type": "png", "chunking": expected_chunking, "loading": expected_loading, "use_advanced_image_processing": True}, - {"document_type": "tiff", "chunking": expected_chunking, "loading": expected_loading, "use_advanced_image_processing": True}, - {"document_type": "bmp", "chunking": expected_chunking, "loading": expected_loading, "use_advanced_image_processing": True}, + { + "document_type": "json", + "chunking": {"strategy": "json", "size": 500, "overlap": 100}, + "loading": {"strategy": "web"}, + }, + {"document_type": "jpg", "chunking": expected_chunking, "loading": expected_loading}, + {"document_type": "jpeg", "chunking": expected_chunking, "loading": expected_loading}, + {"document_type": "png", "chunking": expected_chunking, "loading": expected_loading}, + {"document_type": "jpeg", **expected_image_processor}, + {"document_type": "jpg", **expected_image_processor}, + {"document_type": "png", **expected_image_processor}, + {"document_type": "tiff", **expected_image_processor}, + {"document_type": "bmp", **expected_image_processor}, ] - assert config["document_processors"] == expected_document_processors + assert actual_processors == expected_processors def test_get_config_from_azure( AzureBlobStorageClientMock: MagicMock, blob_client_mock: MagicMock, From ee5f46f7d7c88efc774955d3c0e239d7452b1188 Mon Sep 17 00:00:00 2001 From: Harmanpreet Kaur Date: Mon, 21 Apr 2025 14:37:44 +0530 Subject: [PATCH 6/6] solved linting issue --- code/tests/utilities/helpers/test_config_helper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/code/tests/utilities/helpers/test_config_helper.py b/code/tests/utilities/helpers/test_config_helper.py index c36ba2607..9ab3d6257 100644 --- a/code/tests/utilities/helpers/test_config_helper.py +++ b/code/tests/utilities/helpers/test_config_helper.py @@ -178,7 +178,6 @@ def test_default_config_is_cached(): assert default_config_one is default_config_two - @patch("backend.batch.utilities.helpers.config.config_helper.EnvHelper") def test_default_config_when_use_advanced_image_processing(env_helper_mock): # given @@ -222,8 +221,9 @@ def test_default_config_when_use_advanced_image_processing(env_helper_mock): {"document_type": "bmp", **expected_image_processor}, ] - assert actual_processors == expected_processors + + def test_get_config_from_azure( AzureBlobStorageClientMock: MagicMock, blob_client_mock: MagicMock,