diff --git a/code/backend/batch/utilities/helpers/config/config_helper.py b/code/backend/batch/utilities/helpers/config/config_helper.py index 3687dac8b..78e36fa37 100644 --- a/code/backend/batch/utilities/helpers/config/config_helper.py +++ b/code/backend/batch/utilities/helpers/config/config_helper.py @@ -309,10 +309,10 @@ def clear_config(): @staticmethod def _append_advanced_image_processors(): image_file_types = ["jpeg", "jpg", "png", "tiff", "bmp"] - ConfigHelper._remove_processors_for_file_types(image_file_types) + # ConfigHelper._remove_processors_for_file_types(image_file_types) ConfigHelper._default_config["document_processors"].extend( [ - {"document_type": file_type, "use_advanced_image_processing": True} + {"document_type": file_type, "chunking" : ConfigHelper._default_config["document_processors"][0]["chunking"], "loading" : ConfigHelper._default_config["document_processors"][0]["loading"], "use_advanced_image_processing": True} for file_type in image_file_types ] ) diff --git a/code/tests/utilities/helpers/test_config_helper.py b/code/tests/utilities/helpers/test_config_helper.py index c643269af..9ab3d6257 100644 --- a/code/tests/utilities/helpers/test_config_helper.py +++ b/code/tests/utilities/helpers/test_config_helper.py @@ -178,8 +178,10 @@ def test_default_config_is_cached(): assert default_config_one is default_config_two +@patch("backend.batch.utilities.helpers.config.config_helper.EnvHelper") def test_default_config_when_use_advanced_image_processing(env_helper_mock): # given + ConfigHelper._default_config = None env_helper_mock.return_value.USE_ADVANCED_IMAGE_PROCESSING = True # when @@ -187,54 +189,40 @@ def test_default_config_when_use_advanced_image_processing(env_helper_mock): # then expected_chunking = {"strategy": "layout", "size": 500, "overlap": 100} - assert config["document_processors"] == [ - { - "document_type": "pdf", - "chunking": expected_chunking, - "loading": {"strategy": "layout"}, - }, - { - "document_type": "txt", - "chunking": expected_chunking, - "loading": {"strategy": "web"}, - }, - { - "document_type": "url", - "chunking": expected_chunking, - "loading": {"strategy": "web"}, - }, - { - "document_type": "md", - "chunking": expected_chunking, - "loading": {"strategy": "web"}, - }, - { - "document_type": "html", - "chunking": expected_chunking, - "loading": {"strategy": "web"}, - }, - { - "document_type": "htm", - "chunking": expected_chunking, - "loading": {"strategy": "web"}, - }, - { - "document_type": "docx", - "chunking": expected_chunking, - "loading": {"strategy": "docx"}, - }, + expected_loading = {"strategy": "layout"} + expected_image_processor = { + "chunking": expected_chunking, + "loading": expected_loading, + "use_advanced_image_processing": True, + } + + actual_processors = config["document_processors"] + + expected_processors = [ + {"document_type": "pdf", "chunking": expected_chunking, "loading": expected_loading}, + {"document_type": "txt", "chunking": expected_chunking, "loading": {"strategy": "web"}}, + {"document_type": "url", "chunking": expected_chunking, "loading": {"strategy": "web"}}, + {"document_type": "md", "chunking": expected_chunking, "loading": {"strategy": "web"}}, + {"document_type": "html", "chunking": expected_chunking, "loading": {"strategy": "web"}}, + {"document_type": "htm", "chunking": expected_chunking, "loading": {"strategy": "web"}}, + {"document_type": "docx", "chunking": expected_chunking, "loading": {"strategy": "docx"}}, { "document_type": "json", "chunking": {"strategy": "json", "size": 500, "overlap": 100}, "loading": {"strategy": "web"}, }, - {"document_type": "jpeg", "use_advanced_image_processing": True}, - {"document_type": "jpg", "use_advanced_image_processing": True}, - {"document_type": "png", "use_advanced_image_processing": True}, - {"document_type": "tiff", "use_advanced_image_processing": True}, - {"document_type": "bmp", "use_advanced_image_processing": True}, + {"document_type": "jpg", "chunking": expected_chunking, "loading": expected_loading}, + {"document_type": "jpeg", "chunking": expected_chunking, "loading": expected_loading}, + {"document_type": "png", "chunking": expected_chunking, "loading": expected_loading}, + {"document_type": "jpeg", **expected_image_processor}, + {"document_type": "jpg", **expected_image_processor}, + {"document_type": "png", **expected_image_processor}, + {"document_type": "tiff", **expected_image_processor}, + {"document_type": "bmp", **expected_image_processor}, ] + assert actual_processors == expected_processors + def test_get_config_from_azure( AzureBlobStorageClientMock: MagicMock,