diff --git a/code/backend/batch/utilities/document_chunking/chunking_strategy.py b/code/backend/batch/utilities/document_chunking/chunking_strategy.py index dfe10749d..66e789bf4 100644 --- a/code/backend/batch/utilities/document_chunking/chunking_strategy.py +++ b/code/backend/batch/utilities/document_chunking/chunking_strategy.py @@ -6,6 +6,7 @@ class ChunkingStrategy(Enum): PAGE = "page" FIXED_SIZE_OVERLAP = "fixed_size_overlap" PARAGRAPH = "paragraph" + JSON = "json" class ChunkingSettings: diff --git a/code/backend/batch/utilities/document_chunking/json.py b/code/backend/batch/utilities/document_chunking/json.py new file mode 100644 index 000000000..1b9f07924 --- /dev/null +++ b/code/backend/batch/utilities/document_chunking/json.py @@ -0,0 +1,37 @@ +import json +from typing import List +from .document_chunking_base import DocumentChunkingBase +from langchain.text_splitter import RecursiveJsonSplitter +from .chunking_strategy import ChunkingSettings +from ..common.source_document import SourceDocument + + +class JSONDocumentChunking(DocumentChunkingBase): + def __init__(self) -> None: + pass + + def chunk( + self, documents: List[SourceDocument], chunking: ChunkingSettings + ) -> List[SourceDocument]: + full_document_content = "".join( + list(map(lambda document: str(document.content), documents)) + ) + document_url = documents[0].source + json_data = json.loads(full_document_content) + splitter = RecursiveJsonSplitter(max_chunk_size=chunking.chunk_size) + chunked_content_list = splitter.split_json(json_data) + # Create document for each chunk + documents = [] + chunk_offset = 0 + for idx, chunked_content in enumerate(chunked_content_list): + documents.append( + SourceDocument.from_metadata( + content=str(chunked_content), + document_url=document_url, + metadata={"offset": chunk_offset}, + idx=idx, + ) + ) + + chunk_offset += len(chunked_content) + return documents diff --git a/code/backend/batch/utilities/document_chunking/strategies.py b/code/backend/batch/utilities/document_chunking/strategies.py index e3e0533c3..e70c36aac 100644 --- a/code/backend/batch/utilities/document_chunking/strategies.py +++ b/code/backend/batch/utilities/document_chunking/strategies.py @@ -3,6 +3,7 @@ from .page import PageDocumentChunking from .fixed_size_overlap import FixedSizeOverlapDocumentChunking from .paragraph import ParagraphDocumentChunking +from .json import JSONDocumentChunking def get_document_chunker(chunking_strategy: str): @@ -14,5 +15,7 @@ def get_document_chunker(chunking_strategy: str): return FixedSizeOverlapDocumentChunking() elif chunking_strategy == ChunkingStrategy.PARAGRAPH.value: return ParagraphDocumentChunking() + elif chunking_strategy == ChunkingStrategy.JSON.value: + return JSONDocumentChunking() else: raise Exception(f"Unknown chunking strategy: {chunking_strategy}") diff --git a/code/backend/batch/utilities/helpers/config/config_helper.py b/code/backend/batch/utilities/helpers/config/config_helper.py index bc16287ce..3687dac8b 100644 --- a/code/backend/batch/utilities/helpers/config/config_helper.py +++ b/code/backend/batch/utilities/helpers/config/config_helper.py @@ -68,6 +68,7 @@ def get_available_document_types(self) -> list[str]: "jpg", "png", "docx", + "json" } if self.env_helper.USE_ADVANCED_IMAGE_PROCESSING: document_types.update(ADVANCED_IMAGE_PROCESSING_FILE_TYPES) diff --git a/code/backend/batch/utilities/helpers/config/default.json b/code/backend/batch/utilities/helpers/config/default.json index f91924c0a..137a6eec4 100644 --- a/code/backend/batch/utilities/helpers/config/default.json +++ b/code/backend/batch/utilities/helpers/config/default.json @@ -97,6 +97,17 @@ "strategy": "docx" } }, + { + "document_type": "json", + "chunking": { + "strategy": "json", + "size": 500, + "overlap": 100 + }, + "loading": { + "strategy": "web" + } + }, { "document_type": "jpg", "chunking": { diff --git a/code/tests/utilities/helpers/test_config_helper.py b/code/tests/utilities/helpers/test_config_helper.py index 214d5ef16..c643269af 100644 --- a/code/tests/utilities/helpers/test_config_helper.py +++ b/code/tests/utilities/helpers/test_config_helper.py @@ -223,6 +223,11 @@ def test_default_config_when_use_advanced_image_processing(env_helper_mock): "chunking": expected_chunking, "loading": {"strategy": "docx"}, }, + { + "document_type": "json", + "chunking": {"strategy": "json", "size": 500, "overlap": 100}, + "loading": {"strategy": "web"}, + }, {"document_type": "jpeg", "use_advanced_image_processing": True}, {"document_type": "jpg", "use_advanced_image_processing": True}, {"document_type": "png", "use_advanced_image_processing": True}, @@ -420,7 +425,7 @@ def test_get_available_document_types(config: Config): # then assert sorted(document_types) == sorted( - ["txt", "pdf", "url", "html", "htm", "md", "jpeg", "jpg", "png", "docx"] + ["txt", "pdf", "url", "html", "htm", "md", "jpeg", "jpg", "png", "docx", "json"] ) @@ -448,6 +453,7 @@ def test_get_available_document_types_when_advanced_image_processing_enabled( "docx", "tiff", "bmp", + "json" ] ) @@ -471,6 +477,7 @@ def test_get_available_chunking_strategies(config: Config): "page", "fixed_size_overlap", "paragraph", + "json" ] ) diff --git a/code/tests/utilities/helpers/test_document_chunking_helper.py b/code/tests/utilities/helpers/test_document_chunking_helper.py index fd6a1541c..d0bf443b7 100644 --- a/code/tests/utilities/helpers/test_document_chunking_helper.py +++ b/code/tests/utilities/helpers/test_document_chunking_helper.py @@ -109,3 +109,37 @@ def test_document_chunking_fixed_size_overlap(): chunked_documents[6].content == " shows how the different chunking strategies work now!" ) + + +def test_document_chunking_json(): + # Test json chunking strategy + chunking = ChunkingSettings({"strategy": ChunkingStrategy.JSON, "size": 175, "overlap": 0}) + + json_documents = [ + SourceDocument( + content=""" + { + "window":{ + "title":"Sample Widget", + "name":"main_window", + "width":500, + "height":500 + }, + "image":{ + "src":"Images/Sun.png", + "name":"sun1", + "hOffset":250, + "vOffset":250, + "alignment":"center" + } + } + """, + source="https://example.com/sample_document.json", + ), + ] + + document_chunking = DocumentChunking() + chunked_documents = document_chunking.chunk(json_documents, chunking) + assert len(chunked_documents) == 2 + assert chunked_documents[0].content == "{'window': {'title': 'Sample Widget', 'name': 'main_window', 'width': 500, 'height': 500}}" + assert chunked_documents[1].content == "{'image': {'src': 'Images/Sun.png', 'name': 'sun1', 'hOffset': 250, 'vOffset': 250, 'alignment': 'center'}}" diff --git a/docs/supported_file_types.md b/docs/supported_file_types.md index 52f9ac33b..c8c77d482 100644 --- a/docs/supported_file_types.md +++ b/docs/supported_file_types.md @@ -12,3 +12,4 @@ Out-of-the-box, you can upload the following file types: * HTML * MD (Markdown) * DOCX +* JSON