Azure-Samples · Roopan-Microsoft · Apr 16, 2025 · Feb 8, 2025 · Feb 10, 2025 · Feb 13, 2025
@@ -6,6 +6,7 @@ class ChunkingStrategy(Enum):
     PAGE = "page"
     FIXED_SIZE_OVERLAP = "fixed_size_overlap"
     PARAGRAPH = "paragraph"
+    JSON = "json"
 
 
 class ChunkingSettings:

@@ -0,0 +1,37 @@
+import json
+from typing import List
+from .document_chunking_base import DocumentChunkingBase
+from langchain.text_splitter import RecursiveJsonSplitter
+from .chunking_strategy import ChunkingSettings
+from ..common.source_document import SourceDocument
+
+
+class JSONDocumentChunking(DocumentChunkingBase):
+    def __init__(self) -> None:
+        pass
+
+    def chunk(
+        self, documents: List[SourceDocument], chunking: ChunkingSettings
+    ) -> List[SourceDocument]:
+        full_document_content = "".join(
+            list(map(lambda document: str(document.content), documents))
+        )
+        document_url = documents[0].source
+        json_data = json.loads(full_document_content)
+        splitter = RecursiveJsonSplitter(max_chunk_size=chunking.chunk_size)
+        chunked_content_list = splitter.split_json(json_data)
+        # Create document for each chunk
+        documents = []
+        chunk_offset = 0
+        for idx, chunked_content in enumerate(chunked_content_list):
+            documents.append(
+                SourceDocument.from_metadata(
+                    content=str(chunked_content),
+                    document_url=document_url,
+                    metadata={"offset": chunk_offset},
+                    idx=idx,
+                )
+            )
+
+            chunk_offset += len(chunked_content)
+        return documents
@@ -3,6 +3,7 @@
 from .page import PageDocumentChunking
 from .fixed_size_overlap import FixedSizeOverlapDocumentChunking
 from .paragraph import ParagraphDocumentChunking
+from .json import JSONDocumentChunking
 
 
 def get_document_chunker(chunking_strategy: str):
@@ -14,5 +15,7 @@ def get_document_chunker(chunking_strategy: str):
         return FixedSizeOverlapDocumentChunking()
     elif chunking_strategy == ChunkingStrategy.PARAGRAPH.value:
         return ParagraphDocumentChunking()
+    elif chunking_strategy == ChunkingStrategy.JSON.value:
+        return JSONDocumentChunking()
     else:
         raise Exception(f"Unknown chunking strategy: {chunking_strategy}")
@@ -68,6 +68,7 @@ def get_available_document_types(self) -> list[str]:
             "jpg",
             "png",
             "docx",
+            "json"
         }
         if self.env_helper.USE_ADVANCED_IMAGE_PROCESSING:
             document_types.update(ADVANCED_IMAGE_PROCESSING_FILE_TYPES)

@@ -97,6 +97,17 @@
         "strategy": "docx"
       }
     },
+    {
+      "document_type": "json",
+      "chunking": {
+        "strategy": "json",
+        "size": 500,
+        "overlap": 100
+      },
+      "loading": {
+        "strategy": "web"
+      }
+    },
     {
       "document_type": "jpg",
       "chunking": {

@@ -223,6 +223,11 @@ def test_default_config_when_use_advanced_image_processing(env_helper_mock):
             "chunking": expected_chunking,
             "loading": {"strategy": "docx"},
         },
+        {
+            "document_type": "json",
+            "chunking": {"strategy": "json", "size": 500, "overlap": 100},
+            "loading": {"strategy": "web"},
+        },
         {"document_type": "jpeg", "use_advanced_image_processing": True},
         {"document_type": "jpg", "use_advanced_image_processing": True},
         {"document_type": "png", "use_advanced_image_processing": True},
@@ -420,7 +425,7 @@ def test_get_available_document_types(config: Config):
 
     # then
     assert sorted(document_types) == sorted(
-        ["txt", "pdf", "url", "html", "htm", "md", "jpeg", "jpg", "png", "docx"]
+        ["txt", "pdf", "url", "html", "htm", "md", "jpeg", "jpg", "png", "docx", "json"]
     )
 
 
@@ -448,6 +453,7 @@ def test_get_available_document_types_when_advanced_image_processing_enabled(
             "docx",
             "tiff",
             "bmp",
+            "json"
         ]
     )
 
@@ -471,6 +477,7 @@ def test_get_available_chunking_strategies(config: Config):
             "page",
             "fixed_size_overlap",
             "paragraph",
+            "json"
         ]
     )
 

@@ -109,3 +109,37 @@ def test_document_chunking_fixed_size_overlap():
         chunked_documents[6].content
         == " shows how the different chunking strategies work now!"
     )
+
+
+def test_document_chunking_json():
+    # Test json chunking strategy
+    chunking = ChunkingSettings({"strategy": ChunkingStrategy.JSON, "size": 175, "overlap": 0})
+
+    json_documents = [
+        SourceDocument(
+            content="""
+            {
+                "window":{
+                    "title":"Sample Widget",
+                    "name":"main_window",
+                    "width":500,
+                    "height":500
+                },
+                "image":{
+                    "src":"Images/Sun.png",
+                    "name":"sun1",
+                    "hOffset":250,
+                    "vOffset":250,
+                    "alignment":"center"
+                }
+            }
+            """,
+            source="https://example.com/sample_document.json",
+        ),
+    ]
+
+    document_chunking = DocumentChunking()
+    chunked_documents = document_chunking.chunk(json_documents, chunking)
+    assert len(chunked_documents) == 2
+    assert chunked_documents[0].content == "{'window': {'title': 'Sample Widget', 'name': 'main_window', 'width': 500, 'height': 500}}"
+    assert chunked_documents[1].content == "{'image': {'src': 'Images/Sun.png', 'name': 'sun1', 'hOffset': 250, 'vOffset': 250, 'alignment': 'center'}}"
@@ -12,3 +12,4 @@ Out-of-the-box, you can upload the following file types:
 * HTML
 * MD (Markdown)
 * DOCX
+* JSON
-Original file line number
+Diff line change
@@ Expand Up / @@ -68,6 +68,7 @@ def get_available_document_types(self) -> list[str]: @@
                 "jpg",
                 "png",
                 "docx",
+                "json"
             }
             if self.env_helper.USE_ADVANCED_IMAGE_PROCESSING:
                 document_types.update(ADVANCED_IMAGE_PROCESSING_FILE_TYPES)
@@ Expand Down @@