From cd4fcd9c16e9a7901adc0fc5492d28d572affc99 Mon Sep 17 00:00:00 2001
From: Martin O'Hanlon <martin.ohanlon@neo4j.com>
Date: Tue, 14 Oct 2025 12:10:46 +0100
Subject: [PATCH 1/3] schema extract clean json

---
 .../experimental/components/schema.py         | 13 +++
 .../experimental/components/test_schema.py    | 80 +++++++++++++++++++
 2 files changed, 93 insertions(+)

diff --git a/src/neo4j_graphrag/experimental/components/schema.py b/src/neo4j_graphrag/experimental/components/schema.py
index 90ed970d..ad97f1fc 100644
--- a/src/neo4j_graphrag/experimental/components/schema.py
+++ b/src/neo4j_graphrag/experimental/components/schema.py
@@ -15,6 +15,7 @@
 from __future__ import annotations
 
 import json
+import re
 
 import neo4j
 import logging
@@ -554,6 +555,15 @@ def _filter_relationships_without_labels(
             relationship_types, "relationship type"
         )
 
+    def _clean_json_content(self, content: str) -> str:
+        content = content.strip()
+        
+        # Remove markdown code block markers if present
+        content = re.sub(r'^```(?:json)?\s*', '', content, flags=re.MULTILINE)
+        content = re.sub(r'```\s*$', '', content, flags=re.MULTILINE)
+        
+        return content.strip()
+
     @validate_call
     async def run(self, text: str, examples: str = "", **kwargs: Any) -> GraphSchema:
         """
@@ -575,6 +585,9 @@ async def run(self, text: str, examples: str = "", **kwargs: Any) -> GraphSchema
             # Re-raise the LLMGenerationError
             raise LLMGenerationError("Failed to generate schema from text") from e
 
+        # Clean response
+        content = self._clean_json_content(content)
+
         try:
             extracted_schema: Dict[str, Any] = json.loads(content)
 
diff --git a/tests/unit/experimental/components/test_schema.py b/tests/unit/experimental/components/test_schema.py
index a85a88a2..48eddbc3 100644
--- a/tests/unit/experimental/components/test_schema.py
+++ b/tests/unit/experimental/components/test_schema.py
@@ -960,6 +960,86 @@ async def test_schema_from_text_filters_relationships_without_labels(
     assert ("Person", "MANAGES", "Organization") in schema.patterns
 
 
+@pytest.fixture
+def valid_schema_json_with_markdown() -> str:
+    return """```json
+{
+    "node_types": [
+        {
+            "label": "Person",
+            "properties": [
+                {"name": "name", "type": "STRING"}
+            ]
+        },
+        {
+            "label": "Organization",
+            "properties": [
+                {"name": "name", "type": "STRING"}
+            ]
+        }
+    ],
+    "relationship_types": [
+        {
+            "label": "WORKS_FOR",
+            "properties": [
+                {"name": "since", "type": "DATE"}
+            ]
+        }
+    ],
+    "patterns": [
+        ["Person", "WORKS_FOR", "Organization"]
+    ]
+}
+```"""
+
+
+@pytest.fixture
+def valid_schema_json_with_markdown_no_language() -> str:
+    return """```
+{
+    "node_types": [
+        {
+            "label": "Person",
+            "properties": [
+                {"name": "name", "type": "STRING"}
+            ]
+        }
+    ]
+}
+```"""
+
+
+def test_clean_json_content_markdown_with_json_language(
+    schema_from_text: SchemaFromTextExtractor,
+) -> None:
+    content = """```json
+{"node_types": [{"label": "Person"}]}
+```"""
+    
+    cleaned = schema_from_text._clean_json_content(content)
+    assert cleaned == '{"node_types": [{"label": "Person"}]}'
+
+
+def test_clean_json_content_markdown_without_language(
+    schema_from_text: SchemaFromTextExtractor,
+) -> None:
+    content = """```
+{"node_types": [{"label": "Person"}]}
+```"""
+    
+    cleaned = schema_from_text._clean_json_content(content)
+    assert cleaned == '{"node_types": [{"label": "Person"}]}'
+
+
+def test_clean_json_content_plain_json(
+    schema_from_text: SchemaFromTextExtractor,
+) -> None:
+    content = '{"node_types": [{"label": "Person"}]}'
+    
+    cleaned = schema_from_text._clean_json_content(content)
+    assert cleaned == '{"node_types": [{"label": "Person"}]}'
+
+
 @pytest.mark.asyncio
 @patch("neo4j_graphrag.experimental.components.schema.get_structured_schema")
 async def test_schema_from_existing_graph(mock_get_structured_schema: Mock) -> None:

From 5d7524f90d67f87d592c894a022b0c31efa5e5ea Mon Sep 17 00:00:00 2001
From: Martin O'Hanlon <martin.ohanlon@neo4j.com>
Date: Wed, 15 Oct 2025 18:03:19 +0100
Subject: [PATCH 2/3] remove fixtures

---
 .../experimental/components/test_schema.py    | 49 -------------------
 1 file changed, 49 deletions(-)

diff --git a/tests/unit/experimental/components/test_schema.py b/tests/unit/experimental/components/test_schema.py
index 48eddbc3..352f43a3 100644
--- a/tests/unit/experimental/components/test_schema.py
+++ b/tests/unit/experimental/components/test_schema.py
@@ -960,55 +960,6 @@ async def test_schema_from_text_filters_relationships_without_labels(
     assert ("Person", "MANAGES", "Organization") in schema.patterns
 
 
-@pytest.fixture
-def valid_schema_json_with_markdown() -> str:
-    return """```json
-{
-    "node_types": [
-        {
-            "label": "Person",
-            "properties": [
-                {"name": "name", "type": "STRING"}
-            ]
-        },
-        {
-            "label": "Organization",
-            "properties": [
-                {"name": "name", "type": "STRING"}
-            ]
-        }
-    ],
-    "relationship_types": [
-        {
-            "label": "WORKS_FOR",
-            "properties": [
-                {"name": "since", "type": "DATE"}
-            ]
-        }
-    ],
-    "patterns": [
-        ["Person", "WORKS_FOR", "Organization"]
-    ]
-}
-```"""
-
-
-@pytest.fixture
-def valid_schema_json_with_markdown_no_language() -> str:
-    return """```
-{
-    "node_types": [
-        {
-            "label": "Person",
-            "properties": [
-                {"name": "name", "type": "STRING"}
-            ]
-        }
-    ]
-}
-```"""
-
-
 def test_clean_json_content_markdown_with_json_language(
     schema_from_text: SchemaFromTextExtractor,
 ) -> None:

From c7eb2f685e9e82d7bc400bc4f4c6e66507a3b940 Mon Sep 17 00:00:00 2001
From: Martin O'Hanlon <martin.ohanlon@neo4j.com>
Date: Thu, 16 Oct 2025 11:43:02 +0100
Subject: [PATCH 3/3] updated CHANGELOG

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 19f1f399..b1013f0f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@
 - Document node is now always created when running SimpleKGPipeline, even if `from_pdf=False`.
 - Document metadata is exposed in SimpleKGPipeline run method.
 - Added automatic rate limiting with retry logic and exponential backoff for all Embedding providers using tenacity. The `RateLimitHandler` interface allows for custom rate limiting strategies, including the ability to disable rate limiting entirely.
+- JSON response returned to `SchemaFromTextExtractor` is cleansed of any markdown code blocks before being loaded.
 
 ### Fixed