From cd4fcd9c16e9a7901adc0fc5492d28d572affc99 Mon Sep 17 00:00:00 2001 From: Martin O'Hanlon Date: Tue, 14 Oct 2025 12:10:46 +0100 Subject: [PATCH 1/3] schema extract clean json --- .../experimental/components/schema.py | 13 +++ .../experimental/components/test_schema.py | 80 +++++++++++++++++++ 2 files changed, 93 insertions(+) diff --git a/src/neo4j_graphrag/experimental/components/schema.py b/src/neo4j_graphrag/experimental/components/schema.py index 90ed970d..ad97f1fc 100644 --- a/src/neo4j_graphrag/experimental/components/schema.py +++ b/src/neo4j_graphrag/experimental/components/schema.py @@ -15,6 +15,7 @@ from __future__ import annotations import json +import re import neo4j import logging @@ -554,6 +555,15 @@ def _filter_relationships_without_labels( relationship_types, "relationship type" ) + def _clean_json_content(self, content: str) -> str: + content = content.strip() + + # Remove markdown code block markers if present + content = re.sub(r'^```(?:json)?\s*', '', content, flags=re.MULTILINE) + content = re.sub(r'```\s*$', '', content, flags=re.MULTILINE) + + return content.strip() + @validate_call async def run(self, text: str, examples: str = "", **kwargs: Any) -> GraphSchema: """ @@ -575,6 +585,9 @@ async def run(self, text: str, examples: str = "", **kwargs: Any) -> GraphSchema # Re-raise the LLMGenerationError raise LLMGenerationError("Failed to generate schema from text") from e + # Clean response + content = self._clean_json_content(content) + try: extracted_schema: Dict[str, Any] = json.loads(content) diff --git a/tests/unit/experimental/components/test_schema.py b/tests/unit/experimental/components/test_schema.py index a85a88a2..48eddbc3 100644 --- a/tests/unit/experimental/components/test_schema.py +++ b/tests/unit/experimental/components/test_schema.py @@ -960,6 +960,86 @@ async def test_schema_from_text_filters_relationships_without_labels( assert ("Person", "MANAGES", "Organization") in schema.patterns +@pytest.fixture +def valid_schema_json_with_markdown() -> str: + return """```json +{ + "node_types": [ + { + "label": "Person", + "properties": [ + {"name": "name", "type": "STRING"} + ] + }, + { + "label": "Organization", + "properties": [ + {"name": "name", "type": "STRING"} + ] + } + ], + "relationship_types": [ + { + "label": "WORKS_FOR", + "properties": [ + {"name": "since", "type": "DATE"} + ] + } + ], + "patterns": [ + ["Person", "WORKS_FOR", "Organization"] + ] +} +```""" + + +@pytest.fixture +def valid_schema_json_with_markdown_no_language() -> str: + return """``` +{ + "node_types": [ + { + "label": "Person", + "properties": [ + {"name": "name", "type": "STRING"} + ] + } + ] +} +```""" + + +def test_clean_json_content_markdown_with_json_language( + schema_from_text: SchemaFromTextExtractor, +) -> None: + content = """```json +{"node_types": [{"label": "Person"}]} +```""" + + cleaned = schema_from_text._clean_json_content(content) + assert cleaned == '{"node_types": [{"label": "Person"}]}' + + +def test_clean_json_content_markdown_without_language( + schema_from_text: SchemaFromTextExtractor, +) -> None: + content = """``` +{"node_types": [{"label": "Person"}]} +```""" + + cleaned = schema_from_text._clean_json_content(content) + assert cleaned == '{"node_types": [{"label": "Person"}]}' + + +def test_clean_json_content_plain_json( + schema_from_text: SchemaFromTextExtractor, +) -> None: + content = '{"node_types": [{"label": "Person"}]}' + + cleaned = schema_from_text._clean_json_content(content) + assert cleaned == '{"node_types": [{"label": "Person"}]}' + + @pytest.mark.asyncio @patch("neo4j_graphrag.experimental.components.schema.get_structured_schema") async def test_schema_from_existing_graph(mock_get_structured_schema: Mock) -> None: From 5d7524f90d67f87d592c894a022b0c31efa5e5ea Mon Sep 17 00:00:00 2001 From: Martin O'Hanlon Date: Wed, 15 Oct 2025 18:03:19 +0100 Subject: [PATCH 2/3] remove fixtures --- .../experimental/components/test_schema.py | 49 ------------------- 1 file changed, 49 deletions(-) diff --git a/tests/unit/experimental/components/test_schema.py b/tests/unit/experimental/components/test_schema.py index 48eddbc3..352f43a3 100644 --- a/tests/unit/experimental/components/test_schema.py +++ b/tests/unit/experimental/components/test_schema.py @@ -960,55 +960,6 @@ async def test_schema_from_text_filters_relationships_without_labels( assert ("Person", "MANAGES", "Organization") in schema.patterns -@pytest.fixture -def valid_schema_json_with_markdown() -> str: - return """```json -{ - "node_types": [ - { - "label": "Person", - "properties": [ - {"name": "name", "type": "STRING"} - ] - }, - { - "label": "Organization", - "properties": [ - {"name": "name", "type": "STRING"} - ] - } - ], - "relationship_types": [ - { - "label": "WORKS_FOR", - "properties": [ - {"name": "since", "type": "DATE"} - ] - } - ], - "patterns": [ - ["Person", "WORKS_FOR", "Organization"] - ] -} -```""" - - -@pytest.fixture -def valid_schema_json_with_markdown_no_language() -> str: - return """``` -{ - "node_types": [ - { - "label": "Person", - "properties": [ - {"name": "name", "type": "STRING"} - ] - } - ] -} -```""" - - def test_clean_json_content_markdown_with_json_language( schema_from_text: SchemaFromTextExtractor, ) -> None: From c7eb2f685e9e82d7bc400bc4f4c6e66507a3b940 Mon Sep 17 00:00:00 2001 From: Martin O'Hanlon Date: Thu, 16 Oct 2025 11:43:02 +0100 Subject: [PATCH 3/3] updated CHANGELOG --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 19f1f399..b1013f0f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ - Document node is now always created when running SimpleKGPipeline, even if `from_pdf=False`. - Document metadata is exposed in SimpleKGPipeline run method. - Added automatic rate limiting with retry logic and exponential backoff for all Embedding providers using tenacity. The `RateLimitHandler` interface allows for custom rate limiting strategies, including the ability to disable rate limiting entirely. +- JSON response returned to `SchemaFromTextExtractor` is cleansed of any markdown code blocks before being loaded. ### Fixed