diff --git a/CHANGELOG.md b/CHANGELOG.md index 19f1f399..b1013f0f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ - Document node is now always created when running SimpleKGPipeline, even if `from_pdf=False`. - Document metadata is exposed in SimpleKGPipeline run method. - Added automatic rate limiting with retry logic and exponential backoff for all Embedding providers using tenacity. The `RateLimitHandler` interface allows for custom rate limiting strategies, including the ability to disable rate limiting entirely. +- JSON response returned to `SchemaFromTextExtractor` is cleansed of any markdown code blocks before being loaded. ### Fixed diff --git a/src/neo4j_graphrag/experimental/components/schema.py b/src/neo4j_graphrag/experimental/components/schema.py index 90ed970d..ad97f1fc 100644 --- a/src/neo4j_graphrag/experimental/components/schema.py +++ b/src/neo4j_graphrag/experimental/components/schema.py @@ -15,6 +15,7 @@ from __future__ import annotations import json +import re import neo4j import logging @@ -554,6 +555,15 @@ def _filter_relationships_without_labels( relationship_types, "relationship type" ) + def _clean_json_content(self, content: str) -> str: + content = content.strip() + + # Remove markdown code block markers if present + content = re.sub(r'^```(?:json)?\s*', '', content, flags=re.MULTILINE) + content = re.sub(r'```\s*$', '', content, flags=re.MULTILINE) + + return content.strip() + @validate_call async def run(self, text: str, examples: str = "", **kwargs: Any) -> GraphSchema: """ @@ -575,6 +585,9 @@ async def run(self, text: str, examples: str = "", **kwargs: Any) -> GraphSchema # Re-raise the LLMGenerationError raise LLMGenerationError("Failed to generate schema from text") from e + # Clean response + content = self._clean_json_content(content) + try: extracted_schema: Dict[str, Any] = json.loads(content) diff --git a/tests/unit/experimental/components/test_schema.py b/tests/unit/experimental/components/test_schema.py index a85a88a2..352f43a3 100644 --- a/tests/unit/experimental/components/test_schema.py +++ b/tests/unit/experimental/components/test_schema.py @@ -960,6 +960,37 @@ async def test_schema_from_text_filters_relationships_without_labels( assert ("Person", "MANAGES", "Organization") in schema.patterns +def test_clean_json_content_markdown_with_json_language( + schema_from_text: SchemaFromTextExtractor, +) -> None: + content = """```json +{"node_types": [{"label": "Person"}]} +```""" + + cleaned = schema_from_text._clean_json_content(content) + assert cleaned == '{"node_types": [{"label": "Person"}]}' + + +def test_clean_json_content_markdown_without_language( + schema_from_text: SchemaFromTextExtractor, +) -> None: + content = """``` +{"node_types": [{"label": "Person"}]} +```""" + + cleaned = schema_from_text._clean_json_content(content) + assert cleaned == '{"node_types": [{"label": "Person"}]}' + + +def test_clean_json_content_plain_json( + schema_from_text: SchemaFromTextExtractor, +) -> None: + content = '{"node_types": [{"label": "Person"}]}' + + cleaned = schema_from_text._clean_json_content(content) + assert cleaned == '{"node_types": [{"label": "Person"}]}' + + @pytest.mark.asyncio @patch("neo4j_graphrag.experimental.components.schema.get_structured_schema") async def test_schema_from_existing_graph(mock_get_structured_schema: Mock) -> None: