Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
- Document node is now always created when running SimpleKGPipeline, even if `from_pdf=False`.
- Document metadata is exposed in SimpleKGPipeline run method.
- Added automatic rate limiting with retry logic and exponential backoff for all Embedding providers using tenacity. The `RateLimitHandler` interface allows for custom rate limiting strategies, including the ability to disable rate limiting entirely.
- JSON response returned to `SchemaFromTextExtractor` is cleansed of any markdown code blocks before being loaded.

### Fixed

Expand Down
13 changes: 13 additions & 0 deletions src/neo4j_graphrag/experimental/components/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from __future__ import annotations

import json
import re

import neo4j
import logging
Expand Down Expand Up @@ -554,6 +555,15 @@ def _filter_relationships_without_labels(
relationship_types, "relationship type"
)

def _clean_json_content(self, content: str) -> str:
content = content.strip()

# Remove markdown code block markers if present
content = re.sub(r'^```(?:json)?\s*', '', content, flags=re.MULTILINE)
content = re.sub(r'```\s*$', '', content, flags=re.MULTILINE)

return content.strip()

@validate_call
async def run(self, text: str, examples: str = "", **kwargs: Any) -> GraphSchema:
"""
Expand All @@ -575,6 +585,9 @@ async def run(self, text: str, examples: str = "", **kwargs: Any) -> GraphSchema
# Re-raise the LLMGenerationError
raise LLMGenerationError("Failed to generate schema from text") from e

# Clean response
content = self._clean_json_content(content)

try:
extracted_schema: Dict[str, Any] = json.loads(content)

Expand Down
31 changes: 31 additions & 0 deletions tests/unit/experimental/components/test_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -960,6 +960,37 @@ async def test_schema_from_text_filters_relationships_without_labels(
assert ("Person", "MANAGES", "Organization") in schema.patterns


def test_clean_json_content_markdown_with_json_language(
schema_from_text: SchemaFromTextExtractor,
) -> None:
content = """```json
{"node_types": [{"label": "Person"}]}
```"""

cleaned = schema_from_text._clean_json_content(content)
assert cleaned == '{"node_types": [{"label": "Person"}]}'


def test_clean_json_content_markdown_without_language(
schema_from_text: SchemaFromTextExtractor,
) -> None:
content = """```
{"node_types": [{"label": "Person"}]}
```"""

cleaned = schema_from_text._clean_json_content(content)
assert cleaned == '{"node_types": [{"label": "Person"}]}'


def test_clean_json_content_plain_json(
schema_from_text: SchemaFromTextExtractor,
) -> None:
content = '{"node_types": [{"label": "Person"}]}'

cleaned = schema_from_text._clean_json_content(content)
assert cleaned == '{"node_types": [{"label": "Person"}]}'


@pytest.mark.asyncio
@patch("neo4j_graphrag.experimental.components.schema.get_structured_schema")
async def test_schema_from_existing_graph(mock_get_structured_schema: Mock) -> None:
Expand Down
Loading