From 9436f0e08eb8f14a26704d7746d0348f5eea201b Mon Sep 17 00:00:00 2001 From: estelle Date: Mon, 28 Apr 2025 09:40:58 +0200 Subject: [PATCH 1/4] Strict mode: if node/relationship is in schema but no properties are defined, do not filter allowed properties --- .../components/entity_relation_extractor.py | 10 ++++++++-- .../components/test_entity_relation_extractor.py | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/neo4j_graphrag/experimental/components/entity_relation_extractor.py b/src/neo4j_graphrag/experimental/components/entity_relation_extractor.py index f21a22fed..9967ef901 100644 --- a/src/neo4j_graphrag/experimental/components/entity_relation_extractor.py +++ b/src/neo4j_graphrag/experimental/components/entity_relation_extractor.py @@ -404,7 +404,10 @@ def _enforce_nodes( if not schema_entity: continue allowed_props = schema_entity.get("properties", []) - filtered_props = self._enforce_properties(node.properties, allowed_props) + if allowed_props: + filtered_props = self._enforce_properties(node.properties, allowed_props) + else: + filtered_props = node.properties if filtered_props: valid_nodes.append( Neo4jNode( @@ -469,7 +472,10 @@ def _enforce_relationships( continue allowed_props = schema_relation.get("properties", []) - filtered_props = self._enforce_properties(rel.properties, allowed_props) + if allowed_props: + filtered_props = self._enforce_properties(rel.properties, allowed_props) + else: + filtered_props = rel.properties valid_rels.append( Neo4jRelationship( diff --git a/tests/unit/experimental/components/test_entity_relation_extractor.py b/tests/unit/experimental/components/test_entity_relation_extractor.py index 21fe8807d..cebc3d31e 100644 --- a/tests/unit/experimental/components/test_entity_relation_extractor.py +++ b/tests/unit/experimental/components/test_entity_relation_extractor.py @@ -374,7 +374,7 @@ async def test_extractor_schema_enforcement_valid_nodes_with_empty_props() -> No result: Neo4jGraph = await extractor.run(chunks, schema=schema) - assert len(result.nodes) == 0 + assert len(result.nodes) == 1 @pytest.mark.asyncio From db9c16934728eb9d911d01ac4424165981afddd0 Mon Sep 17 00:00:00 2001 From: estelle Date: Mon, 28 Apr 2025 09:45:39 +0200 Subject: [PATCH 2/4] Ruff --- .../experimental/components/entity_relation_extractor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/neo4j_graphrag/experimental/components/entity_relation_extractor.py b/src/neo4j_graphrag/experimental/components/entity_relation_extractor.py index 9967ef901..112d8b2c6 100644 --- a/src/neo4j_graphrag/experimental/components/entity_relation_extractor.py +++ b/src/neo4j_graphrag/experimental/components/entity_relation_extractor.py @@ -405,7 +405,9 @@ def _enforce_nodes( continue allowed_props = schema_entity.get("properties", []) if allowed_props: - filtered_props = self._enforce_properties(node.properties, allowed_props) + filtered_props = self._enforce_properties( + node.properties, allowed_props + ) else: filtered_props = node.properties if filtered_props: From cd07f358f4fe58931d83e20a31b894867cac99a9 Mon Sep 17 00:00:00 2001 From: estelle Date: Fri, 9 May 2025 10:28:41 +0200 Subject: [PATCH 3/4] Deal with missing relationships, add tests --- .../components/entity_relation_extractor.py | 11 +-- .../experimental/components/schema.py | 2 +- .../test_entity_relation_extractor.py | 68 +++++++++++++++++++ 3 files changed, 75 insertions(+), 6 deletions(-) diff --git a/src/neo4j_graphrag/experimental/components/entity_relation_extractor.py b/src/neo4j_graphrag/experimental/components/entity_relation_extractor.py index 112d8b2c6..d041d78e4 100644 --- a/src/neo4j_graphrag/experimental/components/entity_relation_extractor.py +++ b/src/neo4j_graphrag/experimental/components/entity_relation_extractor.py @@ -403,7 +403,7 @@ def _enforce_nodes( schema_entity = schema.entities.get(node.label) if not schema_entity: continue - allowed_props = schema_entity.get("properties", []) + allowed_props = schema_entity.get("properties") if allowed_props: filtered_props = self._enforce_properties( node.properties, allowed_props @@ -439,6 +439,9 @@ def _enforce_relationships( if self.enforce_schema != SchemaEnforcementMode.STRICT: return extracted_relationships + if schema.relations is None: + return extracted_relationships + valid_rels = [] valid_nodes = {node.id: node.label for node in filtered_nodes} @@ -446,9 +449,7 @@ def _enforce_relationships( potential_schema = schema.potential_schema for rel in extracted_relationships: - schema_relation = ( - schema.relations.get(rel.type) if schema.relations else None - ) + schema_relation = schema.relations.get(rel.type) if not schema_relation: continue @@ -473,7 +474,7 @@ def _enforce_relationships( if not tuple_valid and not reverse_tuple_valid: continue - allowed_props = schema_relation.get("properties", []) + allowed_props = schema_relation.get("properties") if allowed_props: filtered_props = self._enforce_properties(rel.properties, allowed_props) else: diff --git a/src/neo4j_graphrag/experimental/components/schema.py b/src/neo4j_graphrag/experimental/components/schema.py index a58b0b105..2e0641c87 100644 --- a/src/neo4j_graphrag/experimental/components/schema.py +++ b/src/neo4j_graphrag/experimental/components/schema.py @@ -109,7 +109,7 @@ class SchemaConfig(DataModel): @model_validator(mode="before") def check_schema(cls, data: Dict[str, Any]) -> Dict[str, Any]: entities = data.get("entities", {}).keys() - relations = data.get("relations", {}).keys() + relations = (data.get("relations") or {}).keys() potential_schema = data.get("potential_schema", []) if potential_schema: diff --git a/tests/unit/experimental/components/test_entity_relation_extractor.py b/tests/unit/experimental/components/test_entity_relation_extractor.py index cebc3d31e..70c115fea 100644 --- a/tests/unit/experimental/components/test_entity_relation_extractor.py +++ b/tests/unit/experimental/components/test_entity_relation_extractor.py @@ -564,6 +564,74 @@ async def test_extractor_schema_enforcement_inverted_relation_direction() -> Non assert result.relationships[0].end_node_id.split(":")[1] == "2" +@pytest.mark.asyncio +async def test_extractor_schema_enforcement_none_relationships_in_schema() -> None: + llm = MagicMock(spec=LLMInterface) + llm.ainvoke.return_value = LLMResponse( + content='{"nodes":[{"id":"1","label":"Person","properties":' + '{"name":"Alice"}},{"id":"2","label":"Person","properties":' + '{"name":"Bob"}}],' + '"relationships":[{"start_node_id":"1","end_node_id":"2",' + '"type":"FRIENDS_WITH","properties":{}}]}' + ) + + extractor = LLMEntityRelationExtractor( + llm=llm, create_lexical_graph=False, enforce_schema=SchemaEnforcementMode.STRICT + ) + + schema = SchemaConfig( + entities={ + "Person": { + "label": "Person", + "properties": [{"name": "name", "type": "STRING"}], + } + }, + relations=None, + potential_schema=None, + ) + + chunks = TextChunks(chunks=[TextChunk(text="some text", index=0)]) + + result: Neo4jGraph = await extractor.run(chunks, schema=schema) + + assert len(result.nodes) == 2 + assert len(result.relationships) == 1 + assert result.relationships[0].type == "FRIENDS_WITH" + + +@pytest.mark.asyncio +async def test_extractor_schema_enforcement_empty_relationships_in_schema() -> None: + llm = MagicMock(spec=LLMInterface) + llm.ainvoke.return_value = LLMResponse( + content='{"nodes":[{"id":"1","label":"Person","properties":' + '{"name":"Alice"}},{"id":"2","label":"Person","properties":' + '{"name":"Bob"}}],' + '"relationships":[{"start_node_id":"1","end_node_id":"2",' + '"type":"FRIENDS_WITH","properties":{}}]}' + ) + + extractor = LLMEntityRelationExtractor( + llm=llm, create_lexical_graph=False, enforce_schema=SchemaEnforcementMode.STRICT + ) + + schema = SchemaConfig( + entities={ + "Person": { + "label": "Person", + "properties": [{"name": "name", "type": "STRING"}], + } + }, + relations={}, + potential_schema=None, + ) + + chunks = TextChunks(chunks=[TextChunk(text="some text", index=0)]) + + result: Neo4jGraph = await extractor.run(chunks, schema=schema) + + assert len(result.relationships) == 0 + + def test_fix_invalid_json_empty_result() -> None: json_string = "invalid json" From b9d26daf066078d63a69182fe44b49c0339527bd Mon Sep 17 00:00:00 2001 From: estelle Date: Tue, 13 May 2025 13:45:52 +0200 Subject: [PATCH 4/4] Update CHANGELOG and doc --- CHANGELOG.md | 6 +++++- docs/source/user_guide_kg_builder.rst | 7 +++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index df446f7da..b2fc5bcd9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,12 +4,16 @@ ### Added -- Added support for automatic schema extraction from text using LLMs. In the `SimpleKGPipeline`, when the user provides no schema, the automatic schema extraction is enabled by default. +- Added support for automatic schema extraction from text using LLMs. In the `SimpleKGPipeline`, when the user provides no schema, the automatic schema extraction is enabled by default. ### Fixed - Fixed a bug where `spacy` and `rapidfuzz` needed to be installed even if not using the relevant entity resolvers. +### Changed + +- Strict mode in `SimpleKGPipeline`: now properties and relationships are pruned only if they are defined in the input schema. + ## 1.7.0 diff --git a/docs/source/user_guide_kg_builder.rst b/docs/source/user_guide_kg_builder.rst index 30d478667..3d02b47af 100644 --- a/docs/source/user_guide_kg_builder.rst +++ b/docs/source/user_guide_kg_builder.rst @@ -901,6 +901,13 @@ Any relation whose start node or end node does not conform to the provided tuple If a relation start/end nodes are valid but the direction is incorrect, the latter will be inverted. If a node is left with no properties, it will be also pruned. +.. note:: + + If the input schema lacks a certain type of information, pruning is skipped. + For example, if an entity is defined only by a label and has no properties, + property pruning is not performed and all properties returned by the LLM are kept. + + .. warning:: Note that if the schema enforcement mode is on but the schema is not provided, no schema enforcement will be applied.