Add schema enforcement modes and strict mode behaviour

NathalieCharbel · NathalieCharbel · commit a9f47d7705b4 · 2025-03-06T12:39:51.000+01:00
diff --git a/src/neo4j_graphrag/experimental/components/entity_relation_extractor.py b/src/neo4j_graphrag/experimental/components/entity_relation_extractor.py
@@ -19,7 +19,7 @@
 import enum
 import json
 import logging
-from typing import Any, List, Optional, Union, cast
+from typing import Any, List, Optional, Union, cast, Dict
 
 import json_repair
 from pydantic import ValidationError, validate_call
@@ -31,8 +31,11 @@
     DocumentInfo,
     LexicalGraphConfig,
     Neo4jGraph,
+    Neo4jNode,
+    Neo4jRelationship,
     TextChunk,
     TextChunks,
+    SchemaEnforcementMode,
 )
 from neo4j_graphrag.experimental.pipeline.component import Component
 from neo4j_graphrag.experimental.pipeline.exceptions import InvalidJSONError
@@ -168,6 +171,7 @@ class LLMEntityRelationExtractor(EntityRelationExtractor):
         llm (LLMInterface): The language model to use for extraction.
         prompt_template (ERExtractionTemplate | str): A custom prompt template to use for extraction.
         create_lexical_graph (bool): Whether to include the text chunks in the graph in addition to the extracted entities and relations. Defaults to True.
+        enforce_schema (SchemaEnforcementMode): Whether to validate or not the extracted entities/rels against the provided schema. Defaults to None.
         on_error (OnError): What to do when an error occurs during extraction. Defaults to raising an error.
         max_concurrency (int): The maximum number of concurrent tasks which can be used to make requests to the LLM.
 
@@ -192,11 +196,13 @@ def __init__(
         llm: LLMInterface,
         prompt_template: ERExtractionTemplate | str = ERExtractionTemplate(),
         create_lexical_graph: bool = True,
+        enforce_schema: SchemaEnforcementMode = SchemaEnforcementMode.NONE,
         on_error: OnError = OnError.RAISE,
         max_concurrency: int = 5,
     ) -> None:
         super().__init__(on_error=on_error, create_lexical_graph=create_lexical_graph)
         self.llm = llm  # with response_format={ "type": "json_object" },
+        self.enforce_schema = enforce_schema
         self.max_concurrency = max_concurrency
         if isinstance(prompt_template, str):
             template = PromptTemplate(prompt_template, expected_inputs=[])
@@ -275,15 +281,16 @@ async def run_for_chunk(
         examples: str,
         lexical_graph_builder: Optional[LexicalGraphBuilder] = None,
     ) -> Neo4jGraph:
-        """Run extraction and post processing for a single chunk"""
+        """Run extraction, validation and post processing for a single chunk"""
         async with sem:
             chunk_graph = await self.extract_for_chunk(schema, examples, chunk)
+            final_chunk_graph = self.validate_chunk(chunk_graph, schema)
             await self.post_process_chunk(
-                chunk_graph,
+                final_chunk_graph,
                 chunk,
                 lexical_graph_builder,
             )
-            return chunk_graph
+            return final_chunk_graph
 
     @validate_call
     async def run(
@@ -306,7 +313,7 @@ async def run(
             chunks (TextChunks): List of text chunks to extract entities and relations from.
             document_info (Optional[DocumentInfo], optional): Document the chunks are coming from. Used in the lexical graph creation step.
             lexical_graph_config (Optional[LexicalGraphConfig], optional): Lexical graph configuration to customize node labels and relationship types in the lexical graph.
-            schema (SchemaConfig | None): Definition of the schema to guide the LLM in its extraction. Caution: at the moment, there is no guarantee that the extracted entities and relations will strictly obey the schema.
+            schema (SchemaConfig | None): Definition of the schema to guide the LLM in its extraction.
             examples (str): Examples for few-shot learning in the prompt.
         """
         lexical_graph_builder = None
@@ -337,3 +344,147 @@ async def run(
         graph = self.combine_chunk_graphs(lexical_graph, chunk_graphs)
         logger.debug(f"Extracted graph: {prettify(graph)}")
         return graph
+
+    def validate_chunk(
+            self,
+            chunk_graph: Neo4jGraph,
+            schema: SchemaConfig
+    ) -> Neo4jGraph:
+        """
+         Perform validation after entity and relation extraction:
+         - Enforce schema if schema enforcement mode is on and schema is provided
+        """
+        # if enforcing_schema is on and schema is provided, clean the graph
+        return (
+            self._clean_graph(chunk_graph, schema)
+            if self.enforce_schema != SchemaEnforcementMode.NONE and schema.entities
+            else chunk_graph
+        )
+
+    def _clean_graph(
+            self,
+            graph: Neo4jGraph,
+            schema: SchemaConfig,
+    ) -> Neo4jGraph:
+        """
+        Verify that the graph conforms to the provided schema.
+
+        Remove invalid entities,relationships, and properties.
+        If an entity is removed, all of its relationships are also removed.
+        If no valid properties remain for an entity, remove that entity.
+        """
+        # enforce nodes (remove invalid labels, strip invalid properties)
+        filtered_nodes = self._enforce_nodes(graph.nodes, schema)
+
+        # enforce relationships (remove those referencing invalid nodes or with invalid
+        # types or with start/end nodes not conforming to the schema, and strip invalid
+        # properties)
+        filtered_rels = self._enforce_relationships(
+            graph.relationships, filtered_nodes, schema
+        )
+
+        return Neo4jGraph(nodes=filtered_nodes, relationships=filtered_rels)
+
+    def _enforce_nodes(
+            self,
+            extracted_nodes: List[Neo4jNode],
+            schema: SchemaConfig
+    ) -> List[Neo4jNode]:
+        """
+            Filter extracted nodes to be conformant to the schema.
+
+            Keep only those whose label is in schema.
+            For each valid node, filter out properties not present in the schema.
+            Remove a node if it ends up with no valid properties.
+            """
+        valid_nodes = []
+        if self.enforce_schema == SchemaEnforcementMode.STRICT:
+            for node in extracted_nodes:
+                if node.label in schema.entities:
+                    schema_entity = schema.entities[node.label]
+                    filtered_props = self._enforce_properties(node.properties,
+                                                              schema_entity)
+                    if filtered_props:
+                        # keep node only if it has at least one valid property
+                        new_node = Neo4jNode(
+                            id=node.id,
+                            label=node.label,
+                            properties=filtered_props,
+                            embedding_properties=node.embedding_properties,
+                        )
+                        valid_nodes.append(new_node)
+        # elif self.enforce_schema == SchemaEnforcementMode.OPEN:
+            # future logic
+        return valid_nodes
+
+    def _enforce_relationships(
+            self,
+            extracted_relationships: List[Neo4jRelationship],
+            filtered_nodes: List[Neo4jNode],
+            schema: SchemaConfig
+    ) -> List[Neo4jRelationship]:
+        """
+        Filter extracted nodes to be conformant to the schema.
+
+        Keep only those whose types are in schema, start/end node conform to schema,
+        and start/end nodes are in filtered nodes (i.e., kept after node enforcement).
+        For each valid relationship, filter out properties not present in the schema.
+        """
+        valid_rels = []
+        if self.enforce_schema == SchemaEnforcementMode.STRICT:
+            valid_node_ids = {node.id for node in filtered_nodes}
+            for rel in extracted_relationships:
+                # keep relationship if it conforms with the schema
+                if rel.type in schema.relations:
+                    if (rel.start_node_id in valid_node_ids and
+                            rel.end_node_id in valid_node_ids):
+                        start_node_label = self._get_node_label(rel.start_node_id,
+                                                                filtered_nodes)
+                        end_node_label = self._get_node_label(rel.end_node_id,
+                                                              filtered_nodes)
+                        if (not schema.potential_schema or
+                                (start_node_label, rel.type, end_node_label) in
+                                schema.potential_schema):
+                            schema_relation = schema.relations[rel.type]
+                            filtered_props = self._enforce_properties(rel.properties,
+                                                                      schema_relation)
+                            new_rel = Neo4jRelationship(
+                                start_node_id=rel.start_node_id,
+                                end_node_id=rel.end_node_id,
+                                type=rel.type,
+                                properties=filtered_props,
+                                embedding_properties=rel.embedding_properties,
+                            )
+                            valid_rels.append(new_rel)
+        # elif self.enforce_schema == SchemaEnforcementMode.OPEN:
+        # future logic
+        return valid_rels
+
+    def _enforce_properties(
+            self,
+            properties: Dict[str, Any],
+            valid_properties: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        """
+        Filter properties.
+        Keep only those that exist in schema (i.e., valid properties).
+        """
+        return {
+            key: value
+            for key, value in properties.items()
+            if key in valid_properties
+        }
+
+    def _get_node_label(
+            self,
+            node_id: str,
+            nodes: List[Neo4jNode]
+    ) -> str:
+        """
+        Given a list of nodes, get the label of the node whose id matches the provided
+        node id.
+        """
+        for node in nodes:
+            if node.id == node_id:
+                return node.label
+        return ""
diff --git a/src/neo4j_graphrag/experimental/components/types.py b/src/neo4j_graphrag/experimental/components/types.py
@@ -15,6 +15,7 @@
 from __future__ import annotations
 
 import uuid
+from enum import Enum
 from typing import Any, Dict, Optional
 
 from pydantic import BaseModel, Field, field_validator
@@ -170,3 +171,10 @@ def lexical_graph_node_labels(self) -> tuple[str, ...]:
 class GraphResult(DataModel):
     graph: Neo4jGraph
     config: LexicalGraphConfig
+
+
+class SchemaEnforcementMode(str, Enum):
+    NONE = "none"
+    STRICT = "strict"
+    # future possibility: OPEN = "open" -> ensure conformance of nodes/props/rels that
+    # were listed in the schema but leave room for extras