Allow schema parameter in SimpleKGBuilderConfig and refactor code

NathalieCharbel · NathalieCharbel · commit 41d359de64f6 · 2025-04-29T08:52:15.000+03:00
diff --git a/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py b/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py
@@ -12,9 +12,10 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-from typing import Any, ClassVar, Literal, Optional, Sequence, Union
+from typing import Any, ClassVar, Literal, Optional, Sequence, Union, TypeVar
+import logging
 
-from pydantic import ConfigDict
+from pydantic import ConfigDict, model_validator, Field
 
 from neo4j_graphrag.experimental.components.embedder import TextChunkEmbedder
 from neo4j_graphrag.experimental.components.entity_relation_extractor import (
@@ -33,6 +34,7 @@
     SchemaEntity,
     SchemaRelation,
     SchemaFromText,
+    SchemaConfig,
 )
 from neo4j_graphrag.experimental.components.text_splitters.base import TextSplitter
 from neo4j_graphrag.experimental.components.text_splitters.fixed_size_splitter import (
@@ -55,6 +57,9 @@
 )
 from neo4j_graphrag.generation.prompts import ERExtractionTemplate
 
+logger = logging.getLogger(__name__)
+
+T = TypeVar('T', bound='SimpleKGPipelineConfig')
 
 class SimpleKGPipelineConfig(TemplatePipelineConfig):
     COMPONENTS: ClassVar[list[str]] = [
@@ -75,6 +80,7 @@ class SimpleKGPipelineConfig(TemplatePipelineConfig):
     entities: Sequence[EntityInputType] = []
     relations: Sequence[RelationInputType] = []
     potential_schema: Optional[list[tuple[str, str, str]]] = None
+    schema: Optional[Union[SchemaConfig, dict[str, list]]] = None
     enforce_schema: SchemaEnforcementMode = SchemaEnforcementMode.NONE
     on_error: OnError = OnError.IGNORE
     prompt_template: Union[ERExtractionTemplate, str] = ERExtractionTemplate()
@@ -88,10 +94,40 @@ class SimpleKGPipelineConfig(TemplatePipelineConfig):
     text_splitter: Optional[ComponentType] = None
 
     model_config = ConfigDict(arbitrary_types_allowed=True)
+    
+    @model_validator(mode='after')
+    def handle_schema_precedence(self) -> T:
+        """Handle schema precedence and warnings"""
+        self._process_schema_parameters()
+        return self
+    
+    def _process_schema_parameters(self) -> None:
+        """
+        Process schema parameters and handle precedence between 'schema' parameter and individual components.
+        Also logs warnings for deprecated usage.
+        """
+        # check if both schema and individual components are provided
+        has_individual_schema_components = any([self.entities, self.relations, self.potential_schema])
+        
+        if has_individual_schema_components and self.schema is not None:
+            logger.warning(
+                "Both 'schema' and individual schema components (entities, relations, potential_schema) "
+                "were provided. The 'schema' parameter takes precedence. In the future, individual "
+                "components will be removed. Please use only the 'schema' parameter.",
+                stacklevel=2
+            )
+            
+        elif has_individual_schema_components:
+            logger.warning(
+                "The 'entities', 'relations', and 'potential_schema' parameters are deprecated "
+                "and will be removed in a future version. "
+                "Please use the 'schema' parameter instead.",
+                stacklevel=2
+            )
 
     def has_user_provided_schema(self) -> bool:
         """Check if the user has provided schema information"""
-        return bool(self.entities or self.relations or self.potential_schema)
+        return bool(self.entities or self.relations or self.potential_schema or self.schema is not None)
 
     def _get_pdf_loader(self) -> Optional[PdfLoader]:
         if not self.from_pdf:
@@ -129,16 +165,48 @@ def _get_schema(self) -> Union[SchemaBuilder, SchemaFromText]:
             return SchemaFromText(llm=self.get_default_llm())
         return SchemaBuilder()
 
+    def _process_schema_with_precedence(self) -> tuple[list[SchemaEntity], list[SchemaRelation], Optional[list[tuple[str, str, str]]]]:
+        """
+        Process schema inputs according to precedence rules:
+        1. If schema is provided as SchemaConfig object, use it
+        2. If schema is provided as dictionary, extract from it
+        3. Otherwise, use individual schema components
+        
+        Returns:
+            Tuple of (entities, relations, potential_schema)
+        """
+        if self.schema is not None:
+            # schema takes precedence over individual components
+            if isinstance(self.schema, SchemaConfig):
+                # extract components from SchemaConfig
+                entities = list(self.schema.entities.values())
+                relations = list(self.schema.relations.values())
+                potential_schema = self.schema.potential_schema
+            else:
+                # extract from dictionary
+                entities = [SchemaEntity.from_text_or_dict(e) for e in self.schema.get("entities", [])]
+                relations = [SchemaRelation.from_text_or_dict(r) for r in self.schema.get("relations", [])]
+                potential_schema = self.schema.get("potential_schema")
+        else:
+            # use individual components
+            entities = [SchemaEntity.from_text_or_dict(e) for e in self.entities] if self.entities else []
+            relations = [SchemaRelation.from_text_or_dict(r) for r in self.relations] if self.relations else []
+            potential_schema = self.potential_schema
+        
+        return entities, relations, potential_schema
+
     def _get_run_params_for_schema(self) -> dict[str, Any]:
         if self.auto_schema_extraction and not self.has_user_provided_schema():
             # for automatic extraction, the text parameter is needed (will flow through the pipeline connections)
             return {}
         else:
-            # for manual schema, use the provided entities/relations/potential_schema
+            # process schema components according to precedence rules
+            entities, relations, potential_schema = self._process_schema_with_precedence()
+            
             return {
-                "entities": [SchemaEntity.from_text_or_dict(e) for e in self.entities],
-                "relations": [SchemaRelation.from_text_or_dict(r) for r in self.relations],
-                "potential_schema": self.potential_schema,
+                "entities": entities,
+                "relations": relations,
+                "potential_schema": potential_schema,
             }
 
     def _get_extractor(self) -> EntityRelationExtractor:
diff --git a/src/neo4j_graphrag/experimental/pipeline/kg_builder.py b/src/neo4j_graphrag/experimental/pipeline/kg_builder.py
@@ -17,7 +17,6 @@
 
 from typing import List, Optional, Sequence, Union
 import logging
-import warnings
 
 import neo4j
 from pydantic import ValidationError
@@ -44,7 +43,7 @@
 )
 from neo4j_graphrag.generation.prompts import ERExtractionTemplate
 from neo4j_graphrag.llm.base import LLMInterface
-from neo4j_graphrag.experimental.components.schema import SchemaConfig, SchemaBuilder
+from neo4j_graphrag.experimental.components.schema import SchemaConfig
 
 logger = logging.getLogger(__name__)
 
@@ -104,65 +103,16 @@ def __init__(
         lexical_graph_config: Optional[LexicalGraphConfig] = None,
         neo4j_database: Optional[str] = None,
     ):
-        # deprecation warnings for old parameters
-        if any([entities, relations, potential_schema]) and schema is not None:
-            logger.warning(
-                "Both 'schema' and individual schema components (entities, relations, potential_schema) "
-                "were provided. The 'schema' parameter takes precedence. In the future, individual "
-                "components will be removed. Please use only the 'schema' parameter."
-            )
-            # emit a DeprecationWarning for tools that might be monitoring for it
-            warnings.warn(
-                "Both 'schema' and individual schema components are provided. Use only 'schema'.",
-                DeprecationWarning,
-                stacklevel=2,
-            )
-        elif any([entities, relations, potential_schema]):
-            logger.warning(
-                "The 'entities', 'relations', and 'potential_schema' parameters are deprecated "
-                "and will be removed in a future version. "
-                "Please use the 'schema' parameter instead."
-            )
-            warnings.warn(
-                "The 'entities', 'relations', and 'potential_schema' parameters are deprecated.",
-                DeprecationWarning,
-                stacklevel=2,
-            )
-            
-        # handle schema precedence over individual schema components
-        schema_entities = []
-        schema_relations = []
-        schema_potential = None
-        
-        if schema is not None:
-            # schema takes precedence over individual components
-            if isinstance(schema, SchemaConfig):
-                # use the SchemaConfig directly
-                pass
-            else:
-                # convert dictionary to entity/relation lists
-                schema_entities = schema.get("entities", [])
-                schema_relations = schema.get("relations", [])
-                schema_potential = schema.get("potential_schema")
-        else:
-            # Use the individual components if provided
-            schema_entities = entities or []
-            schema_relations = relations or []
-            schema_potential = potential_schema
-        
-        # determine if automatic schema extraction should be performed
-        has_schema = bool(schema_entities or schema_relations or schema_potential or isinstance(schema, SchemaConfig))
-        auto_schema_extraction = not has_schema
-            
         try:
             config = SimpleKGPipelineConfig(
                 # argument type are fixed in the Config object
                 llm_config=llm,  # type: ignore[arg-type]
                 neo4j_config=driver,  # type: ignore[arg-type]
                 embedder_config=embedder,  # type: ignore[arg-type]
-                entities=schema_entities,
-                relations=schema_relations,
-                potential_schema=schema_potential,
+                entities=entities or [],
+                relations=relations or [],
+                potential_schema=potential_schema,
+                schema=schema,
                 enforce_schema=SchemaEnforcementMode(enforce_schema),
                 from_pdf=from_pdf,
                 pdf_loader=ComponentType(pdf_loader) if pdf_loader else None,
@@ -173,7 +123,7 @@ def __init__(
                 perform_entity_resolution=perform_entity_resolution,
                 lexical_graph_config=lexical_graph_config,
                 neo4j_database=neo4j_database,
-                auto_schema_extraction=auto_schema_extraction,
+                auto_schema_extraction=not bool(schema or entities or relations or potential_schema),
             )
         except (ValidationError, ValueError) as e:
             raise PipelineDefinitionError() from e