diff --git a/docs/source/user_guide_kg_builder.rst b/docs/source/user_guide_kg_builder.rst index 366f285c9..4da75e4b1 100644 --- a/docs/source/user_guide_kg_builder.rst +++ b/docs/source/user_guide_kg_builder.rst @@ -131,8 +131,19 @@ This schema information can be provided to the `SimpleKGBuilder` as demonstrated # ... ) -.. note:: - By default, if no schema is provided to the SimpleKGPipeline, automatic schema extraction will be performed using the LLM (See the :ref:`Automatic Schema Extraction`). + +Schema Parameter Behavior +------------------------- + +The `schema` parameter controls how entity and relation extraction is performed: + +* **EXTRACTED**: ``schema="EXTRACTED"`` or (``schema=None``, default value) + The schema is automatically extracted from the input text once using LLM. This guiding schema is then used to structure entity and relation extraction for all chunks. This guarantees all chunks have the same guiding schema. + (See :ref:`Automatic Schema Extraction`) + +* **FREE**: ``schema="FREE"`` or empty schema (``{"node_types": ()}``) + No schema extraction is performed. Entity and relation extraction proceed without a predefined or derived schema, resulting in unguided entity and relation extraction. Use this to bypass automatic schema extraction. + Extra configurations -------------------- diff --git a/src/neo4j_graphrag/experimental/components/schema.py b/src/neo4j_graphrag/experimental/components/schema.py index 6357ac56a..87d7ecf93 100644 --- a/src/neo4j_graphrag/experimental/components/schema.py +++ b/src/neo4j_graphrag/experimental/components/schema.py @@ -226,6 +226,10 @@ def node_type_from_label(self, label: str) -> Optional[NodeType]: def relationship_type_from_label(self, label: str) -> Optional[RelationshipType]: return self._relationship_type_index.get(label) + @classmethod + def create_empty(cls) -> Self: + return cls(node_types=tuple()) + def save( self, file_path: Union[str, Path], diff --git a/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py b/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py index 15bbd53df..7d0458ae5 100644 --- a/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py +++ b/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py @@ -22,10 +22,9 @@ Sequence, Union, ) -import logging import warnings -from pydantic import ConfigDict, Field, model_validator +from pydantic import ConfigDict, Field, model_validator, field_validator from typing_extensions import Self from neo4j_graphrag.experimental.components.embedder import TextChunkEmbedder @@ -66,8 +65,6 @@ ) from neo4j_graphrag.generation.prompts import ERExtractionTemplate -logger = logging.getLogger(__name__) - class SimpleKGPipelineConfig(TemplatePipelineConfig): COMPONENTS: ClassVar[list[str]] = [ @@ -102,6 +99,15 @@ class SimpleKGPipelineConfig(TemplatePipelineConfig): model_config = ConfigDict(arbitrary_types_allowed=True) + @field_validator("schema_", mode="before") + @classmethod + def validate_schema_literal(cls, v: Any) -> Any: + if v == "FREE": # same as "empty" schema, no guiding schema + return GraphSchema.create_empty() + if v == "EXTRACTED": # same as no schema, schema will be extracted by LLM + return None + return v + @model_validator(mode="after") def handle_schema_precedence(self) -> Self: """Handle schema precedence and warnings""" diff --git a/src/neo4j_graphrag/experimental/pipeline/kg_builder.py b/src/neo4j_graphrag/experimental/pipeline/kg_builder.py index 891f57e04..68f579c8b 100644 --- a/src/neo4j_graphrag/experimental/pipeline/kg_builder.py +++ b/src/neo4j_graphrag/experimental/pipeline/kg_builder.py @@ -15,7 +15,7 @@ from __future__ import annotations -from typing import List, Optional, Sequence, Union, Any +from typing import List, Optional, Sequence, Union, Any, Literal import logging import neo4j @@ -99,7 +99,13 @@ def __init__( entities: Optional[Sequence[EntityInputType]] = None, relations: Optional[Sequence[RelationInputType]] = None, potential_schema: Optional[List[tuple[str, str, str]]] = None, - schema: Optional[Union[GraphSchema, dict[str, list[Any]]]] = None, + schema: Optional[ + Union[ + GraphSchema, + dict[str, list[Any]], + Literal["FREE", "EXTRACTED"], + ], + ] = None, from_pdf: bool = True, text_splitter: Optional[TextSplitter] = None, pdf_loader: Optional[DataLoader] = None, diff --git a/tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py b/tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py index 766469048..40f5dae34 100644 --- a/tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py +++ b/tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py @@ -138,6 +138,14 @@ def test_simple_kg_pipeline_config_manual_schema() -> None: assert isinstance(config._get_schema(), SchemaBuilder) +def test_simple_kg_pipeline_config_literal_schema_validation() -> None: + config = SimpleKGPipelineConfig(schema="FREE") # type: ignore + assert config.schema_ == GraphSchema.create_empty() + + config = SimpleKGPipelineConfig(schema="EXTRACTED") # type: ignore + assert config.schema_ is None + + def test_simple_kg_pipeline_config_schema_run_params() -> None: config = SimpleKGPipelineConfig( entities=["Person"],