From 76779b1b8b9dbcb3e26f9724117fe7db55c3f8b6 Mon Sep 17 00:00:00 2001 From: estelle Date: Tue, 1 Jul 2025 10:38:26 +0200 Subject: [PATCH 1/2] Explicit configuration of schema extraction --- docs/source/user_guide_kg_builder.rst | 15 +++++++++++++-- .../experimental/components/schema.py | 4 ++++ .../config/template_pipeline/simple_kg_builder.py | 14 ++++++++++---- .../experimental/pipeline/kg_builder.py | 10 ++++++++-- .../template_pipeline/test_simple_kg_builder.py | 8 ++++++++ 5 files changed, 43 insertions(+), 8 deletions(-) diff --git a/docs/source/user_guide_kg_builder.rst b/docs/source/user_guide_kg_builder.rst index 366f285c9..d3930edce 100644 --- a/docs/source/user_guide_kg_builder.rst +++ b/docs/source/user_guide_kg_builder.rst @@ -131,8 +131,19 @@ This schema information can be provided to the `SimpleKGBuilder` as demonstrated # ... ) -.. note:: - By default, if no schema is provided to the SimpleKGPipeline, automatic schema extraction will be performed using the LLM (See the :ref:`Automatic Schema Extraction`). + +Schema Parameter Behavior +------------------------- + +The `schema` parameter controls how entity and relation extraction is performed: + +* **AUTO_EXTRACTION**: ``schema="AUTO_EXTRACTION"`` or (``schema=None``) + The schema is automatically extracted from the input text once. This guiding schema is then used to structure entity and relation extraction for all chunks. This guarantees all chunks have the same guiding schema. + (See :ref:`Automatic Schema Extraction`) + +* **NO_EXTRACTION**: ``schema="NO_EXTRACTION"`` or empty schema (``{"node_types": ()}``) + No schema extraction is performed. Entity and relation extraction proceed without a predefined or derived schema, resulting in unguided extraction. + Extra configurations -------------------- diff --git a/src/neo4j_graphrag/experimental/components/schema.py b/src/neo4j_graphrag/experimental/components/schema.py index 6357ac56a..87d7ecf93 100644 --- a/src/neo4j_graphrag/experimental/components/schema.py +++ b/src/neo4j_graphrag/experimental/components/schema.py @@ -226,6 +226,10 @@ def node_type_from_label(self, label: str) -> Optional[NodeType]: def relationship_type_from_label(self, label: str) -> Optional[RelationshipType]: return self._relationship_type_index.get(label) + @classmethod + def create_empty(cls) -> Self: + return cls(node_types=tuple()) + def save( self, file_path: Union[str, Path], diff --git a/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py b/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py index 15bbd53df..87a36ac04 100644 --- a/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py +++ b/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py @@ -22,10 +22,9 @@ Sequence, Union, ) -import logging import warnings -from pydantic import ConfigDict, Field, model_validator +from pydantic import ConfigDict, Field, model_validator, field_validator from typing_extensions import Self from neo4j_graphrag.experimental.components.embedder import TextChunkEmbedder @@ -66,8 +65,6 @@ ) from neo4j_graphrag.generation.prompts import ERExtractionTemplate -logger = logging.getLogger(__name__) - class SimpleKGPipelineConfig(TemplatePipelineConfig): COMPONENTS: ClassVar[list[str]] = [ @@ -102,6 +99,15 @@ class SimpleKGPipelineConfig(TemplatePipelineConfig): model_config = ConfigDict(arbitrary_types_allowed=True) + @field_validator("schema_", mode="before") + @classmethod + def validate_schema_literal(cls, v: Any) -> Any: + if v == "NO_EXTRACTION": # same as "empty" schema + return GraphSchema.create_empty() + if v == "AUTO_EXTRACTION": # same as no schema + return None + return v + @model_validator(mode="after") def handle_schema_precedence(self) -> Self: """Handle schema precedence and warnings""" diff --git a/src/neo4j_graphrag/experimental/pipeline/kg_builder.py b/src/neo4j_graphrag/experimental/pipeline/kg_builder.py index 891f57e04..e499ca39c 100644 --- a/src/neo4j_graphrag/experimental/pipeline/kg_builder.py +++ b/src/neo4j_graphrag/experimental/pipeline/kg_builder.py @@ -15,7 +15,7 @@ from __future__ import annotations -from typing import List, Optional, Sequence, Union, Any +from typing import List, Optional, Sequence, Union, Any, Literal import logging import neo4j @@ -99,7 +99,13 @@ def __init__( entities: Optional[Sequence[EntityInputType]] = None, relations: Optional[Sequence[RelationInputType]] = None, potential_schema: Optional[List[tuple[str, str, str]]] = None, - schema: Optional[Union[GraphSchema, dict[str, list[Any]]]] = None, + schema: Optional[ + Union[ + GraphSchema, + dict[str, list[Any]], + Literal["NO_EXTRACTION", "AUTO_EXTRACTION"], + ], + ] = None, from_pdf: bool = True, text_splitter: Optional[TextSplitter] = None, pdf_loader: Optional[DataLoader] = None, diff --git a/tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py b/tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py index 766469048..06fc68790 100644 --- a/tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py +++ b/tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py @@ -138,6 +138,14 @@ def test_simple_kg_pipeline_config_manual_schema() -> None: assert isinstance(config._get_schema(), SchemaBuilder) +def test_simple_kg_pipeline_config_literal_schema_validation() -> None: + config = SimpleKGPipelineConfig(schema="NO_EXTRACTION") # type: ignore + assert config.schema_ == GraphSchema.create_empty() + + config = SimpleKGPipelineConfig(schema="AUTO_EXTRACTION") # type: ignore + assert config.schema_ is None + + def test_simple_kg_pipeline_config_schema_run_params() -> None: config = SimpleKGPipelineConfig( entities=["Person"], From 482df71453c23a231f354fb383f137c92aa3c57e Mon Sep 17 00:00:00 2001 From: estelle Date: Tue, 1 Jul 2025 14:13:26 +0200 Subject: [PATCH 2/2] Renaming --- docs/source/user_guide_kg_builder.rst | 8 ++++---- .../config/template_pipeline/simple_kg_builder.py | 4 ++-- src/neo4j_graphrag/experimental/pipeline/kg_builder.py | 2 +- .../config/template_pipeline/test_simple_kg_builder.py | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/source/user_guide_kg_builder.rst b/docs/source/user_guide_kg_builder.rst index d3930edce..4da75e4b1 100644 --- a/docs/source/user_guide_kg_builder.rst +++ b/docs/source/user_guide_kg_builder.rst @@ -137,12 +137,12 @@ Schema Parameter Behavior The `schema` parameter controls how entity and relation extraction is performed: -* **AUTO_EXTRACTION**: ``schema="AUTO_EXTRACTION"`` or (``schema=None``) - The schema is automatically extracted from the input text once. This guiding schema is then used to structure entity and relation extraction for all chunks. This guarantees all chunks have the same guiding schema. +* **EXTRACTED**: ``schema="EXTRACTED"`` or (``schema=None``, default value) + The schema is automatically extracted from the input text once using LLM. This guiding schema is then used to structure entity and relation extraction for all chunks. This guarantees all chunks have the same guiding schema. (See :ref:`Automatic Schema Extraction`) -* **NO_EXTRACTION**: ``schema="NO_EXTRACTION"`` or empty schema (``{"node_types": ()}``) - No schema extraction is performed. Entity and relation extraction proceed without a predefined or derived schema, resulting in unguided extraction. +* **FREE**: ``schema="FREE"`` or empty schema (``{"node_types": ()}``) + No schema extraction is performed. Entity and relation extraction proceed without a predefined or derived schema, resulting in unguided entity and relation extraction. Use this to bypass automatic schema extraction. Extra configurations diff --git a/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py b/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py index 87a36ac04..7d0458ae5 100644 --- a/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py +++ b/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py @@ -102,9 +102,9 @@ class SimpleKGPipelineConfig(TemplatePipelineConfig): @field_validator("schema_", mode="before") @classmethod def validate_schema_literal(cls, v: Any) -> Any: - if v == "NO_EXTRACTION": # same as "empty" schema + if v == "FREE": # same as "empty" schema, no guiding schema return GraphSchema.create_empty() - if v == "AUTO_EXTRACTION": # same as no schema + if v == "EXTRACTED": # same as no schema, schema will be extracted by LLM return None return v diff --git a/src/neo4j_graphrag/experimental/pipeline/kg_builder.py b/src/neo4j_graphrag/experimental/pipeline/kg_builder.py index e499ca39c..68f579c8b 100644 --- a/src/neo4j_graphrag/experimental/pipeline/kg_builder.py +++ b/src/neo4j_graphrag/experimental/pipeline/kg_builder.py @@ -103,7 +103,7 @@ def __init__( Union[ GraphSchema, dict[str, list[Any]], - Literal["NO_EXTRACTION", "AUTO_EXTRACTION"], + Literal["FREE", "EXTRACTED"], ], ] = None, from_pdf: bool = True, diff --git a/tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py b/tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py index 06fc68790..40f5dae34 100644 --- a/tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py +++ b/tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py @@ -139,10 +139,10 @@ def test_simple_kg_pipeline_config_manual_schema() -> None: def test_simple_kg_pipeline_config_literal_schema_validation() -> None: - config = SimpleKGPipelineConfig(schema="NO_EXTRACTION") # type: ignore + config = SimpleKGPipelineConfig(schema="FREE") # type: ignore assert config.schema_ == GraphSchema.create_empty() - config = SimpleKGPipelineConfig(schema="AUTO_EXTRACTION") # type: ignore + config = SimpleKGPipelineConfig(schema="EXTRACTED") # type: ignore assert config.schema_ is None