Add SimpleKGPipeline class (#165)

willtai · web-flow · commit be56247b7439 · 2024-10-03T16:37:55.000+01:00
* Add KnowledgeGraphBuilder class

* Simplify schema and relations

* Add kg_writer to pydantic data validation

* Refactor pipe_input to be passed to run and async run of KnowledgeGraphBuilder

* Fixed mypy errors

* Refactor KnowledgeGraphBuilder class

* Fix build_pipeline

* Removed SimpleKGPipeline from init

* Update README

* Handle event loop creation for python 3.9

* Fix typo in SchemaProperty docstring

* Use PipelineDefinitionError for SimpleKGPipeline

* PipelineDefinitionError in tests

* Update kg_builder.run() in README

* Allow users to pass strings instead of enums

* Pass OnError as string in example

* Update test to async

* Fixed OnError mypy errors

* Fixed test case for IGNORE OnError

* Update SimpleKGPipeline example in README
diff --git a/README.md b/README.md
@@ -36,6 +36,47 @@ Follow installation instructions [here](https://pygraphviz.github.io/documentati
 
 ## Examples
 
+### Knowledge graph construction
+
+```python
+from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline
+from neo4j_graphrag.llm.openai_llm import OpenAILLM
+
+# Instantiate Entity and Relation objects
+entities = ["PERSON", "ORGANIZATION", "LOCATION"]
+relations = ["SITUATED_AT", "INTERACTS", "LED_BY"]
+potential_schema = [
+    ("PERSON", "SITUATED_AT", "LOCATION"),
+    ("PERSON", "INTERACTS", "PERSON"),
+    ("ORGANIZATION", "LED_BY", "PERSON"),
+]
+
+# Instantiate the LLM
+llm = OpenAILLM(
+    model_name="gpt-4o",
+    model_params={
+        "max_tokens": 2000,
+        "response_format": {"type": "json_object"},
+    },
+)
+
+# Create an instance of the SimpleKGPipeline
+kg_builder = SimpleKGPipeline(
+    llm=llm,
+    driver=driver,
+    file_path=file_path,
+    entities=entities,
+    relations=relations,
+)
+
+await kg_builder.run_async(text="""
+    Albert Einstein was a German physicist born in 1879 who wrote many groundbreaking
+    papers especially about general relativity and quantum mechanics.
+""")
+```
+
+
+
 ### Creating a vector index
 
 When creating a vector index, make sure you match the number of dimensions in the index with the number of dimensions the embeddings have.
diff --git a/examples/pipeline/kg_builder_example.py b/examples/pipeline/kg_builder_example.py
@@ -0,0 +1,86 @@
+#  Copyright (c) "Neo4j"
+#  Neo4j Sweden AB [https://neo4j.com]
+#  #
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#  #
+#      https://www.apache.org/licenses/LICENSE-2.0
+#  #
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+from __future__ import annotations
+
+import asyncio
+import logging
+
+import neo4j
+from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline
+from neo4j_graphrag.llm.openai_llm import OpenAILLM
+
+logging.basicConfig(level=logging.INFO)
+
+
+async def main(neo4j_driver: neo4j.Driver) -> None:
+    # Instantiate Entity and Relation objects
+    entities = ["PERSON", "ORGANIZATION", "HORCRUX", "LOCATION"]
+    relations = ["SITUATED_AT", "INTERACTS", "OWNS", "LED_BY"]
+    potential_schema = [
+        ("PERSON", "SITUATED_AT", "LOCATION"),
+        ("PERSON", "INTERACTS", "PERSON"),
+        ("PERSON", "OWNS", "HORCRUX"),
+        ("ORGANIZATION", "LED_BY", "PERSON"),
+    ]
+
+    # Instantiate the LLM
+    llm = OpenAILLM(
+        model_name="gpt-4o",
+        model_params={
+            "max_tokens": 2000,
+            "response_format": {"type": "json_object"},
+        },
+    )
+
+    # Create an instance of the SimpleKGPipeline
+    kg_builder_pdf = SimpleKGPipeline(
+        llm=llm,
+        driver=neo4j_driver,
+        entities=entities,
+        relations=relations,
+        potential_schema=potential_schema,
+        from_pdf=True,
+        on_error="RAISE",
+    )
+
+    # Run the knowledge graph building process asynchronously
+    pdf_file_path = "examples/pipeline/Harry Potter and the Death Hallows Summary.pdf"
+    pdf_result = await kg_builder_pdf.run_async(file_path=pdf_file_path)
+    print(f"PDF Processing Result: {pdf_result}")
+
+    # Create an instance of the SimpleKGPipeline for text input
+    kg_builder_text = SimpleKGPipeline(
+        llm=llm,
+        driver=neo4j_driver,
+        entities=entities,
+        relations=relations,
+        potential_schema=potential_schema,
+        from_pdf=False,
+        on_error="RAISE",
+    )
+
+    # Run the knowledge graph building process with text input
+    text_input = "John Doe lives in New York City."
+    text_result = await kg_builder_text.run_async(text=text_input)
+    print(f"Text Processing Result: {text_result}")
+
+    await llm.async_client.close()
+
+
+if __name__ == "__main__":
+    with neo4j.GraphDatabase.driver(
+        "bolt://localhost:7687", auth=("neo4j", "password")
+    ) as driver:
+        asyncio.run(main(driver))
diff --git a/src/neo4j_graphrag/experimental/components/pdf_loader.py b/src/neo4j_graphrag/experimental/components/pdf_loader.py
@@ -25,7 +25,7 @@
 from fsspec.implementations.local import LocalFileSystem
 
 from neo4j_graphrag.exceptions import PdfLoaderError
-from neo4j_graphrag.experimental.pipeline import Component, DataModel
+from neo4j_graphrag.experimental.pipeline.component import Component, DataModel
 
 
 class DocumentInfo(DataModel):
diff --git a/src/neo4j_graphrag/experimental/components/schema.py b/src/neo4j_graphrag/experimental/components/schema.py
@@ -19,7 +19,7 @@
 from pydantic import BaseModel, ValidationError, model_validator, validate_call
 
 from neo4j_graphrag.exceptions import SchemaValidationError
-from neo4j_graphrag.experimental.pipeline import Component, DataModel
+from neo4j_graphrag.experimental.pipeline.component import Component, DataModel
 
 
 class SchemaProperty(BaseModel):
diff --git a/src/neo4j_graphrag/experimental/pipeline/kg_builder.py b/src/neo4j_graphrag/experimental/pipeline/kg_builder.py
@@ -0,0 +1,230 @@
+#  Copyright (c) "Neo4j"
+#  Neo4j Sweden AB [https://neo4j.com]
+#  #
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#  #
+#      https://www.apache.org/licenses/LICENSE-2.0
+#  #
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+from __future__ import annotations
+
+from typing import Any, List, Optional, Union
+
+import neo4j
+from pydantic import BaseModel, ConfigDict, Field
+
+from neo4j_graphrag.experimental.components.entity_relation_extractor import (
+    LLMEntityRelationExtractor,
+    OnError,
+)
+from neo4j_graphrag.experimental.components.kg_writer import Neo4jWriter
+from neo4j_graphrag.experimental.components.pdf_loader import PdfLoader
+from neo4j_graphrag.experimental.components.schema import (
+    SchemaBuilder,
+    SchemaEntity,
+    SchemaRelation,
+)
+from neo4j_graphrag.experimental.components.text_splitters.fixed_size_splitter import (
+    FixedSizeSplitter,
+)
+from neo4j_graphrag.experimental.pipeline.exceptions import PipelineDefinitionError
+from neo4j_graphrag.experimental.pipeline.pipeline import Pipeline, PipelineResult
+from neo4j_graphrag.generation.prompts import ERExtractionTemplate
+from neo4j_graphrag.llm.base import LLMInterface
+
+
+class SimpleKGPipelineConfig(BaseModel):
+    llm: LLMInterface
+    driver: neo4j.Driver
+    from_pdf: bool
+    entities: list[SchemaEntity] = Field(default_factory=list)
+    relations: list[SchemaRelation] = Field(default_factory=list)
+    potential_schema: list[tuple[str, str, str]] = Field(default_factory=list)
+    pdf_loader: Any = None
+    kg_writer: Any = None
+    text_splitter: Any = None
+    on_error: OnError = OnError.RAISE
+    prompt_template: Union[ERExtractionTemplate, str] = ERExtractionTemplate()
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+
+class SimpleKGPipeline:
+    """
+    A class to simplify the process of building a knowledge graph from text documents.
+    It abstracts away the complexity of setting up the pipeline and its components.
+
+    Args:
+        llm (LLMInterface): An instance of an LLM to use for entity and relation extraction.
+        driver (neo4j.Driver): A Neo4j driver instance for database connection.
+        entities (Optional[List[str]]): A list of entity labels as strings.
+        relations (Optional[List[str]]): A list of relation labels as strings.
+        potential_schema (Optional[List[tuple]]): A list of potential schema relationships.
+        from_pdf (bool): Determines whether to include the PdfLoader in the pipeline.
+                         If True, expects `file_path` input in `run` methods.
+                         If False, expects `text` input in `run` methods.
+        text_splitter (Optional[Any]): A text splitter component. Defaults to FixedSizeSplitter().
+        pdf_loader (Optional[Any]): A PDF loader component. Defaults to PdfLoader().
+        kg_writer (Optional[Any]): A knowledge graph writer component. Defaults to Neo4jWriter().
+        on_error (OnError): Error handling strategy. Defaults to OnError.RAISE.
+    """
+
+    def __init__(
+        self,
+        llm: LLMInterface,
+        driver: neo4j.Driver,
+        entities: Optional[List[str]] = None,
+        relations: Optional[List[str]] = None,
+        potential_schema: Optional[List[tuple[str, str, str]]] = None,
+        from_pdf: bool = True,
+        text_splitter: Optional[Any] = None,
+        pdf_loader: Optional[Any] = None,
+        kg_writer: Optional[Any] = None,
+        on_error: str = "RAISE",
+        prompt_template: Union[ERExtractionTemplate, str] = ERExtractionTemplate(),
+    ):
+        self.entities = [SchemaEntity(label=label) for label in entities or []]
+        self.relations = [SchemaRelation(label=label) for label in relations or []]
+        self.potential_schema = potential_schema if potential_schema is not None else []
+
+        try:
+            on_error_enum = OnError(on_error)
+        except ValueError:
+            raise PipelineDefinitionError(
+                f"Invalid value for on_error: {on_error}. Expected 'RAISE' or 'CONTINUE'."
+            )
+
+        config = SimpleKGPipelineConfig(
+            llm=llm,
+            driver=driver,
+            entities=self.entities,
+            relations=self.relations,
+            potential_schema=self.potential_schema,
+            from_pdf=from_pdf,
+            pdf_loader=pdf_loader,
+            kg_writer=kg_writer,
+            text_splitter=text_splitter,
+            on_error=on_error_enum,
+            prompt_template=prompt_template,
+        )
+
+        self.from_pdf = config.from_pdf
+        self.llm = config.llm
+        self.driver = config.driver
+        self.text_splitter = config.text_splitter or FixedSizeSplitter()
+        self.on_error = config.on_error
+        self.pdf_loader = config.pdf_loader if pdf_loader is not None else PdfLoader()
+        self.kg_writer = (
+            config.kg_writer if kg_writer is not None else Neo4jWriter(driver)
+        )
+        self.prompt_template = config.prompt_template
+
+        self.pipeline = self._build_pipeline()
+
+    def _build_pipeline(self) -> Pipeline:
+        pipe = Pipeline()
+
+        pipe.add_component(self.text_splitter, "splitter")
+        pipe.add_component(SchemaBuilder(), "schema")
+        pipe.add_component(
+            LLMEntityRelationExtractor(
+                llm=self.llm,
+                on_error=self.on_error,
+                prompt_template=self.prompt_template,
+            ),
+            "extractor",
+        )
+        pipe.add_component(self.kg_writer, "writer")
+
+        if self.from_pdf:
+            pipe.add_component(self.pdf_loader, "pdf_loader")
+
+            pipe.connect(
+                "pdf_loader",
+                "splitter",
+                input_config={"text": "pdf_loader.text"},
+            )
+
+            pipe.connect(
+                "schema",
+                "extractor",
+                input_config={
+                    "schema": "schema",
+                    "document_info": "pdf_loader.document_info",
+                },
+            )
+        else:
+            pipe.connect(
+                "schema",
+                "extractor",
+                input_config={
+                    "schema": "schema",
+                },
+            )
+
+        pipe.connect(
+            "splitter",
+            "extractor",
+            input_config={"chunks": "splitter"},
+        )
+
+        # Connect extractor to writer
+        pipe.connect(
+            "extractor",
+            "writer",
+            input_config={"graph": "extractor"},
+        )
+
+        return pipe
+
+    async def run_async(
+        self, file_path: Optional[str] = None, text: Optional[str] = None
+    ) -> PipelineResult:
+        """
+        Asynchronously runs the knowledge graph building process.
+
+        Args:
+            file_path (Optional[str]): The path to the PDF file to process. Required if `from_pdf` is True.
+            text (Optional[str]): The text content to process. Required if `from_pdf` is False.
+
+        Returns:
+            PipelineResult: The result of the pipeline execution.
+        """
+        pipe_inputs = self._prepare_inputs(file_path=file_path, text=text)
+        return await self.pipeline.run(pipe_inputs)
+
+    def _prepare_inputs(
+        self, file_path: Optional[str], text: Optional[str]
+    ) -> dict[str, Any]:
+        if self.from_pdf:
+            if file_path is None or text is not None:
+                raise PipelineDefinitionError(
+                    "Expected 'file_path' argument when 'from_pdf' is True."
+                )
+        else:
+            if text is None or file_path is not None:
+                raise PipelineDefinitionError(
+                    "Expected 'text' argument when 'from_pdf' is False."
+                )
+
+        pipe_inputs: dict[str, Any] = {
+            "schema": {
+                "entities": self.entities,
+                "relations": self.relations,
+                "potential_schema": self.potential_schema,
+            },
+        }
+
+        if self.from_pdf:
+            pipe_inputs["pdf_loader"] = {"filepath": file_path}
+        else:
+            pipe_inputs["splitter"] = {"text": text}
+
+        return pipe_inputs
diff --git a/tests/unit/experimental/pipeline/test_kg_builder.py b/tests/unit/experimental/pipeline/test_kg_builder.py