neo4j
diff --git a/‎CHANGELOG.md
Lines changed: 2 additions & 0 deletions b/‎CHANGELOG.md
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/source/api.rst
Lines changed: 8 additions & 0 deletions b/‎docs/source/api.rst
Lines changed: 8 additions & 0 deletions
diff --git a/‎docs/source/user_guide_kg_builder.rst
Lines changed: 42 additions & 2 deletions b/‎docs/source/user_guide_kg_builder.rst
Lines changed: 42 additions & 2 deletions
diff --git a/‎examples/pipeline/Harry Potter and the Chamber of Secrets Summary.pdf
31.4 KB b/‎examples/pipeline/Harry Potter and the Chamber of Secrets Summary.pdf
31.4 KB
diff --git a/‎examples/pipeline/kg_builder_from_pdf.py
Lines changed: 24 additions & 13 deletions b/‎examples/pipeline/kg_builder_from_pdf.py
Lines changed: 24 additions & 13 deletions
diff --git a/‎examples/pipeline/kg_builder_from_text.py
Lines changed: 24 additions & 34 deletions b/‎examples/pipeline/kg_builder_from_text.py
Lines changed: 24 additions & 34 deletions
diff --git a/‎examples/pipeline/kg_builder_two_documents_entity_resolution.py
Lines changed: 156 additions & 0 deletions b/‎examples/pipeline/kg_builder_two_documents_entity_resolution.py
Lines changed: 156 additions & 0 deletions
@@ -2,6 +2,8 @@
 
 ## Next
 
+- Added `SinglePropertyExactMatchResolver` component allowing to merge entities with exact same property (e.g. name)
+
 ## 0.7.0
 
 ### Added
 
@@ -71,6 +71,14 @@ LLMEntityRelationExtractor
     :members: run
 
 
+SinglePropertyExactMatchResolver
+================================
+
+.. autoclass:: neo4j_graphrag.experimental.components.resolver.SinglePropertyExactMatchResolver
+    :members: run
+
+
+
 .. _pipeline-section:
 
 ********
 
@@ -11,8 +11,6 @@ unstructured data.
 
     This feature is still experimental. API changes and bug fixes are expected.
 
-    It is not recommended to use it in production yet.
-
 
 ******************
 Pipeline Structure
@@ -26,6 +24,7 @@ A Knowledge Graph (KG) construction pipeline requires a few components:
 - **Schema builder**: provide a schema to ground the LLM extracted entities and relations and obtain an easily navigable KG.
 - **Entity and relation extractor**: extract relevant entities and relations from the text.
 - **Knowledge Graph writer**: save the identified entities and relations.
+- **Entity resolver**: merge similar entities into a single node.
 
 .. image:: images/kg_builder_pipeline.png
   :alt: KG Builder pipeline
@@ -426,3 +425,44 @@ It is possible to create a custom writer using the `KGWriter` interface:
 
 
 See :ref:`kgwritermodel` and :ref:`kgwriter` in API reference.
+
+
+Entity Resolver
+===============
+
+The KG Writer component creates new nodes for each identified entity
+without making assumptions about entity similarity. The Entity Resolver
+is responsible for refining the created knowledge graph by merging entity
+nodes that represent the same real-world object.
+
+In practice, this package implements a single resolver that merges nodes
+with the same label and identical "name" property.
+
+.. warning::
+
+    The `SinglePropertyExactMatchResolver` **replaces** the nodes created by the KG writer.
+
+
+It can be used like this:
+
+.. code:: python
+    from neo4j_graphrag.experimental.components.resolver import (
+        SinglePropertyExactMatchResolver,
+    )
+    resolver = SinglePropertyExactMatchResolver(driver)
+    res = await resolver.run()
+
+.. warning::
+
+    By default, all nodes with the __Entity__ label will be resolved.
+    To exclude specific nodes, a filter_query can be added to the query.
+    For example, if a `:Resolved` label has been applied to already resolved entities
+    in the graph, these entities can be excluded with the following approach:
+
+    .. code:: python
+
+        from neo4j_graphrag.experimental.components.resolver import (
+            SinglePropertyExactMatchResolver,
+        )
+        resolver = SinglePropertyExactMatchResolver(driver, filter_query="WHERE not entity:Resolved")
+        res = await resolver.run()
@@ -33,12 +33,14 @@
     FixedSizeSplitter,
 )
 from neo4j_graphrag.experimental.pipeline.pipeline import PipelineResult
-from neo4j_graphrag.llm import OpenAILLM
+from neo4j_graphrag.llm import LLMInterface, OpenAILLM
 
 logging.basicConfig(level=logging.INFO)
 
 
-async def main(neo4j_driver: neo4j.Driver) -> PipelineResult:
+async def define_and_run_pipeline(
+    neo4j_driver: neo4j.AsyncDriver, llm: LLMInterface
+) -> PipelineResult:
     from neo4j_graphrag.experimental.pipeline import Pipeline
 
     # Instantiate Entity and Relation objects
@@ -86,13 +88,7 @@ async def main(neo4j_driver: neo4j.Driver) -> PipelineResult:
     pipe.add_component(SchemaBuilder(), "schema")
     pipe.add_component(
         LLMEntityRelationExtractor(
-            llm=OpenAILLM(
-                model_name="gpt-4o",
-                model_params={
-                    "max_tokens": 2000,
-                    "response_format": {"type": "json_object"},
-                },
-            ),
+            llm=llm,
             on_error=OnError.RAISE,
         ),
         "extractor",
@@ -127,8 +123,23 @@ async def main(neo4j_driver: neo4j.Driver) -> PipelineResult:
     return await pipe.run(pipe_inputs)
 
 
-if __name__ == "__main__":
-    with neo4j.GraphDatabase.driver(
+async def main() -> PipelineResult:
+    llm = OpenAILLM(
+        model_name="gpt-4o",
+        model_params={
+            "max_tokens": 2000,
+            "response_format": {"type": "json_object"},
+        },
+    )
+    driver = neo4j.AsyncGraphDatabase.driver(
         "bolt://localhost:7687", auth=("neo4j", "password")
-    ) as driver:
-        print(asyncio.run(main(driver)))
+    )
+    res = await define_and_run_pipeline(driver, llm)
+    await driver.close()
+    await llm.async_client.close()
+    return res
+
+
+if __name__ == "__main__":
+    res = asyncio.run(main())
+    print(res)
@@ -15,7 +15,6 @@
 from __future__ import annotations
 
 import asyncio
-import logging.config
 
 import neo4j
 from neo4j_graphrag.embeddings.openai import OpenAIEmbeddings
@@ -36,30 +35,12 @@
 )
 from neo4j_graphrag.experimental.pipeline import Pipeline
 from neo4j_graphrag.experimental.pipeline.pipeline import PipelineResult
-from neo4j_graphrag.llm import OpenAILLM
-
-# set log level to DEBUG for all neo4j_graphrag.* loggers
-logging.config.dictConfig(
-    {
-        "version": 1,
-        "handlers": {
-            "console": {
-                "class": "logging.StreamHandler",
-            }
-        },
-        "loggers": {
-            "root": {
-                "handlers": ["console"],
-            },
-            "neo4j_graphrag": {
-                "level": "DEBUG",
-            },
-        },
-    }
-)
+from neo4j_graphrag.llm import LLMInterface, OpenAILLM
 
 
-async def main(neo4j_driver: neo4j.Driver) -> PipelineResult:
+async def define_and_run_pipeline(
+    neo4j_driver: neo4j.AsyncDriver, llm: LLMInterface
+) -> PipelineResult:
     """This is where we define and run the KG builder pipeline, instantiating a few
     components:
     - Text Splitter: in this example we use the fixed size text splitter
@@ -83,13 +64,7 @@ async def main(neo4j_driver: neo4j.Driver) -> PipelineResult:
     pipe.add_component(SchemaBuilder(), "schema")
     pipe.add_component(
         LLMEntityRelationExtractor(
-            llm=OpenAILLM(
-                model_name="gpt-4o",
-                model_params={
-                    "max_tokens": 1000,
-                    "response_format": {"type": "json_object"},
-                },
-            ),
+            llm=llm,
             on_error=OnError.RAISE,
         ),
         "extractor",
@@ -164,8 +139,23 @@ async def main(neo4j_driver: neo4j.Driver) -> PipelineResult:
     return await pipe.run(pipe_inputs)
 
 
-if __name__ == "__main__":
-    with neo4j.GraphDatabase.driver(
+async def main() -> PipelineResult:
+    llm = OpenAILLM(
+        model_name="gpt-4o",
+        model_params={
+            "max_tokens": 1000,
+            "response_format": {"type": "json_object"},
+        },
+    )
+    driver = neo4j.AsyncGraphDatabase.driver(
         "bolt://localhost:7687", auth=("neo4j", "password")
-    ) as driver:
-        print(asyncio.run(main(driver)))
+    )
+    res = await define_and_run_pipeline(driver, llm)
+    await driver.close()
+    await llm.async_client.close()
+    return res
+
+
+if __name__ == "__main__":
+    res = asyncio.run(main())
+    print(res)
@@ -0,0 +1,156 @@
+#  Copyright (c) "Neo4j"
+#  Neo4j Sweden AB [https://neo4j.com]
+#  #
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#  #
+#      https://www.apache.org/licenses/LICENSE-2.0
+#  #
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+from __future__ import annotations
+
+import asyncio
+
+import neo4j
+from neo4j_graphrag.experimental.components.entity_relation_extractor import (
+    LLMEntityRelationExtractor,
+    OnError,
+)
+from neo4j_graphrag.experimental.components.kg_writer import Neo4jWriter
+from neo4j_graphrag.experimental.components.pdf_loader import PdfLoader
+from neo4j_graphrag.experimental.components.resolver import (
+    SinglePropertyExactMatchResolver,
+)
+from neo4j_graphrag.experimental.components.schema import (
+    SchemaBuilder,
+    SchemaEntity,
+    SchemaProperty,
+    SchemaRelation,
+)
+from neo4j_graphrag.experimental.components.text_splitters.fixed_size_splitter import (
+    FixedSizeSplitter,
+)
+from neo4j_graphrag.experimental.pipeline import Pipeline
+from neo4j_graphrag.llm import LLMInterface, OpenAILLM
+
+
+async def define_and_run_pipeline(
+    neo4j_driver: neo4j.AsyncDriver, llm: LLMInterface
+) -> None:
+    """This is where we define and run the KG builder pipeline, instantiating a few
+    components:
+    - Text Splitter: in this example we use the fixed size text splitter
+    - Schema Builder: this component takes a list of entities, relationships and
+        possible triplets as inputs, validate them and return a schema ready to use
+        for the rest of the pipeline
+    - LLM Entity Relation Extractor is an LLM-based entity and relation extractor:
+        based on the provided schema, the LLM will do its best to identity these
+        entities and their relations within the provided text
+    - KG writer: once entities and relations are extracted, they can be writen
+        to a Neo4j database
+    """
+    pipe = Pipeline()
+    # define the components
+    pipe.add_component(PdfLoader(), "loader")
+    pipe.add_component(
+        FixedSizeSplitter(),
+        "splitter",
+    )
+    pipe.add_component(SchemaBuilder(), "schema")
+    pipe.add_component(
+        LLMEntityRelationExtractor(
+            llm=llm,
+            on_error=OnError.IGNORE,
+        ),
+        "extractor",
+    )
+    pipe.add_component(Neo4jWriter(neo4j_driver), "writer")
+    pipe.add_component(SinglePropertyExactMatchResolver(neo4j_driver), "resolver")
+    # define the execution order of component
+    # and how the output of previous components must be used
+    pipe.connect("loader", "splitter", {"text": "loader.text"})
+    pipe.connect("splitter", "extractor", input_config={"chunks": "splitter"})
+    pipe.connect(
+        "schema",
+        "extractor",
+        input_config={"schema": "schema", "document_info": "loader.document_info"},
+    )
+    pipe.connect(
+        "extractor",
+        "writer",
+        input_config={"graph": "extractor"},
+    )
+    pipe.connect("writer", "resolver", {})
+    # user input:
+    # the initial text
+    # and the list of entities and relations we are looking for
+    pipe_inputs = {
+        "loader": {},
+        "schema": {
+            "entities": [
+                SchemaEntity(
+                    label="Person",
+                    properties=[
+                        SchemaProperty(name="name", type="STRING"),
+                        SchemaProperty(name="place_of_birth", type="STRING"),
+                        SchemaProperty(name="date_of_birth", type="DATE"),
+                    ],
+                ),
+                SchemaEntity(
+                    label="Organization",
+                    properties=[
+                        SchemaProperty(name="name", type="STRING"),
+                        SchemaProperty(name="country", type="STRING"),
+                    ],
+                ),
+            ],
+            "relations": [
+                SchemaRelation(
+                    label="WORKED_FOR",
+                ),
+                SchemaRelation(
+                    label="FRIEND",
+                ),
+                SchemaRelation(
+                    label="ENEMY",
+                ),
+            ],
+            "potential_schema": [
+                ("Person", "WORKED_FOR", "Organization"),
+                ("Person", "FRIEND", "Person"),
+                ("Person", "ENEMY", "Person"),
+            ],
+        },
+    }
+    # run the pipeline for each documents
+    for document in [
+        "examples/pipeline/Harry Potter and the Chamber of Secrets Summary.pdf",
+        "examples/pipeline/Harry Potter and the Death Hallows Summary.pdf",
+    ]:
+        pipe_inputs["loader"]["filepath"] = document
+        await pipe.run(pipe_inputs)
+
+
+async def main() -> None:
+    llm = OpenAILLM(
+        model_name="gpt-4o",
+        model_params={
+            "max_tokens": 1000,
+            "response_format": {"type": "json_object"},
+        },
+    )
+    driver = neo4j.AsyncGraphDatabase.driver(
+        "bolt://localhost:7687", auth=("neo4j", "password")
+    )
+    await define_and_run_pipeline(driver, llm)
+    await driver.close()
+    await llm.async_client.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())