Skip to content

Commit ddc78ce

Browse files
authored
Rename ID property and more cleaning (neo4j#366)
* Allow an ID property by renaming our internal ID to _kg_builder_id * Prune nodes with invalid label/id and relationships with invalid type before inserting (APOC raises errors on them) * Do not pollute the graph with internal/temp properties * Fix e2e * Improve e2e test * Improve e2e test * Rename property, clean db * Doc update * Review comments + e2e test
1 parent 76463f3 commit ddc78ce

File tree

14 files changed

+273
-167
lines changed

14 files changed

+273
-167
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,12 @@
2626
- The `SchemaProperty` model has been renamed `PropertyType`.
2727
- `SchemaConfig` has been removed in favor of `GraphSchema` (used in the `SchemaBuilder` and `EntityRelationExtractor` classes). `entities`, `relations` and `potential_schema` fields have also been renamed `node_types`, `relationship_types` and `patterns` respectively.
2828

29+
#### Other
30+
31+
- The reserved `id` property on `__KGBuilder__` nodes is removed.
32+
- The `chunk_index` property on `__Entity__` nodes is removed. Use the `FROM_CHUNK` relationship instead.
33+
- The `__entity__id` index is not used anymore and can be dropped from the database (it has been replaced by `__entity__tmp_internal_id`).
34+
2935

3036
## 1.7.0
3137

docs/source/user_guide_kg_builder.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1048,8 +1048,10 @@ _________________
10481048
In addition to the user-defined configuration options described above,
10491049
the `GraphPruning` component performs the following cleanup operations:
10501050

1051+
- Nodes with empty label or ID are pruned.
10511052
- Nodes with missing required properties are pruned.
10521053
- Nodes with no remaining properties are pruned.
1054+
- Relationships with empty type are pruned.
10531055
- Relationships with invalid source or target nodes (i.e., nodes no longer present in the graph) are pruned.
10541056
- Relationships with incorrect direction have their direction corrected.
10551057

@@ -1078,6 +1080,11 @@ to a Neo4j database:
10781080
Adjust the batch_size parameter of `Neo4jWriter` to optimize insert performance.
10791081
This parameter controls the number of nodes or relationships inserted per batch, with a default value of 1000.
10801082

1083+
.. note:: Index
1084+
1085+
In order to improve the ingestion performances, an index called `__entity__tmp_internal_id` is automatically added to the database.
1086+
1087+
10811088
See :ref:`neo4jgraph`.
10821089

10831090

examples/customize/build_graph/components/pruners/graph_pruner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""This example demonstrates how to use the GraphPruner component."""
1+
"""This example demonstrates how to use the GraphPruning component."""
22

33
import asyncio
44

src/neo4j_graphrag/experimental/components/entity_relation_extractor.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -146,12 +146,11 @@ def update_ids(
146146
"""Make node IDs unique across chunks, document and pipeline runs
147147
by prefixing them with a unique prefix.
148148
"""
149-
prefix = f"{chunk.chunk_id}"
149+
prefix = chunk.chunk_id
150150
for node in graph.nodes:
151151
node.id = f"{prefix}:{node.id}"
152152
if node.properties is None:
153153
node.properties = {}
154-
node.properties.update({"chunk_index": chunk.index})
155154
for rel in graph.relationships:
156155
rel.start_node_id = f"{prefix}:{rel.start_node_id}"
157156
rel.end_node_id = f"{prefix}:{rel.end_node_id}"

src/neo4j_graphrag/experimental/components/graph_pruning.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ class PruningReason(str, enum.Enum):
4141
NO_PROPERTY_LEFT = "NO_PROPERTY_LEFT"
4242
INVALID_START_OR_END_NODE = "INVALID_START_OR_END_NODE"
4343
INVALID_PATTERN = "INVALID_PATTERN"
44+
MISSING_LABEL = "MISSING_LABEL"
4445

4546

4647
ItemType = TypeVar("ItemType")
@@ -198,6 +199,17 @@ def _validate_node(
198199
schema_entity: Optional[NodeType],
199200
additional_node_types: bool,
200201
) -> Optional[Neo4jNode]:
202+
if not node.label:
203+
pruning_stats.add_pruned_node(node, reason=PruningReason.MISSING_LABEL)
204+
return None
205+
if not node.id:
206+
pruning_stats.add_pruned_node(
207+
node,
208+
reason=PruningReason.MISSING_REQUIRED_PROPERTY,
209+
missing_required_properties=["id"],
210+
details="The node was extracted without a valid ID.",
211+
)
212+
return None
201213
if not schema_entity:
202214
# node type not declared in the schema
203215
if additional_node_types:
@@ -262,6 +274,11 @@ def _validate_relationship(
262274
patterns: tuple[tuple[str, str, str], ...],
263275
additional_patterns: bool,
264276
) -> Optional[Neo4jRelationship]:
277+
if not rel.type:
278+
pruning_stats.add_pruned_relationship(
279+
rel, reason=PruningReason.MISSING_LABEL
280+
)
281+
return None
265282
# validate start/end node IDs are valid nodes
266283
if rel.start_node_id not in valid_nodes or rel.end_node_id not in valid_nodes:
267284
logger.debug(

src/neo4j_graphrag/experimental/components/kg_writer.py

Lines changed: 45 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,9 @@
2929
)
3030
from neo4j_graphrag.experimental.pipeline.component import Component, DataModel
3131
from neo4j_graphrag.neo4j_queries import (
32-
UPSERT_NODE_QUERY,
33-
UPSERT_NODE_QUERY_VARIABLE_SCOPE_CLAUSE,
34-
UPSERT_RELATIONSHIP_QUERY,
35-
UPSERT_RELATIONSHIP_QUERY_VARIABLE_SCOPE_CLAUSE,
32+
upsert_node_query,
33+
upsert_relationship_query,
34+
db_cleaning_query,
3635
)
3736
from neo4j_graphrag.utils.version_utils import (
3837
get_version,
@@ -117,20 +116,19 @@ def __init__(
117116
driver: neo4j.Driver,
118117
neo4j_database: Optional[str] = None,
119118
batch_size: int = 1000,
119+
clean_db: bool = True,
120120
):
121121
self.driver = driver_config.override_user_agent(driver)
122122
self.neo4j_database = neo4j_database
123123
self.batch_size = batch_size
124+
self._clean_db = clean_db
124125
version_tuple, _, _ = get_version(self.driver, self.neo4j_database)
125126
self.is_version_5_23_or_above = is_version_5_23_or_above(version_tuple)
126127

127128
def _db_setup(self) -> None:
128-
# create index on __KGBuilder__.id
129-
# used when creating the relationships
130-
self.driver.execute_query(
131-
"CREATE INDEX __entity__id IF NOT EXISTS FOR (n:__KGBuilder__) ON (n.id)",
132-
database_=self.neo4j_database,
133-
)
129+
self.driver.execute_query("""
130+
CREATE INDEX __entity__tmp_internal_id IF NOT EXISTS FOR (n:__KGBuilder__) ON (n.__tmp_internal_id)
131+
""")
134132

135133
@staticmethod
136134
def _nodes_to_rows(
@@ -149,44 +147,51 @@ def _nodes_to_rows(
149147
def _upsert_nodes(
150148
self, nodes: list[Neo4jNode], lexical_graph_config: LexicalGraphConfig
151149
) -> None:
152-
"""Upserts a single node into the Neo4j database."
150+
"""Upserts a batch of nodes into the Neo4j database.
153151
154152
Args:
155153
nodes (list[Neo4jNode]): The nodes batch to upsert into the database.
156154
"""
157155
parameters = {"rows": self._nodes_to_rows(nodes, lexical_graph_config)}
158-
if self.is_version_5_23_or_above:
159-
self.driver.execute_query(
160-
UPSERT_NODE_QUERY_VARIABLE_SCOPE_CLAUSE,
161-
parameters_=parameters,
162-
database_=self.neo4j_database,
163-
)
164-
else:
165-
self.driver.execute_query(
166-
UPSERT_NODE_QUERY,
167-
parameters_=parameters,
168-
database_=self.neo4j_database,
169-
)
156+
query = upsert_node_query(
157+
support_variable_scope_clause=self.is_version_5_23_or_above
158+
)
159+
self.driver.execute_query(
160+
query,
161+
parameters_=parameters,
162+
database_=self.neo4j_database,
163+
)
164+
return None
165+
166+
@staticmethod
167+
def _relationships_to_rows(
168+
relationships: list[Neo4jRelationship],
169+
) -> list[dict[str, Any]]:
170+
return [relationship.model_dump() for relationship in relationships]
170171

171172
def _upsert_relationships(self, rels: list[Neo4jRelationship]) -> None:
172-
"""Upserts a single relationship into the Neo4j database.
173+
"""Upserts a batch of relationships into the Neo4j database.
173174
174175
Args:
175176
rels (list[Neo4jRelationship]): The relationships batch to upsert into the database.
176177
"""
177-
parameters = {"rows": [rel.model_dump() for rel in rels]}
178-
if self.is_version_5_23_or_above:
179-
self.driver.execute_query(
180-
UPSERT_RELATIONSHIP_QUERY_VARIABLE_SCOPE_CLAUSE,
181-
parameters_=parameters,
182-
database_=self.neo4j_database,
183-
)
184-
else:
185-
self.driver.execute_query(
186-
UPSERT_RELATIONSHIP_QUERY,
187-
parameters_=parameters,
188-
database_=self.neo4j_database,
189-
)
178+
parameters = {"rows": self._relationships_to_rows(rels)}
179+
query = upsert_relationship_query(
180+
support_variable_scope_clause=self.is_version_5_23_or_above
181+
)
182+
self.driver.execute_query(
183+
query,
184+
parameters_=parameters,
185+
database_=self.neo4j_database,
186+
)
187+
188+
def _db_cleaning(self) -> None:
189+
query = db_cleaning_query(
190+
support_variable_scope_clause=self.is_version_5_23_or_above,
191+
batch_size=self.batch_size,
192+
)
193+
with self.driver.session() as session:
194+
session.run(query)
190195

191196
@validate_call
192197
async def run(
@@ -209,6 +214,9 @@ async def run(
209214
for batch in batched(graph.relationships, self.batch_size):
210215
self._upsert_relationships(batch)
211216

217+
if self._clean_db:
218+
self._db_cleaning()
219+
212220
return KGWriterModel(
213221
status="SUCCESS",
214222
metadata={

src/neo4j_graphrag/experimental/components/types.py

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,18 @@
1414
# limitations under the License.
1515
from __future__ import annotations
1616

17+
import logging
1718
import uuid
1819
from typing import Any, Dict, Optional
1920

20-
from pydantic import BaseModel, Field, field_validator
21+
from pydantic import BaseModel, Field
2122

2223
from neo4j_graphrag.experimental.pipeline.component import DataModel
2324

2425

26+
logger = logging.getLogger(__name__)
27+
28+
2529
class DocumentInfo(DataModel):
2630
"""A document loaded by a DataLoader.
2731
@@ -79,7 +83,7 @@ class Neo4jNode(BaseModel):
7983
"""Represents a Neo4j node.
8084
8185
Attributes:
82-
id (str): The element ID of the node.
86+
id (str): The ID of the node. This ID is used to refer to the node for relationship creation.
8387
label (str): The label of the node.
8488
properties (dict[str, Any]): A dictionary of properties attached to the node.
8589
embedding_properties (Optional[dict[str, list[float]]]): A list of embedding properties attached to the node.
@@ -90,15 +94,6 @@ class Neo4jNode(BaseModel):
9094
properties: dict[str, Any] = {}
9195
embedding_properties: Optional[dict[str, list[float]]] = None
9296

93-
@field_validator("properties", "embedding_properties")
94-
@classmethod
95-
def check_for_id_properties(
96-
cls, v: Optional[dict[str, Any]]
97-
) -> Optional[dict[str, Any]]:
98-
if v and "id" in v.keys():
99-
raise TypeError("'id' as a property name is not allowed")
100-
return v
101-
10297
@property
10398
def token(self) -> str:
10499
return self.label

0 commit comments

Comments
 (0)