Skip to content

Commit a8bbcb8

Browse files
committed
Do not pollute the graph with internal/temp properties
1 parent ed7e34d commit a8bbcb8

File tree

6 files changed

+118
-55
lines changed

6 files changed

+118
-55
lines changed

CHANGELOG.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@
2828

2929
#### Other
3030

31-
- The node internal `id` property that's used to create relationships between nodes has been renamed to `__kg_builder_id`. This releases the `id` name for domain-specific meaningful `id` property.
32-
31+
- The `id` property on `__KG_Builder__` nodes is removed.
32+
- The `chunk_index` property on `__Entity__` nodes is removed. Use the `FROM_CHUNK` relationship instead.
3333

3434
## 1.7.0
3535

src/neo4j_graphrag/experimental/components/entity_relation_extractor.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -146,12 +146,11 @@ def update_ids(
146146
"""Make node IDs unique across chunks, document and pipeline runs
147147
by prefixing them with a unique prefix.
148148
"""
149-
prefix = f"{chunk.chunk_id}"
149+
prefix = chunk.chunk_id
150150
for node in graph.nodes:
151151
node.id = f"{prefix}:{node.id}"
152152
if node.properties is None:
153153
node.properties = {}
154-
node.properties.update({"chunk_index": chunk.index})
155154
for rel in graph.relationships:
156155
rel.start_node_id = f"{prefix}:{rel.start_node_id}"
157156
rel.end_node_id = f"{prefix}:{rel.end_node_id}"

src/neo4j_graphrag/experimental/components/kg_writer.py

Lines changed: 51 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -125,12 +125,8 @@ def __init__(
125125
self.is_version_5_23_or_above = is_version_5_23_or_above(version_tuple)
126126

127127
def _db_setup(self) -> None:
128-
# create index on __KGBuilder__.id
129-
# used when creating the relationships
130-
self.driver.execute_query(
131-
"CREATE INDEX __entity__id IF NOT EXISTS FOR (n:__KGBuilder__) ON (n.id)",
132-
database_=self.neo4j_database,
133-
)
128+
# not used for now
129+
pass
134130

135131
@staticmethod
136132
def _nodes_to_rows(
@@ -148,45 +144,62 @@ def _nodes_to_rows(
148144

149145
def _upsert_nodes(
150146
self, nodes: list[Neo4jNode], lexical_graph_config: LexicalGraphConfig
151-
) -> None:
147+
) -> dict[str, str]:
152148
"""Upserts a single node into the Neo4j database."
153149
154150
Args:
155151
nodes (list[Neo4jNode]): The nodes batch to upsert into the database.
156152
"""
157153
parameters = {"rows": self._nodes_to_rows(nodes, lexical_graph_config)}
158-
if self.is_version_5_23_or_above:
159-
self.driver.execute_query(
160-
UPSERT_NODE_QUERY_VARIABLE_SCOPE_CLAUSE,
161-
parameters_=parameters,
162-
database_=self.neo4j_database,
163-
)
164-
else:
165-
self.driver.execute_query(
166-
UPSERT_NODE_QUERY,
167-
parameters_=parameters,
168-
database_=self.neo4j_database,
169-
)
154+
query = (
155+
UPSERT_NODE_QUERY_VARIABLE_SCOPE_CLAUSE
156+
if self.is_version_5_23_or_above
157+
else UPSERT_NODE_QUERY
158+
)
159+
records, _, _ = self.driver.execute_query(
160+
query,
161+
parameters_=parameters,
162+
database_=self.neo4j_database,
163+
)
164+
print("RECORDS", records)
165+
return {r["_internal_id"]: r["element_id"] for r in records}
170166

171-
def _upsert_relationships(self, rels: list[Neo4jRelationship]) -> None:
167+
@staticmethod
168+
def _relationships_to_rows(
169+
relationships: list[Neo4jRelationship], node_id_mapping: dict[str, str]
170+
) -> list[dict[str, Any]]:
171+
return [
172+
{
173+
**relationship.model_dump(),
174+
"start_node_element_id": node_id_mapping.get(
175+
relationship.start_node_id, ""
176+
),
177+
"end_node_element_id": node_id_mapping.get(
178+
relationship.end_node_id, ""
179+
),
180+
}
181+
for relationship in relationships
182+
]
183+
184+
def _upsert_relationships(
185+
self, rels: list[Neo4jRelationship], node_id_mapping: dict[str, str]
186+
) -> None:
172187
"""Upserts a single relationship into the Neo4j database.
173188
174189
Args:
175190
rels (list[Neo4jRelationship]): The relationships batch to upsert into the database.
176191
"""
177-
parameters = {"rows": [rel.model_dump() for rel in rels]}
178-
if self.is_version_5_23_or_above:
179-
self.driver.execute_query(
180-
UPSERT_RELATIONSHIP_QUERY_VARIABLE_SCOPE_CLAUSE,
181-
parameters_=parameters,
182-
database_=self.neo4j_database,
183-
)
184-
else:
185-
self.driver.execute_query(
186-
UPSERT_RELATIONSHIP_QUERY,
187-
parameters_=parameters,
188-
database_=self.neo4j_database,
189-
)
192+
parameters = {"rows": self._relationships_to_rows(rels, node_id_mapping)}
193+
query = (
194+
UPSERT_RELATIONSHIP_QUERY_VARIABLE_SCOPE_CLAUSE
195+
if self.is_version_5_23_or_above
196+
else UPSERT_RELATIONSHIP_QUERY
197+
)
198+
self.driver.execute_query(
199+
query,
200+
parameters_=parameters,
201+
database_=self.neo4j_database,
202+
)
190203

191204
@validate_call
192205
async def run(
@@ -203,11 +216,14 @@ async def run(
203216
try:
204217
self._db_setup()
205218

219+
node_id_mapping = {}
220+
206221
for batch in batched(graph.nodes, self.batch_size):
207-
self._upsert_nodes(batch, lexical_graph_config)
222+
batch_mapping = self._upsert_nodes(batch, lexical_graph_config)
223+
node_id_mapping.update(batch_mapping)
208224

209225
for batch in batched(graph.relationships, self.batch_size):
210-
self._upsert_relationships(batch)
226+
self._upsert_relationships(batch, node_id_mapping)
211227

212228
return KGWriterModel(
213229
status="SUCCESS",

src/neo4j_graphrag/neo4j_queries.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@
5454

5555
UPSERT_NODE_QUERY = (
5656
"UNWIND $rows AS row "
57-
"CREATE (n:__KGBuilder__ {__kg_builder_id: row.id}) "
57+
"CREATE (n:__KGBuilder__) "
5858
"SET n += row.properties "
5959
"WITH n, row CALL apoc.create.addLabels(n, row.labels) YIELD node "
6060
"WITH node as n, row CALL { "
@@ -63,12 +63,12 @@
6363
"CALL db.create.setNodeVectorProperty(n, emb, row.embedding_properties[emb]) "
6464
"RETURN count(*) as nbEmb "
6565
"} "
66-
"RETURN elementId(n)"
66+
"RETURN row.id as _internal_id, elementId(n) as element_id"
6767
)
6868

6969
UPSERT_NODE_QUERY_VARIABLE_SCOPE_CLAUSE = (
7070
"UNWIND $rows AS row "
71-
"CREATE (n:__KGBuilder__ {__kg_builder_id: row.id}) "
71+
"CREATE (n:__KGBuilder__) "
7272
"SET n += row.properties "
7373
"WITH n, row CALL apoc.create.addLabels(n, row.labels) YIELD node "
7474
"WITH node as n, row CALL (n, row) { "
@@ -77,13 +77,13 @@
7777
"CALL db.create.setNodeVectorProperty(n, emb, row.embedding_properties[emb]) "
7878
"RETURN count(*) as nbEmb "
7979
"} "
80-
"RETURN elementId(n)"
80+
"RETURN row.id as _internal_id, elementId(n) as element_id"
8181
)
8282

8383
UPSERT_RELATIONSHIP_QUERY = (
8484
"UNWIND $rows as row "
85-
"MATCH (start:__KGBuilder__ {__kg_builder_id: row.start_node_id}) "
86-
"MATCH (end:__KGBuilder__ {__kg_builder_id: row.end_node_id}) "
85+
"MATCH (start:__KGBuilder__), (end:__KGBuilder__) "
86+
"WHERE elementId(start) = row.start_node_element_id AND elementId(end) = row.end_node_element_id "
8787
"WITH start, end, row "
8888
"CALL apoc.merge.relationship(start, row.type, {}, row.properties, end, row.properties) YIELD rel "
8989
"WITH rel, row CALL { "
@@ -96,8 +96,8 @@
9696

9797
UPSERT_RELATIONSHIP_QUERY_VARIABLE_SCOPE_CLAUSE = (
9898
"UNWIND $rows as row "
99-
"MATCH (start:__KGBuilder__ {__kg_builder_id: row.start_node_id}) "
100-
"MATCH (end:__KGBuilder__ {__kg_builder_id: row.end_node_id}) "
99+
"MATCH (start:__KGBuilder__), (end:__KGBuilder__) "
100+
"WHERE elementId(start) = row.start_node_element_id AND elementId(end) = row.end_node_element_id "
101101
"WITH start, end, row "
102102
"CALL apoc.merge.relationship(start, row.type, {}, row.properties, end, row.properties) YIELD rel "
103103
"WITH rel, row CALL (rel, row) { "

tests/unit/experimental/components/test_entity_relation_extractor.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,6 @@ async def test_extractor_happy_path_non_empty_result() -> None:
109109
entity = result.nodes[2]
110110
assert entity.id == f"{chunk_entity.id}:0"
111111
assert entity.label == "Person"
112-
assert entity.properties == {"chunk_index": 0}
113112
assert len(result.relationships) == 2
114113
assert result.relationships[0].type == "FROM_DOCUMENT"
115114
assert result.relationships[0].start_node_id == f"{chunk_entity.id}"
@@ -213,7 +212,6 @@ async def test_extractor_llm_badly_formatted_json_gets_fixed() -> None:
213212

214213
assert len(res.nodes) == 1
215214
assert res.nodes[0].label == "Person"
216-
assert res.nodes[0].properties == {"chunk_index": 0}
217215
assert res.nodes[0].embedding_properties is None
218216
assert res.relationships == []
219217

tests/unit/experimental/components/test_kg_writer.py

Lines changed: 56 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,17 @@ def test_batched() -> None:
5656
return_value=None,
5757
)
5858
def test_upsert_nodes(_: Mock, driver: MagicMock) -> None:
59+
driver.execute_query.return_value = (
60+
[{"_internal_id": "1", "element_id": "#1"}],
61+
None,
62+
None,
63+
)
5964
neo4j_writer = Neo4jWriter(driver=driver)
6065
node = Neo4jNode(id="1", label="Label", properties={"key": "value"})
61-
neo4j_writer._upsert_nodes(nodes=[node], lexical_graph_config=LexicalGraphConfig())
66+
result = neo4j_writer._upsert_nodes(
67+
nodes=[node], lexical_graph_config=LexicalGraphConfig()
68+
)
69+
assert result == {"1": "#1"}
6270
driver.execute_query.assert_called_once_with(
6371
UPSERT_NODE_QUERY,
6472
parameters_={
@@ -88,14 +96,18 @@ def test_upsert_nodes_with_embedding(
8896
_: Mock,
8997
driver: MagicMock,
9098
) -> None:
99+
driver.execute_query.return_value = (
100+
[{"_internal_id": "1", "element_id": "#1"}],
101+
None,
102+
None,
103+
)
91104
neo4j_writer = Neo4jWriter(driver=driver)
92105
node = Neo4jNode(
93106
id="1",
94107
label="Label",
95108
properties={"key": "value"},
96109
embedding_properties={"embeddingProp": [1.0, 2.0, 3.0]},
97110
)
98-
driver.execute_query.return_value.records = [{"elementId(n)": 1}]
99111
neo4j_writer._upsert_nodes(nodes=[node], lexical_graph_config=LexicalGraphConfig())
100112
driver.execute_query.assert_any_call(
101113
UPSERT_NODE_QUERY,
@@ -130,7 +142,9 @@ def test_upsert_relationship(_: Mock, driver: MagicMock) -> None:
130142
type="RELATIONSHIP",
131143
properties={"key": "value"},
132144
)
133-
neo4j_writer._upsert_relationships(rels=[rel])
145+
neo4j_writer._upsert_relationships(
146+
rels=[rel], node_id_mapping={"1": "#1", "2": "#2"}
147+
)
134148
parameters = {
135149
"rows": [
136150
{
@@ -139,6 +153,8 @@ def test_upsert_relationship(_: Mock, driver: MagicMock) -> None:
139153
"end_node_id": "2",
140154
"properties": {"key": "value"},
141155
"embedding_properties": None,
156+
"start_node_element_id": "#1",
157+
"end_node_element_id": "#2",
142158
}
143159
]
144160
}
@@ -167,7 +183,9 @@ def test_upsert_relationship_with_embedding(_: Mock, driver: MagicMock) -> None:
167183
embedding_properties={"embeddingProp": [1.0, 2.0, 3.0]},
168184
)
169185
driver.execute_query.return_value.records = [{"elementId(r)": "rel_elem_id"}]
170-
neo4j_writer._upsert_relationships(rels=[rel])
186+
neo4j_writer._upsert_relationships(
187+
rels=[rel], node_id_mapping={"1": "#1", "2": "#2"}
188+
)
171189
parameters = {
172190
"rows": [
173191
{
@@ -176,6 +194,8 @@ def test_upsert_relationship_with_embedding(_: Mock, driver: MagicMock) -> None:
176194
"end_node_id": "2",
177195
"properties": {"key": "value"},
178196
"embedding_properties": {"embeddingProp": [1.0, 2.0, 3.0]},
197+
"start_node_element_id": "#1",
198+
"end_node_element_id": "#2",
179199
}
180200
]
181201
}
@@ -196,6 +216,14 @@ def test_upsert_relationship_with_embedding(_: Mock, driver: MagicMock) -> None:
196216
return_value=None,
197217
)
198218
async def test_run(_: Mock, driver: MagicMock) -> None:
219+
driver.execute_query.return_value = (
220+
[
221+
{"_internal_id": "1", "element_id": "#1"},
222+
{"_internal_id": "2", "element_id": "#2"},
223+
],
224+
None,
225+
None,
226+
)
199227
neo4j_writer = Neo4jWriter(driver=driver)
200228
node = Neo4jNode(id="1", label="Label")
201229
rel = Neo4jRelationship(start_node_id="1", end_node_id="2", type="RELATIONSHIP")
@@ -224,6 +252,8 @@ async def test_run(_: Mock, driver: MagicMock) -> None:
224252
"end_node_id": "2",
225253
"properties": {},
226254
"embedding_properties": None,
255+
"start_node_element_id": "#1",
256+
"end_node_element_id": "#2",
227257
}
228258
]
229259
}
@@ -242,7 +272,14 @@ async def test_run(_: Mock, driver: MagicMock) -> None:
242272
async def test_run_is_version_below_5_23(_: Mock) -> None:
243273
driver = MagicMock()
244274
driver.execute_query = Mock(
245-
return_value=([{"versions": ["5.22.0"], "edition": "enterprise"}], None, None)
275+
side_effect=(
276+
# get_version
277+
([{"versions": ["5.22.0"], "edition": "enterpise"}], None, None),
278+
# upsert nodes
279+
([{"_internal_id": "1", "element_id": "#1"}], None, None),
280+
# upsert relationships
281+
(None, None, None),
282+
)
246283
)
247284

248285
neo4j_writer = Neo4jWriter(driver=driver)
@@ -252,6 +289,8 @@ async def test_run_is_version_below_5_23(_: Mock) -> None:
252289
graph = Neo4jGraph(nodes=[node], relationships=[rel])
253290
await neo4j_writer.run(graph=graph)
254291

292+
print(driver.execute_query.call_args_list)
293+
255294
driver.execute_query.assert_any_call(
256295
UPSERT_NODE_QUERY,
257296
parameters_={
@@ -275,6 +314,8 @@ async def test_run_is_version_below_5_23(_: Mock) -> None:
275314
"end_node_id": "2",
276315
"properties": {},
277316
"embedding_properties": None,
317+
"start_node_element_id": "#1",
318+
"end_node_element_id": "",
278319
}
279320
]
280321
}
@@ -293,7 +334,14 @@ async def test_run_is_version_below_5_23(_: Mock) -> None:
293334
async def test_run_is_version_5_23_or_above(_: Mock) -> None:
294335
driver = MagicMock()
295336
driver.execute_query = Mock(
296-
return_value=([{"versions": ["5.23.0"], "edition": "enterpise"}], None, None)
337+
side_effect=(
338+
# get_version
339+
([{"versions": ["5.23.0"], "edition": "enterpise"}], None, None),
340+
# upsert nodes
341+
([{"_internal_id": "1", "element_id": "#1"}], None, None),
342+
# upsert relationships
343+
(None, None, None),
344+
)
297345
)
298346

299347
neo4j_writer = Neo4jWriter(driver=driver)
@@ -327,6 +375,8 @@ async def test_run_is_version_5_23_or_above(_: Mock) -> None:
327375
"end_node_id": "2",
328376
"properties": {},
329377
"embedding_properties": None,
378+
"start_node_element_id": "#1",
379+
"end_node_element_id": "",
330380
}
331381
]
332382
}

0 commit comments

Comments
 (0)