Skip to content

Commit 378c98d

Browse files
authored
Save Document node in lexical graph (#116)
* WIP * Add document info model - move chunk index to TextChunk model * Update tests to test DocumentInfo * Update examples * Update e2e tests, documentation, CHANGELOG * Remove print * Add docstrings, remove another print * Fix tests after merge
1 parent 53fbc1a commit 378c98d

File tree

14 files changed

+292
-129
lines changed

14 files changed

+292
-129
lines changed

CHANGELOG.md

+3
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22

33
## Next
44

5+
### Changed
6+
- When saving the lexical graph in a KG creation pipeline, the document is also saved as a specific node, together with relationships between each chunk and the document they were created from.
7+
58
## 0.5.0
69

710
### Fixed

docs/source/user_guide_kg_builder.rst

+2
Original file line numberDiff line numberDiff line change
@@ -289,9 +289,11 @@ Lexical Graph
289289

290290
By default, the `LLMEntityRelationExtractor` adds some extra nodes and relationships to the extracted graph:
291291

292+
- `Document` node: represent the processed document and have a `path` property.
292293
- `Chunk` nodes: represent the text chunks. They have a `text` property and, if computed, an `embedding` property.
293294
- `NEXT_CHUNK` relationships between one chunk node and the next one in the document. It can be used to enhance the context in a RAG application.
294295
- `FROM_CHUNK` relationship between any extracted entity and the chunk it has been identified into.
296+
- `FROM_DOCUMENT` relationship between each chunk and the document it was built from.
295297

296298
If this 'lexical graph' is not desired, set the `created_lexical_graph` to `False` in the extractor constructor:
297299

examples/pipeline/kg_builder_from_pdf.py

+13-9
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
import asyncio
1818
import logging
19-
from typing import Any
19+
from typing import Any, Dict, List
2020

2121
import neo4j
2222
from langchain_text_splitters import CharacterTextSplitter
@@ -62,13 +62,13 @@ class Neo4jGraph(DataModel):
6262

6363

6464
class ERExtractor(Component):
65-
async def _process_chunk(self, chunk: str, schema: str) -> dict[str, Any]:
65+
async def _process_chunk(self, chunk: str, schema: str) -> Dict[str, Any]:
6666
return {
6767
"entities": [{"label": "Person", "properties": {"name": "John Doe"}}],
6868
"relations": [],
6969
}
7070

71-
async def run(self, chunks: list[str], schema: str) -> Neo4jGraph:
71+
async def run(self, chunks: List[str], schema: str) -> Neo4jGraph:
7272
tasks = [self._process_chunk(chunk, schema) for chunk in chunks]
7373
result = await asyncio.gather(*tasks)
7474
merged_result: dict[str, Any] = {"entities": [], "relations": []}
@@ -141,10 +141,7 @@ async def main(neo4j_driver: neo4j.Driver) -> dict[str, Any]:
141141
pipe = Pipeline()
142142
pipe.add_component(PdfLoader(), "pdf_loader")
143143
pipe.add_component(
144-
LangChainTextSplitterAdapter(
145-
# chunk_size=50 for the sake of this demo
146-
CharacterTextSplitter(chunk_size=50, chunk_overlap=10, separator=".")
147-
),
144+
LangChainTextSplitterAdapter(CharacterTextSplitter(separator=". \n")),
148145
"splitter",
149146
)
150147
pipe.add_component(SchemaBuilder(), "schema")
@@ -153,7 +150,7 @@ async def main(neo4j_driver: neo4j.Driver) -> dict[str, Any]:
153150
llm=OpenAILLM(
154151
model_name="gpt-4o",
155152
model_params={
156-
"max_tokens": 1000,
153+
"max_tokens": 2000,
157154
"response_format": {"type": "json_object"},
158155
},
159156
),
@@ -164,7 +161,14 @@ async def main(neo4j_driver: neo4j.Driver) -> dict[str, Any]:
164161
pipe.add_component(Neo4jWriter(neo4j_driver), "writer")
165162
pipe.connect("pdf_loader", "splitter", input_config={"text": "pdf_loader.text"})
166163
pipe.connect("splitter", "extractor", input_config={"chunks": "splitter"})
167-
pipe.connect("schema", "extractor", input_config={"schema": "schema"})
164+
pipe.connect(
165+
"schema",
166+
"extractor",
167+
input_config={
168+
"schema": "schema",
169+
"document_info": "pdf_loader.document_info",
170+
},
171+
)
168172
pipe.connect(
169173
"extractor",
170174
"writer",

examples/pipeline/kg_builder_from_text.py

+5
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,11 @@ async def main(neo4j_driver: neo4j.Driver) -> dict[str, Any]:
154154
("Person", "WORKED_FOR", "Organization"),
155155
],
156156
},
157+
"extractor": {
158+
"document_info": {
159+
"path": "my text",
160+
}
161+
},
157162
}
158163
# run the pipeline
159164
return await pipe.run(pipe_inputs)

src/neo4j_genai/experimental/components/embedder.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,9 @@ def _embed_chunk(self, text_chunk: TextChunk) -> TextChunk:
5656
embedding = self._embedder.embed_query(text_chunk.text)
5757
metadata = text_chunk.metadata if text_chunk.metadata else {}
5858
metadata["embedding"] = embedding
59-
return TextChunk(text=text_chunk.text, metadata=metadata)
59+
return TextChunk(
60+
text=text_chunk.text, index=text_chunk.index, metadata=metadata
61+
)
6062

6163
@validate_call
6264
async def run(self, text_chunks: TextChunks) -> TextChunks:

0 commit comments

Comments
 (0)