Skip to content

Commit 83b2f56

Browse files
alexthomas93willtai
authored andcommitted
Adds a Text Splitter (#82)
* Added text splitter adapter class * Added copyright header to new files * Added __future__ import to text_splitters.py for backwards compatibility of type hints * Moved text splitter file and tests * Split text splitter adapter into 2 adapters * Added optional metadata to text chunks * Fixed typos * Moved text splitters inside of the components folder * Fixed Component import
1 parent a1d5259 commit 83b2f56

File tree

8 files changed

+836
-2
lines changed

8 files changed

+836
-2
lines changed

poetry.lock

Lines changed: 611 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ scipy = [
6262
{version = "^1", python = "<3.12"},
6363
{version = "^1.7.0", python = ">=3.12"}
6464
]
65+
llama-index = "^0.10.55"
6566
pytest-asyncio = "^0.23.8"
6667

6768
[tool.poetry.extras]

src/neo4j_genai/components/text_splitters/__init__.py

Whitespace-only changes.
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# Neo4j Sweden AB [https://neo4j.com]
2+
# #
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
# #
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
# #
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
from __future__ import annotations
15+
16+
from abc import abstractmethod
17+
from typing import Any, Optional
18+
19+
from neo4j_genai.pipeline.component import Component, DataModel
20+
from pydantic import BaseModel
21+
22+
23+
class TextChunk(BaseModel):
24+
"""A chunk of text split from a document by a text splitter.
25+
26+
Attributes:
27+
text (str): The raw chunk text.
28+
metadata (Optional[dict[str, Any]]): Metadata associated with this chunk such as the id of the next chunk in the original document.
29+
"""
30+
31+
text: str
32+
metadata: Optional[dict[str, Any]] = None
33+
34+
35+
class TextChunks(DataModel):
36+
"""A collection of text chunks returned from a text splitter.
37+
38+
Attributes:
39+
chunks (list[TextChunk]): A list of text chunks.
40+
"""
41+
42+
chunks: list[TextChunk]
43+
44+
45+
class TextSplitter(Component):
46+
"""Interface for a text splitter."""
47+
48+
@abstractmethod
49+
async def run(self, text: str) -> TextChunks:
50+
pass
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# Neo4j Sweden AB [https://neo4j.com]
2+
# #
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
# #
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
# #
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
from __future__ import annotations
15+
16+
from langchain_text_splitters import TextSplitter as LangChainTextSplitter
17+
from neo4j_genai.components.text_splitters.base import (
18+
TextChunk,
19+
TextChunks,
20+
TextSplitter,
21+
)
22+
23+
24+
class LangChainTextSplitterAdapter(TextSplitter):
25+
"""Adapter for LangChain TextSplitters.
26+
Allows instances of this class to be used in the knowledge graph builder pipeline.
27+
28+
Args:
29+
text_splitter (LangChainTextSplitter): An instance of LangChain's TextSplitter class.
30+
"""
31+
32+
def __init__(self, text_splitter: LangChainTextSplitter) -> None:
33+
self.text_splitter = text_splitter
34+
35+
async def run(self, text: str) -> TextChunks:
36+
"""
37+
Splits text into chunks.
38+
39+
Args:
40+
text (str): The text to split.
41+
42+
Returns:
43+
TextChunks: The text split into chunks.
44+
"""
45+
return TextChunks(
46+
chunks=[
47+
TextChunk(text=chunk) for chunk in self.text_splitter.split_text(text)
48+
]
49+
)
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# Neo4j Sweden AB [https://neo4j.com]
2+
# #
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
# #
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
# #
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
from __future__ import annotations
15+
16+
from llama_index.core.node_parser import TextSplitter as LlamaIndexTextSplitter
17+
from neo4j_genai.components.text_splitters.base import (
18+
TextChunk,
19+
TextChunks,
20+
TextSplitter,
21+
)
22+
23+
24+
class LlamaIndexTextSplitterAdapter(TextSplitter):
25+
"""Adapter for LlamaIndex TextSplitters.
26+
Allows instances of this class to be used in the knowledge graph builder pipeline.
27+
28+
Args:
29+
text_splitter (LlamaIndexTextSplitter): An instance of LlamaIndex's TextSplitter class.
30+
"""
31+
32+
def __init__(self, text_splitter: LlamaIndexTextSplitter) -> None:
33+
self.text_splitter = text_splitter
34+
35+
async def run(self, text: str) -> TextChunks:
36+
"""
37+
Splits text into chunks.
38+
39+
Args:
40+
text (str): The text to split.
41+
42+
Returns:
43+
TextChunks: The text split into chunks.
44+
"""
45+
return TextChunks(
46+
chunks=[
47+
TextChunk(text=chunk) for chunk in self.text_splitter.split_text(text)
48+
]
49+
)
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# Neo4j Sweden AB [https://neo4j.com]
2+
# #
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
# #
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
# #
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
import pytest
15+
from langchain_text_splitters import RecursiveCharacterTextSplitter
16+
from neo4j_genai.components.text_splitters.base import TextChunk, TextChunks
17+
from neo4j_genai.components.text_splitters.langchain import LangChainTextSplitterAdapter
18+
19+
text = """
20+
Lorem ipsum dolor sit amet, consectetur adipiscing elit.
21+
In cursus erat quis ornare condimentum. Ut sollicitudin libero nec quam vestibulum, non tristique augue tempor.
22+
Nulla fringilla, augue ac fermentum ultricies, mauris tellus tempor orci, at tincidunt purus arcu vitae nisl.
23+
Nunc suscipit neque vitae ipsum viverra, eu interdum tortor iaculis.
24+
Suspendisse sit amet quam non ipsum molestie euismod finibus eu nisi. Quisque sit amet aliquet leo, vel auctor dolor.
25+
Sed auctor enim at tempus eleifend. Suspendisse potenti. Suspendisse congue tellus id justo bibendum, at commodo sapien porta.
26+
Nam sagittis nisl vitae nibh pellentesque, et convallis turpis ultrices.
27+
"""
28+
29+
30+
@pytest.mark.asyncio
31+
async def test_langchain_adapter() -> None:
32+
text_splitter = LangChainTextSplitterAdapter(RecursiveCharacterTextSplitter())
33+
text_chunks = await text_splitter.run(text)
34+
assert isinstance(text_chunks, TextChunks)
35+
for text_chunk in text_chunks.chunks:
36+
assert isinstance(text_chunk, TextChunk)
37+
assert text_chunk.text in text
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# Neo4j Sweden AB [https://neo4j.com]
2+
# #
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
# #
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
# #
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
import pytest
15+
from llama_index.core.node_parser.text.sentence import SentenceSplitter
16+
from neo4j_genai.components.text_splitters.base import TextChunk, TextChunks
17+
from neo4j_genai.components.text_splitters.llamaindex import (
18+
LlamaIndexTextSplitterAdapter,
19+
)
20+
21+
text = """
22+
Lorem ipsum dolor sit amet, consectetur adipiscing elit.
23+
In cursus erat quis ornare condimentum. Ut sollicitudin libero nec quam vestibulum, non tristique augue tempor.
24+
Nulla fringilla, augue ac fermentum ultricies, mauris tellus tempor orci, at tincidunt purus arcu vitae nisl.
25+
Nunc suscipit neque vitae ipsum viverra, eu interdum tortor iaculis.
26+
Suspendisse sit amet quam non ipsum molestie euismod finibus eu nisi. Quisque sit amet aliquet leo, vel auctor dolor.
27+
Sed auctor enim at tempus eleifend. Suspendisse potenti. Suspendisse congue tellus id justo bibendum, at commodo sapien porta.
28+
Nam sagittis nisl vitae nibh pellentesque, et convallis turpis ultrices.
29+
"""
30+
31+
32+
@pytest.mark.asyncio
33+
async def test_llamaindex_adapter() -> None:
34+
text_splitter = LlamaIndexTextSplitterAdapter(SentenceSplitter())
35+
text_chunks = await text_splitter.run(text)
36+
assert isinstance(text_chunks, TextChunks)
37+
for text_chunk in text_chunks.chunks:
38+
assert isinstance(text_chunk, TextChunk)
39+
assert text_chunk.text in text

0 commit comments

Comments
 (0)