Skip to content

Commit 745e718

Browse files
ricken07tzolov
authored andcommitted
Add Builder pattern and tests for TokenTextSplitter
- Convert TokenTextSplitter configuration parameters to final fields - Add static Builder class for fluent construction - Define default constants for configuration values - Add unit tests validating builder and default configurations - Test metadata handling and content splitting behavior
1 parent a1980ec commit 745e718

File tree

2 files changed

+170
-9
lines changed

2 files changed

+170
-9
lines changed

spring-ai-core/src/main/java/org/springframework/ai/transformer/splitter/TokenTextSplitter.java

Lines changed: 72 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,37 +29,49 @@
2929
/**
3030
* @author Raphael Yu
3131
* @author Christian Tzolov
32+
* @author Ricken Bazolo
3233
*/
3334
public class TokenTextSplitter extends TextSplitter {
3435

3536
private final EncodingRegistry registry = Encodings.newLazyEncodingRegistry();
3637

3738
private final Encoding encoding = registry.getEncoding(EncodingType.CL100K_BASE);
3839

40+
private final static int DEFAULT_CHUNK_SIZE = 800;
41+
42+
private final static int MIN_CHUNK_SIZE_CHARS = 350;
43+
44+
private final static int MIN_CHUNK_LENGTH_TO_EMBED = 5;
45+
46+
private final static int MAX_NUM_CHUNKS = 10000;
47+
48+
private final static boolean KEEP_SEPARATOR = true;
49+
3950
// The target size of each text chunk in tokens
40-
private int defaultChunkSize = 800;
51+
private final int chunkSize;
4152

4253
// The minimum size of each text chunk in characters
43-
private int minChunkSizeChars = 350;
54+
private final int minChunkSizeChars;
4455

4556
// Discard chunks shorter than this
46-
private int minChunkLengthToEmbed = 5;
57+
private final int minChunkLengthToEmbed;
4758

4859
// The maximum number of chunks to generate from a text
49-
private int maxNumChunks = 10000;
60+
private final int maxNumChunks;
5061

51-
private boolean keepSeparator = true;
62+
private final boolean keepSeparator;
5263

5364
public TokenTextSplitter() {
65+
this(DEFAULT_CHUNK_SIZE, MIN_CHUNK_SIZE_CHARS, MIN_CHUNK_LENGTH_TO_EMBED, MAX_NUM_CHUNKS, KEEP_SEPARATOR);
5466
}
5567

5668
public TokenTextSplitter(boolean keepSeparator) {
57-
this.keepSeparator = keepSeparator;
69+
this(DEFAULT_CHUNK_SIZE, MIN_CHUNK_SIZE_CHARS, MIN_CHUNK_LENGTH_TO_EMBED, MAX_NUM_CHUNKS, keepSeparator);
5870
}
5971

60-
public TokenTextSplitter(int defaultChunkSize, int minChunkSizeChars, int minChunkLengthToEmbed, int maxNumChunks,
72+
public TokenTextSplitter(int chunkSize, int minChunkSizeChars, int minChunkLengthToEmbed, int maxNumChunks,
6173
boolean keepSeparator) {
62-
this.defaultChunkSize = defaultChunkSize;
74+
this.chunkSize = chunkSize;
6375
this.minChunkSizeChars = minChunkSizeChars;
6476
this.minChunkLengthToEmbed = minChunkLengthToEmbed;
6577
this.maxNumChunks = maxNumChunks;
@@ -68,7 +80,7 @@ public TokenTextSplitter(int defaultChunkSize, int minChunkSizeChars, int minChu
6880

6981
@Override
7082
protected List<String> splitText(String text) {
71-
return doSplit(text, this.defaultChunkSize);
83+
return doSplit(text, this.chunkSize);
7284
}
7385

7486
protected List<String> doSplit(String text, int chunkSize) {
@@ -133,4 +145,55 @@ private String decodeTokens(List<Integer> tokens) {
133145
return this.encoding.decode(tokensIntArray);
134146
}
135147

148+
public static Builder builder() {
149+
return new Builder();
150+
}
151+
152+
public static class Builder {
153+
154+
private int chunkSize;
155+
156+
private int minChunkSizeChars;
157+
158+
private int minChunkLengthToEmbed;
159+
160+
private int maxNumChunks;
161+
162+
private boolean keepSeparator;
163+
164+
private Builder() {
165+
}
166+
167+
public Builder withChunkSize(int chunkSize) {
168+
this.chunkSize = chunkSize;
169+
return this;
170+
}
171+
172+
public Builder withMinChunkSizeChars(int minChunkSizeChars) {
173+
this.minChunkSizeChars = minChunkSizeChars;
174+
return this;
175+
}
176+
177+
public Builder withMinChunkLengthToEmbed(int minChunkLengthToEmbed) {
178+
this.minChunkLengthToEmbed = minChunkLengthToEmbed;
179+
return this;
180+
}
181+
182+
public Builder withMaxNumChunks(int maxNumChunks) {
183+
this.maxNumChunks = maxNumChunks;
184+
return this;
185+
}
186+
187+
public Builder withKeepSeparator(boolean keepSeparator) {
188+
this.keepSeparator = keepSeparator;
189+
return this;
190+
}
191+
192+
public TokenTextSplitter build() {
193+
return new TokenTextSplitter(this.chunkSize, this.minChunkSizeChars, this.minChunkLengthToEmbed,
194+
this.maxNumChunks, this.keepSeparator);
195+
}
196+
197+
}
198+
136199
}
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
package org.springframework.ai.transformer.splitter;
2+
3+
import org.junit.jupiter.api.Test;
4+
import org.springframework.ai.document.DefaultContentFormatter;
5+
import org.springframework.ai.document.Document;
6+
7+
import java.util.List;
8+
import java.util.Map;
9+
10+
import static org.assertj.core.api.Assertions.assertThat;
11+
12+
/**
13+
* @author Ricken Bazolo
14+
*/
15+
public class TokenTextSplitterTest {
16+
17+
@Test
18+
public void testTokenTextSplitterBuilderWithDefaultValues() {
19+
20+
var contentFormatter1 = DefaultContentFormatter.defaultConfig();
21+
var contentFormatter2 = DefaultContentFormatter.defaultConfig();
22+
23+
assertThat(contentFormatter1).isNotSameAs(contentFormatter2);
24+
25+
var doc1 = new Document("In the end, writing arises when man realizes that memory is not enough.",
26+
Map.of("key1", "value1", "key2", "value2"));
27+
doc1.setContentFormatter(contentFormatter1);
28+
29+
var doc2 = new Document("The most oppressive thing about the labyrinth is that you are constantly "
30+
+ "being forced to choose. It isn’t the lack of an exit, but the abundance of exits that is so disorienting.",
31+
Map.of("key2", "value22", "key3", "value3"));
32+
doc2.setContentFormatter(contentFormatter2);
33+
34+
var tokenTextSplitter = new TokenTextSplitter();
35+
36+
var chunks = tokenTextSplitter.apply(List.of(doc1, doc2));
37+
38+
assertThat(chunks.size()).isEqualTo(2);
39+
40+
// Doc 1
41+
assertThat(chunks.get(0).getContent())
42+
.isEqualTo("In the end, writing arises when man realizes that memory is not enough.");
43+
// Doc 2
44+
assertThat(chunks.get(1).getContent()).isEqualTo(
45+
"The most oppressive thing about the labyrinth is that you are constantly being forced to choose. It isn’t the lack of an exit, but the abundance of exits that is so disorienting.");
46+
47+
assertThat(chunks.get(0).getMetadata()).containsKeys("key1", "key2").doesNotContainKeys("key3");
48+
assertThat(chunks.get(1).getMetadata()).containsKeys("key2", "key3").doesNotContainKeys("key1");
49+
}
50+
51+
@Test
52+
public void testTokenTextSplitterBuilderWithAllFields() {
53+
54+
var contentFormatter1 = DefaultContentFormatter.defaultConfig();
55+
var contentFormatter2 = DefaultContentFormatter.defaultConfig();
56+
57+
assertThat(contentFormatter1).isNotSameAs(contentFormatter2);
58+
59+
var doc1 = new Document("In the end, writing arises when man realizes that memory is not enough.",
60+
Map.of("key1", "value1", "key2", "value2"));
61+
doc1.setContentFormatter(contentFormatter1);
62+
63+
var doc2 = new Document("The most oppressive thing about the labyrinth is that you are constantly "
64+
+ "being forced to choose. It isn’t the lack of an exit, but the abundance of exits that is so disorienting.",
65+
Map.of("key2", "value22", "key3", "value3"));
66+
doc2.setContentFormatter(contentFormatter2);
67+
68+
var tokenTextSplitter = TokenTextSplitter.builder()
69+
.withChunkSize(10)
70+
.withMinChunkSizeChars(5)
71+
.withMinChunkLengthToEmbed(3)
72+
.withMaxNumChunks(50)
73+
.withKeepSeparator(true)
74+
.build();
75+
76+
var chunks = tokenTextSplitter.apply(List.of(doc1, doc2));
77+
78+
assertThat(chunks.size()).isEqualTo(6);
79+
80+
// Doc 1
81+
assertThat(chunks.get(0).getContent()).isEqualTo("In the end, writing arises when man realizes that");
82+
assertThat(chunks.get(1).getContent()).isEqualTo("memory is not enough.");
83+
84+
// Doc 2
85+
assertThat(chunks.get(2).getContent()).isEqualTo("The most oppressive thing about the labyrinth is that you");
86+
assertThat(chunks.get(3).getContent()).isEqualTo("are constantly being forced to choose.");
87+
assertThat(chunks.get(4).getContent()).isEqualTo("It isn’t the lack of an exit, but");
88+
assertThat(chunks.get(5).getContent()).isEqualTo("the abundance of exits that is so disorienting");
89+
90+
// Verify that the same, merged metadata is copied to all chunks.
91+
assertThat(chunks.get(0).getMetadata()).isEqualTo(chunks.get(1).getMetadata());
92+
assertThat(chunks.get(2).getMetadata()).isEqualTo(chunks.get(3).getMetadata());
93+
94+
assertThat(chunks.get(0).getMetadata()).containsKeys("key1", "key2").doesNotContainKeys("key3");
95+
assertThat(chunks.get(2).getMetadata()).containsKeys("key2", "key3").doesNotContainKeys("key1");
96+
}
97+
98+
}

0 commit comments

Comments
 (0)