29
29
/**
30
30
* @author Raphael Yu
31
31
* @author Christian Tzolov
32
+ * @author Ricken Bazolo
32
33
*/
33
34
public class TokenTextSplitter extends TextSplitter {
34
35
35
36
private final EncodingRegistry registry = Encodings .newLazyEncodingRegistry ();
36
37
37
38
private final Encoding encoding = registry .getEncoding (EncodingType .CL100K_BASE );
38
39
40
+ private final static int DEFAULT_CHUNK_SIZE = 800 ;
41
+
42
+ private final static int MIN_CHUNK_SIZE_CHARS = 350 ;
43
+
44
+ private final static int MIN_CHUNK_LENGTH_TO_EMBED = 5 ;
45
+
46
+ private final static int MAX_NUM_CHUNKS = 10000 ;
47
+
48
+ private final static boolean KEEP_SEPARATOR = true ;
49
+
39
50
// The target size of each text chunk in tokens
40
- private int defaultChunkSize = 800 ;
51
+ private final int chunkSize ;
41
52
42
53
// The minimum size of each text chunk in characters
43
- private int minChunkSizeChars = 350 ;
54
+ private final int minChunkSizeChars ;
44
55
45
56
// Discard chunks shorter than this
46
- private int minChunkLengthToEmbed = 5 ;
57
+ private final int minChunkLengthToEmbed ;
47
58
48
59
// The maximum number of chunks to generate from a text
49
- private int maxNumChunks = 10000 ;
60
+ private final int maxNumChunks ;
50
61
51
- private boolean keepSeparator = true ;
62
+ private final boolean keepSeparator ;
52
63
53
64
public TokenTextSplitter () {
65
+ this (DEFAULT_CHUNK_SIZE , MIN_CHUNK_SIZE_CHARS , MIN_CHUNK_LENGTH_TO_EMBED , MAX_NUM_CHUNKS , KEEP_SEPARATOR );
54
66
}
55
67
56
68
public TokenTextSplitter (boolean keepSeparator ) {
57
- this . keepSeparator = keepSeparator ;
69
+ this ( DEFAULT_CHUNK_SIZE , MIN_CHUNK_SIZE_CHARS , MIN_CHUNK_LENGTH_TO_EMBED , MAX_NUM_CHUNKS , keepSeparator ) ;
58
70
}
59
71
60
- public TokenTextSplitter (int defaultChunkSize , int minChunkSizeChars , int minChunkLengthToEmbed , int maxNumChunks ,
72
+ public TokenTextSplitter (int chunkSize , int minChunkSizeChars , int minChunkLengthToEmbed , int maxNumChunks ,
61
73
boolean keepSeparator ) {
62
- this .defaultChunkSize = defaultChunkSize ;
74
+ this .chunkSize = chunkSize ;
63
75
this .minChunkSizeChars = minChunkSizeChars ;
64
76
this .minChunkLengthToEmbed = minChunkLengthToEmbed ;
65
77
this .maxNumChunks = maxNumChunks ;
@@ -68,7 +80,7 @@ public TokenTextSplitter(int defaultChunkSize, int minChunkSizeChars, int minChu
68
80
69
81
@ Override
70
82
protected List <String > splitText (String text ) {
71
- return doSplit (text , this .defaultChunkSize );
83
+ return doSplit (text , this .chunkSize );
72
84
}
73
85
74
86
protected List <String > doSplit (String text , int chunkSize ) {
@@ -133,4 +145,55 @@ private String decodeTokens(List<Integer> tokens) {
133
145
return this .encoding .decode (tokensIntArray );
134
146
}
135
147
148
+ public static Builder builder () {
149
+ return new Builder ();
150
+ }
151
+
152
+ public static class Builder {
153
+
154
+ private int chunkSize ;
155
+
156
+ private int minChunkSizeChars ;
157
+
158
+ private int minChunkLengthToEmbed ;
159
+
160
+ private int maxNumChunks ;
161
+
162
+ private boolean keepSeparator ;
163
+
164
+ private Builder () {
165
+ }
166
+
167
+ public Builder withChunkSize (int chunkSize ) {
168
+ this .chunkSize = chunkSize ;
169
+ return this ;
170
+ }
171
+
172
+ public Builder withMinChunkSizeChars (int minChunkSizeChars ) {
173
+ this .minChunkSizeChars = minChunkSizeChars ;
174
+ return this ;
175
+ }
176
+
177
+ public Builder withMinChunkLengthToEmbed (int minChunkLengthToEmbed ) {
178
+ this .minChunkLengthToEmbed = minChunkLengthToEmbed ;
179
+ return this ;
180
+ }
181
+
182
+ public Builder withMaxNumChunks (int maxNumChunks ) {
183
+ this .maxNumChunks = maxNumChunks ;
184
+ return this ;
185
+ }
186
+
187
+ public Builder withKeepSeparator (boolean keepSeparator ) {
188
+ this .keepSeparator = keepSeparator ;
189
+ return this ;
190
+ }
191
+
192
+ public TokenTextSplitter build () {
193
+ return new TokenTextSplitter (this .chunkSize , this .minChunkSizeChars , this .minChunkLengthToEmbed ,
194
+ this .maxNumChunks , this .keepSeparator );
195
+ }
196
+
197
+ }
198
+
136
199
}
0 commit comments