Skip to content

Commit 8b1882b

Browse files
ogbozoyantzolov
authored andcommitted
Filter out null keys and values from document metadata for TextSplitter
- Update TextSplitter to filter out metadata entries with null keys or values - Simplify text substring call in TextSplitterTests - Add test case for document splitting with null metadata values
1 parent 740dd18 commit 8b1882b

File tree

2 files changed

+34
-2
lines changed

2 files changed

+34
-2
lines changed

spring-ai-core/src/main/java/org/springframework/ai/transformer/splitter/TextSplitter.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,8 @@ private List<Document> createDocuments(List<String> texts, List<ContentFormatter
8888
// only primitive values are in here -
8989
Map<String, Object> metadataCopy = metadata.entrySet()
9090
.stream()
91-
.collect(Collectors.toMap(e -> e.getKey(), e -> e.getValue()));
91+
.filter(e -> e.getKey() != null && e.getValue() != null)
92+
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
9293
Document newDoc = new Document(chunk, metadataCopy);
9394

9495
if (this.copyContentFormatter) {

spring-ai-core/src/test/java/org/springframework/ai/transformer/splitter/TextSplitterTests.java

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ protected List<String> splitText(String text) {
4242
List<String> chunks = new ArrayList<>();
4343

4444
chunks.add(text.substring(0, chuckSize));
45-
chunks.add(text.substring(chuckSize, text.length()));
45+
chunks.add(text.substring(chuckSize));
4646

4747
return chunks;
4848
}
@@ -213,4 +213,35 @@ public void pageWithChunkSplit() {
213213
() -> assertThat(splitedDocument.get(3).getMetadata().get("page_number")).isEqualTo(3));
214214
}
215215

216+
@Test
217+
public void testSplitTextWithNullMetadata() {
218+
219+
var contentFormatter = DefaultContentFormatter.defaultConfig();
220+
221+
var doc = new Document("In the end, writing arises when man realizes that memory is not enough.");
222+
223+
doc.getMetadata().put("key1", "value1");
224+
doc.getMetadata().put("key2", null);
225+
226+
doc.setContentFormatter(contentFormatter);
227+
228+
List<Document> chunks = testTextSplitter.apply(List.of(doc));
229+
230+
assertThat(testTextSplitter.isCopyContentFormatter()).isTrue();
231+
232+
assertThat(chunks).hasSize(2);
233+
234+
// Doc chunks:
235+
assertThat(chunks.get(0).getContent()).isEqualTo("In the end, writing arises when man");
236+
assertThat(chunks.get(1).getContent()).isEqualTo(" realizes that memory is not enough.");
237+
238+
// Verify that the same, merged metadata is copied to all chunks.
239+
assertThat(chunks.get(0).getMetadata()).isEqualTo(chunks.get(1).getMetadata());
240+
assertThat(chunks.get(1).getMetadata()).containsKeys("key1");
241+
242+
// Verify that the content formatters are copied from the parents to the chunks.
243+
assertThat(chunks.get(0).getContentFormatter()).isSameAs(contentFormatter);
244+
assertThat(chunks.get(1).getContentFormatter()).isSameAs(contentFormatter);
245+
}
246+
216247
}

0 commit comments

Comments
 (0)