Skip to content

Commit a8d0440

Browse files
committed
Fixed bug with splitting text in text files
1 parent 3bba4e5 commit a8d0440

File tree

3 files changed

+32
-7
lines changed

3 files changed

+32
-7
lines changed

marklogic-langchain4j/src/main/java/com/marklogic/langchain4j/splitter/AbstractChunkDocumentProducer.java

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,10 @@ abstract class AbstractChunkDocumentProducer implements Iterator<DocumentWriteOp
2929
this.textSegments = textSegments;
3030
this.chunkConfig = chunkConfig;
3131

32-
// Chunks cannot be written to a TEXT document. So if maxChunks is zero, and we have a text document, we will
33-
// instead write all the chunks to a separate document.
34-
this.maxChunksPerDocument = Format.TEXT.equals(sourceDocumentFormat) && chunkConfig.getMaxChunks() == 0 ?
32+
// Chunks cannot be written to the source document unless its format is JSON or XML. So if maxChunks is zero and
33+
// we don't have a JSON or XML document, all chunks will be written to a separate document.
34+
boolean cannotAddChunksToSourceDocument = !Format.JSON.equals(sourceDocumentFormat) && !Format.XML.equals(sourceDocumentFormat);
35+
this.maxChunksPerDocument = cannotAddChunksToSourceDocument && chunkConfig.getMaxChunks() == 0 ?
3536
textSegments.size() :
3637
chunkConfig.getMaxChunks();
3738
}

marklogic-langchain4j/src/main/java/com/marklogic/langchain4j/splitter/DefaultChunkAssembler.java

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,13 +56,12 @@ private Format determineSourceDocumentFormat(DocumentWriteOperation sourceDocume
5656
}
5757

5858
private Format determineChunkDocumentFormat(Format sourceDocumentFormat) {
59-
final boolean addChunksToSourceDocument = !Format.TEXT.equals(sourceDocumentFormat) && chunkConfig.getMaxChunks() == 0;
60-
if (addChunksToSourceDocument) {
59+
final boolean canAddChunksToSourceDocument = Format.XML.equals(sourceDocumentFormat) || Format.JSON.equals(sourceDocumentFormat);
60+
if (canAddChunksToSourceDocument && chunkConfig.getMaxChunks() == 0) {
6161
return sourceDocumentFormat;
6262
}
6363

64-
final String documentType = chunkConfig.getDocumentType();
65-
if (documentType != null || Format.TEXT.equals(sourceDocumentFormat)) {
64+
if (chunkConfig.getDocumentType() != null || !canAddChunksToSourceDocument) {
6665
return "xml".equalsIgnoreCase(chunkConfig.getDocumentType()) ? Format.XML : Format.JSON;
6766
}
6867

marklogic-spark-connector/src/test/java/com/marklogic/spark/writer/splitter/SplitTextDocumentTest.java

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
import com.marklogic.spark.AbstractIntegrationTest;
1010
import com.marklogic.spark.Options;
1111
import org.apache.spark.sql.DataFrameWriter;
12+
import org.apache.spark.sql.Dataset;
13+
import org.apache.spark.sql.Row;
1214
import org.apache.spark.sql.SaveMode;
1315
import org.jdom2.Namespace;
1416
import org.junit.jupiter.api.Test;
@@ -56,6 +58,29 @@ void xmlChunks() {
5658
tester.assertReadPermissionExists("spark-user-role");
5759
}
5860

61+
@Test
62+
void inputDocumentHasUnknownFormat() {
63+
Dataset<Row> dataset = newSparkSession().read().format(CONNECTOR_IDENTIFIER)
64+
.load("src/test/resources/mixed-files/hello.txt");
65+
66+
assertTrue(dataset.collectAsList().get(0).isNullAt(2),
67+
"The connector is not expected to determine document type when reading files.");
68+
69+
dataset.write().format(CONNECTOR_IDENTIFIER)
70+
.option(Options.CLIENT_URI, makeClientUri())
71+
.option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
72+
.option(Options.WRITE_SPLITTER_TEXT, true)
73+
.option(Options.WRITE_URI_TEMPLATE, "/test/hello.txt")
74+
.mode(SaveMode.Append)
75+
.save();
76+
77+
JsonNode doc = readJsonDocument("/test/hello.txt-chunks-1.json");
78+
assertEquals("hello world", doc.get("chunks").get(0).get("text").asText(),
79+
"When the input document format is UNKNOWN and max chunks is zero, the connector should realize it " +
80+
"cannot add chunks to a document with format=UNKNOWN and thus it should create a separate chunks " +
81+
"document containing all the chunks.");
82+
}
83+
5984
@Test
6085
void maxChunksOfThree() {
6186
prepareToWriteChunkDocuments()

0 commit comments

Comments
 (0)