Skip to content

Commit 755e92b

Browse files
authored
Merge pull request #376 from marklogic/feature/splitter-xml-namespace
MLE-18275 Added default namespace for Flux XML
2 parents 41ac480 + 1a972b2 commit 755e92b

File tree

15 files changed

+99
-60
lines changed

15 files changed

+99
-60
lines changed

marklogic-langchain4j/src/main/java/com/marklogic/langchain4j/Util.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ public interface Util {
2020
*/
2121
Logger LANGCHAIN4J_LOGGER = LoggerFactory.getLogger("com.marklogic.langchain4j");
2222

23+
String DEFAULT_XML_NAMESPACE = "http://marklogic.com/appservices/model";
24+
2325
static JsonNode getJsonFromHandle(AbstractWriteHandle writeHandle) {
2426
if (writeHandle instanceof JacksonHandle) {
2527
return ((JacksonHandle) writeHandle).get();

marklogic-langchain4j/src/main/java/com/marklogic/langchain4j/embedding/XmlChunkConfig.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,14 @@
33
*/
44
package com.marklogic.langchain4j.embedding;
55

6+
import com.marklogic.langchain4j.Util;
7+
68
import javax.xml.namespace.NamespaceContext;
79

10+
/**
11+
* Captures configuration settings for the existing chunks in XML documents. Used to then add embeddings to each
12+
* chunk.
13+
*/
814
public class XmlChunkConfig {
915

1016
// The default expression ignores the namespace so that if a user is e.g. constructing a new XML document with a
@@ -24,7 +30,7 @@ public XmlChunkConfig() {
2430
public XmlChunkConfig(String textExpression, String embeddingName, String embeddingNamespace, NamespaceContext namespaceContext) {
2531
this.textExpression = textExpression != null ? textExpression : DEFAULT_TEXT_EXPRESSION;
2632
this.embeddingName = embeddingName != null ? embeddingName : "embedding";
27-
this.embeddingNamespace = embeddingNamespace;
33+
this.embeddingNamespace = embeddingNamespace != null ? embeddingNamespace : Util.DEFAULT_XML_NAMESPACE;
2834
this.namespaceContext = namespaceContext;
2935
}
3036

marklogic-langchain4j/src/main/java/com/marklogic/langchain4j/splitter/ChunkConfig.java

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,12 @@
44
package com.marklogic.langchain4j.splitter;
55

66
import com.marklogic.client.io.DocumentMetadataHandle;
7+
import com.marklogic.langchain4j.Util;
78

9+
/**
10+
* Captures configuration settings for producing chunks, either in a source document or in separate
11+
* sidecar documents.
12+
*/
813
public class ChunkConfig {
914

1015
private final DocumentMetadataHandle metadata;
@@ -30,7 +35,7 @@ public static class Builder {
3035
private int maxChunks;
3136
private String documentType;
3237
private String rootName;
33-
private String xmlNamespace;
38+
private String xmlNamespace = Util.DEFAULT_XML_NAMESPACE;
3439
private String uriPrefix;
3540
private String uriSuffix;
3641

@@ -59,7 +64,9 @@ public Builder withRootName(String rootName) {
5964
}
6065

6166
public Builder withXmlNamespace(String xmlNamespace) {
62-
this.xmlNamespace = xmlNamespace;
67+
if (xmlNamespace != null) {
68+
this.xmlNamespace = xmlNamespace;
69+
}
6370
return this;
6471
}
6572

marklogic-langchain4j/src/main/java/com/marklogic/langchain4j/splitter/XmlChunkDocumentProducer.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import com.marklogic.client.impl.DocumentWriteOperationImpl;
88
import com.marklogic.client.io.DOMHandle;
99
import com.marklogic.client.io.Format;
10+
import com.marklogic.langchain4j.Util;
1011
import com.marklogic.langchain4j.dom.DOMHelper;
1112
import com.marklogic.langchain4j.embedding.Chunk;
1213
import com.marklogic.langchain4j.embedding.DOMChunk;
@@ -92,7 +93,7 @@ private void addChunk(Document doc, TextSegment textSegment, Element chunksEleme
9293
}
9394

9495
private String determineChunksElementName(Document doc) {
95-
return doc.getDocumentElement().getElementsByTagName(DEFAULT_CHUNKS_ELEMENT_NAME).getLength() == 0 ?
96+
return doc.getDocumentElement().getElementsByTagNameNS(Util.DEFAULT_XML_NAMESPACE, DEFAULT_CHUNKS_ELEMENT_NAME).getLength() == 0 ?
9697
DEFAULT_CHUNKS_ELEMENT_NAME : "splitter-chunks";
9798
}
9899
}

marklogic-spark-connector/src/test/java/com/marklogic/langchain4j/embedding/EmbedderTest.java

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import com.marklogic.spark.writer.XmlUtil;
2020
import dev.langchain4j.data.document.splitter.DocumentSplitters;
2121
import dev.langchain4j.model.embedding.onnx.allminilml6v2.AllMiniLmL6V2EmbeddingModel;
22+
import org.jdom2.Namespace;
2223
import org.junit.jupiter.api.Test;
2324

2425
import java.util.Iterator;
@@ -88,9 +89,10 @@ void xml() {
8889

8990
docs.forEachRemaining(doc -> {
9091
XmlNode node = new XmlNode(XmlUtil.extractDocument(doc.getContent()));
91-
node.assertElementCount("/root/chunks/chunk", 2);
92-
node.assertElementExists("/root/chunks/chunk[1]/embedding");
93-
node.assertElementExists("/root/chunks/chunk[2]/embedding");
92+
node.setNamespaces(new Namespace[]{Namespace.getNamespace("model", "http://marklogic.com/appservices/model")});
93+
node.assertElementCount("/model:root/model:chunks/model:chunk", 2);
94+
node.assertElementExists("/model:root/model:chunks/model:chunk[1]/model:embedding");
95+
node.assertElementExists("/model:root/model:chunks/model:chunk[2]/model:embedding");
9496
});
9597
}
9698

marklogic-spark-connector/src/test/java/com/marklogic/langchain4j/splitter/SplitterTest.java

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import com.marklogic.junit5.XmlNode;
1616
import com.marklogic.spark.AbstractIntegrationTest;
1717
import dev.langchain4j.data.document.splitter.DocumentSplitters;
18+
import org.jdom2.Namespace;
1819
import org.junit.jupiter.api.Test;
1920

2021
import java.util.Map;
@@ -34,16 +35,16 @@ void textPath() {
3435

3536
doc.assertElementCount(
3637
"Expecting the default splitter to split the 'text' element into 4 chunks, each having its own 'text' element.",
37-
"/root/chunks/chunk[text/text()]", 4);
38+
"/root/model:chunks/model:chunk[model:text/text()]", 4);
3839
}
3940

4041
@Test
4142
void elementPath() {
4243
XmlNode doc = splitTextDocument("/root/nested");
4344
doc.assertElementCount("Only expecting one chunk since the root/nested/text element has very little text",
44-
"/root/chunks/chunk", 1);
45+
"/root/model:chunks/model:chunk", 1);
4546

46-
String value = doc.getElementValue("/root/chunks/chunk/text");
47+
String value = doc.getElementValue("/root/model:chunks/model:chunk/model:text");
4748
assertEquals("This is for testing.", value, "With our DOM-based implementation, we can easily return the " +
4849
"text content of any node selected by the user's path. We may eventually support an option to instead " +
4950
"serialize the selected node into a string.");
@@ -52,11 +53,11 @@ void elementPath() {
5253
@Test
5354
void attributePath() {
5455
XmlNode doc = splitTextDocument("/root/attribute-test/@text");
55-
doc.assertElementCount("/root/chunks/chunk", 1);
56+
doc.assertElementCount("/root/model:chunks/model:chunk", 1);
5657
doc.assertElementValue("It should be rare that a user wants to split the text in an attribute, but it should " +
5758
"be feasible. We don't have a way though of preserving the attribute name in some sort of serialization " +
5859
"with JDOM2; we can only get the attribute value.",
59-
"/root/chunks/chunk/text",
60+
"/root/model:chunks/model:chunk/model:text",
6061
"Some attribute text."
6162
);
6263
}
@@ -74,11 +75,11 @@ void multipleMatchingElements() {
7475

7576
doc.assertElementCount(
7677
"Should have text from 2 elements, but that's small enough for 1 chunk",
77-
"/root/chunks/chunk", 1);
78+
"/root/model:chunks/model:chunk", 1);
7879

7980
doc.assertElementValue(
8081
"The single chunk should have the concatenation of the two selected elements, joined with a space.",
81-
"/root/chunks/chunk/text", "https://docs.marklogic.com/guide/java/intro This is for testing.");
82+
"/root/model:chunks/model:chunk/model:text", "https://docs.marklogic.com/guide/java/intro This is for testing.");
8283
}
8384

8485
@Test
@@ -138,7 +139,7 @@ private XmlNode splitTextDocument(String xpath) {
138139
DocumentWriteOperation sourceDocument = readXmlDocument();
139140
DocumentWriteOperation output = newXmlSplitter(xpath).apply(sourceDocument).next();
140141
String xml = HandleAccessor.contentAsString(output.getContent());
141-
return new XmlNode(xml);
142+
return new XmlNode(xml, Namespace.getNamespace("model", "http://marklogic.com/appservices/model"));
142143
}
143144

144145
private DocumentTextSplitter newJsonSplitter(String... jsonPointers) {

marklogic-spark-connector/src/test/java/com/marklogic/spark/AbstractIntegrationTest.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,4 +175,15 @@ protected XmlNode readDocumentProperties(String uri) {
175175
props.setNamespaces(new Namespace[]{PROPERTIES_NAMESPACE});
176176
return props;
177177
}
178+
179+
@Override
180+
protected XmlNode readXmlDocument(String uri) {
181+
// Registers two frequently used namespaces in tests.
182+
return readXmlDocument(uri,
183+
Namespace.getNamespace("model", "http://marklogic.com/appservices/model"),
184+
Namespace.getNamespace("ex", "org:example")
185+
);
186+
}
187+
188+
178189
}

marklogic-spark-connector/src/test/java/com/marklogic/spark/writer/embedding/AddEmbeddingsFromTextTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ void xmlSidecarDocuments() {
5252
for (String uri : uris) {
5353
assertTrue(uri.endsWith(".xml"));
5454
XmlNode doc = readXmlDocument(uri);
55-
doc.assertElementCount("/node()/chunks/chunk", 1);
55+
doc.assertElementCount("/node()/model:chunks/model:chunk", 1);
5656
}
5757
}
5858

marklogic-spark-connector/src/test/java/com/marklogic/spark/writer/embedding/AddEmbeddingsToXmlTest.java

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -85,8 +85,14 @@ void sidecarWithNamespace() {
8585
.mode(SaveMode.Append)
8686
.save();
8787

88-
verifyChunksInNamespacedSidecar();
89-
verifyEachChunkIsReturnedByAVectorQuery("namespaced_xml_chunks");
88+
XmlNode doc = readXmlDocument("/split-test.xml-chunks-1.xml");
89+
doc.assertElementCount("/ex:sidecar/ex:chunks/ex:chunk", 4);
90+
for (XmlNode chunk : doc.getXmlNodes("/ex:sidecar/ex:chunks/ex:chunk")) {
91+
chunk.assertElementExists("/ex:chunk/ex:text");
92+
chunk.assertElementExists("For now, the embedding still defaults to the empty namespace. We may change " +
93+
"this soon to be a MarkLogic-specific namespace to better distinguish it from the users " +
94+
"content.", "/ex:chunk/model:embedding");
95+
}
9096
}
9197

9298
/**
@@ -133,10 +139,9 @@ void customChunks() {
133139

134140
XmlNode doc = readXmlDocument("/split-test.xml");
135141
doc.assertElementCount("Each of the 2 custom chunks should have an 'embedding' element.",
136-
"/envelope/my-chunks/my-chunk[my-text and embedding]", 2);
142+
"/envelope/my-chunks/my-chunk[my-text and model:embedding]", 2);
137143
}
138144

139-
140145
@Test
141146
void namespacedCustomChunks() {
142147
readDocument("/marklogic-docs/namespaced-custom-chunks.xml")
@@ -151,9 +156,9 @@ void namespacedCustomChunks() {
151156
.mode(SaveMode.Append)
152157
.save();
153158

154-
XmlNode doc = readXmlDocument("/split-test.xml", Namespace.getNamespace("ex", "org:example"));
159+
XmlNode doc = readXmlDocument("/split-test.xml");
155160
doc.assertElementCount("Each of the 2 custom chunks should have an 'embedding' element.",
156-
"/ex:envelope/ex:my-chunks/ex:my-chunk[ex:my-text and embedding]", 2);
161+
"/ex:envelope/ex:my-chunks/ex:my-chunk[ex:my-text and model:embedding]", 2);
157162
}
158163

159164
@Test
@@ -243,7 +248,7 @@ private void verifyEachChunkOnDocumentHasAnEmbedding(String uri) {
243248
XmlNode doc = readXmlDocument(uri);
244249
doc.getXmlNodes("/node()/chunks/chunk").forEach(chunk -> {
245250
chunk.assertElementExists("/chunk/text");
246-
chunk.assertElementExists("/chunk/embedding");
251+
chunk.assertElementExists("/chunk/model:embedding");
247252
});
248253
}
249254

@@ -270,13 +275,12 @@ private void verifyEachChunkIsReturnedByAVectorQuery(String viewName) {
270275
}
271276

272277
private void verifyChunksInNamespacedSidecar() {
273-
XmlNode doc = readXmlDocument("/split-test.xml-chunks-1.xml", Namespace.getNamespace("ex", "org:example"));
278+
XmlNode doc = readXmlDocument("/split-test.xml-chunks-1.xml");
274279
doc.assertElementCount("/ex:sidecar/ex:chunks/ex:chunk", 4);
275280
for (XmlNode chunk : doc.getXmlNodes("/ex:sidecar/ex:chunks/ex:chunk")) {
276281
chunk.assertElementExists("/ex:chunk/ex:text");
277-
chunk.assertElementExists("For now, the embedding still defaults to the empty namespace. We may change " +
278-
"this soon to be a MarkLogic-specific namespace to better distinguish it from the users " +
279-
"content.", "/ex:chunk/embedding");
282+
chunk.assertElementExists("The embedding should default to the MarkLogic-specific namespace when not " +
283+
"specified by the user.", "/ex:chunk/model:embedding");
280284
}
281285
}
282286
}

marklogic-spark-connector/src/test/java/com/marklogic/spark/writer/splitter/SplitJsonDocumentTest.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -244,10 +244,10 @@ void xmlChunks() {
244244
assertCollectionSize("chunks", 2);
245245

246246
XmlNode doc = readXmlDocument("/split-test.json-chunks-1.xml");
247-
doc.assertElementCount("/root/chunks/chunk", 2);
247+
doc.assertElementCount("/model:root/model:chunks/model:chunk", 2);
248248

249249
doc = readXmlDocument("/split-test.json-chunks-2.xml");
250-
doc.assertElementCount("/root/chunks/chunk", 2);
250+
doc.assertElementCount("/model:root/model:chunks/model:chunk", 2);
251251
}
252252

253253
@Test

0 commit comments

Comments
 (0)