Skip to content

Commit d1d4c2c

Browse files
authored
Merge pull request #378 from marklogic/feature/error-messages
Fix for specifying both namespaces
2 parents 9481130 + 785e426 commit d1d4c2c

File tree

9 files changed

+93
-13
lines changed

9 files changed

+93
-13
lines changed

marklogic-langchain4j/src/main/java/com/marklogic/langchain4j/embedding/XmlChunkConfig.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ public class XmlChunkConfig {
2323
private final NamespaceContext namespaceContext;
2424

2525
// Defaults to the config used by the connector's splitter feature.
26-
public XmlChunkConfig() {
27-
this(null, null, null, null);
26+
public XmlChunkConfig(String embeddingNamespace) {
27+
this(null, null, embeddingNamespace, null);
2828
}
2929

3030
public XmlChunkConfig(String textExpression, String embeddingName, String embeddingNamespace, NamespaceContext namespaceContext) {

marklogic-langchain4j/src/main/java/com/marklogic/langchain4j/splitter/ChunkConfig.java

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,20 @@ public class ChunkConfig {
1717
private final String documentType;
1818
private final String rootName;
1919
private final String xmlNamespace;
20+
private final String embeddingXmlNamespace;
2021
private final String uriPrefix;
2122
private final String uriSuffix;
2223

23-
private ChunkConfig(DocumentMetadataHandle metadata, int maxChunks, String documentType, String rootName, String xmlNamespace, String uriPrefix, String uriSuffix) {
24+
// Ignoring Sonar warning about too many constructor args, as that's mitigated via the builder.
25+
@SuppressWarnings("java:S107")
26+
private ChunkConfig(DocumentMetadataHandle metadata, int maxChunks, String documentType, String rootName,
27+
String xmlNamespace, String embeddingXmlNamespace, String uriPrefix, String uriSuffix) {
2428
this.metadata = metadata;
2529
this.maxChunks = maxChunks;
2630
this.documentType = documentType;
2731
this.rootName = rootName;
2832
this.xmlNamespace = xmlNamespace;
33+
this.embeddingXmlNamespace = embeddingXmlNamespace;
2934
this.uriPrefix = uriPrefix;
3035
this.uriSuffix = uriSuffix;
3136
}
@@ -36,11 +41,17 @@ public static class Builder {
3641
private String documentType;
3742
private String rootName;
3843
private String xmlNamespace = Util.DEFAULT_XML_NAMESPACE;
44+
private String embeddingXmlNamespace;
3945
private String uriPrefix;
4046
private String uriSuffix;
4147

4248
public ChunkConfig build() {
43-
return new ChunkConfig(metadata, maxChunks, documentType, rootName, xmlNamespace, uriPrefix, uriSuffix);
49+
String tempNamespace = embeddingXmlNamespace;
50+
if (tempNamespace == null) {
51+
// If no embedding XML namespace is specified, default to the chunk namespace is defined.
52+
tempNamespace = xmlNamespace != null ? xmlNamespace : Util.DEFAULT_XML_NAMESPACE;
53+
}
54+
return new ChunkConfig(metadata, maxChunks, documentType, rootName, xmlNamespace, tempNamespace, uriPrefix, uriSuffix);
4455
}
4556

4657
public Builder withMetadata(DocumentMetadataHandle metadata) {
@@ -70,6 +81,13 @@ public Builder withXmlNamespace(String xmlNamespace) {
7081
return this;
7182
}
7283

84+
public Builder withEmbeddingXmlNamespace(String embeddingXmlNamespace) {
85+
if (embeddingXmlNamespace != null) {
86+
this.embeddingXmlNamespace = embeddingXmlNamespace;
87+
}
88+
return this;
89+
}
90+
7391
public Builder withUriPrefix(String uriPrefix) {
7492
this.uriPrefix = uriPrefix;
7593
return this;
@@ -108,4 +126,8 @@ public String getUriSuffix() {
108126
public String getXmlNamespace() {
109127
return xmlNamespace;
110128
}
129+
130+
public String getEmbeddingXmlNamespace() {
131+
return embeddingXmlNamespace;
132+
}
111133
}

marklogic-langchain4j/src/main/java/com/marklogic/langchain4j/splitter/XmlChunkDocumentProducer.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ class XmlChunkDocumentProducer extends AbstractChunkDocumentProducer {
3535

3636
// Namespaces aren't needed for producing chunks.
3737
this.domHelper = new DOMHelper(null);
38-
this.xmlChunkConfig = new XmlChunkConfig();
38+
this.xmlChunkConfig = new XmlChunkConfig(chunkConfig.getEmbeddingXmlNamespace());
3939
}
4040

4141
@Override

marklogic-spark-connector/src/main/resources/marklogic-spark-messages.properties

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ spark.marklogic.write.uriTemplate=
1818
spark.marklogic.write.xmlRootName=
1919
spark.marklogic.write.splitter.maxChunkSize=
2020
spark.marklogic.write.splitter.maxOverlapSize=
21+
spark.marklogic.write.splitter.sidecar.maxChunks=
2122
spark.marklogic.write.embedder.chunks.jsonPointer=
2223
spark.marklogic.write.embedder.chunks.xpath=
2324
spark.marklogic.write.embedder.batchSize=

marklogic-spark-connector/src/test/java/com/marklogic/spark/AbstractIntegrationTest.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,8 @@ protected XmlNode readXmlDocument(String uri) {
181181
// Registers two frequently used namespaces in tests.
182182
return readXmlDocument(uri,
183183
Namespace.getNamespace("model", "http://marklogic.com/appservices/model"),
184-
Namespace.getNamespace("ex", "org:example")
184+
Namespace.getNamespace("ex", "org:example"),
185+
Namespace.getNamespace("acme", "org:acme")
185186
);
186187
}
187188

marklogic-spark-connector/src/test/java/com/marklogic/spark/writer/embedding/AddEmbeddingsToXmlTest.java

Lines changed: 49 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,6 @@ void embeddingsInSourceDocument() {
6666
verifyEachChunkIsReturnedByAVectorQuery("xml_chunks");
6767
}
6868

69-
@ExtendWith(RequiresMarkLogic12.class)
7069
@Test
7170
void sidecarWithNamespace() {
7271
readDocument("/marklogic-docs/namespaced-java-client-intro.xml")
@@ -89,9 +88,8 @@ void sidecarWithNamespace() {
8988
doc.assertElementCount("/ex:sidecar/ex:chunks/ex:chunk", 4);
9089
for (XmlNode chunk : doc.getXmlNodes("/ex:sidecar/ex:chunks/ex:chunk")) {
9190
chunk.assertElementExists("/ex:chunk/ex:text");
92-
chunk.assertElementExists("For now, the embedding still defaults to the empty namespace. We may change " +
93-
"this soon to be a MarkLogic-specific namespace to better distinguish it from the users " +
94-
"content.", "/ex:chunk/model:embedding");
91+
chunk.assertElementExists("When a namespace is specified for the sidecar XML document, that should " +
92+
"override the default namespace for the embedding element.", "/ex:chunk/ex:embedding");
9593
}
9694
}
9795

@@ -107,7 +105,6 @@ void sidecarWithCustomNamespace() {
107105
readDocument("/marklogic-docs/java-client-intro.xml")
108106
.write().format(CONNECTOR_IDENTIFIER)
109107
.option(Options.CLIENT_URI, makeClientUri())
110-
.option(Options.XPATH_NAMESPACE_PREFIX + "ex", "org:example")
111108
.option(Options.WRITE_SPLITTER_XPATH, "/node()/text/text()")
112109
.option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
113110
.option(Options.WRITE_URI_TEMPLATE, "/split-test.xml")
@@ -117,13 +114,60 @@ void sidecarWithCustomNamespace() {
117114
.option(Options.WRITE_SPLITTER_SIDECAR_XML_NAMESPACE, "org:example")
118115
.option(Options.WRITE_SPLITTER_SIDECAR_COLLECTIONS, "namespaced-xml-vector-chunks")
119116
.option(Options.WRITE_EMBEDDER_MODEL_FUNCTION_CLASS_NAME, TEST_EMBEDDING_FUNCTION_CLASS)
117+
.option(Options.WRITE_EMBEDDER_EMBEDDING_NAMESPACE, "http://marklogic.com/appservices/model")
120118
.mode(SaveMode.Append)
121119
.save();
122120

123121
verifyChunksInNamespacedSidecar();
124122
verifyEachChunkIsReturnedByAVectorQuery("namespaced_xml_chunks");
125123
}
126124

125+
@Test
126+
void sidecarWithCustomNamespaceAndCustomEmbeddingNamespace() {
127+
readDocument("/marklogic-docs/java-client-intro.xml")
128+
.write().format(CONNECTOR_IDENTIFIER)
129+
.option(Options.CLIENT_URI, makeClientUri())
130+
.option(Options.WRITE_SPLITTER_XPATH, "/node()/text/text()")
131+
.option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
132+
.option(Options.WRITE_URI_TEMPLATE, "/split-test.xml")
133+
.option(Options.WRITE_SPLITTER_SIDECAR_MAX_CHUNKS, 100)
134+
.option(Options.WRITE_SPLITTER_SIDECAR_XML_NAMESPACE, "org:example")
135+
.option(Options.WRITE_EMBEDDER_MODEL_FUNCTION_CLASS_NAME, TEST_EMBEDDING_FUNCTION_CLASS)
136+
.option(Options.WRITE_EMBEDDER_EMBEDDING_NAMESPACE, "org:acme")
137+
.mode(SaveMode.Append)
138+
.save();
139+
140+
XmlNode doc = readXmlDocument("/split-test.xml-chunks-1.xml");
141+
doc.assertElementValue("/ex:root/ex:source-uri", "/split-test.xml");
142+
doc.assertElementExists("/ex:root/ex:chunks/ex:chunk[1]/ex:text");
143+
doc.assertElementExists(
144+
"When splitting and adding embeddings, the user can specify a namespace both for the sidecar document " +
145+
"and a separate namespace for the embedding.",
146+
"/ex:root/ex:chunks/ex:chunk[1]/acme:embedding"
147+
);
148+
}
149+
150+
@Test
151+
void sidecarWithNoNamespace() {
152+
readDocument("/marklogic-docs/java-client-intro.xml")
153+
.write().format(CONNECTOR_IDENTIFIER)
154+
.option(Options.CLIENT_URI, makeClientUri())
155+
.option(Options.WRITE_SPLITTER_XPATH, "/node()/text/text()")
156+
.option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
157+
.option(Options.WRITE_URI_TEMPLATE, "/split-test.xml")
158+
.option(Options.WRITE_SPLITTER_SIDECAR_MAX_CHUNKS, 100)
159+
.option(Options.WRITE_SPLITTER_SIDECAR_XML_NAMESPACE, "")
160+
.option(Options.WRITE_EMBEDDER_MODEL_FUNCTION_CLASS_NAME, TEST_EMBEDDING_FUNCTION_CLASS)
161+
.mode(SaveMode.Append)
162+
.save();
163+
164+
XmlNode doc = readXmlDocument("/split-test.xml-chunks-1.xml");
165+
doc.assertElementValue("/root/source-uri", "/split-test.xml");
166+
doc.assertElementExists("/root/chunks/chunk[1]/text");
167+
doc.assertElementExists("Since a namespace is specified for the document - no namespace - it should be " +
168+
"applied to the embedding element too.", "/root/chunks/chunk[1]/embedding");
169+
}
170+
127171
@Test
128172
void customChunks() {
129173
readDocument("/marklogic-docs/custom-chunks.xml")

marklogic-spark-connector/src/test/java/com/marklogic/spark/writer/splitter/SplitJsonDocumentTest.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,17 @@ void maxChunksOfThree() {
174174
tester.assertUpdatePermissionExists("spark-user-role");
175175
}
176176

177+
@Test
178+
void invalidMaxChunksValue() {
179+
DataFrameWriter writer = prepareToWriteChunkDocuments()
180+
.option(Options.WRITE_SPLITTER_SIDECAR_MAX_CHUNKS, -1)
181+
.mode(SaveMode.Append);
182+
183+
SparkException ex = assertThrows(SparkException.class, () -> writer.save());
184+
assertTrue(ex.getMessage().contains("The value of 'spark.marklogic.write.splitter.sidecar.maxChunks' must be 0 or greater."),
185+
"Unexpected error: " + ex.getMessage());
186+
}
187+
177188
@Test
178189
void maxChunksWithCustomPermissions() {
179190
prepareToWriteChunkDocuments()

marklogic-spark-langchain4j/src/main/java/com/marklogic/spark/langchain4j/DocumentTextSplitterFactory.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,8 @@ private static ChunkAssembler makeChunkAssembler(Context context) {
8888
.withRootName(context.getStringOption(Options.WRITE_SPLITTER_SIDECAR_ROOT_NAME))
8989
.withUriPrefix(context.getStringOption(Options.WRITE_SPLITTER_SIDECAR_URI_PREFIX))
9090
.withUriSuffix(context.getStringOption(Options.WRITE_SPLITTER_SIDECAR_URI_SUFFIX))
91-
.withXmlNamespace(context.getStringOption(Options.WRITE_SPLITTER_SIDECAR_XML_NAMESPACE))
91+
.withXmlNamespace(context.getProperties().get(Options.WRITE_SPLITTER_SIDECAR_XML_NAMESPACE))
92+
.withEmbeddingXmlNamespace(context.getProperties().get(Options.WRITE_EMBEDDER_EMBEDDING_NAMESPACE))
9293
.build()
9394
);
9495
}

marklogic-spark-langchain4j/src/main/java/com/marklogic/spark/langchain4j/EmbeddingAdderFactory.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ private static ChunkSelector makeXmlChunkSelector(Context context) {
8282
XmlChunkConfig xmlChunkConfig = new XmlChunkConfig(
8383
context.getStringOption(Options.WRITE_EMBEDDER_TEXT_XPATH),
8484
context.getStringOption(Options.WRITE_EMBEDDER_EMBEDDING_NAME),
85-
context.getStringOption(Options.WRITE_EMBEDDER_EMBEDDING_NAMESPACE),
85+
context.getProperties().get(Options.WRITE_EMBEDDER_EMBEDDING_NAMESPACE),
8686
NamespaceContextFactory.makeNamespaceContext(context.getProperties())
8787
);
8888
return new DOMChunkSelector(

0 commit comments

Comments
 (0)