Skip to content

Commit 75266fe

Browse files
authored
Merge pull request #359 from marklogic/feature/more-refactor
Refactor: Removed Spark connector references from langchain package
2 parents 5edcb23 + 791d64d commit 75266fe

22 files changed

+118
-79
lines changed
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
/*
2+
* Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
3+
*/
4+
package com.marklogic.langchain4j;
5+
6+
public class MarkLogicLangchainException extends RuntimeException {
7+
8+
public MarkLogicLangchainException(String message) {
9+
super(message);
10+
}
11+
12+
public MarkLogicLangchainException(String message, Throwable cause) {
13+
super(message, cause);
14+
}
15+
16+
}

src/main/java/com/marklogic/langchain4j/JsonUtil.java renamed to src/main/java/com/marklogic/langchain4j/Util.java

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,16 @@
99
import com.marklogic.client.impl.HandleAccessor;
1010
import com.marklogic.client.io.JacksonHandle;
1111
import com.marklogic.client.io.marker.AbstractWriteHandle;
12-
import com.marklogic.spark.ConnectorException;
12+
import org.slf4j.Logger;
13+
import org.slf4j.LoggerFactory;
1314

14-
public interface JsonUtil {
15+
public interface Util {
16+
17+
/**
18+
* Intended for log messages pertaining to the embedder feature. Uses a separate logger so that it can be enabled
19+
* at the info/debug level without enabling any other log messages.
20+
*/
21+
Logger LANGCHAIN4J_LOGGER = LoggerFactory.getLogger("com.marklogic.langchain4j");
1522

1623
static JsonNode getJsonFromHandle(AbstractWriteHandle writeHandle) {
1724
if (writeHandle instanceof JacksonHandle) {
@@ -21,7 +28,7 @@ static JsonNode getJsonFromHandle(AbstractWriteHandle writeHandle) {
2128
try {
2229
return new ObjectMapper().readTree(json);
2330
} catch (JsonProcessingException e) {
24-
throw new ConnectorException(String.format(
31+
throw new MarkLogicLangchainException(String.format(
2532
"Unable to read JSON from content handle; cause: %s", e.getMessage()), e);
2633
}
2734
}

src/main/java/com/marklogic/langchain4j/dom/DOMHelper.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import com.marklogic.client.impl.HandleAccessor;
88
import com.marklogic.client.io.DOMHandle;
99
import com.marklogic.client.io.marker.AbstractWriteHandle;
10-
import com.marklogic.spark.ConnectorException;
10+
import com.marklogic.langchain4j.MarkLogicLangchainException;
1111
import org.w3c.dom.Document;
1212
import org.xml.sax.InputSource;
1313

@@ -50,7 +50,7 @@ public Document extractDocument(DocumentWriteOperation sourceDocument) {
5050
try {
5151
return getDocumentBuilder().parse(new InputSource(new StringReader(xml)));
5252
} catch (Exception e) {
53-
throw new ConnectorException(String.format("Unable to parse XML for document with URI: %s; cause: %s",
53+
throw new MarkLogicLangchainException(String.format("Unable to parse XML for document with URI: %s; cause: %s",
5454
sourceDocument.getUri(), e.getMessage()), e);
5555
}
5656
}
@@ -69,7 +69,7 @@ public XPathExpression compileXPath(String xpathExpression, String purposeForErr
6969
return xpath.compile(xpathExpression);
7070
} catch (XPathExpressionException e) {
7171
String message = massageXPathCompilationError(e.getMessage());
72-
throw new ConnectorException(String.format(
72+
throw new MarkLogicLangchainException(String.format(
7373
"Unable to compile XPath expression for %s: %s; cause: %s",
7474
purposeForErrorMessage, xpathExpression, message), e
7575
);
@@ -89,7 +89,7 @@ private DocumentBuilder getDocumentBuilder() {
8989
try {
9090
this.documentBuilder = this.documentBuilderFactory.newDocumentBuilder();
9191
} catch (ParserConfigurationException e) {
92-
throw new ConnectorException(String.format("Unable to create XML document; cause: %s", e.getMessage()), e);
92+
throw new MarkLogicLangchainException(String.format("Unable to create XML document; cause: %s", e.getMessage()), e);
9393
}
9494
}
9595
return this.documentBuilder;

src/main/java/com/marklogic/langchain4j/dom/XPathNamespaceContext.java

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,27 +3,17 @@
33
*/
44
package com.marklogic.langchain4j.dom;
55

6-
import com.marklogic.spark.Options;
7-
86
import javax.xml.XMLConstants;
97
import javax.xml.namespace.NamespaceContext;
10-
import java.util.HashMap;
118
import java.util.Iterator;
129
import java.util.Map;
1310

1411
public class XPathNamespaceContext implements NamespaceContext {
1512

1613
private final Map<String, String> prefixesToNamespaces;
1714

18-
public XPathNamespaceContext(Map<String, String> properties) {
19-
prefixesToNamespaces = new HashMap<>();
20-
properties.keySet().stream()
21-
.filter(key -> key.startsWith(Options.XPATH_NAMESPACE_PREFIX))
22-
.forEach(key -> {
23-
String prefix = key.substring(Options.XPATH_NAMESPACE_PREFIX.length());
24-
String namespace = properties.get(key);
25-
prefixesToNamespaces.put(prefix, namespace);
26-
});
15+
public XPathNamespaceContext(Map<String, String> prefixesToNamespaces) {
16+
this.prefixesToNamespaces = prefixesToNamespaces;
2717
}
2818

2919
@Override

src/main/java/com/marklogic/langchain4j/embedding/DOMChunk.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
*/
44
package com.marklogic.langchain4j.embedding;
55

6-
import com.marklogic.spark.ConnectorException;
6+
import com.marklogic.langchain4j.MarkLogicLangchainException;
77
import dev.langchain4j.data.embedding.Embedding;
88
import org.w3c.dom.Document;
99
import org.w3c.dom.Element;
@@ -48,7 +48,7 @@ public String getEmbeddingText() {
4848
try {
4949
embeddingTextNodes = (NodeList) xpath.evaluate(textExpression, chunkElement, XPathConstants.NODESET);
5050
} catch (XPathExpressionException e) {
51-
throw new ConnectorException(String.format("Unable to evaluate XPath expression: %s; cause: %s",
51+
throw new MarkLogicLangchainException(String.format("Unable to evaluate XPath expression: %s; cause: %s",
5252
textExpression, e.getMessage()), e);
5353
}
5454

src/main/java/com/marklogic/langchain4j/embedding/DOMChunkSelector.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@
66
import com.marklogic.client.document.DocumentWriteOperation;
77
import com.marklogic.client.impl.DocumentWriteOperationImpl;
88
import com.marklogic.client.io.DOMHandle;
9+
import com.marklogic.langchain4j.MarkLogicLangchainException;
910
import com.marklogic.langchain4j.dom.DOMHelper;
10-
import com.marklogic.spark.ConnectorException;
1111
import org.w3c.dom.Document;
1212
import org.w3c.dom.Element;
1313
import org.w3c.dom.Node;
@@ -55,7 +55,7 @@ private NodeList selectChunkNodes(Document doc) {
5555
try {
5656
return (NodeList) chunksExpression.evaluate(doc, XPathConstants.NODESET);
5757
} catch (XPathExpressionException e) {
58-
throw new ConnectorException(String.format(
58+
throw new MarkLogicLangchainException(String.format(
5959
"Unable to evaluate XPath expression for selecting chunks: %s; cause: %s", chunksExpression, e.getMessage()), e);
6060
}
6161
}
@@ -65,7 +65,7 @@ private List<Chunk> makeChunks(DocumentWriteOperation sourceDocument, Document d
6565
for (int i = 0; i < chunkNodes.getLength(); i++) {
6666
Node node = chunkNodes.item(i);
6767
if (node.getNodeType() != Node.ELEMENT_NODE) {
68-
throw new ConnectorException(String.format("XPath expression for selecting chunks must only " +
68+
throw new MarkLogicLangchainException(String.format("XPath expression for selecting chunks must only " +
6969
"select elements; XPath: %s; document URI: %s", chunksExpression, sourceDocument.getUri()));
7070
}
7171
chunks.add(new DOMChunk(sourceDocument.getUri(), document, (Element) node, xmlChunkConfig, xpathFactory));

src/main/java/com/marklogic/langchain4j/embedding/EmbeddingAdder.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
package com.marklogic.langchain4j.embedding;
55

66
import com.marklogic.client.document.DocumentWriteOperation;
7+
import com.marklogic.langchain4j.Util;
78
import com.marklogic.langchain4j.splitter.DocumentTextSplitter;
8-
import com.marklogic.spark.Util;
99

1010
import java.util.ArrayList;
1111
import java.util.Iterator;
@@ -56,8 +56,8 @@ public Iterator<DocumentWriteOperation> get() {
5656
// Return any pending source documents - i.e. those with chunks that didn't add up to the embedding generator's
5757
// batch size, and thus embeddings haven't been added.
5858
if (pendingSourceDocuments != null && !pendingSourceDocuments.isEmpty()) {
59-
if (Util.EMBEDDER_LOGGER.isInfoEnabled()) {
60-
Util.EMBEDDER_LOGGER.info("Pending source document count: {}; generating embeddings for each document.",
59+
if (Util.LANGCHAIN4J_LOGGER.isInfoEnabled()) {
60+
Util.LANGCHAIN4J_LOGGER.info("Pending source document count: {}; generating embeddings for each document.",
6161
pendingSourceDocuments.size());
6262
}
6363
embeddingGenerator.generateEmbeddingsForPendingChunks();

src/main/java/com/marklogic/langchain4j/embedding/EmbeddingGenerator.java

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
*/
44
package com.marklogic.langchain4j.embedding;
55

6-
import com.marklogic.spark.Util;
6+
import com.marklogic.langchain4j.Util;
77
import dev.langchain4j.data.document.Metadata;
88
import dev.langchain4j.data.embedding.Embedding;
99
import dev.langchain4j.data.segment.TextSegment;
@@ -68,8 +68,8 @@ boolean addEmbeddings(DocumentAndChunks documentAndChunks) {
6868

6969
void generateEmbeddingsForPendingChunks() {
7070
if (!pendingChunks.isEmpty()) {
71-
if (Util.EMBEDDER_LOGGER.isDebugEnabled()) {
72-
Util.EMBEDDER_LOGGER.debug("Generating embeddings for pending chunks; count: {}.", pendingChunks.size());
71+
if (Util.LANGCHAIN4J_LOGGER.isDebugEnabled()) {
72+
Util.LANGCHAIN4J_LOGGER.debug("Generating embeddings for pending chunks; count: {}.", pendingChunks.size());
7373
}
7474
addEmbeddingsToChunks(pendingChunks);
7575
pendingChunks.clear();
@@ -80,8 +80,8 @@ private void addChunkToPendingChunks(Chunk chunk) {
8080
String text = chunk.getEmbeddingText();
8181
if (text != null && text.trim().length() > 0) {
8282
pendingChunks.add(chunk);
83-
} else if (Util.EMBEDDER_LOGGER.isDebugEnabled()) {
84-
Util.EMBEDDER_LOGGER.debug("Not generating embedding for chunk in URI {}; could not find text to use for generating an embedding.",
83+
} else if (Util.LANGCHAIN4J_LOGGER.isDebugEnabled()) {
84+
Util.LANGCHAIN4J_LOGGER.debug("Not generating embedding for chunk in URI {}; could not find text to use for generating an embedding.",
8585
chunk.getDocumentUri());
8686
}
8787
}
@@ -92,7 +92,7 @@ private void addEmbeddingsToChunks(List<Chunk> chunks) {
9292
logResponse(response, textSegments);
9393

9494
if (response.content() == null) {
95-
Util.EMBEDDER_LOGGER.warn("Sent {} chunks; no embeddings were returned; finish reason: {}",
95+
Util.LANGCHAIN4J_LOGGER.warn("Sent {} chunks; no embeddings were returned; finish reason: {}",
9696
textSegments.size(), response.finishReason());
9797
} else {
9898
List<Embedding> embeddings = response.content();
@@ -109,22 +109,22 @@ private List<TextSegment> makeTextSegments(List<Chunk> chunks) {
109109
}
110110

111111
private void logResponse(Response<List<Embedding>> response, List<TextSegment> textSegments) {
112-
if (Util.EMBEDDER_LOGGER.isInfoEnabled()) {
112+
if (Util.LANGCHAIN4J_LOGGER.isInfoEnabled()) {
113113
// Not every embedding model provides token usage.
114114
if (response.tokenUsage() != null) {
115-
Util.EMBEDDER_LOGGER.info("Sent {} chunks; token usage: {}", textSegments.size(), response.tokenUsage());
115+
Util.LANGCHAIN4J_LOGGER.info("Sent {} chunks; token usage: {}", textSegments.size(), response.tokenUsage());
116116
} else {
117-
Util.EMBEDDER_LOGGER.info("Sent {} chunks", textSegments.size());
117+
Util.LANGCHAIN4J_LOGGER.info("Sent {} chunks", textSegments.size());
118118
}
119119

120-
if (Util.EMBEDDER_LOGGER.isDebugEnabled()) {
120+
if (Util.LANGCHAIN4J_LOGGER.isDebugEnabled()) {
121121
long totalRequests = requestCount.incrementAndGet();
122122
if (response.tokenUsage() != null) {
123-
Util.EMBEDDER_LOGGER.debug("Requests: {}; tokens: {}", totalRequests,
123+
Util.LANGCHAIN4J_LOGGER.debug("Requests: {}; tokens: {}", totalRequests,
124124
tokenCount.addAndGet(response.tokenUsage().inputTokenCount())
125125
);
126126
} else {
127-
Util.EMBEDDER_LOGGER.debug("Requests: {}", totalRequests);
127+
Util.LANGCHAIN4J_LOGGER.debug("Requests: {}", totalRequests);
128128
}
129129
}
130130
}

src/main/java/com/marklogic/langchain4j/embedding/JsonChunkSelector.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import com.marklogic.client.document.DocumentWriteOperation;
1111
import com.marklogic.client.impl.DocumentWriteOperationImpl;
1212
import com.marklogic.client.io.JacksonHandle;
13-
import com.marklogic.langchain4j.JsonUtil;
13+
import com.marklogic.langchain4j.Util;
1414

1515
import java.util.ArrayList;
1616
import java.util.List;
@@ -60,7 +60,7 @@ private JsonChunkSelector(String chunksPointerExpression, String textPointer, St
6060

6161
@Override
6262
public DocumentAndChunks selectChunks(DocumentWriteOperation sourceDocument) {
63-
JsonNode doc = JsonUtil.getJsonFromHandle(sourceDocument.getContent());
63+
JsonNode doc = Util.getJsonFromHandle(sourceDocument.getContent());
6464

6565
JsonNode chunksNode = doc.at(chunksPointer);
6666
if (chunksNode == null || (!(chunksNode instanceof ArrayNode) && !(chunksNode instanceof ObjectNode))) {

src/main/java/com/marklogic/langchain4j/splitter/DOMTextSelector.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@
44
package com.marklogic.langchain4j.splitter;
55

66
import com.marklogic.client.document.DocumentWriteOperation;
7+
import com.marklogic.langchain4j.MarkLogicLangchainException;
8+
import com.marklogic.langchain4j.Util;
79
import com.marklogic.langchain4j.dom.DOMHelper;
8-
import com.marklogic.spark.ConnectorException;
9-
import com.marklogic.spark.Util;
1010
import org.w3c.dom.Document;
1111
import org.w3c.dom.NodeList;
1212

@@ -33,15 +33,15 @@ public String selectTextToSplit(DocumentWriteOperation sourceDocument) {
3333
try {
3434
doc = domHelper.extractDocument(sourceDocument);
3535
} catch (Exception ex) {
36-
Util.MAIN_LOGGER.warn("Unable to select text to split in document: {}; cause: {}", sourceDocument.getUri(), ex.getMessage());
36+
Util.LANGCHAIN4J_LOGGER.warn("Unable to select text to split in document: {}; cause: {}", sourceDocument.getUri(), ex.getMessage());
3737
return null;
3838
}
3939

4040
NodeList items;
4141
try {
4242
items = (NodeList) this.textExpression.evaluate(doc, XPathConstants.NODESET);
4343
} catch (XPathExpressionException e) {
44-
throw new ConnectorException(String.format(
44+
throw new MarkLogicLangchainException(String.format(
4545
"Unable to evaluate XPath expression for selecting text to split: %s; cause: %s", textExpression, e.getMessage()), e);
4646
}
4747

0 commit comments

Comments
 (0)