Skip to content

Commit 3f5c5bd

Browse files
committed
Refactor: Moving split/embed code into langchain4j package
No functional changes. Next step will be to remove all "com.marklogic.spark" imports from "com.marklogic.langchain4j".
1 parent b0f70ea commit 3f5c5bd

38 files changed

+138
-139
lines changed

src/main/java/com/marklogic/spark/writer/JsonUtil.java renamed to src/main/java/com/marklogic/langchain4j/JsonUtil.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/*
22
* Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
33
*/
4-
package com.marklogic.spark.writer;
4+
package com.marklogic.langchain4j;
55

66
import com.fasterxml.jackson.core.JsonProcessingException;
77
import com.fasterxml.jackson.databind.JsonNode;

src/main/java/com/marklogic/spark/writer/dom/DOMHelper.java renamed to src/main/java/com/marklogic/langchain4j/dom/DOMHelper.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/*
22
* Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
33
*/
4-
package com.marklogic.spark.writer.dom;
4+
package com.marklogic.langchain4j.dom;
55

66
import com.marklogic.client.document.DocumentWriteOperation;
77
import com.marklogic.client.impl.HandleAccessor;

src/main/java/com/marklogic/spark/writer/dom/XPathNamespaceContext.java renamed to src/main/java/com/marklogic/langchain4j/dom/XPathNamespaceContext.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/*
22
* Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
33
*/
4-
package com.marklogic.spark.writer.dom;
4+
package com.marklogic.langchain4j.dom;
55

66
import com.marklogic.spark.Options;
77

src/main/java/com/marklogic/spark/writer/embedding/Chunk.java renamed to src/main/java/com/marklogic/langchain4j/embedding/Chunk.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/*
22
* Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
33
*/
4-
package com.marklogic.spark.writer.embedding;
4+
package com.marklogic.langchain4j.embedding;
55

66
import dev.langchain4j.data.embedding.Embedding;
77

src/main/java/com/marklogic/spark/writer/embedding/ChunkSelector.java renamed to src/main/java/com/marklogic/langchain4j/embedding/ChunkSelector.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/*
22
* Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
33
*/
4-
package com.marklogic.spark.writer.embedding;
4+
package com.marklogic.langchain4j.embedding;
55

66
import com.marklogic.client.document.DocumentWriteOperation;
77

src/main/java/com/marklogic/spark/writer/embedding/DOMChunk.java renamed to src/main/java/com/marklogic/langchain4j/embedding/DOMChunk.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/*
22
* Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
33
*/
4-
package com.marklogic.spark.writer.embedding;
4+
package com.marklogic.langchain4j.embedding;
55

66
import com.marklogic.spark.ConnectorException;
77
import dev.langchain4j.data.embedding.Embedding;

src/main/java/com/marklogic/spark/writer/embedding/DOMChunkSelector.java renamed to src/main/java/com/marklogic/langchain4j/embedding/DOMChunkSelector.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
/*
22
* Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
33
*/
4-
package com.marklogic.spark.writer.embedding;
4+
package com.marklogic.langchain4j.embedding;
55

66
import com.marklogic.client.document.DocumentWriteOperation;
77
import com.marklogic.client.impl.DocumentWriteOperationImpl;
88
import com.marklogic.client.io.DOMHandle;
9+
import com.marklogic.langchain4j.dom.DOMHelper;
910
import com.marklogic.spark.ConnectorException;
10-
import com.marklogic.spark.writer.dom.DOMHelper;
1111
import org.w3c.dom.Document;
1212
import org.w3c.dom.Element;
1313
import org.w3c.dom.Node;
@@ -20,14 +20,14 @@
2020
import java.util.ArrayList;
2121
import java.util.List;
2222

23-
class DOMChunkSelector implements ChunkSelector {
23+
public class DOMChunkSelector implements ChunkSelector {
2424

2525
private final XPathFactory xpathFactory;
2626
private final XPathExpression chunksExpression;
2727
private final XmlChunkConfig xmlChunkConfig;
2828
private final DOMHelper domHelper;
2929

30-
DOMChunkSelector(String chunksExpression, XmlChunkConfig xmlChunkConfig) {
30+
public DOMChunkSelector(String chunksExpression, XmlChunkConfig xmlChunkConfig) {
3131
this.xpathFactory = XPathFactory.newInstance();
3232
this.xmlChunkConfig = xmlChunkConfig;
3333
this.domHelper = new DOMHelper(xmlChunkConfig.getNamespaceContext());

src/main/java/com/marklogic/spark/writer/embedding/DocumentAndChunks.java renamed to src/main/java/com/marklogic/langchain4j/embedding/DocumentAndChunks.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/*
22
* Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
33
*/
4-
package com.marklogic.spark.writer.embedding;
4+
package com.marklogic.langchain4j.embedding;
55

66
import com.marklogic.client.document.DocumentWriteOperation;
77
import com.marklogic.client.io.marker.AbstractWriteHandle;

src/main/java/com/marklogic/spark/writer/embedding/EmbedderDocumentProcessor.java renamed to src/main/java/com/marklogic/langchain4j/embedding/EmbeddingAdder.java

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,35 @@
11
/*
22
* Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
33
*/
4-
package com.marklogic.spark.writer.embedding;
4+
package com.marklogic.langchain4j.embedding;
55

66
import com.marklogic.client.document.DocumentWriteOperation;
7+
import com.marklogic.langchain4j.splitter.DocumentTextSplitter;
78
import com.marklogic.spark.Util;
8-
import com.marklogic.spark.writer.DocumentProcessor;
9-
import com.marklogic.spark.writer.splitter.SplitterDocumentProcessor;
109

1110
import java.util.ArrayList;
1211
import java.util.Iterator;
1312
import java.util.List;
13+
import java.util.function.Function;
1414
import java.util.function.Supplier;
1515
import java.util.stream.Stream;
1616

1717
/**
1818
* Supports a use case where a document already has chunks in it, which must be selected via a {@code ChunkSelector}.
1919
* The {@code EmbeddingModel} is then used to generate and add an embedding to each chunk in a given document.
2020
*/
21-
class EmbedderDocumentProcessor implements DocumentProcessor, Supplier<Iterator<DocumentWriteOperation>> {
21+
public class EmbeddingAdder implements Function<DocumentWriteOperation, Iterator<DocumentWriteOperation>>, Supplier<Iterator<DocumentWriteOperation>> {
2222

2323
private final ChunkSelector chunkSelector;
2424
private final EmbeddingGenerator embeddingGenerator;
25-
private final SplitterDocumentProcessor splitterDocumentProcessor;
25+
private final DocumentTextSplitter documentTextSplitter;
2626

2727
private List<DocumentWriteOperation> pendingSourceDocuments = new ArrayList<>();
2828

29-
EmbedderDocumentProcessor(ChunkSelector chunkSelector, EmbeddingGenerator embeddingGenerator, SplitterDocumentProcessor splitterDocumentProcessor) {
29+
public EmbeddingAdder(ChunkSelector chunkSelector, EmbeddingGenerator embeddingGenerator, DocumentTextSplitter documentTextSplitter) {
3030
this.chunkSelector = chunkSelector;
3131
this.embeddingGenerator = embeddingGenerator;
32-
this.splitterDocumentProcessor = splitterDocumentProcessor;
32+
this.documentTextSplitter = documentTextSplitter;
3333
}
3434

3535
/**
@@ -40,7 +40,7 @@ class EmbedderDocumentProcessor implements DocumentProcessor, Supplier<Iterator<
4040
*/
4141
@Override
4242
public Iterator<DocumentWriteOperation> apply(DocumentWriteOperation sourceDocument) {
43-
if (splitterDocumentProcessor != null) {
43+
if (documentTextSplitter != null) {
4444
return splitAndAddEmbeddings(sourceDocument);
4545
}
4646

@@ -67,7 +67,7 @@ public Iterator<DocumentWriteOperation> get() {
6767
}
6868

6969
private Iterator<DocumentWriteOperation> splitAndAddEmbeddings(DocumentWriteOperation sourceDocument) {
70-
Iterator<DocumentWriteOperation> splitDocuments = splitterDocumentProcessor.apply(sourceDocument);
70+
Iterator<DocumentWriteOperation> splitDocuments = documentTextSplitter.apply(sourceDocument);
7171

7272
// Track the list of documents to return. A document won't be returned immediately if it has chunks but the
7373
// embedding generator doesn't receive enough chunks to meet its batch size threshold.

src/main/java/com/marklogic/spark/writer/embedding/EmbeddingGenerator.java renamed to src/main/java/com/marklogic/langchain4j/embedding/EmbeddingGenerator.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/*
22
* Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
33
*/
4-
package com.marklogic.spark.writer.embedding;
4+
package com.marklogic.langchain4j.embedding;
55

66
import com.marklogic.spark.Util;
77
import dev.langchain4j.data.document.Metadata;
@@ -16,7 +16,7 @@
1616
import java.util.concurrent.atomic.AtomicLong;
1717
import java.util.stream.Collectors;
1818

19-
class EmbeddingGenerator {
19+
public class EmbeddingGenerator {
2020

2121
// We don't have any use for metadata, so just need a single instance for constructing text segments.
2222
private static final Metadata TEXT_SEGMENT_METADATA = new Metadata();
@@ -30,11 +30,11 @@ class EmbeddingGenerator {
3030

3131
private List<Chunk> pendingChunks = new ArrayList<>();
3232

33-
EmbeddingGenerator(EmbeddingModel embeddingModel) {
33+
public EmbeddingGenerator(EmbeddingModel embeddingModel) {
3434
this(embeddingModel, 1);
3535
}
3636

37-
EmbeddingGenerator(EmbeddingModel embeddingModel, int batchSize) {
37+
public EmbeddingGenerator(EmbeddingModel embeddingModel, int batchSize) {
3838
this.embeddingModel = embeddingModel;
3939
this.batchSize = batchSize;
4040
}

0 commit comments

Comments
 (0)