MLE-17715 Can now configure embedder batch size

rjrudin · rjrudin · commit b564a8c42bc4 · 2024-11-06T15:23:11.000-05:00
Going to test this out next on some larger datasets to see what kinds of errors we'll need to handle from the embedding model.
diff --git a/src/main/java/com/marklogic/spark/Options.java b/src/main/java/com/marklogic/spark/Options.java
@@ -250,6 +250,13 @@ public abstract class Options {
      */
     public static final String WRITE_EMBEDDER_EMBEDDING_NAMESPACE = "spark.marklogic.write.embedder.embedding.namespace";
 
+    /**
+     * Defines the number of chunks to send to the embedding model in a single call. Defaults to 1.
+     *
+     * @since 2.5.0
+     */
+    public static final String WRITE_EMBEDDER_BATCH_SIZE = "spark.marklogic.write.embedder.batchSize";
+
     private Options() {
     }
 }
diff --git a/src/main/java/com/marklogic/spark/writer/embedding/EmbedderDocumentProcessor.java b/src/main/java/com/marklogic/spark/writer/embedding/EmbedderDocumentProcessor.java
@@ -5,7 +5,6 @@
 
 import com.marklogic.client.document.DocumentWriteOperation;
 import com.marklogic.spark.writer.DocumentProcessor;
-import dev.langchain4j.model.embedding.EmbeddingModel;
 
 import java.util.Iterator;
 import java.util.stream.Stream;
@@ -19,9 +18,9 @@ class EmbedderDocumentProcessor implements DocumentProcessor {
     private final ChunkSelector chunkSelector;
     private final EmbeddingGenerator embeddingGenerator;
 
-    EmbedderDocumentProcessor(ChunkSelector chunkSelector, EmbeddingModel embeddingModel) {
+    EmbedderDocumentProcessor(ChunkSelector chunkSelector, EmbeddingGenerator embeddingGenerator) {
         this.chunkSelector = chunkSelector;
-        this.embeddingGenerator = new EmbeddingGenerator(embeddingModel);
+        this.embeddingGenerator = embeddingGenerator;
     }
 
     @Override
diff --git a/src/main/java/com/marklogic/spark/writer/embedding/EmbedderDocumentProcessorFactory.java b/src/main/java/com/marklogic/spark/writer/embedding/EmbedderDocumentProcessorFactory.java
@@ -6,8 +6,8 @@
 import com.marklogic.spark.ConnectorException;
 import com.marklogic.spark.ContextSupport;
 import com.marklogic.spark.Options;
-import com.marklogic.spark.writer.dom.XPathNamespaceContext;
 import com.marklogic.spark.writer.DocumentProcessor;
+import com.marklogic.spark.writer.dom.XPathNamespaceContext;
 import dev.langchain4j.model.embedding.EmbeddingModel;
 
 import java.util.HashMap;
@@ -21,11 +21,21 @@ public static Optional<DocumentProcessor> makeEmbedder(ContextSupport context) {
         Optional<EmbeddingModel> embeddingModel = makeEmbeddingModel(context);
         if (embeddingModel.isPresent()) {
             ChunkSelector chunkSelector = makeChunkSelector(context);
-            return Optional.of(new EmbedderDocumentProcessor(chunkSelector, embeddingModel.get()));
+            EmbeddingGenerator embeddingGenerator = makeEmbeddingGenerator(context);
+            return Optional.of(new EmbedderDocumentProcessor(chunkSelector, embeddingGenerator));
         }
         return Optional.empty();
     }
 
+    public static EmbeddingGenerator makeEmbeddingGenerator(ContextSupport context) {
+        Optional<EmbeddingModel> embeddingModel = makeEmbeddingModel(context);
+        if (embeddingModel.isPresent()) {
+            int batchSize = context.getIntOption(Options.WRITE_EMBEDDER_BATCH_SIZE, 1, 1);
+            return new EmbeddingGenerator(embeddingModel.get(), batchSize);
+        }
+        return null;
+    }
+
     /**
      * If the user is also splitting the documents, then we'll know the location of the chunks based on the default
      * chunks data structure produced by the splitter. If the user is instead processing documents that already have
@@ -74,7 +84,7 @@ private static ChunkSelector makeXmlChunkSelector(ContextSupport context) {
         );
     }
 
-    public static Optional<EmbeddingModel> makeEmbeddingModel(ContextSupport context) {
+    private static Optional<EmbeddingModel> makeEmbeddingModel(ContextSupport context) {
         if (!context.hasOption(Options.WRITE_EMBEDDER_MODEL_FUNCTION_CLASS_NAME)) {
             return Optional.empty();
         }
diff --git a/src/main/java/com/marklogic/spark/writer/embedding/EmbeddingGenerator.java b/src/main/java/com/marklogic/spark/writer/embedding/EmbeddingGenerator.java
@@ -4,36 +4,78 @@
 package com.marklogic.spark.writer.embedding;
 
 import com.marklogic.spark.Util;
+import dev.langchain4j.data.document.Metadata;
 import dev.langchain4j.data.embedding.Embedding;
+import dev.langchain4j.data.segment.TextSegment;
 import dev.langchain4j.model.embedding.EmbeddingModel;
 import dev.langchain4j.model.output.Response;
 
+import java.util.ArrayList;
+import java.util.Iterator;
 import java.util.List;
+import java.util.stream.Collectors;
 
 /**
  * Knows how to generate and add embeddings for each chunk. Will soon support a batch size so that more than one
  * chunk can be sent to an embedding model in a single call.
  */
 public class EmbeddingGenerator {
 
-    private EmbeddingModel embeddingModel;
+    // We don't have any use for metadata, so just need a single instance for constructing text segments.
+    private static final Metadata TEXT_SEGMENT_METADATA = new Metadata();
+
+    private final EmbeddingModel embeddingModel;
+    private final int batchSize;
 
     public EmbeddingGenerator(EmbeddingModel embeddingModel) {
+        this(embeddingModel, 1);
+    }
+
+    public EmbeddingGenerator(EmbeddingModel embeddingModel, int batchSize) {
         this.embeddingModel = embeddingModel;
+        this.batchSize = batchSize;
     }
 
     public void addEmbeddings(List<Chunk> chunks) {
-        if (chunks != null) {
-            chunks.forEach(chunk -> {
-                String text = chunk.getEmbeddingText();
-                if (text != null && text.trim().length() > 0) {
-                    Response<Embedding> response = embeddingModel.embed(text);
-                    chunk.addEmbedding(response.content());
-                } else if (Util.MAIN_LOGGER.isDebugEnabled()) {
-                    Util.MAIN_LOGGER.debug("Not generating embedding for chunk in URI {}; could not find text to use for generating an embedding.",
-                        chunk.getDocumentUri());
+        if (chunks == null || chunks.isEmpty()) {
+            return;
+        }
+
+        Iterator<Chunk> chunkIterator = chunks.iterator();
+        List<Chunk> batch = new ArrayList<>();
+        while (chunkIterator.hasNext()) {
+            Chunk chunk = chunkIterator.next();
+            String text = chunk.getEmbeddingText();
+            if (text != null && text.trim().length() > 0) {
+                batch.add(chunk);
+                if (batch.size() >= this.batchSize) {
+                    addEmbeddingsToChunks(batch);
+                    batch = new ArrayList<>();
                 }
-            });
+            } else if (Util.MAIN_LOGGER.isDebugEnabled()) {
+                Util.MAIN_LOGGER.debug("Not generating embedding for chunk in URI {}; could not find text to use for generating an embedding.",
+                    chunk.getDocumentUri());
+            }
+        }
+
+        if (!batch.isEmpty()) {
+            addEmbeddingsToChunks(batch);
+        }
+    }
+
+    private void addEmbeddingsToChunks(List<Chunk> chunks) {
+        List<TextSegment> textSegments = chunks.stream()
+            .map(chunk -> new TextSegment(chunk.getEmbeddingText(), TEXT_SEGMENT_METADATA))
+            .collect(Collectors.toList());
+
+        Response<List<Embedding>> response = embeddingModel.embedAll(textSegments);
+        if (Util.MAIN_LOGGER.isDebugEnabled()) {
+            Util.MAIN_LOGGER.debug("Sent {} chunks; token usage: {}", textSegments.size(), response.tokenUsage());
+        }
+
+        List<Embedding> embeddings = response.content();
+        for (int i = 0; i < embeddings.size(); i++) {
+            chunks.get(i).addEmbedding(embeddings.get(i));
         }
     }
 }
diff --git a/src/main/java/com/marklogic/spark/writer/splitter/SplitterDocumentProcessorFactory.java b/src/main/java/com/marklogic/spark/writer/splitter/SplitterDocumentProcessorFactory.java
@@ -10,9 +10,7 @@
 import com.marklogic.spark.writer.DocumentProcessor;
 import com.marklogic.spark.writer.dom.XPathNamespaceContext;
 import com.marklogic.spark.writer.embedding.EmbedderDocumentProcessorFactory;
-import com.marklogic.spark.writer.embedding.EmbeddingGenerator;
 import dev.langchain4j.data.document.DocumentSplitter;
-import dev.langchain4j.model.embedding.EmbeddingModel;
 
 import java.util.Arrays;
 import java.util.Optional;
@@ -85,12 +83,6 @@ private static ChunkAssembler makeChunkAssembler(ContextSupport context) {
             metadata.getPermissions().addFromDelimitedString(value);
         }
 
-        EmbeddingGenerator embeddingGenerator = null;
-        Optional<EmbeddingModel> embeddingModel = EmbedderDocumentProcessorFactory.makeEmbeddingModel(context);
-        if (embeddingModel.isPresent()) {
-            embeddingGenerator = new EmbeddingGenerator(embeddingModel.get());
-        }
-
         return new DefaultChunkAssembler(new ChunkConfig.Builder()
             .withMetadata(metadata)
             .withMaxChunks(context.getIntOption(Options.WRITE_SPLITTER_SIDECAR_MAX_CHUNKS, 0, 0))
@@ -100,7 +92,7 @@ private static ChunkAssembler makeChunkAssembler(ContextSupport context) {
             .withUriSuffix(context.getStringOption(Options.WRITE_SPLITTER_SIDECAR_URI_SUFFIX))
             .withXmlNamespace(context.getStringOption(Options.WRITE_SPLITTER_SIDECAR_XML_NAMESPACE))
             .build(),
-            embeddingGenerator
+            EmbedderDocumentProcessorFactory.makeEmbeddingGenerator(context)
         );
     }
 
diff --git a/src/main/resources/marklogic-spark-messages.properties b/src/main/resources/marklogic-spark-messages.properties
@@ -20,3 +20,5 @@ spark.marklogic.write.splitter.maxChunkSize=
 spark.marklogic.write.splitter.maxOverlapSize=
 spark.marklogic.write.embedder.chunks.jsonPointer=
 spark.marklogic.write.embedder.chunks.xpath=
+spark.marklogic.write.embedder.batchSize=
+
diff --git a/src/test/java/com/marklogic/spark/writer/embedding/AddEmbeddingsToJsonTest.java b/src/test/java/com/marklogic/spark/writer/embedding/AddEmbeddingsToJsonTest.java
@@ -206,6 +206,45 @@ void chunksIsAnObjectInsteadOfAnArray() {
         assertEquals(JsonNodeType.ARRAY, doc.get("embedding").getNodeType());
     }
 
+    @Test
+    void testBatchSize() {
+        TestEmbeddingModel.batchCounter = 0;
+        
+        readDocument("/marklogic-docs/java-client-intro.json")
+            .write().format(CONNECTOR_IDENTIFIER)
+            .option(Options.CLIENT_URI, makeClientUri())
+            .option(Options.WRITE_SPLITTER_JSON_POINTERS, "/text")
+            .option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
+            .option(Options.WRITE_URI_TEMPLATE, "/split-test.json")
+            .option(Options.WRITE_SPLITTER_MAX_CHUNK_SIZE, 500)
+            .option(Options.WRITE_EMBEDDER_MODEL_FUNCTION_CLASS_NAME, "com.marklogic.spark.writer.embedding.TestEmbeddingModel")
+            .option(Options.WRITE_EMBEDDER_BATCH_SIZE, 2)
+            .mode(SaveMode.Append)
+            .save();
+
+        JsonNode doc = readJsonDocument("/split-test.json");
+        assertEquals(4, doc.get("chunks").size());
+
+        assertEquals(2, TestEmbeddingModel.batchCounter, "Expecting 2 batches to be sent to the test " +
+            "embedding model, given the batch size of 2 and 4 chunks being created.");
+    }
+
+    @Test
+    void invalidBatchSize() {
+        DataFrameWriter writer = readDocument("/marklogic-docs/java-client-intro.json")
+            .write().format(CONNECTOR_IDENTIFIER)
+            .option(Options.CLIENT_URI, makeClientUri())
+            .option(Options.WRITE_SPLITTER_JSON_POINTERS, "/text")
+            .option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
+            .option(Options.WRITE_URI_TEMPLATE, "/split-test.json")
+            .option(Options.WRITE_EMBEDDER_MODEL_FUNCTION_CLASS_NAME, "com.marklogic.spark.writer.embedding.TestEmbeddingModel")
+            .option(Options.WRITE_EMBEDDER_BATCH_SIZE, "abc")
+            .mode(SaveMode.Append);
+
+        ConnectorException ex = assertThrowsConnectorException(() -> writer.save());
+        assertEquals("The value of 'spark.marklogic.write.embedder.batchSize' must be numeric.", ex.getMessage());
+    }
+
     private Dataset<Row> readDocument(String uri) {
         return newSparkSession().read().format(CONNECTOR_IDENTIFIER)
             .option(Options.CLIENT_URI, makeClientUri())
diff --git a/src/test/java/com/marklogic/spark/writer/embedding/EmbedderTest.java b/src/test/java/com/marklogic/spark/writer/embedding/EmbedderTest.java
@@ -62,7 +62,7 @@ void customizedPaths() {
                 .withTextPointer("/wrapper/custom-text")
                 .withEmbeddingArrayName("custom-embedding")
                 .build(),
-            new AllMiniLmL6V2EmbeddingModel()
+            new EmbeddingGenerator(new AllMiniLmL6V2EmbeddingModel())
         );
 
         DocumentWriteOperation output = embedder.apply(new DocumentWriteOperationImpl("a.json", null, new JacksonHandle(doc))).next();
diff --git a/src/test/java/com/marklogic/spark/writer/embedding/EmbeddingGeneratorTest.java b/src/test/java/com/marklogic/spark/writer/embedding/EmbeddingGeneratorTest.java
@@ -0,0 +1,31 @@
+/*
+ * Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
+ */
+package com.marklogic.spark.writer.embedding;
+
+import org.junit.jupiter.api.Test;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+class EmbeddingGeneratorTest {
+
+    @Test
+    void test() {
+        TestEmbeddingModel embeddingModel = new TestEmbeddingModel();
+        TestEmbeddingModel.batchCounter = 0;
+
+        EmbeddingGenerator generator = new EmbeddingGenerator(embeddingModel, 2);
+
+        List<Chunk> chunks = new ArrayList<>();
+        for (int i = 0; i < 5; i++) {
+            chunks.add(new TestEmbeddingModel.TestChunk("text" + i));
+        }
+
+        generator.addEmbeddings(chunks);
+        assertEquals(3, embeddingModel.batchCounter, "3 batches should have been sent given the batch size of 2.");
+    }
+
+}
diff --git a/src/test/java/com/marklogic/spark/writer/embedding/TestEmbeddingModel.java b/src/test/java/com/marklogic/spark/writer/embedding/TestEmbeddingModel.java
@@ -0,0 +1,57 @@
+/*
+ * Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
+ */
+package com.marklogic.spark.writer.embedding;
+
+import dev.langchain4j.data.embedding.Embedding;
+import dev.langchain4j.data.segment.TextSegment;
+import dev.langchain4j.model.embedding.EmbeddingModel;
+import dev.langchain4j.model.output.Response;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.function.Function;
+
+/**
+ * Used for testing the embedder batch size feature.
+ */
+class TestEmbeddingModel implements EmbeddingModel, Function<Map<String, String>, EmbeddingModel> {
+
+    static int batchCounter;
+
+    @Override
+    public EmbeddingModel apply(Map<String, String> options) {
+        return this;
+    }
+
+    @Override
+    public Response<List<Embedding>> embedAll(List<TextSegment> textSegments) {
+        batchCounter++;
+        return Response.from(Arrays.asList(new Embedding(new float[]{1})));
+    }
+
+    static class TestChunk implements Chunk {
+
+        private final String text;
+
+        TestChunk(String text) {
+            this.text = text;
+        }
+
+        @Override
+        public String getDocumentUri() {
+            return "/doesnt/matter.json";
+        }
+
+        @Override
+        public String getEmbeddingText() {
+            return text;
+        }
+
+        @Override
+        public void addEmbedding(Embedding embedding) {
+            // Don't need to do this for the purposes of our test.
+        }
+    }
+}

Original file line number	Diff line number	Diff line change
`@@ -250,6 +250,13 @@ public abstract class Options {`
`250`	`250`	`*/`
`251`	`251`	`public static final String WRITE_EMBEDDER_EMBEDDING_NAMESPACE = "spark.marklogic.write.embedder.embedding.namespace";`
`252`	`252`
	`253`	`+ /**`
	`254`	`+ * Defines the number of chunks to send to the embedding model in a single call. Defaults to 1.`
	`255`	`+ *`
	`256`	`+ * @since 2.5.0`
	`257`	`+ */`
	`258`	`+ public static final String WRITE_EMBEDDER_BATCH_SIZE = "spark.marklogic.write.embedder.batchSize";`
	`259`	`+`
`253`	`260`	`private Options() {`
`254`	`261`	`}`
`255`	`262`	`}`