marklogic
diff --git a/‎Jenkinsfile
Lines changed: 1 addition & 0 deletions b/‎Jenkinsfile
Lines changed: 1 addition & 0 deletions
diff --git a/‎docker-compose.yaml
Lines changed: 2 additions & 2 deletions b/‎docker-compose.yaml
Lines changed: 2 additions & 2 deletions
diff --git a/‎caddy/config/Caddyfile renamed to ‎docker/caddy/config/Caddyfile b/‎caddy/config/Caddyfile renamed to ‎docker/caddy/config/Caddyfile
diff --git a/‎src/main/java/com/marklogic/spark/Util.java
Lines changed: 6 additions & 0 deletions b/‎src/main/java/com/marklogic/spark/Util.java
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/main/java/com/marklogic/spark/writer/DocumentProcessorFactory.java
Lines changed: 10 additions & 5 deletions b/‎src/main/java/com/marklogic/spark/writer/DocumentProcessorFactory.java
Lines changed: 10 additions & 5 deletions
diff --git a/‎src/main/java/com/marklogic/spark/writer/WriteBatcherDataWriter.java
Lines changed: 15 additions & 0 deletions b/‎src/main/java/com/marklogic/spark/writer/WriteBatcherDataWriter.java
Lines changed: 15 additions & 0 deletions
diff --git a/‎src/main/java/com/marklogic/spark/writer/embedding/ChunkSelector.java
Lines changed: 0 additions & 20 deletions b/‎src/main/java/com/marklogic/spark/writer/embedding/ChunkSelector.java
Lines changed: 0 additions & 20 deletions
diff --git a/‎src/main/java/com/marklogic/spark/writer/embedding/DOMChunkSelector.java
Lines changed: 5 additions & 3 deletions b/‎src/main/java/com/marklogic/spark/writer/embedding/DOMChunkSelector.java
Lines changed: 5 additions & 3 deletions
diff --git a/‎src/main/java/com/marklogic/spark/writer/embedding/DocumentAndChunks.java
Lines changed: 70 additions & 0 deletions b/‎src/main/java/com/marklogic/spark/writer/embedding/DocumentAndChunks.java
Lines changed: 70 additions & 0 deletions
diff --git a/‎src/main/java/com/marklogic/spark/writer/embedding/EmbedderDocumentProcessor.java
Lines changed: 86 additions & 6 deletions b/‎src/main/java/com/marklogic/spark/writer/embedding/EmbedderDocumentProcessor.java
Lines changed: 86 additions & 6 deletions
@@ -117,6 +117,7 @@ pipeline{
           sh label:'mlcleanup', script: '''#!/bin/bash
             cd marklogic-spark-connector
             docker-compose down -v || true
+            sudo /usr/local/sbin/mladmin delete $WORKSPACE/marklogic-spark-connector/docker/caddy/
             sudo /usr/local/sbin/mladmin delete $WORKSPACE/marklogic-spark-connector/docker/marklogic/logs/
           '''
         }
 
@@ -7,8 +7,8 @@ services:
   caddy-load-balancer:
     image: caddy:2-alpine
     volumes:
-      - ./caddy/data:/data
-      - ./caddy/config/Caddyfile:/etc/caddy/Caddyfile
+      # Not mapping the Caddy data directory, as that causes issues for Jenkins.
+      - ./docker/caddy/config/Caddyfile:/etc/caddy/Caddyfile
     ports:
       # Expand this range as needed. See Caddyfile for which ports are used for reverse proxies.
       - "8115:8115"
 
@@ -18,6 +18,12 @@ public interface Util {
      */
     Logger MAIN_LOGGER = LoggerFactory.getLogger("com.marklogic.spark");
 
+    /**
+     * Intended for log messages pertaining to the embedder feature. Uses a separate logger so that it can be enabled
+     * at the info/debug level without enabling any other log messages.
+     */
+    Logger EMBEDDER_LOGGER = LoggerFactory.getLogger("com.marklogic.spark.embedder");
+
     static boolean hasOption(Map<String, String> properties, String... options) {
         return Stream.of(options)
             .anyMatch(option -> properties.get(option) != null && properties.get(option).trim().length() > 0);
 
@@ -5,19 +5,24 @@
 
 import com.marklogic.spark.ContextSupport;
 import com.marklogic.spark.writer.embedding.EmbedderDocumentProcessorFactory;
+import com.marklogic.spark.writer.splitter.SplitterDocumentProcessor;
 import com.marklogic.spark.writer.splitter.SplitterDocumentProcessorFactory;
 
 import java.util.Optional;
 
 abstract class DocumentProcessorFactory {
 
     static DocumentProcessor buildDocumentProcessor(ContextSupport context) {
-        Optional<DocumentProcessor> splitter = SplitterDocumentProcessorFactory.makeSplitter(context);
-        if (splitter.isPresent()) {
-            return splitter.get();
+        Optional<SplitterDocumentProcessor> splitter = SplitterDocumentProcessorFactory.makeSplitter(context);
+
+        Optional<DocumentProcessor> embedder = EmbedderDocumentProcessorFactory.makeEmbedder(
+            context, splitter.isPresent() ? splitter.get() : null
+        );
+
+        if (embedder.isPresent()) {
+            return embedder.get();
         }
-        Optional<DocumentProcessor> embedder = EmbedderDocumentProcessorFactory.makeEmbedder(context);
-        return embedder.isPresent() ? embedder.get() : null;
+        return splitter.isPresent() ? splitter.get() : null;
     }
 
     private DocumentProcessorFactory() {
 
@@ -36,6 +36,7 @@
 import java.util.Set;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicReference;
+import java.util.function.Supplier;
 
 /**
  * Uses the Java Client's WriteBatcher to handle writing rows as documents to MarkLogic.
@@ -108,6 +109,8 @@ public WriterCommitMessage commit() {
         // in a document. Those are retrieved here.
         buildAndWriteDocuments(rowConverter.getRemainingDocumentInputs());
 
+        flushDocumentProcessor();
+
         this.writeBatcher.flushAndWait();
 
         throwWriteFailureIfExists();
@@ -179,6 +182,18 @@ private Set<String> getGraphNames() {
             null;
     }
 
+    /**
+     * A document processor can implement Supplier so that it can batch up documents to be written and then return
+     * any pending documents during the commit operation. This allows for the embedder processor to batch calls to the
+     * embedding model.
+     */
+    private void flushDocumentProcessor() {
+        if (this.documentProcessor instanceof Supplier) {
+            Iterator<DocumentWriteOperation> remainingDocuments = ((Supplier<Iterator<DocumentWriteOperation>>) this.documentProcessor).get();
+            remainingDocuments.forEachRemaining(this::writeDocument);
+        }
+    }
+
     private void addBatchListeners(WriteBatcher writeBatcher) {
         writeBatcher.onBatchSuccess(batch -> this.successItemCount.getAndAdd(batch.getItems().length));
         if (writeContext.isAbortOnFailure()) {
 
@@ -5,8 +5,6 @@
 
 import com.marklogic.client.document.DocumentWriteOperation;
 
-import java.util.List;
-
 /**
  * Abstracts how chunks are selected from a JSON or XML document.
  */
@@ -19,22 +17,4 @@ public interface ChunkSelector {
      */
     DocumentAndChunks selectChunks(DocumentWriteOperation sourceDocument);
 
-    class DocumentAndChunks {
-
-        private final DocumentWriteOperation documentToWrite;
-        private final List<Chunk> chunks;
-
-        DocumentAndChunks(DocumentWriteOperation documentToWrite, List<Chunk> chunks) {
-            this.documentToWrite = documentToWrite;
-            this.chunks = chunks;
-        }
-
-        public DocumentWriteOperation getDocumentToWrite() {
-            return documentToWrite;
-        }
-
-        public List<Chunk> getChunks() {
-            return chunks;
-        }
-    }
 }
@@ -20,18 +20,20 @@
 import java.util.ArrayList;
 import java.util.List;
 
-public class DOMChunkSelector implements ChunkSelector {
+class DOMChunkSelector implements ChunkSelector {
 
     private final XPathFactory xpathFactory;
     private final XPathExpression chunksExpression;
     private final XmlChunkConfig xmlChunkConfig;
     private final DOMHelper domHelper;
 
-    public DOMChunkSelector(String chunksExpression, XmlChunkConfig xmlChunkConfig) {
+    DOMChunkSelector(String chunksExpression, XmlChunkConfig xmlChunkConfig) {
         this.xpathFactory = XPathFactory.newInstance();
         this.xmlChunkConfig = xmlChunkConfig;
         this.domHelper = new DOMHelper(xmlChunkConfig.getNamespaceContext());
-        this.chunksExpression = domHelper.compileXPath(chunksExpression, "selecting chunks");
+
+        String chunksXPath = chunksExpression != null ? chunksExpression : "/node()/chunks";
+        this.chunksExpression = domHelper.compileXPath(chunksXPath, "selecting chunks");
     }
 
     @Override
 
@@ -0,0 +1,70 @@
+/*
+ * Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
+ */
+package com.marklogic.spark.writer.embedding;
+
+import com.marklogic.client.document.DocumentWriteOperation;
+import com.marklogic.client.io.marker.AbstractWriteHandle;
+import com.marklogic.client.io.marker.DocumentMetadataWriteHandle;
+import org.jetbrains.annotations.NotNull;
+
+import java.util.List;
+
+/**
+ * Encapsulates a document to be written to MarkLogic along with an optional list of chunks that have been extracted
+ * from it. Capturing the list of chunks is useful when a user wishes to use both the splitter and embedder. In that
+ * scenario, the embedder can reuse the list of chunks produced by the splitter without having to find the chunks
+ * itself.
+ */
+public class DocumentAndChunks implements DocumentWriteOperation {
+
+    private final DocumentWriteOperation documentToWrite;
+    private final List<Chunk> chunks;
+
+    public DocumentAndChunks(DocumentWriteOperation documentToWrite, List<Chunk> chunks) {
+        this.documentToWrite = documentToWrite;
+        this.chunks = chunks;
+    }
+
+    public DocumentWriteOperation getDocumentToWrite() {
+        return documentToWrite;
+    }
+
+    public List<Chunk> getChunks() {
+        return chunks;
+    }
+
+    public boolean hasChunks() {
+        return chunks != null && !chunks.isEmpty();
+    }
+
+    @Override
+    public OperationType getOperationType() {
+        return OperationType.DOCUMENT_WRITE;
+    }
+
+    @Override
+    public String getUri() {
+        return documentToWrite.getUri();
+    }
+
+    @Override
+    public DocumentMetadataWriteHandle getMetadata() {
+        return documentToWrite.getMetadata();
+    }
+
+    @Override
+    public AbstractWriteHandle getContent() {
+        return documentToWrite.getContent();
+    }
+
+    @Override
+    public String getTemporalDocumentURI() {
+        return documentToWrite.getTemporalDocumentURI();
+    }
+
+    @Override
+    public int compareTo(@NotNull DocumentWriteOperation o) {
+        return documentToWrite.compareTo(o);
+    }
+}
@@ -4,31 +4,111 @@
 package com.marklogic.spark.writer.embedding;
 
 import com.marklogic.client.document.DocumentWriteOperation;
+import com.marklogic.spark.Util;
 import com.marklogic.spark.writer.DocumentProcessor;
+import com.marklogic.spark.writer.splitter.SplitterDocumentProcessor;
 
+import java.util.ArrayList;
 import java.util.Iterator;
+import java.util.List;
+import java.util.function.Supplier;
 import java.util.stream.Stream;
 
 /**
  * Supports a use case where a document already has chunks in it, which must be selected via a {@code ChunkSelector}.
  * The {@code EmbeddingModel} is then used to generate and add an embedding to each chunk in a given document.
  */
-class EmbedderDocumentProcessor implements DocumentProcessor {
+class EmbedderDocumentProcessor implements DocumentProcessor, Supplier<Iterator<DocumentWriteOperation>> {
 
     private final ChunkSelector chunkSelector;
     private final EmbeddingGenerator embeddingGenerator;
+    private final SplitterDocumentProcessor splitterDocumentProcessor;
 
-    EmbedderDocumentProcessor(ChunkSelector chunkSelector, EmbeddingGenerator embeddingGenerator) {
+    private List<DocumentWriteOperation> pendingSourceDocuments = new ArrayList<>();
+
+    EmbedderDocumentProcessor(ChunkSelector chunkSelector, EmbeddingGenerator embeddingGenerator, SplitterDocumentProcessor splitterDocumentProcessor) {
         this.chunkSelector = chunkSelector;
         this.embeddingGenerator = embeddingGenerator;
+        this.splitterDocumentProcessor = splitterDocumentProcessor;
     }
 
+    /**
+     * I think we can hold onto documents here? addEmbeddings could return true/false if it actually sends anything.
+     *
+     * @param sourceDocument the function argument
+     * @return
+     */
     @Override
     public Iterator<DocumentWriteOperation> apply(DocumentWriteOperation sourceDocument) {
-        ChunkSelector.DocumentAndChunks documentAndChunks = chunkSelector.selectChunks(sourceDocument);
-        if (documentAndChunks.getChunks() != null && !documentAndChunks.getChunks().isEmpty()) {
-            embeddingGenerator.addEmbeddings(documentAndChunks.getChunks());
+        if (splitterDocumentProcessor != null) {
+            return splitAndAddEmbeddings(sourceDocument);
+        }
+
+        DocumentAndChunks documentAndChunks = chunkSelector.selectChunks(sourceDocument);
+        return documentAndChunks.hasChunks() ?
+            addEmbeddingsToExistingChunks(documentAndChunks) :
+            // If no chunks are found, embeddings can't be added, so just return the source document.
+            Stream.of(documentAndChunks.getDocumentToWrite()).iterator();
+    }
+
+    @Override
+    public Iterator<DocumentWriteOperation> get() {
+        // Return any pending source documents - i.e. those with chunks that didn't add up to the embedding generator's
+        // batch size, and thus embeddings haven't been added.
+        if (pendingSourceDocuments != null && !pendingSourceDocuments.isEmpty()) {
+            if (Util.EMBEDDER_LOGGER.isInfoEnabled()) {
+                Util.EMBEDDER_LOGGER.info("Pending source document count: {}; generating embeddings for each document.",
+                    pendingSourceDocuments.size());
+            }
+            embeddingGenerator.generateEmbeddingsForPendingChunks();
+            return pendingSourceDocuments.iterator();
+        }
+        return Stream.<DocumentWriteOperation>empty().iterator();
+    }
+
+    private Iterator<DocumentWriteOperation> splitAndAddEmbeddings(DocumentWriteOperation sourceDocument) {
+        Iterator<DocumentWriteOperation> splitDocuments = splitterDocumentProcessor.apply(sourceDocument);
+
+        // Track the list of documents to return. A document won't be returned immediately if it has chunks but the
+        // embedding generator doesn't receive enough chunks to meet its batch size threshold.
+        List<DocumentWriteOperation> documentsToReturn = new ArrayList<>();
+
+        splitDocuments.forEachRemaining(splitDoc -> {
+            boolean hasChunks = splitDoc instanceof DocumentAndChunks && ((DocumentAndChunks) splitDoc).hasChunks();
+            if (hasChunks) {
+                DocumentAndChunks documentAndChunks = (DocumentAndChunks) splitDoc;
+                pendingSourceDocuments.add(documentAndChunks);
+                boolean embeddingsWereGenerated = embeddingGenerator.addEmbeddings(documentAndChunks);
+                // If the embedding generator received enough chunks to exceed its batch size, then all the pending
+                // documents can be added to the list of documents to return, as we know those documents will have had
+                // embeddings added to them.
+                if (embeddingsWereGenerated) {
+                    documentsToReturn.addAll(pendingSourceDocuments);
+                    pendingSourceDocuments.clear();
+                }
+            } else {
+                // If the document doesn't have any chunks, it can be returned immediately.
+                documentsToReturn.add(splitDoc);
+            }
+        });
+
+        return documentsToReturn.iterator();
+    }
+
+    /**
+     * For existing chunks - add the document to the pending list. Then add embeddings. If embeddings were generated,
+     * return an iterator over all the pending documents, which now have embeddings.
+     */
+    private Iterator<DocumentWriteOperation> addEmbeddingsToExistingChunks(DocumentAndChunks documentAndChunks) {
+        pendingSourceDocuments.add(documentAndChunks);
+        boolean embeddingsWereGenerated = embeddingGenerator.addEmbeddings(documentAndChunks);
+        if (embeddingsWereGenerated) {
+            List<DocumentWriteOperation> documentsWithEmbeddings = new ArrayList<>();
+            documentsWithEmbeddings.addAll(pendingSourceDocuments);
+            pendingSourceDocuments.clear();
+            return documentsWithEmbeddings.iterator();
+        } else {
+            return Stream.<DocumentWriteOperation>empty().iterator();
         }
-        return Stream.of(documentAndChunks.getDocumentToWrite()).iterator();
     }
 }
Original file line number	Diff line number	Diff line change
`@@ -117,6 +117,7 @@ pipeline{`
`117`	`117`	`sh label:'mlcleanup', script: '''#!/bin/bash`
`118`	`118`	`cd marklogic-spark-connector`
`119`	`119`	`docker-compose down -v \|\| true`
	`120`	`+ sudo /usr/local/sbin/mladmin delete $WORKSPACE/marklogic-spark-connector/docker/caddy/`
`120`	`121`	`sudo /usr/local/sbin/mladmin delete $WORKSPACE/marklogic-spark-connector/docker/marklogic/logs/`
`121`	`122`	`'''`
`122`	`123`	`}`