Not adding invalid embeddings

rjrudin · rjrudin · commit 278232470065 · 2024-12-03T09:30:59.000-05:00
For now, "invalid" = a vector with all zeroes in it. Which Ollama seems to spit out intermittently. And MarkLogic can't do anything useful with it, and in fact it will throw an error, so we're not adding it.
diff --git a/marklogic-langchain4j/src/main/java/com/marklogic/langchain4j/embedding/EmbeddingGenerator.java b/marklogic-langchain4j/src/main/java/com/marklogic/langchain4j/embedding/EmbeddingGenerator.java
@@ -97,11 +97,34 @@ private void addEmbeddingsToChunks(List<Chunk> chunks) {
         } else {
             List<Embedding> embeddings = response.content();
             for (int i = 0; i < embeddings.size(); i++) {
-                chunks.get(i).addEmbedding(embeddings.get(i));
+                addEmbeddingToChunk(chunks.get(i), embeddings.get(i));
             }
         }
     }
 
+    private void addEmbeddingToChunk(Chunk chunk, Embedding embedding) {
+        if (vectorIsAllZeroes(embedding)) {
+            if (Util.LANGCHAIN4J_LOGGER.isDebugEnabled()) {
+                Util.LANGCHAIN4J_LOGGER.debug("Not adding embedding to chunk as it only contains zeroes; source document URI: {}; text: {}",
+                    chunk.getDocumentUri(), chunk.getEmbeddingText());
+            } else {
+                Util.LANGCHAIN4J_LOGGER.warn("Not adding embedding to chunk as it only contains zeroes; source document URI: {}",
+                    chunk.getDocumentUri());
+            }
+        } else {
+            chunk.addEmbedding(embedding);
+        }
+    }
+
+    private boolean vectorIsAllZeroes(Embedding embedding) {
+        for (float f : embedding.vector()) {
+            if (f != 0.0f) {
+                return false;
+            }
+        }
+        return true;
+    }
+
     private List<TextSegment> makeTextSegments(List<Chunk> chunks) {
         return chunks.stream()
             .map(chunk -> new TextSegment(chunk.getEmbeddingText(), TEXT_SEGMENT_METADATA))
diff --git a/marklogic-spark-connector/src/test/java/com/marklogic/spark/writer/embedding/AddEmbeddingsToJsonTest.java b/marklogic-spark-connector/src/test/java/com/marklogic/spark/writer/embedding/AddEmbeddingsToJsonTest.java
@@ -103,6 +103,42 @@ void addEmbeddingsToExistingSplits() {
         verifyEachChunkIsReturnedByAVectorQuery();
     }
 
+    @ExtendWith(RequiresMarkLogic12.class)
+    @Test
+    void vectorHasAllZeroes() {
+        readDocument("/marklogic-docs/java-client-intro.json")
+            .repartition(1)
+            .write().format(CONNECTOR_IDENTIFIER)
+            .option(Options.CLIENT_URI, makeClientUri())
+            .option(Options.WRITE_SPLITTER_JSON_POINTERS, "/text")
+            .option(Options.WRITE_SPLITTER_SIDECAR_MAX_CHUNKS, 10)
+            .option(Options.WRITE_SPLITTER_SIDECAR_COLLECTIONS, "json-vector-chunks")
+            .option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
+            .option(Options.WRITE_URI_TEMPLATE, "/split-test.json")
+            .option(Options.WRITE_EMBEDDER_MODEL_FUNCTION_CLASS_NAME, "com.marklogic.spark.writer.embedding.TestEmbeddingModel")
+            .option(Options.WRITE_EMBEDDER_MODEL_FUNCTION_OPTION_PREFIX + "returnZeroesOnFirstCall", "true")
+            .mode(SaveMode.Append)
+            .save();
+
+        JsonNode doc = readJsonDocument("/split-test.json-chunks-1.json");
+        JsonNode firstChunk = doc.get("chunks").get(0);
+        assertFalse(firstChunk.has("embedding"), "The first chunk is given an array of all zeroes by the test " +
+            "embedding model. Flux should recognize this and not add the `embedding` field, as doing so will cause " +
+            "issues with the Optic vector library - specifically, a VEC-MAGNITUDEZERO error at least when using " +
+            "vec.cosineSimilarity and then sorting on the values. A future version of MarkLogic 12 may improve this " +
+            "by allowing for an array of zeroes to be rejected.");
+
+        JsonNode secondChunk = doc.get("chunks").get(1);
+        assertTrue(secondChunk.has("embedding"), "The test embedding model should generate a valid embedding for " +
+            "the second chunk, which means it can be queried next using Optic.");
+
+        RowManager rowManager = getDatabaseClient().newRowManager();
+        RowSet<RowRecord> rows = rowManager.resultRows(rowManager.newPlanBuilder().fromView("example", "json_chunks"));
+        assertEquals(1, rows.stream().count(), "The TDE has nullable=false for the embedding column, as a null " +
+            "vector will cause issues when querying on vectors. And since invalidValues=ignore, the first chunk " +
+            "won't be returned; only the second chunk will be.");
+    }
+
     @Test
     void passOptionsToEmbeddingModelFunction() {
         DataFrameWriter writer = readDocument("/marklogic-docs/java-client-intro.json")
diff --git a/marklogic-spark-connector/src/test/java/com/marklogic/spark/writer/embedding/TestEmbeddingModel.java b/marklogic-spark-connector/src/test/java/com/marklogic/spark/writer/embedding/TestEmbeddingModel.java
@@ -7,6 +7,7 @@
 import dev.langchain4j.data.embedding.Embedding;
 import dev.langchain4j.data.segment.TextSegment;
 import dev.langchain4j.model.embedding.EmbeddingModel;
+import dev.langchain4j.model.embedding.onnx.allminilml6v2.AllMiniLmL6V2EmbeddingModel;
 import dev.langchain4j.model.output.Response;
 
 import java.util.Arrays;
@@ -27,8 +28,13 @@ public static void reset() {
         chunkCounter = 0;
     }
 
+    private static AllMiniLmL6V2EmbeddingModel realEmbeddingModel = new AllMiniLmL6V2EmbeddingModel();
+
+    private boolean returnZeroesOnFirstCall;
+
     @Override
     public EmbeddingModel apply(Map<String, String> options) {
+        returnZeroesOnFirstCall = "true".equals(options.get("returnZeroesOnFirstCall"));
         return this;
     }
 
@@ -41,7 +47,11 @@ public int dimension() {
     public Response<List<Embedding>> embedAll(List<TextSegment> textSegments) {
         batchCounter++;
         chunkCounter += textSegments.size();
-        return Response.from(Arrays.asList(new Embedding(new float[]{1})));
+        if (returnZeroesOnFirstCall) {
+            returnZeroesOnFirstCall = false;
+            return Response.from(Arrays.asList(new Embedding(new float[384])));
+        }
+        return realEmbeddingModel.embedAll(textSegments);
     }
 
     public static class TestChunk implements Chunk {
diff --git a/marklogic-spark-langchain4j/src/main/java/com/marklogic/spark/langchain4j/EmbeddingAdderFactory.java b/marklogic-spark-langchain4j/src/main/java/com/marklogic/spark/langchain4j/EmbeddingAdderFactory.java
@@ -21,7 +21,7 @@ public abstract class EmbeddingAdderFactory {
     public static Optional<EmbeddingAdder> makeEmbedder(Context context, DocumentTextSplitter splitter) {
         Optional<EmbeddingModel> embeddingModel = makeEmbeddingModel(context);
         if (embeddingModel.isPresent()) {
-            EmbeddingGenerator embeddingGenerator = makeEmbeddingGenerator(context);
+            EmbeddingGenerator embeddingGenerator = makeEmbeddingGenerator(context, embeddingModel.get());
             if (splitter != null) {
                 return Optional.of(new EmbeddingAdder(splitter, embeddingGenerator));
             }
@@ -31,17 +31,12 @@ public static Optional<EmbeddingAdder> makeEmbedder(Context context, DocumentTex
         return Optional.empty();
     }
 
-    public static EmbeddingGenerator makeEmbeddingGenerator(Context context) {
-        Optional<EmbeddingModel> embeddingModel = makeEmbeddingModel(context);
-        if (embeddingModel.isPresent()) {
-            int batchSize = context.getIntOption(Options.WRITE_EMBEDDER_BATCH_SIZE, 1, 1);
-            EmbeddingModel model = embeddingModel.get();
-            if (Util.MAIN_LOGGER.isInfoEnabled()) {
-                Util.MAIN_LOGGER.info("Using embedding model with dimension: {}", model.dimension());
-            }
-            return new EmbeddingGenerator(model, batchSize);
+    private static EmbeddingGenerator makeEmbeddingGenerator(Context context, EmbeddingModel model) {
+        int batchSize = context.getIntOption(Options.WRITE_EMBEDDER_BATCH_SIZE, 1, 1);
+        if (Util.MAIN_LOGGER.isInfoEnabled()) {
+            Util.MAIN_LOGGER.info("Using embedding model with dimension: {}", model.dimension());
         }
-        return null;
+        return new EmbeddingGenerator(model, batchSize);
     }
 
     /**
diff --git a/test-app/src/main/ml-schemas-12/tde/json-vector-chunks.json b/test-app/src/main/ml-schemas-12/tde/json-vector-chunks.json
@@ -19,8 +19,8 @@
             "scalarType": "vector",
             "val": "vec:vector(embedding)",
             "dimension": "384",
-            "invalidValues": "reject",
-            "nullable": true
+            "invalidValues": "ignore",
+            "nullable": false
           }
         ]
       }

Original file line number	Diff line number	Diff line change
`@@ -19,8 +19,8 @@`
`19`	`19`	`"scalarType": "vector",`
`20`	`20`	`"val": "vec:vector(embedding)",`
`21`	`21`	`"dimension": "384",`
`22`		`- "invalidValues": "reject",`
`23`		`- "nullable": true`
	`22`	`+ "invalidValues": "ignore",`
	`23`	`+ "nullable": false`
`24`	`24`	`}`
`25`	`25`	`]`
`26`	`26`	`}`