Merge pull request #44 from marklogic/feature/joinDoc-test

rjrudin · web-flow · commit c1bf7514b6d4 · 2023-05-25T11:11:42.000-07:00
Added docs and a test for how to join in a document
diff --git a/docs/reading.md b/docs/reading.md
@@ -37,6 +37,30 @@ df = spark.read.format("com.marklogic.spark") \
     .load()
 ```
 
+## Accessing documents 
+
+While the connector requires that an Optic query use `op.fromView` as its accessor function, documents can still be
+retrieved via the [Optic functions for joining documents](https://docs.marklogic.com/guide/app-dev/OpticAPI#id_78437). 
+
+For example, the following query will find all matching rows and then retrieve the documents and URIs associated with 
+those rows:
+
+```
+query = "const joinCol = op.fragmentIdCol('id'); \
+op.fromView('example', 'employee', '', joinCol) \
+  .joinDoc('doc', joinCol) \
+  .select('doc')"
+
+df = spark.read.format("com.marklogic.spark") \
+    .option("spark.marklogic.client.uri", "pyspark-example-user:password@localhost:8020") \
+    .option("spark.marklogic.read.opticDsl", query) \
+    .load()
+```
+
+Calling `df.show()` will then show the URI and JSON contents of the document associated with each row. The Python 
+[from_json](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.from_json.html)
+function can then be used to parse the contents of each `doc` column into a JSON object as needed. 
+
 ## Pushing down operations
 
 The Spark connector framework supports pushing down multiple operations to the connector data source. This can 
diff --git a/docs/writing.md b/docs/writing.md
@@ -51,7 +51,7 @@ above options for setting a prefix and suffix will be ignored, as the template c
 For example, consider a Spark DataFrame with, among other columns, columns named `organization` and `employee_id`. 
 The following template would construct URIs based on both columns:
 
-    .options("spark.marklogic.write.uriTemplate", "/example/{organization}/{employee_id}.json")
+    .option("spark.marklogic.write.uriTemplate", "/example/{organization}/{employee_id}.json")
 
 Both columns should have values in each row in the DataFrame. If the connector encounters a row that does not have a 
 value for any column in the URI template, an error will be thrown.
diff --git a/src/main/java/com/marklogic/spark/reader/SchemaInferrer.java b/src/main/java/com/marklogic/spark/reader/SchemaInferrer.java
@@ -43,6 +43,7 @@ public abstract class SchemaInferrer {
         put("point", DataTypes.StringType);
         put("boolean", DataTypes.BooleanType);
         put("none", DataTypes.StringType); // See DBQ-296, this is intentional for some column types.
+        put ("value", DataTypes.StringType); // In MarkLogic 10, "value" is returned for a column containing a JSON object.
         put("integer", DataTypes.IntegerType);
         put("unsignedInt", DataTypes.IntegerType);
         put("iri", DataTypes.StringType);
diff --git a/src/test/java/com/marklogic/spark/reader/ReadWithJoinDocTest.java b/src/test/java/com/marklogic/spark/reader/ReadWithJoinDocTest.java
@@ -0,0 +1,38 @@
+package com.marklogic.spark.reader;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.marklogic.spark.AbstractIntegrationTest;
+import com.marklogic.spark.Options;
+import org.apache.spark.sql.Row;
+import org.junit.jupiter.api.Test;
+
+import java.util.List;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+public class ReadWithJoinDocTest extends AbstractIntegrationTest {
+
+    @Test
+    void jsonDocuments() throws Exception {
+        List<Row> rows = newDefaultReader()
+            .option(Options.READ_OPTIC_DSL,
+                "const idCol = op.fragmentIdCol('id'); " +
+                    "op.fromView('sparkTest', 'allTypes', '', idCol)" +
+                    ".where(op.sqlCondition('intValue = 1'))" +
+                    ".joinDoc('doc', idCol)" +
+                    ".select('doc')")
+            .option(Options.READ_NUM_PARTITIONS, 1)
+            .option(Options.READ_BATCH_SIZE, 0)
+            .load()
+            .collectAsList();
+
+        assertEquals(1, rows.size());
+
+        Row row = rows.get(0);
+        JsonNode doc = new ObjectMapper().readTree(row.getString(0));
+        assertEquals(1, doc.get("allTypes").get(0).get("intValue").asInt(),
+            "Verifying that the doc was correctly returned as a string in the Spark row, and could then be read via " +
+                "Jackson into a JsonNode");
+    }
+}