DEVEXP-627 Updated docs to included partitions feature

rjrudin · rjrudin · commit aa792db68bf7 · 2023-11-14T10:48:32.000-05:00
Added logo too. Fun!
diff --git a/docs/_config.yml b/docs/_config.yml
@@ -5,6 +5,8 @@ plugins:
 
 heading_anchors: true
 
+logo: "/assets/ProgressMarkLogic_PrimaryLogo_Stacked.svg"
+
 # Aux links for the upper right navigation
 aux_links:
   "marklogic/marklogic-spark-connector":
diff --git a/docs/assets/ProgressMarkLogic_PrimaryLogo_Stacked.svg b/docs/assets/ProgressMarkLogic_PrimaryLogo_Stacked.svg
@@ -0,0 +1,31 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg id="Logos" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 274.24 121.2">
+  <defs>
+    <style>
+      .cls-1 {
+        fill: #5ce500;
+      }
+
+      .cls-2 {
+        fill: #4b4e52;
+      }
+    </style>
+  </defs>
+  <path class="cls-2" d="m78.24,101.42l-13.23-21.43v29.47h-2.85v-35h2.57l13.62,22.23,13.68-22.23h2.39v35h-2.85v-29.47l-13.17,21.43h-.17Z"/>
+  <path class="cls-2" d="m115.01,91.9c0-3.42-2.39-5.64-6.27-5.64-2.91,0-5.3.86-7.35,2.22v-2.74c1.71-1.14,4.5-2.11,7.64-2.11,5.36,0,8.72,3.08,8.72,8.09v17.73h-2.74v-2.39c-1.08,1.31-3.88,2.79-7.01,2.79-5.13,0-9.35-2.96-9.35-7.81s4.22-7.75,9.46-7.75c3.02,0,5.76,1.43,6.9,2.68v-5.07Zm0,8.55c-.8-2.17-3.65-3.76-6.73-3.76-3.59,0-6.9,1.77-6.9,5.36s3.31,5.42,6.9,5.42c3.08,0,5.93-1.6,6.73-3.76v-3.25Z"/>
+  <path class="cls-2" d="m135.27,86.77c-.68-.29-1.43-.46-2.34-.46-3.08,0-5.53,2.22-6.73,5.47v17.67h-2.85v-25.31h2.85v4.05c1.14-2.51,3.71-4.56,6.84-4.56.97,0,1.65.17,2.22.34v2.79Z"/>
+  <path class="cls-2" d="m157.9,109.46h-3.53l-13.17-13v13h-2.85v-35h2.85v20.75l12.77-11.06h3.71l-13.51,11.57,13.74,13.74Z"/>
+  <path class="cls-2" d="m160.93,74.45h2.91v32.32h18.98v2.68h-21.89v-35Z"/>
+  <path class="cls-2" d="m195.2,109.97c-6.67,0-11.17-5.36-11.17-13.17s4.5-13.17,11.17-13.17,11.23,5.36,11.23,13.17-4.5,13.17-11.23,13.17Zm0-23.83c-5.02,0-8.32,4.28-8.32,10.66s3.31,10.66,8.32,10.66,8.38-4.28,8.38-10.66-3.31-10.66-8.38-10.66Z"/>
+  <path class="cls-2" d="m227.89,105.24c-1.43,2.11-4.33,4.16-7.81,4.16-7.35,0-10.89-6.1-10.89-12.88s3.53-12.88,10.89-12.88c3.48,0,6.38,2.05,7.81,4.22v-3.71h2.79v23.49c0,8.38-5.59,12.03-11.63,12.03-3.19,0-6.04-.97-7.92-2.39v-3.25c2.51,2.28,5.19,3.08,7.92,3.08,4.96,0,8.84-2.96,8.84-9.29v-2.57Zm0-14.08c-1.65-3.14-4.5-4.9-7.52-4.9-5.24,0-8.32,4.28-8.32,10.26s3.08,10.32,8.32,10.32c3.02,0,5.87-1.82,7.52-4.9v-10.77Z"/>
+  <path class="cls-2" d="m237.94,78.39c-1.08,0-1.94-.85-1.94-1.94,0-1.03.85-1.94,1.94-1.94s1.94.91,1.94,1.94c0,1.08-.85,1.94-1.94,1.94Zm1.43,31.07h-2.85v-25.31h2.85v25.31Z"/>
+  <path class="cls-2" d="m255.2,83.63c2.45,0,4.96.68,6.73,2.22v2.91c-2-1.82-4.1-2.57-6.61-2.57-5.02,0-9.06,3.93-9.06,10.6s4.05,10.6,9.06,10.6c2.51,0,4.62-.74,6.61-2.57v2.91c-1.77,1.54-4.28,2.22-6.73,2.22-6.78,0-11.8-5.19-11.8-13.17s5.02-13.17,11.8-13.17Z"/>
+  <path class="cls-2" d="m268.7,86.12c-2.02,0-3.69-1.57-3.69-3.73s1.67-3.73,3.69-3.73,3.69,1.57,3.69,3.73-1.67,3.73-3.69,3.73Zm0-6.88c-1.69,0-3.08,1.31-3.08,3.14s1.39,3.12,3.08,3.12,3.06-1.31,3.06-3.12-1.37-3.14-3.06-3.14Zm-.59,3.67v1.39h-.86v-3.92h1.71c.86,0,1.39.47,1.39,1.29,0,.61-.33,1.02-.86,1.16l1,1.47h-.96l-.92-1.39h-.51Zm.82-1.79h-.82v1.08h.82c.39,0,.61-.24.61-.53,0-.31-.22-.55-.61-.55Z"/>
+  <g>
+    <path class="cls-1" d="m48.57,41.08c0,.57-.4,1.27-.9,1.55l-12.51,7.22v-27.86L11.05,8.07,23.55.85c.49-.29,1.3-.29,1.8,0l23.22,13.41v26.82Zm-18.77-15.99L13.28,15.55c-.49-.28-1.3-.28-1.8,0L.32,21.99l17.42,10.06v20.12s11.16-6.45,11.16-6.45c.49-.29.9-.99.9-1.56v-19.08ZM.32,42.11l12.05,6.96v-13.93L.32,42.11Z"/>
+    <g>
+      <path class="cls-2" d="m75.14,14.25h-13.15v35.6h5.83v-13.97h7.36c7.96,0,12.35-3.89,12.35-10.96,0-3.21-1.21-10.67-12.4-10.67Zm6.47,10.77c0,4.01-2.08,5.65-7.19,5.65h-6.59v-11.16h7.36c4.32,0,6.42,1.8,6.42,5.51Zm23.29-1.11l.44.21-.99,5.26-.68-.21c-.56-.17-1.12-.26-1.69-.26-5.45,0-5.89,4.34-5.89,9.59v11.34h-5.4v-26.14h5.07v3.15c1.38-2.12,3.05-3.22,5.24-3.48,1.44-.17,2.84.03,3.91.54Zm13.28-.73c-4.25,0-7.66,1.62-9.85,4.68-2.3,3.22-2.64,7-2.64,8.96,0,8.26,4.9,13.6,12.49,13.6,9.22,0,12.49-7.43,12.49-13.79,0-3.66-1.09-6.98-3.05-9.35-2.23-2.68-5.49-4.1-9.44-4.1Zm0,22.42c-4.23,0-6.85-3.36-6.85-8.77s2.63-8.87,6.85-8.87,6.9,3.44,6.9,8.77-2.65,8.87-6.9,8.87Zm57.89-21.69l.44.21-.99,5.26-.68-.21c-.56-.17-1.12-.26-1.68-.26-5.45,0-5.89,4.34-5.89,9.59v11.34h-5.4v-26.14h5.07v3.15c1.38-2.12,3.05-3.22,5.24-3.48,1.44-.17,2.84.03,3.9.54Zm19.4,17.56l-.1.29c-1.13,3.21-3.82,3.89-5.89,3.89-4.09,0-6.72-2.71-6.97-7.12h18.48l.07-.51c.08-.6.07-1.46.06-2.08,0-.17,0-.32,0-.47-.27-7.32-5.04-12.23-11.87-12.23-3.99,0-7.26,1.57-9.46,4.54-1.78,2.4-2.8,5.72-2.8,9.11,0,8.08,5.02,13.51,12.5,13.51,5.78,0,9.76-2.94,11.22-8.27l.17-.64h-5.4Zm-12.78-7.63c.45-3.62,2.93-5.92,6.46-5.92,2.69,0,5.88,1.57,6.33,5.92h-12.79Zm-30.66-7.93c-1.74-1.81-4.07-2.72-6.95-2.72-7.93,0-11.53,6.97-11.53,13.45s3.57,13.36,11.53,13.36c2.66,0,5.03-.98,6.74-2.74-.02,1.15-.05,2.25-.08,2.72-.29,4.31-2.37,6.32-6.52,6.32-2.26,0-4.85-.8-5.35-3.05l-.11-.5h-5.32s.09.71.09.71c.56,4.61,4.6,7.47,10.56,7.47,5.22,0,8.94-1.97,10.75-5.69.87-1.79,1.31-4.41,1.31-7.77v-23.77h-5.11v2.19Zm-6.61,19.27c-1.89,0-6.28-.86-6.28-8.77,0-5.2,2.46-8.44,6.42-8.44,3.06,0,6.33,2.22,6.33,8.44,0,5.49-2.42,8.77-6.47,8.77Zm80.66-3.19c0,4.16-3.45,8.35-11.16,8.35-6.82,0-10.88-2.95-11.42-8.31l-.06-.55h5.32l.08.4c.65,3.22,3.72,3.9,6.18,3.9,2.7,0,5.58-.87,5.58-3.31,0-1.23-.8-2.11-2.39-2.6-.94-.28-2.09-.6-3.32-.95-2.1-.59-4.28-1.2-5.62-1.66-3.32-1.19-5.22-3.5-5.22-6.35,0-5.27,5.13-7.63,10.21-7.63,6.8,0,10.32,2.56,10.76,7.83l.04.54h-5.24s-.07-.42-.07-.42c-.53-3.03-3.54-3.48-5.29-3.48-1.51,0-5.03.27-5.03,2.78,0,1.14.97,1.98,2.98,2.56.6.17,1.49.4,2.51.66,2.13.55,4.79,1.24,6.13,1.7,3.33,1.17,5.02,3.37,5.02,6.54Zm24.95,0c0,4.16-3.45,8.35-11.16,8.35-6.82,0-10.88-2.95-11.42-8.31l-.06-.55h5.32l.08.4c.65,3.22,3.72,3.9,6.18,3.9,2.7,0,5.58-.87,5.58-3.31,0-1.23-.8-2.11-2.39-2.6-.94-.28-2.1-.6-3.32-.95-2.1-.59-4.28-1.2-5.61-1.66-3.32-1.19-5.22-3.5-5.22-6.35,0-5.27,5.13-7.63,10.21-7.63,6.8,0,10.32,2.56,10.76,7.83l.05.54h-5.24s-.07-.42-.07-.42c-.53-3.03-3.54-3.48-5.29-3.48-1.51,0-5.03.27-5.03,2.78,0,1.14.97,1.98,2.98,2.56.6.17,1.49.4,2.52.66,2.13.55,4.79,1.24,6.13,1.7,3.33,1.17,5.02,3.37,5.02,6.54Z"/>
+      <path class="cls-2" d="m255.3,26.51c-2.01,0-3.68-1.57-3.68-3.72s1.67-3.72,3.68-3.72,3.68,1.57,3.68,3.72-1.67,3.72-3.68,3.72Zm0-6.82c-1.67,0-3.01,1.28-3.01,3.09s1.34,3.09,3.01,3.09,3.01-1.28,3.01-3.09-1.34-3.09-3.01-3.09Zm-.43,3.78v1.24h-1.1v-3.91h1.83c.87,0,1.44.51,1.44,1.36,0,.61-.29,1.02-.75,1.18l.89,1.36h-1.22l-.75-1.24h-.35Zm.61-1.75h-.61v.87h.61c.33,0,.51-.16.51-.43s-.18-.45-.51-.45Z"/>
+    </g>
+  </g>
+</svg>
diff --git a/docs/configuration.md b/docs/configuration.md
@@ -110,8 +110,8 @@ The following options control how the connector reads rows from MarkLogic via cu
 | spark.marklogic.read.xquery | XQuery code to execute. |
 | spark.marklogic.read.vars. | Prefix for user-defined variables to be sent to the custom code. |
 
-If you are using Spark's streaming support with custom code, the following options can also be used to control how
-partitions are defined:
+If you are using Spark's streaming support with custom code, or you need to break up your custom code query into 
+multiple queries, the following options can also be used to control how partitions are defined:
 
 | Option | Description | 
 | --- | --- |
diff --git a/docs/reading.md b/docs/reading.md
@@ -25,6 +25,7 @@ df = spark.read.format("com.marklogic.spark") \
     .option("spark.marklogic.client.uri", "spark-example-user:password@localhost:8003") \
     .option("spark.marklogic.read.opticQuery", "op.fromView('example', 'employee')") \
     .load()
+df.show()
 ```
 
 As demonstrated above, `format`, `spark.marklogic.client.uri` (or the other `spark.marklogic.client` options
@@ -45,6 +46,7 @@ df = spark.read.format("com.marklogic.spark") \
     .option("spark.marklogic.client.uri", "spark-example-user:password@localhost:8003") \
     .option("spark.marklogic.read.opticQuery", query) \
     .load()
+df.show()
 ```
 
 The `where` clause in the example above can include any of the query features supported by MarkLogic, such as 
@@ -88,6 +90,7 @@ df = spark.read.format("com.marklogic.spark") \
     .option("spark.marklogic.client.uri", "spark-example-user:password@localhost:8003") \
     .option("spark.marklogic.read.opticQuery", "op.fromView('example', 'employee')") \
     .load()
+df.show()
 ```
 
 ### Accessing documents 
@@ -289,6 +292,7 @@ df = spark.read.format("com.marklogic.spark") \
     .option("spark.marklogic.client.uri", "spark-example-user:password@localhost:8003") \
     .option("spark.marklogic.read.javascript", "cts.uris(null, null, cts.collectionQuery('employee'))") \
     .load()
+df.show()
 ```
 
 Or code can be [written in XQuery](https://docs.marklogic.com/guide/getting-started/XQueryTutorial) by configuring the 
@@ -299,6 +303,7 @@ df = spark.read.format("com.marklogic.spark") \
     .option("spark.marklogic.client.uri", "spark-example-user:password@localhost:8003") \
     .option("spark.marklogic.read.xquery", "cts:uris((), (), cts:collection-query('employee'))") \
     .load()
+df.show()
 ```
 
 You can also invoke a JavaScript or XQuery module in your application's modules database via the 
@@ -309,6 +314,7 @@ df = spark.read.format("com.marklogic.spark") \
     .option("spark.marklogic.client.uri", "spark-example-user:password@localhost:8003") \
     .option("spark.marklogic.read.invoke", "/read.sjs") \
     .load()
+df.show()
 ```
 
 ### Custom code schemas
@@ -330,6 +336,7 @@ df = spark.read.format("com.marklogic.spark") \
     .option("spark.marklogic.read.invoke", "/read-custom-schema.sjs") \
     .schema(StructType([StructField("id", IntegerType()), StructField("name", StringType())])) \
     .load()
+df.show()
 ```
 
 ### Custom external variables
@@ -348,46 +355,77 @@ df = spark.read.format("com.marklogic.spark") \
     .option("spark.marklogic.read.vars.var2", "Marketing") \
     .option("spark.marklogic.read.javascript", "var var1, var2; cts.uris(null, null, cts.wordQuery([var1, var2]))") \
     .load()
+df.show()
 ```
 
-### Streaming support
+### Defining partitions for custom code
 
-Spark's support for [streaming reads](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) 
-from MarkLogic can be useful when your custom code for reading data may take a long time to execute. Or, based on the
-nature of your custom code, running the query incrementally to produce smaller batches may be a better fit for your 
-use case. 
+By default, the connector will send a single request to MarkLogic to execute custom code for reading rows. If your
+custom code returns a large amount of data and is at risk of timing out, or if you seek better performance by breaking
+your query into many smaller queries, you can use one of the following options to define partitions for your custom code:
 
-(TODO This needs to be rewritten, will do so in a follow up PR.)
+- `spark.marklogic.read.partitions.invoke`
+- `spark.marklogic.read.partitions.javascript`
+- `spark.marklogic.read.partitions.xquery`
 
-To stream results from your custom code, the connector must know how batches can be constructed based on the results of
-your custom code. Because the connector does not know anything about your code, the connector needs to run an 
-additional set of custom code that you implement to provide a sequence of partitions to the connector. The
-connector will then run your custom once for each of your partitions, with the partition being passed as
-an external variable to your custom code. 
+If one of the above options is defined, the connector will execute the code associated with the option and expect a 
+sequence of values to be returned. You can return any values you want to define partitions; the connector does not care
+what the values represent. The connector will then execute your custom code - defined by `spark.marklogic.read.invoke`, 
+`spark.marklogic.read.javascript`, or `spark.marklogic.read.xquery` - once for each partition value. The partition value
+will be defined in an external variable named `PARTITION`. Note as well that any external variables you define via the 
+`spark.marklogic.read.vars` prefix will also be sent to the code for returning partitions.
 
-The code to run for providing a sequence of partitions must be defined via one of the following options:
+The following example shows a common use case for using MarkLogic forest IDs as partitions:
 
-- `spark.marklogic.read.partitions.invoke` - a JavaScript or XQuery module path to invoke.
-- `spark.marklogic.read.partitions.javascript` - a JavaScript program to evaluate.
-- `spark.marklogic.read.partitions.xquery` - an XQuery program to evaluate.
+```
+df = spark.read.format("com.marklogic.spark") \
+    .option("spark.marklogic.client.uri", "spark-example-user:password@localhost:8003") \
+    .option("spark.marklogic.read.partitions.javascript", "xdmp.databaseForests(xdmp.database())") \
+    .option("spark.marklogic.read.javascript", "cts.uris(null, null, cts.collectionQuery('employee'), 0, [PARTITION])") \
+    .load()
+df.show()
+```
 
-Note that any variables you define via the `spark.marklogic.reads.vars` prefix will also be sent to the above code, 
-in addition to the code you define for reading rows. 
+In the example application used by this documentation the "spark-example-content" database has 3 forests. Thus, the
+partitions code above will return a sequence of 3 forest IDs. The connector will then invoke the custom 
+JavaScript code 3 times, once for each forest ID, with the `PARTITION` variable populated with a forest ID. 
 
-You are free to return any sequence of partitions. For each one, the connector will invoke your regular custom
-code with an external variable named `PARTITION` of type `String`. You are then free to use this value to return 
-a set of results associated with the partition.
+For the above scenario, it is common to run these queries 
+[at the same point in time](https://docs.marklogic.com/guide/app-dev/point_in_time). Because you are free to return
+any partition values you wish, one technique for this scenario would be to construct partitions containing both a 
+forest ID and a server timestamp:
 
-The following examples illustrates how the forest IDs for the `spark-example-content` database can be used as batch
-identifiers. The custom code for returning URIs is then constrained to the value of `PARTITION` which will be a forest 
-ID. Spark will invoke the custom code once for each partition, with the returned batch of rows being immediately 
-sent to the writer, which in this example are then printed to the console:
+```
+const forestIds = xdmp.databaseForests(xdmp.database())
+const timestamp = xdmp.requestTimestamp()
+Sequence.from(forestIds.toArray().map(forestId => forestId + ":" + timestamp))
+```
+
+In the custom code for returning rows, you can then obtain both a forest ID and a server timestamp from the partition
+value and use them to ensure each of your queries runs at the same point in time. 
+
+### Streaming support
+
+Just like for reading rows with Optic, the connector supports 
+[streaming reads](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) 
+from MarkLogic via micro-batches. The connector configuration does not change; instead, different Spark APIs are used 
+to read a stream of data from MarkLogic. This can be useful for when you wish to obtain a batch of results from 
+MarkLogic and immediately send them to a Spark writer. 
+
+When streaming results from your custom code, you will need to set one of the options described above - either
+`spark.marklogic.read.partitions.invoke`, `spark.marklogic.read.partitions.javascript`, or 
+`spark.marklogic.read.partitions.xquery` - for defining partitions. 
+
+The following example shows how the same connector configuration can be used for defining partitions and the custom
+code for returning rows, just with different Spark APIs. In this example, Spark will invoke the custom code once
+for each partition, with the returned batch of rows being immediately streamed to the writer, which prints the
+batch of rows to the console:
 
 ```
 stream = spark.readStream \
     .format("com.marklogic.spark") \
     .option("spark.marklogic.client.uri", "spark-example-user:password@localhost:8003") \
-    .option("spark.marklogic.read.partitions.javascript", "xdmp.databaseForests(xdmp.database('spark-example-content'))") \
+    .option("spark.marklogic.read.partitions.javascript", "xdmp.databaseForests(xdmp.database())") \
     .option("spark.marklogic.read.javascript", "cts.uris(null, null, cts.collectionQuery('employee'), null, [PARTITION]);") \
     .load() \
     .writeStream \
@@ -397,21 +435,6 @@ stream.processAllAvailable()
 stream.stop()
 ```
 
-For a streaming use case, you may wish to ensure that every query runs 
-[at the same point in time](https://docs.marklogic.com/guide/app-dev/point_in_time). Because you are free to return
-any partitions you wish, one technique for accomplishing this would be to construct partitions
-containing both a forest ID and a server timestamp:
-
-```
-const forestIds = xdmp.databaseForests(xdmp.database('spark-example-content'))
-const timestamp = xdmp.requestTimestamp()
-Sequence.from(forestIds.toArray().map(forestId => forestId + ":" + timestamp))
-```
-
-In your custom code, you would then parse out the forest ID and server timestamp from each partition and use
-them accordingly in your queries. The MarkLogic documentation in the link above can provide more details and examples
-on how to perform point-in-time queries with server timestamps.
-
 ### Tuning performance
 
 A key difference with reading via custom code is that unless you are using Spark streaming, a single call will be made 
diff --git a/examples/getting-started/docker-compose.yml b/examples/getting-started/docker-compose.yml
@@ -7,7 +7,6 @@ services:
   marklogic:
     image: "marklogicdb/marklogic-db:11.1.0-centos-1.1.0"
     platform: linux/amd64
-    restart: always
     environment:
       - MARKLOGIC_INIT=true
       - MARKLOGIC_ADMIN_USERNAME=admin