Merge pull request #162 from BillFarber/develop

BillFarber · web-flow · commit 3d90acd461a5 · 2024-02-12T12:11:18.000-05:00
MLE-12296 Added an example of nested aggregation.
diff --git a/examples/entity-aggregation/README.md b/examples/entity-aggregation/README.md
@@ -115,3 +115,65 @@ Note as well that this example is not intended to be authoritative. Please see
 on writing Spark programs. You may also find
 [this reference on Spark aggregate functions](https://sparkbyexamples.com/spark/spark-sql-aggregate-functions/) helpful.
 
+## Importing customers, rentals and payments from Postgres to MarkLogic
+
+This project also contains an example of nested joins being aggregated. In this example, the query includes a join with
+rentals (as above), but with the addition that payments are joined with rentals, producing a nested join in the query.
+Then Spark aggregation functions are used to perform nested aggregations, resulting in customer documents with JSON
+similar to the following snippet.
+
+```
+{
+  "customer_id": 182,
+  "last_name": "Lane",
+  "Rentals": [
+    {
+      "rental_id": 1542,
+      "payments": [
+        {
+          "payment_id": 19199,
+          "amount": 3.99
+        }
+      ]
+    },
+...
+    {
+      "rental_id": 4591,
+      "payments": [
+        {
+          "payment_id": 19518,
+          "amount": 1.99
+        },
+        {
+          "payment_id": 25162,
+          "amount": 1.99
+        },
+        {
+          "payment_id": 29163,
+          "amount": 0.99
+        },
+        {
+          "payment_id": 31069,
+          "amount": 3.99
+        },
+        {
+          "payment_id": 31834,
+          "amount": 3.99
+        }
+      ]
+    },
+...
+}
+```
+
+To try this out, there is another Gradle command similar to the first:
+
+    ./gradlew importCustomersWithRentalsAndPayments
+
+You can then use [MarkLogic's qconsole application](https://docs.marklogic.com/guide/qconsole/intro) to view the
+customer documents written to the Documents database. In this example, 10 customer rows are queried from the Postgres
+"customers" table, so you will see 10 customer documents in the Documents database.
+
+The JSON snippet above is from the document for customer 182. One of that customer's rentals, 4591, has multiple
+payments. If you examine the document for that customer, (/customerWithDoubleNesting/182.json) you will be able to
+verify the nested aggregation.
diff --git a/examples/entity-aggregation/build.gradle b/examples/entity-aggregation/build.gradle
@@ -16,3 +16,8 @@ task importCustomers(type: JavaExec) {
   classpath = sourceSets.main.runtimeClasspath
   mainClass = 'org.example.ImportCustomers'
 }
+
+task importCustomersWithRentalsAndPayments(type: JavaExec) {
+  classpath = sourceSets.main.runtimeClasspath
+  mainClass = 'org.example.ImportCustomersWithRentalsAndPayments'
+}
diff --git a/examples/entity-aggregation/src/main/java/org/example/ImportCustomersWithRentalsAndPayments.java b/examples/entity-aggregation/src/main/java/org/example/ImportCustomersWithRentalsAndPayments.java
@@ -0,0 +1,72 @@
+package org.example;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.functions;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Properties;
+
+public class ImportCustomersWithRentalsAndPayments {
+
+    public static void main(String[] args) {
+        // The MarkLogic admin password is assumed to be "admin" per the docker-compose.yml file. This is purely for
+        // demonstrational purposes and should never be used in a real application.
+        final String markLogicAdminPassword = "admin";
+
+        // Create a vanilla local Spark session.
+        SparkSession session = SparkSession.builder()
+            .master("local[*]")
+            .getOrCreate();
+
+        Map<String, String> jdbcOptions = new HashMap<String, String>() {{
+            put("driver", "org.postgresql.Driver");
+            put("url", "jdbc:postgresql://localhost/dvdrental");
+            put("user", "postgres");
+            put("password", "postgres");
+        }};
+
+        String query =
+            "select c.customer_id, c.last_name, r.rental_id, r.rental_date, p.payment_id, p.amount " +
+            "from customer c " +
+            "inner join rental r on c.customer_id = r.customer_id " +
+            "inner join payment p on r.rental_id = p.rental_id " +
+            "where (c.customer_id >= 180 and c.customer_id < 190) ";
+
+        session
+            .read()
+            // Use Spark's built-in JDBC support to read rows from Postgres.
+            .format("jdbc").options(jdbcOptions)
+            .option("query", query)
+            .load()
+
+            .groupBy("rental_id")
+            .agg(
+                functions.first("customer_id").alias("customer_id"),
+                functions.first("last_name").alias("last_name"),
+                functions.collect_list(functions.struct("payment_id","amount")).alias("payments")
+            )
+            .groupBy("customer_id")
+            .agg(
+                functions.first("last_name").alias("last_name"),
+                functions.collect_list(functions.struct("rental_id","payments")).alias("Rentals")
+            )
+
+
+            // The remaining calls use the MarkLogic Spark connector to write customer rows, with nested rentals and
+            // sub-nested payments, to the Documents database in MarkLogic.
+            .write()
+            .format("com.marklogic.spark")
+            .option("spark.marklogic.client.host", "localhost")
+            .option("spark.marklogic.client.port", "8000")
+            .option("spark.marklogic.client.username", "admin")
+            .option("spark.marklogic.client.password", markLogicAdminPassword)
+            .option("spark.marklogic.write.uriTemplate", "/customerWithDoubleNesting/{customer_id}.json")
+            .option("spark.marklogic.write.collections", "CustomerRentalPayments")
+            .option("spark.marklogic.write.permissions", "rest-reader,read,rest-writer,update")
+            .mode(SaveMode.Append)
+            .save();
+    }
+}

Original file line number	Diff line number	Diff line change
`@@ -16,3 +16,8 @@ task importCustomers(type: JavaExec) {`
`16`	`16`	`classpath = sourceSets.main.runtimeClasspath`
`17`	`17`	`mainClass = 'org.example.ImportCustomers'`
`18`	`18`	`}`
	`19`	`+`
	`20`	`+task importCustomersWithRentalsAndPayments(type: JavaExec) {`
	`21`	`+ classpath = sourceSets.main.runtimeClasspath`
	`22`	`+ mainClass = 'org.example.ImportCustomersWithRentalsAndPayments'`
	`23`	`+}`