Merge pull request #160 from marklogic/feature/logging-enhancement

rjrudin · web-flow · commit 8812a1f0f85e · 2024-02-09T14:19:17.000-05:00
MLE-12294 Improving logging for sake of ETL tool
diff --git a/src/main/java/com/marklogic/spark/DefaultSource.java b/src/main/java/com/marklogic/spark/DefaultSource.java
@@ -45,7 +45,7 @@
  */
 public class DefaultSource implements TableProvider, DataSourceRegister {
 
-    private static final Logger logger = LoggerFactory.getLogger(DefaultSource.class);
+    private static final Logger logger = LoggerFactory.getLogger("com.marklogic.spark");
 
     @Override
     public String shortName() {
diff --git a/src/main/java/com/marklogic/spark/Util.java b/src/main/java/com/marklogic/spark/Util.java
@@ -16,6 +16,8 @@
 package com.marklogic.spark;
 
 import org.apache.spark.sql.catalyst.json.JSONOptions;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 import scala.collection.immutable.HashMap;
 
 import java.util.ArrayList;
@@ -25,6 +27,12 @@
 
 public interface Util {
 
+    /**
+     * Intended for all non-debug logging where the class name doesn't matter and only adds complexity to the log
+     * messages.
+     */
+    Logger MAIN_LOGGER = LoggerFactory.getLogger("com.marklogic.spark");
+
     JSONOptions DEFAULT_JSON_OPTIONS = new JSONOptions(
         new HashMap<>(),
 
diff --git a/src/main/java/com/marklogic/spark/reader/customcode/CustomCodeMicroBatchStream.java b/src/main/java/com/marklogic/spark/reader/customcode/CustomCodeMicroBatchStream.java
@@ -1,5 +1,6 @@
 package com.marklogic.spark.reader.customcode;
 
+import com.marklogic.spark.Util;
 import org.apache.spark.sql.connector.read.InputPartition;
 import org.apache.spark.sql.connector.read.PartitionReaderFactory;
 import org.apache.spark.sql.connector.read.streaming.MicroBatchStream;
@@ -74,6 +75,6 @@ public void commit(Offset end) {
 
     @Override
     public void stop() {
-        logger.info("Stopping");
+        Util.MAIN_LOGGER.info("Stopping");
     }
 }
diff --git a/src/main/java/com/marklogic/spark/reader/document/ForestReader.java b/src/main/java/com/marklogic/spark/reader/document/ForestReader.java
@@ -11,6 +11,7 @@
 import com.marklogic.client.query.QueryDefinition;
 import com.marklogic.client.query.SearchQueryDefinition;
 import com.marklogic.client.query.StructuredQueryBuilder;
+import com.marklogic.spark.Util;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
 import org.apache.spark.sql.catalyst.util.ArrayBasedMapData;
@@ -89,9 +90,9 @@ public boolean next() {
             List<String> uris = getNextBatchOfUris();
             if (uris.isEmpty()) {
                 // TBD on whether this should be info/debug.
-                if (logger.isInfoEnabled()) {
+                if (Util.MAIN_LOGGER.isInfoEnabled()) {
                     long duration = System.currentTimeMillis() - startTime;
-                    logger.info("Read {} documents from partition {} in {}ms", docCount, forestPartition, duration);
+                    Util.MAIN_LOGGER.info("Read {} documents from partition {} in {}ms", docCount, forestPartition, duration);
                 }
                 return false;
             }
diff --git a/src/main/java/com/marklogic/spark/reader/optic/OpticMicroBatchStream.java b/src/main/java/com/marklogic/spark/reader/optic/OpticMicroBatchStream.java
@@ -15,6 +15,7 @@
  */
 package com.marklogic.spark.reader.optic;
 
+import com.marklogic.spark.Util;
 import org.apache.spark.sql.connector.read.InputPartition;
 import org.apache.spark.sql.connector.read.PartitionReaderFactory;
 import org.apache.spark.sql.connector.read.streaming.MicroBatchStream;
@@ -98,6 +99,6 @@ public void commit(Offset end) {
 
     @Override
     public void stop() {
-        logger.info("Stopping");
+        Util.MAIN_LOGGER.info("Stopping");
     }
 }
diff --git a/src/main/java/com/marklogic/spark/reader/optic/OpticPartitionReaderFactory.java b/src/main/java/com/marklogic/spark/reader/optic/OpticPartitionReaderFactory.java
@@ -35,7 +35,9 @@ class OpticPartitionReaderFactory implements PartitionReaderFactory {
 
     @Override
     public PartitionReader<InternalRow> createReader(InputPartition partition) {
-        logger.info("Creating reader for partition: {}", partition);
+        if (logger.isDebugEnabled()) {
+            logger.debug("Creating reader for partition: {}", partition);
+        }
         return new OpticPartitionReader(this.readContext, (PlanAnalysis.Partition) partition);
     }
 }
diff --git a/src/main/java/com/marklogic/spark/reader/optic/OpticScanBuilder.java b/src/main/java/com/marklogic/spark/reader/optic/OpticScanBuilder.java
@@ -16,6 +16,7 @@
 package com.marklogic.spark.reader.optic;
 
 import com.marklogic.spark.Options;
+import com.marklogic.spark.Util;
 import com.marklogic.spark.reader.filter.FilterFactory;
 import com.marklogic.spark.reader.filter.OpticFilter;
 import org.apache.spark.sql.connector.expressions.SortOrder;
@@ -80,8 +81,8 @@ public Filter[] pushFilters(Filter[] filters) {
         for (Filter filter : filters) {
             OpticFilter opticFilter = FilterFactory.toPlanFilter(filter);
             if (opticFilter != null) {
-                if (logger.isInfoEnabled()) {
-                    logger.info("Pushing down filter: {}", filter);
+                if (logger.isDebugEnabled()) {
+                    logger.debug("Pushing down filter: {}", filter);
                 }
                 opticFilters.add(opticFilter);
                 this.pushedFilters.add(filter);
@@ -110,8 +111,8 @@ public boolean pushLimit(int limit) {
         if (readContext.planAnalysisFoundNoRows()) {
             return false;
         }
-        if (logger.isInfoEnabled()) {
-            logger.info("Pushing down limit: {}", limit);
+        if (logger.isDebugEnabled()) {
+            logger.debug("Pushing down limit: {}", limit);
         }
         readContext.pushDownLimit(limit);
         return true;
@@ -125,8 +126,8 @@ public boolean pushTopN(SortOrder[] orders, int limit) {
         // This will be invoked when the user calls both orderBy and limit in their Spark program. If the user only
         // calls limit, then only pushLimit is called and this will not be called. If the user only calls orderBy and
         // not limit, then neither this nor pushLimit will be called.
-        if (logger.isInfoEnabled()) {
-            logger.info("Pushing down topN: {}; limit: {}", Arrays.asList(orders), limit);
+        if (logger.isDebugEnabled()) {
+            logger.debug("Pushing down topN: {}; limit: {}", Arrays.asList(orders), limit);
         }
         readContext.pushDownTopN(orders, limit);
         return true;
@@ -156,16 +157,16 @@ public boolean supportCompletePushDown(Aggregation aggregation) {
         }
 
         if (hasUnsupportedAggregateFunction(aggregation)) {
-            if (logger.isInfoEnabled()) {
-                logger.info("Aggregation contains one or more unsupported functions, " +
+            if (Util.MAIN_LOGGER.isInfoEnabled()) {
+                Util.MAIN_LOGGER.info("Aggregation contains one or more unsupported functions, " +
                     "so not pushing aggregation to MarkLogic: {}", describeAggregation(aggregation));
             }
             return false;
         }
 
         if (readContext.getBucketCount() > 1) {
-            if (logger.isInfoEnabled()) {
-                logger.info("Multiple requests will be made to MarkLogic; aggregation will be applied by Spark as well: {}",
+            if (Util.MAIN_LOGGER.isInfoEnabled()) {
+                Util.MAIN_LOGGER.info("Multiple requests will be made to MarkLogic; aggregation will be applied by Spark as well: {}",
                     describeAggregation(aggregation));
             }
             return false;
@@ -183,12 +184,12 @@ public boolean pushAggregation(Aggregation aggregation) {
         }
 
         if (pushDownAggregatesIsDisabled()) {
-            logger.info("Push down of aggregates is disabled; Spark will handle all aggregations.");
+            Util.MAIN_LOGGER.info("Push down of aggregates is disabled; Spark will handle all aggregations.");
             return false;
         }
 
-        if (logger.isInfoEnabled()) {
-            logger.info("Pushing down aggregation: {}", describeAggregation(aggregation));
+        if (logger.isDebugEnabled()) {
+            logger.debug("Pushing down aggregation: {}", describeAggregation(aggregation));
         }
         readContext.pushDownAggregation(aggregation);
         return true;
diff --git a/src/main/java/com/marklogic/spark/reader/optic/PlanUtil.java b/src/main/java/com/marklogic/spark/reader/optic/PlanUtil.java
@@ -99,7 +99,9 @@ static ObjectNode buildGroupByAggregation(List<String> columnNames, Aggregation
                         aggregateArgs.addObject().put("values", "distinct");
                     }
                 } else {
-                    logger.info("Unsupported aggregate function, will not be pushed to Optic: {}", func);
+                    if (logger.isDebugEnabled()) {
+                        logger.debug("Unsupported aggregate function, will not be pushed to Optic: {}", func);
+                    }
                 }
             }
         });
diff --git a/src/main/java/com/marklogic/spark/reader/optic/ReadContext.java b/src/main/java/com/marklogic/spark/reader/optic/ReadContext.java
@@ -29,6 +29,7 @@
 import com.marklogic.spark.ConnectorException;
 import com.marklogic.spark.ContextSupport;
 import com.marklogic.spark.Options;
+import com.marklogic.spark.Util;
 import com.marklogic.spark.reader.filter.OpticFilter;
 import org.apache.spark.sql.connector.expressions.Expression;
 import org.apache.spark.sql.connector.expressions.SortOrder;
@@ -92,8 +93,8 @@ public ReadContext(Map<String, String> properties, StructType schema, int defaul
         }
 
         if (this.planAnalysis != null) {
-            if (logger.isInfoEnabled()) {
-                logger.info("Partition count: {}; number of requests that will be made to MarkLogic: {}",
+            if (Util.MAIN_LOGGER.isInfoEnabled()) {
+                Util.MAIN_LOGGER.info("Partition count: {}; number of requests that will be made to MarkLogic: {}",
                     this.planAnalysis.getPartitions().size(), this.planAnalysis.getAllBuckets().size());
             }
             // Calling this to establish a server timestamp.
@@ -108,7 +109,7 @@ public ReadContext(Map<String, String> properties, StructType schema, int defaul
     private void handlePlanAnalysisError(String query, FailedRequestException ex) {
         final String indicatorOfNoRowsExisting = "$tableId as xs:string -- Invalid coercion: () as xs:string";
         if (ex.getMessage().contains(indicatorOfNoRowsExisting)) {
-            logger.info("No rows were found, so will not create any partitions.");
+            Util.MAIN_LOGGER.info("No rows were found, so will not create any partitions.");
         } else {
             throw new ConnectorException(String.format("Unable to run Optic DSL query %s; cause: %s", query, ex.getMessage()), ex);
         }
@@ -195,12 +196,14 @@ void pushDownAggregation(Aggregation aggregation) {
                 StructField field = findColumnInSchema(sum.column(), PlanUtil.expressionToColumnName(sum.column()));
                 newSchema = newSchema.add(func.toString(), field.dataType());
             } else {
-                logger.info("Unsupported aggregate function: {}", func);
+                if (logger.isDebugEnabled()) {
+                    logger.debug("Unsupported aggregate function: {}", func);
+                }
             }
         }
 
         if (!getProperties().containsKey(Options.READ_BATCH_SIZE)) {
-            logger.info("Batch size was not overridden, so modifying each partition to make a single request to improve " +
+            Util.MAIN_LOGGER.info("Batch size was not overridden, so modifying each partition to make a single request to improve " +
                 "performance of pushed down aggregation.");
             List<PlanAnalysis.Partition> mergedPartitions = planAnalysis.getPartitions().stream()
                 .map(p -> p.mergeBuckets())
diff --git a/src/main/java/com/marklogic/spark/reader/optic/SchemaInferrer.java b/src/main/java/com/marklogic/spark/reader/optic/SchemaInferrer.java
@@ -19,18 +19,16 @@
 import com.fasterxml.jackson.databind.JsonNode;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.marklogic.spark.ConnectorException;
+import com.marklogic.spark.Util;
 import org.apache.spark.sql.types.DataType;
 import org.apache.spark.sql.types.DataTypes;
 import org.apache.spark.sql.types.StructType;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
 import java.util.HashMap;
 import java.util.Map;
 
 public abstract class SchemaInferrer {
 
-    private static final Logger logger = LoggerFactory.getLogger(SchemaInferrer.class);
     private static final ObjectMapper objectMapper = new ObjectMapper();
 
     // "Column info types" = the possible set of types returned by the columnInfo call to /v1/rows. Note that this is
@@ -105,7 +103,7 @@ private static DataType determineSparkType(JsonNode column) {
         if (COLUMN_INFO_TYPES_TO_SPARK_TYPES.containsKey(type)) {
             return COLUMN_INFO_TYPES_TO_SPARK_TYPES.get(type);
         }
-        logger.warn("Unrecognized column type: {}; will map to Spark StringType", column);
+        Util.MAIN_LOGGER.warn("Unrecognized column type: {}; will map to Spark StringType", column);
         return DataTypes.StringType;
     }
 
diff --git a/src/main/java/com/marklogic/spark/writer/MarkLogicWrite.java b/src/main/java/com/marklogic/spark/writer/MarkLogicWrite.java
@@ -15,8 +15,9 @@
  */
 package com.marklogic.spark.writer;
 
-import com.marklogic.spark.reader.customcode.CustomCodeContext;
 import com.marklogic.spark.Options;
+import com.marklogic.spark.Util;
+import com.marklogic.spark.reader.customcode.CustomCodeContext;
 import com.marklogic.spark.writer.customcode.CustomCodeWriterFactory;
 import org.apache.spark.sql.connector.write.BatchWrite;
 import org.apache.spark.sql.connector.write.DataWriterFactory;
@@ -31,7 +32,7 @@
 
 class MarkLogicWrite implements BatchWrite, StreamingWrite {
 
-    private static final Logger logger = LoggerFactory.getLogger(MarkLogicWrite.class);
+    private static final Logger logger = LoggerFactory.getLogger("com.marklogic.spark");
 
     private WriteContext writeContext;
 
@@ -46,21 +47,21 @@ public boolean useCommitCoordinator() {
 
     @Override
     public DataWriterFactory createBatchWriterFactory(PhysicalWriteInfo info) {
-        logger.info("Number of partitions: {}", info.numPartitions());
+        Util.MAIN_LOGGER.info("Number of partitions: {}", info.numPartitions());
         return (DataWriterFactory) determineWriterFactory();
     }
 
     @Override
     public void commit(WriterCommitMessage[] messages) {
-        if (messages != null && messages.length > 0 && logger.isInfoEnabled()) {
-            logger.info("Commit messages received: {}", Arrays.asList(messages));
+        if (messages != null && messages.length > 0 && logger.isDebugEnabled()) {
+            logger.debug("Commit messages received: {}", Arrays.asList(messages));
         }
     }
 
     @Override
     public void abort(WriterCommitMessage[] messages) {
-        if (messages != null && messages.length > 0) {
-            logger.warn("Abort messages received: {}", Arrays.asList(messages));
+        if (messages != null && messages.length > 0 && messages[0] != null) {
+            Util.MAIN_LOGGER.warn("Abort messages received: {}", Arrays.asList(messages));
         }
     }
 
@@ -79,7 +80,7 @@ public void commit(long epochId, WriterCommitMessage[] messages) {
     @Override
     public void abort(long epochId, WriterCommitMessage[] messages) {
         if (messages != null && messages.length > 0) {
-            logger.warn("Abort messages received for epochId {}: {}", epochId, Arrays.asList(messages));
+            Util.MAIN_LOGGER.warn("Abort messages received for epochId {}: {}", epochId, Arrays.asList(messages));
         }
     }
 
diff --git a/src/main/java/com/marklogic/spark/writer/WriteBatcherDataWriter.java b/src/main/java/com/marklogic/spark/writer/WriteBatcherDataWriter.java
@@ -18,6 +18,7 @@
 import com.marklogic.client.DatabaseClient;
 import com.marklogic.client.datamovement.DataMovementManager;
 import com.marklogic.client.datamovement.WriteBatcher;
+import com.marklogic.spark.Util;
 import com.marklogic.spark.reader.document.DocumentRowSchema;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.connector.write.DataWriter;
@@ -100,13 +101,15 @@ public WriterCommitMessage commit() throws IOException {
 
     @Override
     public void abort() {
-        logger.warn("Abort called; stopping job");
+        Util.MAIN_LOGGER.warn("Abort called; stopping job");
         stopJobAndRelease();
     }
 
     @Override
     public void close() {
-        logger.info("Close called; stopping job");
+        if (logger.isDebugEnabled()) {
+            logger.debug("Close called; stopping job.");
+        }
         stopJobAndRelease();
     }
 
diff --git a/src/main/java/com/marklogic/spark/writer/WriteContext.java b/src/main/java/com/marklogic/spark/writer/WriteContext.java
@@ -22,8 +22,10 @@
 import com.marklogic.client.document.ServerTransform;
 import com.marklogic.spark.ContextSupport;
 import com.marklogic.spark.Options;
+import com.marklogic.spark.Util;
 import org.apache.spark.sql.types.StructType;
 
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 import java.util.Map;
@@ -63,7 +65,13 @@ WriteBatcher newWriteBatcher(DataMovementManager dataMovementManager) {
         WriteBatcher writeBatcher = dataMovementManager
             .newWriteBatcher()
             .withBatchSize((int) getNumericOption(Options.WRITE_BATCH_SIZE, 100, 1))
-            .withThreadCount((int) getNumericOption(Options.WRITE_THREAD_COUNT, 4, 1));
+            .withThreadCount((int) getNumericOption(Options.WRITE_THREAD_COUNT, 4, 1))
+            // WriteBatcherImpl has its own warn-level logging which is a bit verbose, including more than just the
+            // message from the server. This is intended to always show up and be associated with our Spark connector
+            // and also to be more brief, just capturing the main message from the server.
+            .onBatchFailure(((batch, failure) -> {
+                Util.MAIN_LOGGER.error("Failed to write documents: {}", failure.getMessage());
+            }));
 
         if (logger.isDebugEnabled()) {
             writeBatcher.onBatchSuccess(this::logBatchOnSuccess);
@@ -90,7 +98,7 @@ DocBuilder newDocBuilder() {
             Stream.of(Options.WRITE_URI_PREFIX, Options.WRITE_URI_SUFFIX, Options.WRITE_URI_REPLACE).forEach(option -> {
                 String value = getProperties().get(option);
                 if (value != null && value.trim().length() > 0) {
-                    logger.warn("Option {} will be ignored since option {} was specified.", option, Options.WRITE_URI_TEMPLATE);
+                    Util.MAIN_LOGGER.warn("Option {} will be ignored since option {} was specified.", option, Options.WRITE_URI_TEMPLATE);
                 }
             });
         } else {
diff --git a/src/main/java/com/marklogic/spark/writer/customcode/CustomCodeWriter.java b/src/main/java/com/marklogic/spark/writer/customcode/CustomCodeWriter.java
@@ -86,12 +86,14 @@ public WriterCommitMessage commit() {
 
     @Override
     public void abort() {
-        logger.warn("Abort called; stopping job");
+        Util.MAIN_LOGGER.warn("Abort called; stopping job");
     }
 
     @Override
     public void close() {
-        logger.info("Close called; stopping job");
+        if (logger.isDebugEnabled()) {
+            logger.debug("Close called; stopping job.");
+        }
         if (databaseClient != null) {
             databaseClient.release();
         }
@@ -167,7 +169,7 @@ private void executeCall(ServerEvaluationCall call) {
             if (customCodeContext.isAbortOnFailure()) {
                 throw ex;
             }
-            logger.error(String.format("Unable to process row; cause: %s", ex.getMessage()));
+            Util.MAIN_LOGGER.error(String.format("Unable to process row; cause: %s", ex.getMessage()));
         }
     }
 
diff --git a/src/test/resources/logback.xml b/src/test/resources/logback.xml

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,6 @@`
`1`	`1`	`package com.marklogic.spark.reader.customcode;`
`2`	`2`
	`3`	`+import com.marklogic.spark.Util;`
`3`	`4`	`import org.apache.spark.sql.connector.read.InputPartition;`
`4`	`5`	`import org.apache.spark.sql.connector.read.PartitionReaderFactory;`
`5`	`6`	`import org.apache.spark.sql.connector.read.streaming.MicroBatchStream;`
`@@ -74,6 +75,6 @@ public void commit(Offset end) {`
`74`	`75`
`75`	`76`	`@Override`
`76`	`77`	`public void stop() {`
`77`		`- logger.info("Stopping");`
	`78`	`+ Util.MAIN_LOGGER.info("Stopping");`
`78`	`79`	`}`
`79`	`80`	`}`
Original file line number	Diff line number	Diff line change
`@@ -15,6 +15,7 @@`
`15`	`15`	`*/`
`16`	`16`	`package com.marklogic.spark.reader.optic;`
`17`	`17`
	`18`	`+import com.marklogic.spark.Util;`
`18`	`19`	`import org.apache.spark.sql.connector.read.InputPartition;`
`19`	`20`	`import org.apache.spark.sql.connector.read.PartitionReaderFactory;`
`20`	`21`	`import org.apache.spark.sql.connector.read.streaming.MicroBatchStream;`
`@@ -98,6 +99,6 @@ public void commit(Offset end) {`
`98`	`99`
`99`	`100`	`@Override`
`100`	`101`	`public void stop() {`
`101`		`- logger.info("Stopping");`
	`102`	`+ Util.MAIN_LOGGER.info("Stopping");`
`102`	`103`	`}`
`103`	`104`	`}`
Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,9 @@ class OpticPartitionReaderFactory implements PartitionReaderFactory {`
`35`	`35`
`36`	`36`	`@Override`
`37`	`37`	`public PartitionReader<InternalRow> createReader(InputPartition partition) {`
`38`		`- logger.info("Creating reader for partition: {}", partition);`
	`38`	`+ if (logger.isDebugEnabled()) {`
	`39`	`+ logger.debug("Creating reader for partition: {}", partition);`
	`40`	`+ }`
`39`	`41`	`return new OpticPartitionReader(this.readContext, (PlanAnalysis.Partition) partition);`
`40`	`42`	`}`
`41`	`43`	`}`
Original file line number	Diff line number	Diff line change
`@@ -99,7 +99,9 @@ static ObjectNode buildGroupByAggregation(List<String> columnNames, Aggregation`
`99`	`99`	`aggregateArgs.addObject().put("values", "distinct");`
`100`	`100`	`}`
`101`	`101`	`} else {`
`102`		`- logger.info("Unsupported aggregate function, will not be pushed to Optic: {}", func);`
	`102`	`+ if (logger.isDebugEnabled()) {`
	`103`	`+ logger.debug("Unsupported aggregate function, will not be pushed to Optic: {}", func);`
	`104`	`+ }`
`103`	`105`	`}`
`104`	`106`	`}`
`105`	`107`	`});`