@@ -7,7 +7,10 @@ package org.apache.spark.sql
7
7
8
8
import java .util .Locale
9
9
10
+ import com .amazonaws .services .glue .model .{AccessDeniedException , AWSGlueException }
10
11
import com .amazonaws .services .s3 .model .AmazonS3Exception
12
+ import com .fasterxml .jackson .databind .ObjectMapper
13
+ import com .fasterxml .jackson .module .scala .DefaultScalaModule
11
14
import org .apache .commons .text .StringEscapeUtils .unescapeJava
12
15
import org .opensearch .flint .core .IRestHighLevelClient
13
16
import org .opensearch .flint .core .metrics .MetricConstants
@@ -17,12 +20,16 @@ import play.api.libs.json._
17
20
import org .apache .spark .{SparkConf , SparkException }
18
21
import org .apache .spark .internal .Logging
19
22
import org .apache .spark .sql .catalyst .parser .ParseException
23
+ import org .apache .spark .sql .flint .config .FlintSparkConf
20
24
import org .apache .spark .sql .types ._
21
25
import org .apache .spark .sql .util ._
22
26
23
27
trait FlintJobExecutor {
24
28
this : Logging =>
25
29
30
+ val mapper = new ObjectMapper ()
31
+ mapper.registerModule(DefaultScalaModule )
32
+
26
33
var currentTimeProvider : TimeProvider = new RealTimeProvider ()
27
34
var threadPoolFactory : ThreadPoolFactory = new DefaultThreadPoolFactory ()
28
35
var envinromentProvider : EnvironmentProvider = new RealEnvironment ()
@@ -64,6 +71,9 @@ trait FlintJobExecutor {
64
71
"sessionId": {
65
72
"type": "keyword"
66
73
},
74
+ "jobType": {
75
+ "type": "keyword"
76
+ },
67
77
"updateTime": {
68
78
"type": "date",
69
79
"format": "strict_date_time||epoch_millis"
@@ -188,6 +198,7 @@ trait FlintJobExecutor {
188
198
StructField (" queryId" , StringType , nullable = true ),
189
199
StructField (" queryText" , StringType , nullable = true ),
190
200
StructField (" sessionId" , StringType , nullable = true ),
201
+ StructField (" jobType" , StringType , nullable = true ),
191
202
// number is not nullable
192
203
StructField (" updateTime" , LongType , nullable = false ),
193
204
StructField (" queryRunTime" , LongType , nullable = true )))
@@ -216,6 +227,7 @@ trait FlintJobExecutor {
216
227
queryId,
217
228
query,
218
229
sessionId,
230
+ spark.conf.get(FlintSparkConf .JOB_TYPE .key),
219
231
endTime,
220
232
endTime - startTime))
221
233
@@ -246,6 +258,7 @@ trait FlintJobExecutor {
246
258
StructField (" queryId" , StringType , nullable = true ),
247
259
StructField (" queryText" , StringType , nullable = true ),
248
260
StructField (" sessionId" , StringType , nullable = true ),
261
+ StructField (" jobType" , StringType , nullable = true ),
249
262
// number is not nullable
250
263
StructField (" updateTime" , LongType , nullable = false ),
251
264
StructField (" queryRunTime" , LongType , nullable = true )))
@@ -265,6 +278,7 @@ trait FlintJobExecutor {
265
278
queryId,
266
279
query,
267
280
sessionId,
281
+ spark.conf.get(FlintSparkConf .JOB_TYPE .key),
268
282
endTime,
269
283
endTime - startTime))
270
284
@@ -328,7 +342,7 @@ trait FlintJobExecutor {
328
342
val inputJson = Json .parse(input)
329
343
val mappingJson = Json .parse(mapping)
330
344
331
- compareJson(inputJson, mappingJson)
345
+ compareJson(inputJson, mappingJson) || compareJson(mappingJson, inputJson)
332
346
}
333
347
334
348
def checkAndCreateIndex (osClient : OSClient , resultIndex : String ): Either [String , Unit ] = {
@@ -409,68 +423,58 @@ trait FlintJobExecutor {
409
423
private def handleQueryException (
410
424
e : Exception ,
411
425
message : String ,
412
- spark : SparkSession ,
413
- dataSource : String ,
414
- query : String ,
415
- queryId : String ,
416
- sessionId : String ): String = {
417
- val error = s " $message: ${e.getMessage}"
418
- logError(error, e)
419
- error
426
+ errorSource : Option [String ] = None ,
427
+ statusCode : Option [Int ] = None ): String = {
428
+
429
+ val errorDetails = Map (" Message" -> s " $message: ${e.getMessage}" ) ++
430
+ errorSource.map(" ErrorSource" -> _) ++
431
+ statusCode.map(code => " StatusCode" -> code.toString)
432
+
433
+ val errorJson = mapper.writeValueAsString(errorDetails)
434
+ logError(errorJson, e)
435
+ errorJson
420
436
}
421
437
422
438
def getRootCause (e : Throwable ): Throwable = {
423
439
if (e.getCause == null ) e
424
440
else getRootCause(e.getCause)
425
441
}
426
442
427
- def processQueryException (
428
- ex : Exception ,
429
- spark : SparkSession ,
430
- dataSource : String ,
431
- query : String ,
432
- queryId : String ,
433
- sessionId : String ): String = {
443
+ /**
444
+ * This method converts query exception into error string, which then persist to query result
445
+ * metadata
446
+ */
447
+ def processQueryException (ex : Exception ): String = {
434
448
getRootCause(ex) match {
435
449
case r : ParseException =>
436
- handleQueryException(r, " Syntax error" , spark, dataSource, query, queryId, sessionId )
450
+ handleQueryException(r, " Syntax error" )
437
451
case r : AmazonS3Exception =>
438
452
incrementCounter(MetricConstants .S3_ERR_CNT_METRIC )
439
453
handleQueryException(
440
454
r,
441
455
" Fail to read data from S3. Cause" ,
442
- spark,
443
- dataSource,
444
- query,
445
- queryId,
446
- sessionId)
447
- case r : AnalysisException =>
456
+ Some (r.getServiceName),
457
+ Some (r.getStatusCode))
458
+ case r : AWSGlueException =>
459
+ incrementCounter(MetricConstants .GLUE_ERR_CNT_METRIC )
460
+ // Redact Access denied in AWS Glue service
461
+ r match {
462
+ case accessDenied : AccessDeniedException =>
463
+ accessDenied.setErrorMessage(
464
+ " Access denied in AWS Glue service. Please check permissions." )
465
+ case _ => // No additional action for other types of AWSGlueException
466
+ }
448
467
handleQueryException(
449
468
r,
450
- " Fail to analyze query. Cause" ,
451
- spark,
452
- dataSource,
453
- query,
454
- queryId,
455
- sessionId)
469
+ " Fail to read data from Glue. Cause" ,
470
+ Some (r.getServiceName),
471
+ Some (r.getStatusCode))
472
+ case r : AnalysisException =>
473
+ handleQueryException(r, " Fail to analyze query. Cause" )
456
474
case r : SparkException =>
457
- handleQueryException(
458
- r,
459
- " Spark exception. Cause" ,
460
- spark,
461
- dataSource,
462
- query,
463
- queryId,
464
- sessionId)
475
+ handleQueryException(r, " Spark exception. Cause" )
465
476
case r : Exception =>
466
- handleQueryException(
467
- r,
468
- " Fail to run query, cause" ,
469
- spark,
470
- dataSource,
471
- query,
472
- queryId,
473
- sessionId)
477
+ handleQueryException(r, " Fail to run query. Cause" )
474
478
}
475
479
}
476
480
}
0 commit comments