@@ -7,7 +7,10 @@ package org.apache.spark.sql
7
7
8
8
import java .util .Locale
9
9
10
+ import com .amazonaws .services .glue .model .{AccessDeniedException , AWSGlueException }
10
11
import com .amazonaws .services .s3 .model .AmazonS3Exception
12
+ import com .fasterxml .jackson .databind .ObjectMapper
13
+ import com .fasterxml .jackson .module .scala .DefaultScalaModule
11
14
import org .apache .commons .text .StringEscapeUtils .unescapeJava
12
15
import org .opensearch .flint .core .IRestHighLevelClient
13
16
import org .opensearch .flint .core .metrics .MetricConstants
@@ -17,13 +20,17 @@ import play.api.libs.json._
17
20
import org .apache .spark .{SparkConf , SparkException }
18
21
import org .apache .spark .internal .Logging
19
22
import org .apache .spark .sql .catalyst .parser .ParseException
23
+ import org .apache .spark .sql .flint .config .FlintSparkConf
20
24
import org .apache .spark .sql .flint .config .FlintSparkConf .REFRESH_POLICY
21
25
import org .apache .spark .sql .types ._
22
26
import org .apache .spark .sql .util ._
23
27
24
28
trait FlintJobExecutor {
25
29
this : Logging =>
26
30
31
+ val mapper = new ObjectMapper ()
32
+ mapper.registerModule(DefaultScalaModule )
33
+
27
34
var currentTimeProvider : TimeProvider = new RealTimeProvider ()
28
35
var threadPoolFactory : ThreadPoolFactory = new DefaultThreadPoolFactory ()
29
36
var envinromentProvider : EnvironmentProvider = new RealEnvironment ()
@@ -65,6 +72,9 @@ trait FlintJobExecutor {
65
72
"sessionId": {
66
73
"type": "keyword"
67
74
},
75
+ "jobType": {
76
+ "type": "keyword"
77
+ },
68
78
"updateTime": {
69
79
"type": "date",
70
80
"format": "strict_date_time||epoch_millis"
@@ -190,6 +200,7 @@ trait FlintJobExecutor {
190
200
StructField (" queryId" , StringType , nullable = true ),
191
201
StructField (" queryText" , StringType , nullable = true ),
192
202
StructField (" sessionId" , StringType , nullable = true ),
203
+ StructField (" jobType" , StringType , nullable = true ),
193
204
// number is not nullable
194
205
StructField (" updateTime" , LongType , nullable = false ),
195
206
StructField (" queryRunTime" , LongType , nullable = true )))
@@ -218,6 +229,7 @@ trait FlintJobExecutor {
218
229
queryId,
219
230
query,
220
231
sessionId,
232
+ spark.conf.get(FlintSparkConf .JOB_TYPE .key),
221
233
endTime,
222
234
endTime - startTime))
223
235
@@ -248,6 +260,7 @@ trait FlintJobExecutor {
248
260
StructField (" queryId" , StringType , nullable = true ),
249
261
StructField (" queryText" , StringType , nullable = true ),
250
262
StructField (" sessionId" , StringType , nullable = true ),
263
+ StructField (" jobType" , StringType , nullable = true ),
251
264
// number is not nullable
252
265
StructField (" updateTime" , LongType , nullable = false ),
253
266
StructField (" queryRunTime" , LongType , nullable = true )))
@@ -267,6 +280,7 @@ trait FlintJobExecutor {
267
280
queryId,
268
281
query,
269
282
sessionId,
283
+ spark.conf.get(FlintSparkConf .JOB_TYPE .key),
270
284
endTime,
271
285
endTime - startTime))
272
286
@@ -330,7 +344,7 @@ trait FlintJobExecutor {
330
344
val inputJson = Json .parse(input)
331
345
val mappingJson = Json .parse(mapping)
332
346
333
- compareJson(inputJson, mappingJson)
347
+ compareJson(inputJson, mappingJson) || compareJson(mappingJson, inputJson)
334
348
}
335
349
336
350
def checkAndCreateIndex (osClient : OSClient , resultIndex : String ): Either [String , Unit ] = {
@@ -411,68 +425,58 @@ trait FlintJobExecutor {
411
425
private def handleQueryException (
412
426
e : Exception ,
413
427
message : String ,
414
- spark : SparkSession ,
415
- dataSource : String ,
416
- query : String ,
417
- queryId : String ,
418
- sessionId : String ): String = {
419
- val error = s " $message: ${e.getMessage}"
420
- logError(error, e)
421
- error
428
+ errorSource : Option [String ] = None ,
429
+ statusCode : Option [Int ] = None ): String = {
430
+
431
+ val errorDetails = Map (" Message" -> s " $message: ${e.getMessage}" ) ++
432
+ errorSource.map(" ErrorSource" -> _) ++
433
+ statusCode.map(code => " StatusCode" -> code.toString)
434
+
435
+ val errorJson = mapper.writeValueAsString(errorDetails)
436
+ logError(errorJson, e)
437
+ errorJson
422
438
}
423
439
424
440
def getRootCause (e : Throwable ): Throwable = {
425
441
if (e.getCause == null ) e
426
442
else getRootCause(e.getCause)
427
443
}
428
444
429
- def processQueryException (
430
- ex : Exception ,
431
- spark : SparkSession ,
432
- dataSource : String ,
433
- query : String ,
434
- queryId : String ,
435
- sessionId : String ): String = {
445
+ /**
446
+ * This method converts query exception into error string, which then persist to query result
447
+ * metadata
448
+ */
449
+ def processQueryException (ex : Exception ): String = {
436
450
getRootCause(ex) match {
437
451
case r : ParseException =>
438
- handleQueryException(r, " Syntax error" , spark, dataSource, query, queryId, sessionId )
452
+ handleQueryException(r, " Syntax error" )
439
453
case r : AmazonS3Exception =>
440
454
incrementCounter(MetricConstants .S3_ERR_CNT_METRIC )
441
455
handleQueryException(
442
456
r,
443
457
" Fail to read data from S3. Cause" ,
444
- spark,
445
- dataSource,
446
- query,
447
- queryId,
448
- sessionId)
449
- case r : AnalysisException =>
458
+ Some (r.getServiceName),
459
+ Some (r.getStatusCode))
460
+ case r : AWSGlueException =>
461
+ incrementCounter(MetricConstants .GLUE_ERR_CNT_METRIC )
462
+ // Redact Access denied in AWS Glue service
463
+ r match {
464
+ case accessDenied : AccessDeniedException =>
465
+ accessDenied.setErrorMessage(
466
+ " Access denied in AWS Glue service. Please check permissions." )
467
+ case _ => // No additional action for other types of AWSGlueException
468
+ }
450
469
handleQueryException(
451
470
r,
452
- " Fail to analyze query. Cause" ,
453
- spark,
454
- dataSource,
455
- query,
456
- queryId,
457
- sessionId)
471
+ " Fail to read data from Glue. Cause" ,
472
+ Some (r.getServiceName),
473
+ Some (r.getStatusCode))
474
+ case r : AnalysisException =>
475
+ handleQueryException(r, " Fail to analyze query. Cause" )
458
476
case r : SparkException =>
459
- handleQueryException(
460
- r,
461
- " Spark exception. Cause" ,
462
- spark,
463
- dataSource,
464
- query,
465
- queryId,
466
- sessionId)
477
+ handleQueryException(r, " Spark exception. Cause" )
467
478
case r : Exception =>
468
- handleQueryException(
469
- r,
470
- " Fail to run query, cause" ,
471
- spark,
472
- dataSource,
473
- query,
474
- queryId,
475
- sessionId)
479
+ handleQueryException(r, " Fail to run query. Cause" )
476
480
}
477
481
}
478
482
}
0 commit comments