Skip to content

Commit 6ecaa05

Browse files
Enhance index monitor to terminate streaming job on consecutive errors (#346) (#347)
* Add error counter and terminate logic in index monitor * Add new Spark conf for max error count and interval * Add new Spark conf for initial delay too * Update user manual --------- (cherry picked from commit 9de4f28) Signed-off-by: Chen Dai <daichen@amazon.com> Signed-off-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
1 parent 422dae7 commit 6ecaa05

File tree

5 files changed

+141
-29
lines changed

5 files changed

+141
-29
lines changed

docs/index.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -519,6 +519,9 @@ In the index mapping, the `_meta` and `properties`field stores meta and schema i
519519
- `spark.flint.index.hybridscan.enabled`: default is false.
520520
- `spark.flint.index.checkpoint.mandatory`: default is true.
521521
- `spark.datasource.flint.socket_timeout_millis`: default value is 60000.
522+
- `spark.flint.monitor.initialDelaySeconds`: Initial delay in seconds before starting the monitoring task. Default value is 15.
523+
- `spark.flint.monitor.intervalSeconds`: Interval in seconds for scheduling the monitoring task. Default value is 60.
524+
- `spark.flint.monitor.maxErrorCount`: Maximum number of consecutive errors allowed before stopping the monitoring task. Default value is 5.
522525

523526
#### Data Type Mapping
524527

flint-spark-integration/src/main/scala/org/apache/spark/sql/flint/config/FlintSparkConf.scala

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,18 @@ object FlintSparkConf {
154154
.doc("Checkpoint location for incremental refresh index will be mandatory if enabled")
155155
.createWithDefault("true")
156156

157+
val MONITOR_INITIAL_DELAY_SECONDS = FlintConfig("spark.flint.monitor.initialDelaySeconds")
158+
.doc("Initial delay in seconds before starting the monitoring task")
159+
.createWithDefault("15")
160+
161+
val MONITOR_INTERVAL_SECONDS = FlintConfig("spark.flint.monitor.intervalSeconds")
162+
.doc("Interval in seconds for scheduling the monitoring task")
163+
.createWithDefault("60")
164+
165+
val MONITOR_MAX_ERROR_COUNT = FlintConfig("spark.flint.monitor.maxErrorCount")
166+
.doc("Maximum number of consecutive errors allowed in index monitor")
167+
.createWithDefault("5")
168+
157169
val SOCKET_TIMEOUT_MILLIS =
158170
FlintConfig(s"spark.datasource.flint.${FlintOptions.SOCKET_TIMEOUT_MILLIS}")
159171
.datasourceOption()
@@ -223,6 +235,12 @@ case class FlintSparkConf(properties: JMap[String, String]) extends Serializable
223235

224236
def isCheckpointMandatory: Boolean = CHECKPOINT_MANDATORY.readFrom(reader).toBoolean
225237

238+
def monitorInitialDelaySeconds(): Int = MONITOR_INITIAL_DELAY_SECONDS.readFrom(reader).toInt
239+
240+
def monitorIntervalSeconds(): Int = MONITOR_INTERVAL_SECONDS.readFrom(reader).toInt
241+
242+
def monitorMaxErrorCount(): Int = MONITOR_MAX_ERROR_COUNT.readFrom(reader).toInt
243+
226244
/**
227245
* spark.sql.session.timeZone
228246
*/

flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSparkIndexMonitor.scala

Lines changed: 79 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import org.opensearch.flint.core.metrics.{MetricConstants, MetricsUtil}
1616

1717
import org.apache.spark.internal.Logging
1818
import org.apache.spark.sql.SparkSession
19+
import org.apache.spark.sql.flint.config.FlintSparkConf
1920
import org.apache.spark.sql.flint.newDaemonThreadPoolScheduledExecutor
2021

2122
/**
@@ -34,43 +35,32 @@ class FlintSparkIndexMonitor(
3435
dataSourceName: String)
3536
extends Logging {
3637

38+
/** Task execution initial delay in seconds */
39+
private val INITIAL_DELAY_SECONDS = FlintSparkConf().monitorInitialDelaySeconds()
40+
41+
/** Task execution interval in seconds */
42+
private val INTERVAL_SECONDS = FlintSparkConf().monitorIntervalSeconds()
43+
44+
/** Max error count allowed */
45+
private val MAX_ERROR_COUNT = FlintSparkConf().monitorMaxErrorCount()
46+
3747
/**
3848
* Start monitoring task on the given Flint index.
3949
*
4050
* @param indexName
4151
* Flint index name
4252
*/
4353
def startMonitor(indexName: String): Unit = {
44-
val task = FlintSparkIndexMonitor.executor.scheduleWithFixedDelay(
45-
() => {
46-
logInfo(s"Scheduler trigger index monitor task for $indexName")
47-
try {
48-
if (isStreamingJobActive(indexName)) {
49-
logInfo("Streaming job is still active")
50-
flintClient
51-
.startTransaction(indexName, dataSourceName)
52-
.initialLog(latest => latest.state == REFRESHING)
53-
.finalLog(latest => latest) // timestamp will update automatically
54-
.commit(_ => {})
55-
} else {
56-
logError("Streaming job is not active. Cancelling monitor task")
57-
flintClient
58-
.startTransaction(indexName, dataSourceName)
59-
.initialLog(_ => true)
60-
.finalLog(latest => latest.copy(state = FAILED))
61-
.commit(_ => {})
54+
logInfo(s"""Starting index monitor for $indexName with configuration:
55+
| - Initial delay: $INITIAL_DELAY_SECONDS seconds
56+
| - Interval: $INTERVAL_SECONDS seconds
57+
| - Max error count: $MAX_ERROR_COUNT
58+
|""".stripMargin)
6259

63-
stopMonitor(indexName)
64-
logInfo("Index monitor task is cancelled")
65-
}
66-
} catch {
67-
case e: Throwable =>
68-
logError("Failed to update index log entry", e)
69-
MetricsUtil.incrementCounter(MetricConstants.STREAMING_HEARTBEAT_FAILED_METRIC)
70-
}
71-
},
72-
15, // Delay to ensure final logging is complete first, otherwise version conflicts
73-
60, // TODO: make interval configurable
60+
val task = FlintSparkIndexMonitor.executor.scheduleWithFixedDelay(
61+
new FlintSparkIndexMonitorTask(indexName),
62+
INITIAL_DELAY_SECONDS, // Delay to ensure final logging is complete first, otherwise version conflicts
63+
INTERVAL_SECONDS,
7464
TimeUnit.SECONDS)
7565

7666
FlintSparkIndexMonitor.indexMonitorTracker.put(indexName, task)
@@ -92,8 +82,68 @@ class FlintSparkIndexMonitor(
9282
}
9383
}
9484

85+
/**
86+
* Index monitor task that encapsulates the execution logic with number of consecutive error
87+
* tracked.
88+
*
89+
* @param indexName
90+
* Flint index name
91+
*/
92+
private class FlintSparkIndexMonitorTask(indexName: String) extends Runnable {
93+
94+
/** The number of consecutive error */
95+
private var errorCnt = 0
96+
97+
override def run(): Unit = {
98+
logInfo(s"Scheduler trigger index monitor task for $indexName")
99+
try {
100+
if (isStreamingJobActive(indexName)) {
101+
logInfo("Streaming job is still active")
102+
flintClient
103+
.startTransaction(indexName, dataSourceName)
104+
.initialLog(latest => latest.state == REFRESHING)
105+
.finalLog(latest => latest) // timestamp will update automatically
106+
.commit(_ => {})
107+
} else {
108+
logError("Streaming job is not active. Cancelling monitor task")
109+
flintClient
110+
.startTransaction(indexName, dataSourceName)
111+
.initialLog(_ => true)
112+
.finalLog(latest => latest.copy(state = FAILED))
113+
.commit(_ => {})
114+
115+
stopMonitor(indexName)
116+
logInfo("Index monitor task is cancelled")
117+
}
118+
errorCnt = 0 // Reset counter if no error
119+
} catch {
120+
case e: Throwable =>
121+
errorCnt += 1
122+
logError(s"Failed to update index log entry, consecutive errors: $errorCnt", e)
123+
MetricsUtil.incrementCounter(MetricConstants.STREAMING_HEARTBEAT_FAILED_METRIC)
124+
125+
// Stop streaming job and its monitor if max retry limit reached
126+
if (errorCnt >= MAX_ERROR_COUNT) {
127+
logInfo(s"Terminating streaming job and index monitor for $indexName")
128+
stopStreamingJob(indexName)
129+
stopMonitor(indexName)
130+
logInfo(s"Streaming job and index monitor terminated")
131+
}
132+
}
133+
}
134+
}
135+
95136
private def isStreamingJobActive(indexName: String): Boolean =
96137
spark.streams.active.exists(_.name == indexName)
138+
139+
private def stopStreamingJob(indexName: String): Unit = {
140+
val job = spark.streams.active.find(_.name == indexName)
141+
if (job.isDefined) {
142+
job.get.stop()
143+
} else {
144+
logWarning("Refreshing job not found")
145+
}
146+
}
97147
}
98148

99149
object FlintSparkIndexMonitor extends Logging {

flint-spark-integration/src/test/scala/org/apache/spark/sql/flint/config/FlintSparkConfSuite.scala

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import org.opensearch.flint.core.http.FlintRetryOptions._
1313
import org.scalatest.matchers.should.Matchers.convertToAnyShouldWrapper
1414

1515
import org.apache.spark.FlintSuite
16+
import org.apache.spark.sql.flint.config.FlintSparkConf.{MONITOR_INITIAL_DELAY_SECONDS, MONITOR_INTERVAL_SECONDS, MONITOR_MAX_ERROR_COUNT}
1617

1718
class FlintSparkConfSuite extends FlintSuite {
1819
test("test spark conf") {
@@ -84,6 +85,24 @@ class FlintSparkConfSuite extends FlintSuite {
8485
overrideConf.flintOptions().getBatchBytes shouldBe 4 * 1024 * 1024
8586
}
8687

88+
test("test index monitor options") {
89+
val defaultConf = FlintSparkConf()
90+
defaultConf.monitorInitialDelaySeconds() shouldBe 15
91+
defaultConf.monitorIntervalSeconds() shouldBe 60
92+
defaultConf.monitorMaxErrorCount() shouldBe 5
93+
94+
withSparkConf(MONITOR_MAX_ERROR_COUNT.key, MONITOR_INTERVAL_SECONDS.key) {
95+
setFlintSparkConf(MONITOR_INITIAL_DELAY_SECONDS, 5)
96+
setFlintSparkConf(MONITOR_INTERVAL_SECONDS, 30)
97+
setFlintSparkConf(MONITOR_MAX_ERROR_COUNT, 10)
98+
99+
val overrideConf = FlintSparkConf()
100+
defaultConf.monitorInitialDelaySeconds() shouldBe 5
101+
overrideConf.monitorIntervalSeconds() shouldBe 30
102+
overrideConf.monitorMaxErrorCount() shouldBe 10
103+
}
104+
}
105+
87106
/**
88107
* Delete index `indexNames` after calling `f`.
89108
*/

integ-test/src/test/scala/org/opensearch/flint/spark/FlintSparkIndexMonitorITSuite.scala

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import org.opensearch.flint.OpenSearchTransactionSuite
1919
import org.opensearch.flint.spark.skipping.FlintSparkSkippingIndex.getSkippingIndexName
2020
import org.scalatest.matchers.should.Matchers
2121

22+
import org.apache.spark.sql.flint.config.FlintSparkConf.MONITOR_MAX_ERROR_COUNT
2223
import org.apache.spark.sql.flint.newDaemonThreadPoolScheduledExecutor
2324

2425
class FlintSparkIndexMonitorITSuite extends OpenSearchTransactionSuite with Matchers {
@@ -40,6 +41,9 @@ class FlintSparkIndexMonitorITSuite extends OpenSearchTransactionSuite with Matc
4041
realExecutor.scheduleWithFixedDelay(invocation.getArgument(0), 5, 1, TimeUnit.SECONDS)
4142
}).when(FlintSparkIndexMonitor.executor)
4243
.scheduleWithFixedDelay(any[Runnable], any[Long], any[Long], any[TimeUnit])
44+
45+
// Set max error count higher to avoid impact on transient error test case
46+
setFlintSparkConf(MONITOR_MAX_ERROR_COUNT, 10)
4347
}
4448

4549
override def beforeEach(): Unit = {
@@ -128,6 +132,24 @@ class FlintSparkIndexMonitorITSuite extends OpenSearchTransactionSuite with Matc
128132
}
129133
}
130134

135+
test("monitor task and streaming job should terminate if exception occurred consistently") {
136+
val task = FlintSparkIndexMonitor.indexMonitorTracker(testFlintIndex)
137+
138+
// Block write on metadata log index
139+
setWriteBlockOnMetadataLogIndex(true)
140+
waitForMonitorTaskRun()
141+
142+
// Both monitor task and streaming job should stop after 10 times
143+
10 times { (_, _) =>
144+
{
145+
// assert nothing. just wait enough times of task execution
146+
}
147+
}
148+
149+
task.isCancelled shouldBe true
150+
spark.streams.active.exists(_.name == testFlintIndex) shouldBe false
151+
}
152+
131153
private def getLatestTimestamp: (Long, Long) = {
132154
val latest = latestLogEntry(testLatestId)
133155
(latest("jobStartTime").asInstanceOf[Long], latest("lastUpdateTime").asInstanceOf[Long])

0 commit comments

Comments
 (0)