Skip to content

Commit 68bcce8

Browse files
authored
Reduce Session Document Retrieval in FintREPL to Enhance Latency Metrics (#179)
This PR reduces the frequency of 'getSessionDoc' calls in two places of FintREPL, addressing the correlation between request count and query latency metrics. 1. **Heartbeat Update Optimization**: - Prior to updating the heartbeat, the sequence number and primary term are now obtained for effective concurrency control. - This PR removes the get session doc call and directly updates the last update time and state. 2. **Session Document Retrieval before Statement Processing**: - Previously, in scenarios where a query takes 10 minutes, the 'getSessionDoc' call is limited to once per 10 minutes. However, in idle states with no running queries, the call frequency is run every 100 milliseconds. - This PR reduced the frequency of 'getSessionDoc' calls by ensuring we make the call at least 1 minute after the previous call. **Testing**: - Verified consistent 1-minute intervals for heartbeat updates. - Confirmed the 'getSessionDoc' call executes every 1 minute prior to picking up the next statement. Signed-off-by: Kaituo Li <kaituo@amazon.com>
1 parent 3965b3e commit 68bcce8

File tree

3 files changed

+33
-41
lines changed

3 files changed

+33
-41
lines changed

spark-sql-application/src/main/scala/org/apache/spark/sql/CommandState.scala

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,5 @@ case class CommandState(
1414
recordedVerificationResult: VerificationResult,
1515
flintReader: FlintReader,
1616
futureMappingCheck: Future[Either[String, Unit]],
17-
executionContext: ExecutionContextExecutor)
17+
executionContext: ExecutionContextExecutor,
18+
recordedLastCanPickCheckTime: Long)

spark-sql-application/src/main/scala/org/apache/spark/sql/FlintREPL.scala

Lines changed: 30 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -6,18 +6,18 @@
66
package org.apache.spark.sql
77

88
import java.net.ConnectException
9-
import java.time.Instant
10-
import java.util.Map
119
import java.util.concurrent.{ScheduledExecutorService, ScheduledFuture}
1210

1311
import scala.concurrent.{ExecutionContext, ExecutionContextExecutor, Future, TimeoutException}
1412
import scala.concurrent.duration.{Duration, MINUTES, _}
1513
import scala.util.{Failure, Success, Try}
1614
import scala.util.control.NonFatal
1715

16+
import org.json4s.native.Serialization
1817
import org.opensearch.action.get.GetResponse
1918
import org.opensearch.common.Strings
2019
import org.opensearch.flint.app.{FlintCommand, FlintInstance}
20+
import org.opensearch.flint.app.FlintInstance.formats
2121
import org.opensearch.flint.core.storage.{FlintReader, OpenSearchUpdater}
2222

2323
import org.apache.spark.SparkConf
@@ -47,6 +47,7 @@ object FlintREPL extends Logging with FlintJobExecutor {
4747
private val DEFAULT_QUERY_EXECUTION_TIMEOUT = Duration(30, MINUTES)
4848
private val DEFAULT_QUERY_WAIT_TIMEOUT_MILLIS = 10 * 60 * 1000
4949
val INITIAL_DELAY_MILLIS = 3000L
50+
val EARLY_TERMIANTION_CHECK_FREQUENCY = 60000L
5051

5152
def update(flintCommand: FlintCommand, updater: OpenSearchUpdater): Unit = {
5253
updater.update(flintCommand.statementId, FlintCommand.serialize(flintCommand))
@@ -292,10 +293,11 @@ object FlintREPL extends Logging with FlintJobExecutor {
292293
var lastActivityTime = currentTimeProvider.currentEpochMillis()
293294
var verificationResult: VerificationResult = NotVerified
294295
var canPickUpNextStatement = true
296+
var lastCanPickCheckTime = 0L
295297
while (currentTimeProvider
296298
.currentEpochMillis() - lastActivityTime <= commandContext.inactivityLimitMillis && canPickUpNextStatement) {
297299
logInfo(
298-
s"""read from ${commandContext.sessionIndex}, sessionId: $commandContext.sessionId""")
300+
s"""read from ${commandContext.sessionIndex}, sessionId: ${commandContext.sessionId}""")
299301
val flintReader: FlintReader =
300302
createQueryReader(
301303
commandContext.osClient,
@@ -309,18 +311,21 @@ object FlintREPL extends Logging with FlintJobExecutor {
309311
verificationResult,
310312
flintReader,
311313
futureMappingCheck,
312-
executionContext)
313-
val result: (Long, VerificationResult, Boolean) =
314+
executionContext,
315+
lastCanPickCheckTime)
316+
val result: (Long, VerificationResult, Boolean, Long) =
314317
processCommands(commandContext, commandState)
315318

316319
val (
317320
updatedLastActivityTime,
318321
updatedVerificationResult,
319-
updatedCanPickUpNextStatement) = result
322+
updatedCanPickUpNextStatement,
323+
updatedLastCanPickCheckTime) = result
320324

321325
lastActivityTime = updatedLastActivityTime
322326
verificationResult = updatedVerificationResult
323327
canPickUpNextStatement = updatedCanPickUpNextStatement
328+
lastCanPickCheckTime = updatedLastCanPickCheckTime
324329
} finally {
325330
flintReader.close()
326331
}
@@ -481,18 +486,27 @@ object FlintREPL extends Logging with FlintJobExecutor {
481486

482487
private def processCommands(
483488
context: CommandContext,
484-
state: CommandState): (Long, VerificationResult, Boolean) = {
489+
state: CommandState): (Long, VerificationResult, Boolean, Long) = {
485490
import context._
486491
import state._
487492

488493
var lastActivityTime = recordedLastActivityTime
489494
var verificationResult = recordedVerificationResult
490495
var canProceed = true
491496
var canPickNextStatementResult = true // Add this line to keep track of canPickNextStatement
497+
var lastCanPickCheckTime = recordedLastCanPickCheckTime
492498

493499
while (canProceed) {
494-
if (!canPickNextStatement(sessionId, jobId, osClient, sessionIndex)) {
495-
canPickNextStatementResult = false
500+
val currentTime = currentTimeProvider.currentEpochMillis()
501+
502+
// Only call canPickNextStatement if EARLY_TERMIANTION_CHECK_FREQUENCY milliseconds have passed
503+
if (currentTime - lastCanPickCheckTime > EARLY_TERMIANTION_CHECK_FREQUENCY) {
504+
canPickNextStatementResult =
505+
canPickNextStatement(sessionId, jobId, osClient, sessionIndex)
506+
lastCanPickCheckTime = currentTime
507+
}
508+
509+
if (!canPickNextStatementResult) {
496510
canProceed = false
497511
} else if (!flintReader.hasNext) {
498512
canProceed = false
@@ -524,7 +538,7 @@ object FlintREPL extends Logging with FlintJobExecutor {
524538
}
525539

526540
// return tuple indicating if still active and mapping verification result
527-
(lastActivityTime, verificationResult, canPickNextStatementResult)
541+
(lastActivityTime, verificationResult, canPickNextStatementResult, lastCanPickCheckTime)
528542
}
529543

530544
/**
@@ -888,20 +902,12 @@ object FlintREPL extends Logging with FlintJobExecutor {
888902
return // Exit the run method if the thread is interrupted
889903
}
890904

891-
val getResponse = osClient.getDoc(sessionIndex, sessionId)
892-
if (getResponse.isExists()) {
893-
val source = getResponse.getSourceAsMap
894-
val flintInstance = FlintInstance.deserializeFromMap(source)
895-
flintInstance.state = "running"
896-
flintSessionUpdater.updateIf(
897-
sessionId,
898-
FlintInstance.serializeWithoutJobId(
899-
flintInstance,
900-
currentTimeProvider.currentEpochMillis()),
901-
getResponse.getSeqNo,
902-
getResponse.getPrimaryTerm)
903-
}
904-
// do nothing if the session doc does not exist
905+
flintSessionUpdater.upsert(
906+
sessionId,
907+
Serialization.write(
908+
Map(
909+
"lastUpdateTime" -> currentTimeProvider.currentEpochMillis(),
910+
"state" -> "running")))
905911
} catch {
906912
case ie: InterruptedException =>
907913
// Preserve the interrupt status

spark-sql-application/src/test/scala/org/apache/spark/sql/FlintREPLTest.scala

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -47,20 +47,6 @@ class FlintREPLTest
4747
val getResponse = mock[GetResponse]
4848
val scheduledFutureRaw = mock[ScheduledFuture[_]]
4949

50-
// Mock behaviors
51-
when(osClient.getDoc(*, *)).thenReturn(getResponse)
52-
when(getResponse.isExists()).thenReturn(true)
53-
when(getResponse.getSourceAsMap).thenReturn(
54-
Map[String, Object](
55-
"applicationId" -> "app1",
56-
"jobId" -> "job1",
57-
"sessionId" -> "session1",
58-
"lastUpdateTime" -> java.lang.Long.valueOf(12345L),
59-
"error" -> "someError",
60-
"state" -> "running",
61-
"jobStartTime" -> java.lang.Long.valueOf(0L)).asJava)
62-
when(getResponse.getSeqNo).thenReturn(0L)
63-
when(getResponse.getPrimaryTerm).thenReturn(0L)
6450
// when scheduled task is scheduled, execute the runnable immediately only once and become no-op afterwards.
6551
when(
6652
threadPool.scheduleAtFixedRate(
@@ -85,8 +71,7 @@ class FlintREPLTest
8571
0)
8672

8773
// Verifications
88-
verify(osClient, atLeastOnce()).getDoc("sessionIndex", "session1")
89-
verify(flintSessionUpdater, atLeastOnce()).updateIf(eqTo("session1"), *, eqTo(0L), eqTo(0L))
74+
verify(flintSessionUpdater, atLeastOnce()).upsert(eqTo("session1"), *)
9075
}
9176

9277
test("createShutdownHook add shutdown hook and update FlintInstance if conditions are met") {

0 commit comments

Comments
 (0)