@@ -25,7 +25,7 @@ use quickwit_common::pubsub::{EventBroker, EventSubscriber};
25
25
use quickwit_common:: { rate_limited_error, rate_limited_warn} ;
26
26
use quickwit_proto:: control_plane:: {
27
27
ControlPlaneService , ControlPlaneServiceClient , GetOrCreateOpenShardsRequest ,
28
- GetOrCreateOpenShardsSubrequest ,
28
+ GetOrCreateOpenShardsSubrequest , GetOrCreateOpenShardsSuccess ,
29
29
} ;
30
30
use quickwit_proto:: indexing:: ShardPositionsUpdate ;
31
31
use quickwit_proto:: ingest:: ingester:: {
@@ -45,13 +45,14 @@ use tracing::{error, info};
45
45
46
46
use super :: broadcast:: LocalShardsUpdate ;
47
47
use super :: debouncing:: {
48
- DebouncedGetOrCreateOpenShardsRequest , GetOrCreateOpenShardsRequestDebouncer ,
48
+ DebouncedGetOrCreateOpenShardsRequest , GetOrCreateOpenShardsErrorDebounced ,
49
+ GetOrCreateOpenShardsRequestDebouncer , Rendezvous ,
49
50
} ;
50
51
use super :: ingester:: PERSIST_REQUEST_TIMEOUT ;
51
52
use super :: metrics:: IngestResultMetrics ;
52
53
use super :: routing_table:: { NextOpenShardError , RoutingTable } ;
53
- use super :: workbench:: IngestWorkbench ;
54
- use super :: { pending_subrequests , IngesterPool } ;
54
+ use super :: workbench:: { pending_subrequests_for_attempt , IngestWorkbench } ;
55
+ use super :: IngesterPool ;
55
56
use crate :: { get_ingest_router_buffer_size, LeaderId } ;
56
57
57
58
/// Duration after which ingest requests time out with [`IngestV2Error::Timeout`].
@@ -155,7 +156,7 @@ impl IngestRouter {
155
156
}
156
157
157
158
/// Inspects the shard table for each subrequest and returns the appropriate
158
- /// [`GetOrCreateOpenShardsRequest`] request if open shards do not exist for all the them .
159
+ /// [`GetOrCreateOpenShardsRequest`] request for those without open shards .
159
160
async fn make_get_or_create_open_shard_request (
160
161
& self ,
161
162
workbench : & mut IngestWorkbench ,
@@ -169,7 +170,9 @@ impl IngestRouter {
169
170
170
171
let mut state_guard = self . state . lock ( ) . await ;
171
172
172
- for subrequest in pending_subrequests ( & workbench. subworkbenches ) {
173
+ for subrequest in
174
+ pending_subrequests_for_attempt ( & workbench. subworkbenches , workbench. num_attempts )
175
+ {
173
176
if !state_guard. routing_table . has_open_shards (
174
177
& subrequest. index_id ,
175
178
& subrequest. source_id ,
@@ -192,7 +195,7 @@ impl IngestRouter {
192
195
debounced_request. push_subrequest ( subrequest, permit) ;
193
196
}
194
197
Err ( barrier) => {
195
- debounced_request. push_barrier ( barrier) ;
198
+ debounced_request. push_barrier ( subrequest . subrequest_id , barrier) ;
196
199
}
197
200
}
198
201
}
@@ -219,28 +222,65 @@ impl IngestRouter {
219
222
workbench : & mut IngestWorkbench ,
220
223
debounced_request : DebouncedGetOrCreateOpenShardsRequest ,
221
224
) {
222
- let ( request_opt, rendezvous) = debounced_request. take ( ) ;
225
+ let ( request_opt, mut rendezvous) = debounced_request. take ( ) ;
226
+
227
+ let Some ( request) = request_opt else {
228
+ return ;
229
+ } ;
230
+
231
+ let successes = self
232
+ . try_get_or_create_open_shards ( request, workbench, & mut rendezvous)
233
+ . await ;
234
+
235
+ self . populate_routing_table ( successes) . await ;
223
236
224
- if let Some ( request) = request_opt {
225
- self . populate_routing_table ( workbench, request) . await ;
237
+ let rendezvouz_errors = rendezvous. wait ( ) . await ;
238
+ for ( subrequest_id, error) in rendezvouz_errors. into_iter ( ) {
239
+ match error {
240
+ GetOrCreateOpenShardsErrorDebounced :: ControlPlaneError ( control_plane_error) => {
241
+ workbench. record_get_or_create_open_shards_error (
242
+ subrequest_id,
243
+ & control_plane_error,
244
+ ) ;
245
+ }
246
+ GetOrCreateOpenShardsErrorDebounced :: Failure ( failure) => {
247
+ workbench. record_get_or_create_open_shards_failure ( failure) ;
248
+ }
249
+ }
226
250
}
227
- rendezvous. wait ( ) . await ;
228
251
}
229
252
230
- /// Issues a [`GetOrCreateOpenShardsRequest`] request to the control plane and populates the
231
- /// shard table according to the response received.
232
- async fn populate_routing_table (
253
+ /// Issues a [`GetOrCreateOpenShardsRequest`] request, returning the
254
+ /// successful shard creations if any and recording the failures to the
255
+ /// provided workbench and rendezvous
256
+ async fn try_get_or_create_open_shards (
233
257
& self ,
234
- workbench : & mut IngestWorkbench ,
235
258
request : GetOrCreateOpenShardsRequest ,
236
- ) {
259
+ workbench : & mut IngestWorkbench ,
260
+ rendezvous : & mut Rendezvous ,
261
+ ) -> Vec < GetOrCreateOpenShardsSuccess > {
237
262
if request. subrequests . is_empty ( ) {
238
- return ;
263
+ return Vec :: new ( ) ;
239
264
}
240
- let response_result = self . control_plane . get_or_create_open_shards ( request) . await ;
265
+ let response_result = self
266
+ . control_plane
267
+ . get_or_create_open_shards ( request. clone ( ) )
268
+ . await ;
241
269
let response = match response_result {
242
270
Ok ( response) => response,
243
271
Err ( control_plane_error) => {
272
+ for subrequest in & request. subrequests {
273
+ rendezvous. write_error (
274
+ subrequest. subrequest_id ,
275
+ GetOrCreateOpenShardsErrorDebounced :: ControlPlaneError (
276
+ control_plane_error. clone ( ) ,
277
+ ) ,
278
+ ) ;
279
+ workbench. record_get_or_create_open_shards_error (
280
+ subrequest. subrequest_id ,
281
+ & control_plane_error,
282
+ ) ;
283
+ }
244
284
if workbench. is_last_attempt ( ) {
245
285
rate_limited_error ! (
246
286
limit_per_min = 10 ,
@@ -252,23 +292,34 @@ impl IngestRouter {
252
292
"failed to get open shards from control plane: {control_plane_error}"
253
293
) ;
254
294
} ;
255
- return ;
295
+ return Vec :: new ( ) ;
256
296
}
257
297
} ;
258
- let mut state_guard = self . state . lock ( ) . await ;
259
298
260
- for success in response. successes {
299
+ for failure in response. failures {
300
+ rendezvous. write_error (
301
+ failure. subrequest_id ,
302
+ GetOrCreateOpenShardsErrorDebounced :: Failure ( failure. clone ( ) ) ,
303
+ ) ;
304
+ workbench. record_get_or_create_open_shards_failure ( failure) ;
305
+ }
306
+
307
+ response. successes
308
+ }
309
+
310
+ async fn populate_routing_table ( & self , successes : Vec < GetOrCreateOpenShardsSuccess > ) {
311
+ if successes. is_empty ( ) {
312
+ return ;
313
+ }
314
+
315
+ let mut state_guard = self . state . lock ( ) . await ;
316
+ for success in successes {
261
317
state_guard. routing_table . replace_shards (
262
318
success. index_uid ( ) . clone ( ) ,
263
319
success. source_id ,
264
320
success. open_shards ,
265
321
) ;
266
322
}
267
- drop ( state_guard) ;
268
-
269
- for failure in response. failures {
270
- workbench. record_get_or_create_open_shards_failure ( failure) ;
271
- }
272
323
}
273
324
274
325
async fn process_persist_results (
@@ -375,7 +426,9 @@ impl IngestRouter {
375
426
let rate_limited_shards: & HashSet < ShardId > = & workbench. rate_limited_shards ;
376
427
let state_guard = self . state . lock ( ) . await ;
377
428
378
- for subrequest in pending_subrequests ( & workbench. subworkbenches ) {
429
+ for subrequest in
430
+ pending_subrequests_for_attempt ( & workbench. subworkbenches , workbench. num_attempts )
431
+ {
379
432
let next_open_shard_res_opt = state_guard
380
433
. routing_table
381
434
. find_entry ( & subrequest. index_id , & subrequest. source_id )
@@ -547,6 +600,7 @@ fn update_ingest_metrics(ingest_result: &IngestV2Result<IngestResponseV2>, num_s
547
600
ingest_results_metrics. router_load_shedding . inc ( )
548
601
}
549
602
IngestFailureReason :: LoadShedding => ingest_results_metrics. load_shedding . inc ( ) ,
603
+ IngestFailureReason :: Unavailable => ingest_results_metrics. unavailable . inc ( ) ,
550
604
}
551
605
}
552
606
}
@@ -1026,9 +1080,15 @@ mod tests {
1026
1080
closed_shards : Vec :: new ( ) ,
1027
1081
unavailable_leaders : Vec :: new ( ) ,
1028
1082
} ;
1029
- router
1030
- . populate_routing_table ( & mut workbench, get_or_create_open_shards_request)
1083
+ let successes = router
1084
+ . try_get_or_create_open_shards (
1085
+ get_or_create_open_shards_request,
1086
+ & mut workbench,
1087
+ & mut Rendezvous :: default ( ) ,
1088
+ )
1031
1089
. await ;
1090
+ assert_eq ! ( successes. len( ) , 2 ) ;
1091
+ router. populate_routing_table ( successes) . await ;
1032
1092
1033
1093
let state_guard = router. state . lock ( ) . await ;
1034
1094
let routing_table = & state_guard. routing_table ;
@@ -1430,7 +1490,7 @@ mod tests {
1430
1490
let subworkbench = workbench. subworkbenches . get ( & 1 ) . unwrap ( ) ;
1431
1491
assert ! ( matches!(
1432
1492
subworkbench. last_failure_opt,
1433
- Some ( SubworkbenchFailure :: Unavailable )
1493
+ Some ( SubworkbenchFailure :: IngesterUnavailable )
1434
1494
) ) ;
1435
1495
}
1436
1496
0 commit comments