30
30
#include < vector>
31
31
32
32
#include " backend_config.h"
33
- #include " backend_model_instance.h"
34
33
#include " dynamic_batch_scheduler.h"
35
34
#include " filesystem.h"
36
35
#include " model_config_utils.h"
@@ -165,7 +164,8 @@ TritonModel::Create(
165
164
// Create and initialize the model.
166
165
std::unique_ptr<TritonModel> local_model (new TritonModel (
167
166
server, localized_model_dir, backend, min_compute_capability, version,
168
- model_config, auto_complete_config));
167
+ model_config, auto_complete_config, backend_cmdline_config_map,
168
+ host_policy_map));
169
169
170
170
TritonModel* raw_local_model = local_model.get ();
171
171
@@ -197,17 +197,7 @@ TritonModel::Create(
197
197
// Initialize the model for Triton core usage
198
198
RETURN_IF_ERROR (local_model->Init (is_config_provided));
199
199
200
- bool device_blocking = false ;
201
- if (local_model->backend_ ->ExecutionPolicy () ==
202
- TRITONBACKEND_EXECUTION_DEVICE_BLOCKING) {
203
- if (model_config.has_sequence_batching ()) {
204
- LOG_INFO << " Overriding execution policy to "
205
- " \" TRITONBACKEND_EXECUTION_BLOCKING\" for sequence model \" "
206
- << model_config.name () << " \" " ;
207
- } else {
208
- device_blocking = true ;
209
- }
210
- }
200
+ RETURN_IF_ERROR (local_model->GetExecutionPolicy (model_config));
211
201
212
202
// Initalize the custom batching library for the model, if provided.
213
203
if (model_config.has_sequence_batching ()) {
@@ -250,17 +240,71 @@ TritonModel::Create(
250
240
}
251
241
}
252
242
253
- // Create and initialize the model instances for this model.
254
- RETURN_IF_ERROR (TritonModelInstance::CreateInstances (
243
+ // Create or update the model instances for this model.
244
+ RETURN_IF_ERROR (TritonModelInstance::SetInstances (
255
245
raw_local_model, backend_cmdline_config_map, host_policy_map,
256
- model_config, device_blocking));
246
+ model_config));
247
+ RETURN_IF_ERROR (local_model->CommitInstances ());
257
248
258
249
RETURN_IF_ERROR (local_model->SetConfiguredScheduler ());
259
250
260
251
*model = std::move (local_model);
261
252
return Status::Success;
262
253
}
263
254
255
+ Status
256
+ TritonModel::UpdateInstanceGroup (
257
+ const inference::ModelConfig& new_model_config,
258
+ std::unique_lock<std::mutex>* caller_lock)
259
+ {
260
+ // Generate normalized model config with new instance group.
261
+ inference::ModelConfig model_config = config_;
262
+ model_config.clear_instance_group ();
263
+ model_config.mutable_instance_group ()->Add (
264
+ new_model_config.instance_group ().begin (),
265
+ new_model_config.instance_group ().end ());
266
+ RETURN_IF_ERROR (NormalizeInstanceGroup (
267
+ min_compute_capability_, backend_->BackendAttributes ().preferred_groups_ ,
268
+ &model_config));
269
+ RETURN_IF_ERROR (ValidateInstanceGroup (model_config, min_compute_capability_));
270
+
271
+ // Update the instances to the new config.
272
+ caller_lock->unlock (); // allow inference while creating instances
273
+ Status status = TritonModelInstance::SetInstances (
274
+ this , backend_cmdline_config_map_, host_policy_map_, model_config);
275
+ caller_lock->lock ();
276
+ if (!status.IsOk ()) {
277
+ return status;
278
+ }
279
+
280
+ // At this point, the new model config is ready but not yet written into this
281
+ // object. The 'caller_lock' is held, so 'model_lifecycle' will pause any new
282
+ // inference request. It is safe to move forward and commit the change.
283
+ RETURN_IF_ERROR (SetModelConfig (model_config));
284
+ RETURN_IF_ERROR (CommitInstances ());
285
+ RETURN_IF_ERROR (SetConfiguredScheduler ());
286
+
287
+ return Status::Success;
288
+ }
289
+
290
+ Status
291
+ TritonModel::GetExecutionPolicy (const inference::ModelConfig& model_config)
292
+ {
293
+ // Set 'device_blocking_'
294
+ device_blocking_ = false ;
295
+ if (backend_->ExecutionPolicy () == TRITONBACKEND_EXECUTION_DEVICE_BLOCKING) {
296
+ if (model_config.has_sequence_batching ()) {
297
+ LOG_INFO << " Overriding execution policy to "
298
+ " \" TRITONBACKEND_EXECUTION_BLOCKING\" for sequence model \" "
299
+ << model_config.name () << " \" " ;
300
+ } else {
301
+ device_blocking_ = true ;
302
+ }
303
+ }
304
+
305
+ return Status::Success;
306
+ }
307
+
264
308
Status
265
309
TritonModel::ResolveBackendConfigs (
266
310
const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map,
@@ -323,21 +367,78 @@ TritonModel::SetBackendConfigDefaults(
323
367
return Status::Success;
324
368
}
325
369
370
+ std::shared_ptr<TritonModelInstance>
371
+ TritonModel::FindInstance (const TritonModelInstance::Signature& signature) const
372
+ {
373
+ // The search can be improved by introducing some gradient into comparing
374
+ // signatures. One solution could be to use hash key. [FIXME: DLIS-4822]
375
+ for (auto * instances : {&instances_, &passive_instances_}) {
376
+ for (auto & instance : (*instances)) {
377
+ if (instance->GetSignature () == signature) {
378
+ return instance;
379
+ }
380
+ }
381
+ }
382
+ return std::shared_ptr<TritonModelInstance>();
383
+ }
384
+
326
385
Status
327
- TritonModel::AddInstance (
328
- std::unique_ptr <TritonModelInstance>&& instance, const bool passive)
386
+ TritonModel::RegisterInstance (
387
+ std::shared_ptr <TritonModelInstance>&& instance, const bool passive)
329
388
{
389
+ instance->GetSignature ().DisableMatching ();
390
+
330
391
if (passive) {
331
- passive_instance_group_map_[instance->GroupName ()].emplace_back (
332
- std::move (instance));
392
+ bg_passive_instances_.emplace_back (std::move (instance));
333
393
} else {
334
- instance_group_map_[instance->GroupName ()].emplace_back (
335
- std::move (instance));
394
+ bg_instances_.emplace_back (std::move (instance));
336
395
}
337
396
338
397
return Status::Success;
339
398
}
340
399
400
+ Status
401
+ TritonModel::CommitInstances ()
402
+ {
403
+ instances_.swap (bg_instances_);
404
+ passive_instances_.swap (bg_passive_instances_);
405
+ bg_instances_.clear ();
406
+ bg_passive_instances_.clear ();
407
+
408
+ for (auto * instances : {&instances_, &passive_instances_}) {
409
+ for (auto & instance : (*instances)) {
410
+ instance->GetSignature ().EnableMatching ();
411
+ }
412
+ }
413
+
414
+ return Status::Success;
415
+ }
416
+
417
+ std::vector<std::shared_ptr<TritonModelInstance>>
418
+ TritonModel::GetInstancesByDevice (int32_t device_id) const
419
+ {
420
+ std::vector<std::shared_ptr<TritonModelInstance>> result;
421
+ // Do not match passive instances, as they do not have a backend thread.
422
+ // Do not match foreground instances, as backend threads cannot be updated.
423
+ for (auto & instance : bg_instances_) {
424
+ if (instance->DeviceId () == device_id) {
425
+ result.push_back (instance);
426
+ }
427
+ }
428
+ return result;
429
+ }
430
+
431
+ Status
432
+ TritonModel::SetSchedulerMutable (std::unique_ptr<Scheduler> scheduler)
433
+ {
434
+ if (scheduler_ != nullptr ) {
435
+ LOG_VERBOSE (1 ) << " Replacing scheduler for model '" + config_.name () + " '" ;
436
+ }
437
+ scheduler_ = std::move (scheduler);
438
+
439
+ return Status::Success;
440
+ }
441
+
341
442
Status
342
443
TritonModel::UpdateModelConfig (
343
444
const uint32_t config_version, TRITONSERVER_Message* updated_config_message)
@@ -443,7 +544,7 @@ TritonModel::SetConfiguredScheduler()
443
544
0 /* max_queue_delay_microseconds */ , &scheduler));
444
545
}
445
546
446
- return SetScheduler (std::move (scheduler));
547
+ return SetSchedulerMutable (std::move (scheduler));
447
548
}
448
549
449
550
Status
@@ -499,40 +600,20 @@ TritonModel::SetBatchingStrategy(const std::string& batch_libpath)
499
600
return Status::Success;
500
601
}
501
602
502
- Status
503
- TritonModel::Initialize ()
504
- {
505
- for (const auto & pair : instance_group_map_) {
506
- for (const auto & instance : pair.second ) {
507
- RETURN_IF_ERROR (instance->Initialize ());
508
- }
509
- }
510
-
511
- return Status::Success;
512
- }
513
-
514
- Status
515
- TritonModel::WarmUp ()
516
- {
517
- for (const auto & pair : instance_group_map_) {
518
- for (const auto & instance : pair.second ) {
519
- RETURN_IF_ERROR (instance->WarmUp ());
520
- }
521
- }
522
-
523
- return Status::Success;
524
- }
525
-
526
603
TritonModel::TritonModel (
527
604
InferenceServer* server,
528
605
const std::shared_ptr<LocalizedPath>& localized_model_dir,
529
606
const std::shared_ptr<TritonBackend>& backend,
530
607
const double min_compute_capability, const int64_t version,
531
- const inference::ModelConfig& config, const bool auto_complete_config)
608
+ const inference::ModelConfig& config, const bool auto_complete_config,
609
+ const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map,
610
+ const triton::common::HostPolicyCmdlineConfigMap& host_policy_map)
532
611
: Model(
533
612
min_compute_capability, localized_model_dir->Path (), version, config),
534
613
server_(server), min_compute_capability_(min_compute_capability),
535
614
auto_complete_config_(auto_complete_config),
615
+ backend_cmdline_config_map_(backend_cmdline_config_map),
616
+ host_policy_map_(host_policy_map), device_blocking_(false ),
536
617
localized_model_dir_(localized_model_dir), backend_(backend),
537
618
state_(nullptr )
538
619
{
@@ -556,8 +637,10 @@ TritonModel::~TritonModel()
556
637
557
638
// Explicitly delete/finalize all model instances before finalizing
558
639
// the model itself.
559
- instance_group_map_.clear ();
560
- passive_instance_group_map_.clear ();
640
+ instances_.clear ();
641
+ passive_instances_.clear ();
642
+ bg_instances_.clear ();
643
+ bg_passive_instances_.clear ();
561
644
562
645
// Unregister itself from the rate limiter. Note this should happen
563
646
// after all instances are destructed. Destrucing instances ensures
0 commit comments