Skip to content

Commit bb7fc09

Browse files
authored
Add model instance update (#189)
* Apply backend_model_instance change * Apply model change * Apply rate_limiter change * Apply backend_model change * Apply model_lifecycle change and refactor * Separate last modified time for model config and model files * Add structure to support model config update * More refactoring on model_lifecycle * Add model config update * Remove Init and WarmUp from backend_model * Add instance unregister on rate limiter * Add format * Fix rate limiter unregister model instance * Fix model config related segfault * Add remove model instance into rate limiter * Index model instance by pointer in rate limiter * Remove index from model instance * Track device id in backend thread * Simplify model architecture * Remove need for device to thread map * Add instance group setting comparison * Enable model instance re-use * Stop re-using instances with shared backend thread * Fix marco syntax issue * Improve model_config_utils comments * Update copyright * Improve model config util naming * Fix outdated comment * Refactor setting device blocking * Use predefined name for config.pbtxt * Decouple registration from instance creation * Add more doc on Signature * Add comments to backend_model.h functions * Use function instead of macro for ShareBackendThread * Reorder create instance error return and register instance * Apply PR #194 * Rename SetExecutionPolicy to GetExecutionPolicy * Add links to JIRA * Disable instance update for sequence models
1 parent ecbe023 commit bb7fc09

13 files changed

+841
-428
lines changed

src/backend_model.cc

Lines changed: 133 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
#include <vector>
3131

3232
#include "backend_config.h"
33-
#include "backend_model_instance.h"
3433
#include "dynamic_batch_scheduler.h"
3534
#include "filesystem.h"
3635
#include "model_config_utils.h"
@@ -165,7 +164,8 @@ TritonModel::Create(
165164
// Create and initialize the model.
166165
std::unique_ptr<TritonModel> local_model(new TritonModel(
167166
server, localized_model_dir, backend, min_compute_capability, version,
168-
model_config, auto_complete_config));
167+
model_config, auto_complete_config, backend_cmdline_config_map,
168+
host_policy_map));
169169

170170
TritonModel* raw_local_model = local_model.get();
171171

@@ -197,17 +197,7 @@ TritonModel::Create(
197197
// Initialize the model for Triton core usage
198198
RETURN_IF_ERROR(local_model->Init(is_config_provided));
199199

200-
bool device_blocking = false;
201-
if (local_model->backend_->ExecutionPolicy() ==
202-
TRITONBACKEND_EXECUTION_DEVICE_BLOCKING) {
203-
if (model_config.has_sequence_batching()) {
204-
LOG_INFO << "Overriding execution policy to "
205-
"\"TRITONBACKEND_EXECUTION_BLOCKING\" for sequence model \""
206-
<< model_config.name() << "\"";
207-
} else {
208-
device_blocking = true;
209-
}
210-
}
200+
RETURN_IF_ERROR(local_model->GetExecutionPolicy(model_config));
211201

212202
// Initalize the custom batching library for the model, if provided.
213203
if (model_config.has_sequence_batching()) {
@@ -250,17 +240,71 @@ TritonModel::Create(
250240
}
251241
}
252242

253-
// Create and initialize the model instances for this model.
254-
RETURN_IF_ERROR(TritonModelInstance::CreateInstances(
243+
// Create or update the model instances for this model.
244+
RETURN_IF_ERROR(TritonModelInstance::SetInstances(
255245
raw_local_model, backend_cmdline_config_map, host_policy_map,
256-
model_config, device_blocking));
246+
model_config));
247+
RETURN_IF_ERROR(local_model->CommitInstances());
257248

258249
RETURN_IF_ERROR(local_model->SetConfiguredScheduler());
259250

260251
*model = std::move(local_model);
261252
return Status::Success;
262253
}
263254

255+
Status
256+
TritonModel::UpdateInstanceGroup(
257+
const inference::ModelConfig& new_model_config,
258+
std::unique_lock<std::mutex>* caller_lock)
259+
{
260+
// Generate normalized model config with new instance group.
261+
inference::ModelConfig model_config = config_;
262+
model_config.clear_instance_group();
263+
model_config.mutable_instance_group()->Add(
264+
new_model_config.instance_group().begin(),
265+
new_model_config.instance_group().end());
266+
RETURN_IF_ERROR(NormalizeInstanceGroup(
267+
min_compute_capability_, backend_->BackendAttributes().preferred_groups_,
268+
&model_config));
269+
RETURN_IF_ERROR(ValidateInstanceGroup(model_config, min_compute_capability_));
270+
271+
// Update the instances to the new config.
272+
caller_lock->unlock(); // allow inference while creating instances
273+
Status status = TritonModelInstance::SetInstances(
274+
this, backend_cmdline_config_map_, host_policy_map_, model_config);
275+
caller_lock->lock();
276+
if (!status.IsOk()) {
277+
return status;
278+
}
279+
280+
// At this point, the new model config is ready but not yet written into this
281+
// object. The 'caller_lock' is held, so 'model_lifecycle' will pause any new
282+
// inference request. It is safe to move forward and commit the change.
283+
RETURN_IF_ERROR(SetModelConfig(model_config));
284+
RETURN_IF_ERROR(CommitInstances());
285+
RETURN_IF_ERROR(SetConfiguredScheduler());
286+
287+
return Status::Success;
288+
}
289+
290+
Status
291+
TritonModel::GetExecutionPolicy(const inference::ModelConfig& model_config)
292+
{
293+
// Set 'device_blocking_'
294+
device_blocking_ = false;
295+
if (backend_->ExecutionPolicy() == TRITONBACKEND_EXECUTION_DEVICE_BLOCKING) {
296+
if (model_config.has_sequence_batching()) {
297+
LOG_INFO << "Overriding execution policy to "
298+
"\"TRITONBACKEND_EXECUTION_BLOCKING\" for sequence model \""
299+
<< model_config.name() << "\"";
300+
} else {
301+
device_blocking_ = true;
302+
}
303+
}
304+
305+
return Status::Success;
306+
}
307+
264308
Status
265309
TritonModel::ResolveBackendConfigs(
266310
const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map,
@@ -323,21 +367,78 @@ TritonModel::SetBackendConfigDefaults(
323367
return Status::Success;
324368
}
325369

370+
std::shared_ptr<TritonModelInstance>
371+
TritonModel::FindInstance(const TritonModelInstance::Signature& signature) const
372+
{
373+
// The search can be improved by introducing some gradient into comparing
374+
// signatures. One solution could be to use hash key. [FIXME: DLIS-4822]
375+
for (auto* instances : {&instances_, &passive_instances_}) {
376+
for (auto& instance : (*instances)) {
377+
if (instance->GetSignature() == signature) {
378+
return instance;
379+
}
380+
}
381+
}
382+
return std::shared_ptr<TritonModelInstance>();
383+
}
384+
326385
Status
327-
TritonModel::AddInstance(
328-
std::unique_ptr<TritonModelInstance>&& instance, const bool passive)
386+
TritonModel::RegisterInstance(
387+
std::shared_ptr<TritonModelInstance>&& instance, const bool passive)
329388
{
389+
instance->GetSignature().DisableMatching();
390+
330391
if (passive) {
331-
passive_instance_group_map_[instance->GroupName()].emplace_back(
332-
std::move(instance));
392+
bg_passive_instances_.emplace_back(std::move(instance));
333393
} else {
334-
instance_group_map_[instance->GroupName()].emplace_back(
335-
std::move(instance));
394+
bg_instances_.emplace_back(std::move(instance));
336395
}
337396

338397
return Status::Success;
339398
}
340399

400+
Status
401+
TritonModel::CommitInstances()
402+
{
403+
instances_.swap(bg_instances_);
404+
passive_instances_.swap(bg_passive_instances_);
405+
bg_instances_.clear();
406+
bg_passive_instances_.clear();
407+
408+
for (auto* instances : {&instances_, &passive_instances_}) {
409+
for (auto& instance : (*instances)) {
410+
instance->GetSignature().EnableMatching();
411+
}
412+
}
413+
414+
return Status::Success;
415+
}
416+
417+
std::vector<std::shared_ptr<TritonModelInstance>>
418+
TritonModel::GetInstancesByDevice(int32_t device_id) const
419+
{
420+
std::vector<std::shared_ptr<TritonModelInstance>> result;
421+
// Do not match passive instances, as they do not have a backend thread.
422+
// Do not match foreground instances, as backend threads cannot be updated.
423+
for (auto& instance : bg_instances_) {
424+
if (instance->DeviceId() == device_id) {
425+
result.push_back(instance);
426+
}
427+
}
428+
return result;
429+
}
430+
431+
Status
432+
TritonModel::SetSchedulerMutable(std::unique_ptr<Scheduler> scheduler)
433+
{
434+
if (scheduler_ != nullptr) {
435+
LOG_VERBOSE(1) << "Replacing scheduler for model '" + config_.name() + "'";
436+
}
437+
scheduler_ = std::move(scheduler);
438+
439+
return Status::Success;
440+
}
441+
341442
Status
342443
TritonModel::UpdateModelConfig(
343444
const uint32_t config_version, TRITONSERVER_Message* updated_config_message)
@@ -443,7 +544,7 @@ TritonModel::SetConfiguredScheduler()
443544
0 /* max_queue_delay_microseconds */, &scheduler));
444545
}
445546

446-
return SetScheduler(std::move(scheduler));
547+
return SetSchedulerMutable(std::move(scheduler));
447548
}
448549

449550
Status
@@ -499,40 +600,20 @@ TritonModel::SetBatchingStrategy(const std::string& batch_libpath)
499600
return Status::Success;
500601
}
501602

502-
Status
503-
TritonModel::Initialize()
504-
{
505-
for (const auto& pair : instance_group_map_) {
506-
for (const auto& instance : pair.second) {
507-
RETURN_IF_ERROR(instance->Initialize());
508-
}
509-
}
510-
511-
return Status::Success;
512-
}
513-
514-
Status
515-
TritonModel::WarmUp()
516-
{
517-
for (const auto& pair : instance_group_map_) {
518-
for (const auto& instance : pair.second) {
519-
RETURN_IF_ERROR(instance->WarmUp());
520-
}
521-
}
522-
523-
return Status::Success;
524-
}
525-
526603
TritonModel::TritonModel(
527604
InferenceServer* server,
528605
const std::shared_ptr<LocalizedPath>& localized_model_dir,
529606
const std::shared_ptr<TritonBackend>& backend,
530607
const double min_compute_capability, const int64_t version,
531-
const inference::ModelConfig& config, const bool auto_complete_config)
608+
const inference::ModelConfig& config, const bool auto_complete_config,
609+
const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map,
610+
const triton::common::HostPolicyCmdlineConfigMap& host_policy_map)
532611
: Model(
533612
min_compute_capability, localized_model_dir->Path(), version, config),
534613
server_(server), min_compute_capability_(min_compute_capability),
535614
auto_complete_config_(auto_complete_config),
615+
backend_cmdline_config_map_(backend_cmdline_config_map),
616+
host_policy_map_(host_policy_map), device_blocking_(false),
536617
localized_model_dir_(localized_model_dir), backend_(backend),
537618
state_(nullptr)
538619
{
@@ -556,8 +637,10 @@ TritonModel::~TritonModel()
556637

557638
// Explicitly delete/finalize all model instances before finalizing
558639
// the model itself.
559-
instance_group_map_.clear();
560-
passive_instance_group_map_.clear();
640+
instances_.clear();
641+
passive_instances_.clear();
642+
bg_instances_.clear();
643+
bg_passive_instances_.clear();
561644

562645
// Unregister itself from the rate limiter. Note this should happen
563646
// after all instances are destructed. Destrucing instances ensures

0 commit comments

Comments
 (0)