Skip to content

Commit 9468f5f

Browse files
authored
Server stop will wait until all loading models to complete before unload all models (#324)
* Unload all models must wait until all models complete loading or unloading * Use LoadUnloadModels() in UnloadAllModels() * Erase removed nodes only after affected nodes is completed * Respect server shutdown timeout when waiting for load
1 parent 34d2650 commit 9468f5f

File tree

3 files changed

+24
-14
lines changed

3 files changed

+24
-14
lines changed

src/model_repository_manager/model_repository_manager.cc

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -957,16 +957,21 @@ ModelRepositoryManager::PollModels(
957957
Status
958958
ModelRepositoryManager::UnloadAllModels()
959959
{
960-
Status status;
961-
for (const auto& name_info : infos_) {
962-
Status unload_status = model_life_cycle_->AsyncUnload(name_info.first);
963-
if (!unload_status.IsOk()) {
964-
status = Status(
965-
unload_status.ErrorCode(),
966-
"Failed to gracefully unload models: " + unload_status.Message());
960+
// Find all the models to be unloaded.
961+
std::unordered_map<std::string, std::vector<const InferenceParameter*>>
962+
models;
963+
{
964+
std::lock_guard<std::mutex> lock(mu_);
965+
for (const auto& pair : infos_) {
966+
// Only the model name is needed.
967+
models[pair.first.name_];
967968
}
968969
}
969-
return Status::Success;
970+
// Unload all models found.
971+
bool polled;
972+
return LoadUnloadModels(
973+
models, ActionType::UNLOAD, true /* unload_dependents */, &polled,
974+
nullptr /* no_parallel_conflict */);
970975
}
971976

972977
Status
@@ -1821,12 +1826,14 @@ ModelRepositoryManager::DependencyGraph::RemoveNodes(
18211826
}
18221827

18231828
all_removed_nodes.emplace(model_id);
1824-
// Exclude removed node from affected nodes to skip some evaluations.
1825-
all_affected_nodes.erase(model_id);
18261829
}
18271830

18281831
curr_removal.swap(next_removal);
18291832
}
1833+
// Exclude removed nodes from affected nodes.
1834+
for (const auto& removed_node : all_removed_nodes) {
1835+
all_affected_nodes.erase(removed_node);
1836+
}
18301837
return {std::move(all_affected_nodes), std::move(all_removed_nodes)};
18311838
}
18321839

src/model_repository_manager/model_repository_manager.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -421,8 +421,11 @@ class ModelRepositoryManager {
421421
std::string, std::vector<const InferenceParameter*>>& models,
422422
const ActionType type, const bool unload_dependents);
423423

424-
/// Unload all models. This function should be called before shutting down
425-
/// the model repository manager.
424+
/// Unload all models that are tracked by the model repository manager. If a
425+
/// model is loading or unloading when this function is called, or a model
426+
/// failed to unload, an error is returned. New models may be loaded while
427+
/// this function is unloading models. This function should be called before
428+
/// shutting down the model repository manager.
426429
/// \return error status.
427430
Status UnloadAllModels();
428431

src/server.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -330,11 +330,11 @@ InferenceServer::Stop(const bool force)
330330
}
331331

332332
if (inflight_status.size() == 0) {
333-
unloading_model = true;
334333
status = model_repository_manager_->UnloadAllModels();
335334
if (!status.IsOk()) {
336-
LOG_ERROR << status.Message();
335+
LOG_WARNING << status.Message();
337336
} else {
337+
unloading_model = true;
338338
LOG_INFO << "All models are stopped, unloading models";
339339
continue;
340340
}

0 commit comments

Comments
 (0)