Skip to content

Commit 8d26f1b

Browse files
authored
CMS: ignore tenant and cluster unavailable nodes limits in force availability mode (#20217)
1 parent 6138330 commit 8d26f1b

File tree

5 files changed

+78
-56
lines changed

5 files changed

+78
-56
lines changed

ydb/core/cms/cms.cpp

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -943,14 +943,6 @@ bool TCms::TryToLockVDisk(const TActionOptions& opts,
943943
}
944944
break;
945945
case MODE_FORCE_RESTART:
946-
if (counters->GroupAlreadyHasLockedDisks() && !counters->GroupHasMoreThanOneDiskPerNode() && opts.PartialPermissionAllowed) {
947-
TabletCounters->Cumulative()[COUNTER_PARTIAL_PERMISSIONS_OPTIMIZED].Increment(1);
948-
error.Code = TStatus::DISALLOW_TEMP;
949-
error.Reason = "You cannot get two or more disks from the same group at the same time"
950-
" in partial permissions allowed mode";
951-
error.Deadline = defaultDeadline;
952-
return false;
953-
}
954946
// Any number of down disks is OK for this mode.
955947
break;
956948
default:

ydb/core/cms/cms_maintenance_api_ut.cpp

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,55 @@ Y_UNIT_TEST_SUITE(TMaintenanceApiTest) {
268268
env.CheckRequest("user", "user-r-1", false, NKikimrCms::TStatus::TStatus::WRONG_REQUEST);
269269
}
270270
}
271+
272+
Y_UNIT_TEST(ForceAvailabilityMode) {
273+
TCmsTestEnv env(8);
274+
275+
NKikimrCms::TCmsConfig config;
276+
config.MutableClusterLimits()->SetDisabledNodesLimit(1);
277+
config.MutableClusterLimits()->SetDisabledNodesRatioLimit(30);
278+
config.MutableTenantLimits()->SetDisabledNodesLimit(1);
279+
config.MutableTenantLimits()->SetDisabledNodesRatioLimit(30);
280+
env.SetCmsConfig(config);
281+
282+
auto response = env.CheckMaintenanceTaskCreate(
283+
"task-1",
284+
Ydb::StatusIds::SUCCESS,
285+
Ydb::Maintenance::AVAILABILITY_MODE_FORCE,
286+
MakeActionGroup(
287+
MakeLockAction(env.GetNodeId(0), TDuration::Minutes(10))
288+
),
289+
MakeActionGroup(
290+
MakeLockAction(env.GetNodeId(1), TDuration::Minutes(10))
291+
),
292+
MakeActionGroup(
293+
MakeLockAction(env.GetNodeId(2), TDuration::Minutes(10))
294+
),
295+
MakeActionGroup(
296+
MakeLockAction(env.GetNodeId(3), TDuration::Minutes(10))
297+
),
298+
MakeActionGroup(
299+
MakeLockAction(env.GetNodeId(4), TDuration::Minutes(10))
300+
),
301+
MakeActionGroup(
302+
MakeLockAction(env.GetNodeId(5), TDuration::Minutes(10))
303+
),
304+
MakeActionGroup(
305+
MakeLockAction(env.GetNodeId(6), TDuration::Minutes(10))
306+
),
307+
MakeActionGroup(
308+
MakeLockAction(env.GetNodeId(7), TDuration::Minutes(10))
309+
)
310+
);
311+
312+
// Everything is allowed to perform maintenance regardless of the failure model
313+
UNIT_ASSERT_VALUES_EQUAL(response.action_group_states().size(), 8);
314+
for (size_t i = 0; i < 8; ++i) {
315+
UNIT_ASSERT_VALUES_EQUAL(response.action_group_states(i).action_states().size(), 1);
316+
const auto& a = response.action_group_states(i).action_states(0);
317+
UNIT_ASSERT_VALUES_EQUAL(a.status(), ActionState::ACTION_STATUS_PERFORMED);
318+
}
319+
}
271320
}
272321

273322
} // namespace NKikimr::NCmsTest

ydb/core/cms/cms_tenants_ut.cpp

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -441,14 +441,14 @@ Y_UNIT_TEST_SUITE(TCmsTenatsTest) {
441441
auto res1 = env.ExtractPermissions
442442
(env.CheckPermissionRequest("user", false, false, false, true, TStatus::ALLOW,
443443
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(0), 60000000)));
444-
// Limit should work for any mode because we are restarting one node already.
444+
// Limit should work for any mode except force because we are restarting one node already.
445445
env.CheckPermissionRequest("user", false, false, false, true, TStatus::DISALLOW_TEMP,
446446
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(1), 60000000));
447447
env.CheckPermissionRequest("user", false, false, false, true,
448448
MODE_KEEP_AVAILABLE, TStatus::DISALLOW_TEMP,
449449
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(1), 60000000));
450-
env.CheckPermissionRequest("user", false, false, false, true,
451-
MODE_FORCE_RESTART, TStatus::DISALLOW_TEMP,
450+
env.CheckPermissionRequest("user", false, /* dry */ true, false, true,
451+
MODE_FORCE_RESTART, TStatus::ALLOW,
452452
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(1), 60000000));
453453

454454
env.CheckDonePermission("user", res1.second[0]);
@@ -505,27 +505,24 @@ Y_UNIT_TEST_SUITE(TCmsTenatsTest) {
505505
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(1), 60000000),
506506
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(2), 60000000)));
507507
UNIT_ASSERT_VALUES_EQUAL(res1.second.size(), 1);
508-
// Limit should work for any mode because we are restarting one node already.
508+
// Limit should work for any mode except force because we are restarting one node already.
509509
env.CheckRequest("user", res1.first, false, TStatus::DISALLOW_TEMP, 0);
510510
env.CheckRequest("user", res1.first, false, MODE_KEEP_AVAILABLE, TStatus::DISALLOW_TEMP, 0);
511-
env.CheckRequest("user", res1.first, false, MODE_FORCE_RESTART, TStatus::DISALLOW_TEMP, 0);
511+
env.CheckRequest("user", res1.first, /* dry */ true, MODE_FORCE_RESTART, TStatus::ALLOW, 2);
512512

513513
env.CheckDonePermission("user", res1.second[0]);
514514

515-
auto res2 = env.ExtractPermissions
516-
(env.CheckRequest("user", res1.first, false, MODE_FORCE_RESTART, TStatus::ALLOW_PARTIAL, 1));
517-
518-
env.CheckDonePermission("user", res2.second[0]);
515+
env.CheckRequest("user", res1.first, /* dry */ true, MODE_FORCE_RESTART, TStatus::ALLOW, 2);
519516

520517
// Now shutdown one node and try various modes again. Only MODE_FORCE_RESTART
521-
// should allow to restart another node.
518+
// should allow to restart nodes.
522519
{
523520
TGuard<TMutex> guard(TFakeNodeWhiteboardService::Mutex);
524521
TFakeNodeWhiteboardService::Info[env.GetNodeId(0)].Connected = false;
525522
}
526523
env.CheckRequest("user", res1.first, false, TStatus::DISALLOW_TEMP, 0);
527524
env.CheckRequest("user", res1.first, false, MODE_KEEP_AVAILABLE, TStatus::DISALLOW_TEMP, 0);
528-
env.CheckRequest("user", res1.first, false, MODE_FORCE_RESTART, TStatus::ALLOW, 1);
525+
env.CheckRequest("user", res1.first, false, MODE_FORCE_RESTART, TStatus::ALLOW, 2);
529526
}
530527

531528
Y_UNIT_TEST(TestTenantLimitForceRestartModeScheduled) {

ydb/core/cms/cms_ut.cpp

Lines changed: 18 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1167,39 +1167,31 @@ Y_UNIT_TEST_SUITE(TCmsTest) {
11671167
TestAvailabilityMode(MODE_FORCE_RESTART, true);
11681168
}
11691169

1170-
void TestAvailabilityModeScheduled(EAvailabilityMode mode, bool disconnectNodes)
1170+
void TestKeepAvailabileModeScheduled(bool disconnectNodes)
11711171
{
1172-
Y_ABORT_UNLESS(mode == MODE_KEEP_AVAILABLE
1173-
|| mode == MODE_FORCE_RESTART);
1174-
11751172
TCmsTestEnv env(8);
11761173
env.AdvanceCurrentTime(TDuration::Minutes(3));
11771174

11781175
auto res1 = env.ExtractPermissions
11791176
(env.CheckPermissionRequest("user", true, false, true,
1180-
true, mode, TStatus::ALLOW_PARTIAL,
1177+
true, MODE_KEEP_AVAILABLE, TStatus::ALLOW_PARTIAL,
11811178
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(0), 60000000),
11821179
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(1), 60000000),
11831180
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(2), 60000000)));
11841181
if (disconnectNodes) {
11851182
TFakeNodeWhiteboardService::Info[env.GetNodeId(0)].Connected = false;
11861183
}
11871184

1188-
env.CheckRequest("user", res1.first, false, mode, TStatus::ALLOW_PARTIAL, 1);
1185+
env.CheckRequest("user", res1.first, false, MODE_KEEP_AVAILABLE, TStatus::ALLOW_PARTIAL, 1);
11891186
if (disconnectNodes) {
11901187
TFakeNodeWhiteboardService::Info[env.GetNodeId(1)].Connected = false;
11911188
}
11921189

1193-
env.CheckRequest("user", res1.first, false, mode,
1194-
mode == MODE_KEEP_AVAILABLE ? TStatus::DISALLOW_TEMP : TStatus::ALLOW,
1195-
mode == MODE_KEEP_AVAILABLE ? 0 : 1);
1196-
if (mode != MODE_KEEP_AVAILABLE) {
1197-
return;
1198-
}
1190+
env.CheckRequest("user", res1.first, false, MODE_KEEP_AVAILABLE, TStatus::DISALLOW_TEMP, 0);
11991191

12001192
env.CheckDonePermission("user", res1.second[0]);
12011193

1202-
env.CheckRequest("user", res1.first, false, mode,
1194+
env.CheckRequest("user", res1.first, false, MODE_KEEP_AVAILABLE,
12031195
disconnectNodes ? TStatus::DISALLOW_TEMP : TStatus::ALLOW,
12041196
disconnectNodes ? 0 : 1);
12051197
if (!disconnectNodes) {
@@ -1208,27 +1200,17 @@ Y_UNIT_TEST_SUITE(TCmsTest) {
12081200

12091201
TFakeNodeWhiteboardService::Info[env.GetNodeId(0)].Connected = true;
12101202

1211-
env.CheckRequest("user", res1.first, false, mode, TStatus::ALLOW, 1);
1203+
env.CheckRequest("user", res1.first, false, MODE_KEEP_AVAILABLE, TStatus::ALLOW, 1);
12121204
}
12131205

12141206
Y_UNIT_TEST(TestKeepAvailableModeScheduled)
12151207
{
1216-
TestAvailabilityModeScheduled(MODE_KEEP_AVAILABLE, false);
1217-
}
1218-
1219-
Y_UNIT_TEST(TestForceRestartModeScheduled)
1220-
{
1221-
TestAvailabilityModeScheduled(MODE_FORCE_RESTART, false);
1208+
TestKeepAvailabileModeScheduled(false);
12221209
}
12231210

12241211
Y_UNIT_TEST(TestKeepAvailableModeScheduledDisconnects)
12251212
{
1226-
TestAvailabilityModeScheduled(MODE_KEEP_AVAILABLE, true);
1227-
}
1228-
1229-
Y_UNIT_TEST(TestForceRestartModeScheduledDisconnects)
1230-
{
1231-
TestAvailabilityModeScheduled(MODE_FORCE_RESTART, true);
1213+
TestKeepAvailabileModeScheduled(true);
12321214
}
12331215

12341216
Y_UNIT_TEST(TestOutdatedState)
@@ -1615,10 +1597,12 @@ Y_UNIT_TEST_SUITE(TCmsTest) {
16151597
env.CheckPermissionRequest("user", true, true, false, true, MODE_KEEP_AVAILABLE, TStatus::ALLOW_PARTIAL,
16161598
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(2), 60000000, "storage"),
16171599
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(3), 60000000, "storage"));
1618-
env.CheckPermissionRequest("user", true, true, false, true, MODE_FORCE_RESTART, TStatus::ALLOW_PARTIAL,
1600+
env.CheckPermissionRequest("user", true, true, false, true, MODE_MAX_AVAILABILITY, TStatus::ALLOW_PARTIAL,
16191601
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(2), 60000000, "storage"),
16201602
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(3), 60000000, "storage"));
1621-
env.CheckPermissionRequest("user", true, true, false, true, MODE_MAX_AVAILABILITY, TStatus::ALLOW_PARTIAL,
1603+
1604+
// But it is possible for FORCE RESTART mode
1605+
env.CheckPermissionRequest("user", true, true, false, true, MODE_FORCE_RESTART, TStatus::ALLOW,
16221606
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(2), 60000000, "storage"),
16231607
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(3), 60000000, "storage"));
16241608

@@ -1655,14 +1639,16 @@ Y_UNIT_TEST_SUITE(TCmsTest) {
16551639
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(4), 60000000, "storage"),
16561640
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(4), 60000000, "storage"),
16571641
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(5), 60000000, "storage"));
1658-
env.CheckPermissionRequest("user", true, true, false, true, MODE_FORCE_RESTART, TStatus::ALLOW_PARTIAL,
1642+
env.CheckPermissionRequest("user", true, true, false, true, MODE_MAX_AVAILABILITY, TStatus::ALLOW_PARTIAL,
1643+
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(2), 60000000, "storage"),
1644+
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(3), 60000000, "storage"));
1645+
1646+
// But it is possible for FORCE RESTART mode
1647+
env.CheckPermissionRequest("user", true, true, false, true, MODE_FORCE_RESTART, TStatus::ALLOW,
16591648
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(3), 60000000, "storage"),
16601649
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(4), 60000000, "storage"),
16611650
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(5), 60000000, "storage"),
16621651
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(6), 60000000, "storage"));
1663-
env.CheckPermissionRequest("user", true, true, false, true, MODE_MAX_AVAILABILITY, TStatus::ALLOW_PARTIAL,
1664-
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(2), 60000000, "storage"),
1665-
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(3), 60000000, "storage"));
16661652

16671653
// It's ok to get two permissions for one group if PartialPermissionAllowed is set to false
16681654
env.CheckPermissionRequest("user", false, true, false, true, MODE_KEEP_AVAILABLE, TStatus::ALLOW,

ydb/core/cms/node_checkers.cpp

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -91,8 +91,6 @@ bool TNodesLimitsCounterBase::TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabili
9191
Y_ABORT_UNLESS(NodeToState.contains(nodeId));
9292
auto nodeState = NodeToState.at(nodeId);
9393

94-
bool isForceRestart = mode == NKikimrCms::MODE_FORCE_RESTART;
95-
9694
NCH_LOG_D("Checking Node: "
9795
<< nodeId << ", with state: " << nodeState
9896
<< ", with limit: " << DisabledNodesLimit
@@ -114,12 +112,12 @@ bool TNodesLimitsCounterBase::TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabili
114112
return true;
115113
}
116114

117-
// Always allow at least one node
118-
if (LockedNodesCount + DownNodesCount == 0) {
115+
if (mode == NKikimrCms::MODE_FORCE_RESTART) {
119116
return true;
120117
}
121118

122-
if (isForceRestart && !LockedNodesCount) {
119+
// Always allow at least one node
120+
if (LockedNodesCount + DownNodesCount == 0) {
123121
return true;
124122
}
125123

0 commit comments

Comments
 (0)