Skip to content

Commit 7967afe

Browse files
authored
25-1: CMS: ignore tenant and cluster unavailable nodes limits in force availability mode (#20283)
2 parents bacf8ac + 6fb9351 commit 7967afe

File tree

5 files changed

+78
-56
lines changed

5 files changed

+78
-56
lines changed

ydb/core/cms/cms.cpp

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -936,14 +936,6 @@ bool TCms::TryToLockVDisk(const TActionOptions& opts,
936936
}
937937
break;
938938
case MODE_FORCE_RESTART:
939-
if (counters->GroupAlreadyHasLockedDisks() && !counters->GroupHasMoreThanOneDiskPerNode() && opts.PartialPermissionAllowed) {
940-
TabletCounters->Cumulative()[COUNTER_PARTIAL_PERMISSIONS_OPTIMIZED].Increment(1);
941-
error.Code = TStatus::DISALLOW_TEMP;
942-
error.Reason = "You cannot get two or more disks from the same group at the same time"
943-
" in partial permissions allowed mode";
944-
error.Deadline = defaultDeadline;
945-
return false;
946-
}
947939
// Any number of down disks is OK for this mode.
948940
break;
949941
default:

ydb/core/cms/cms_maintenance_api_ut.cpp

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,55 @@ Y_UNIT_TEST_SUITE(TMaintenanceApiTest) {
116116
const auto &a = response.action_group_states(0).action_states(0);
117117
UNIT_ASSERT_VALUES_EQUAL(a.status(), ActionState::ACTION_STATUS_PERFORMED);
118118
}
119+
120+
Y_UNIT_TEST(ForceAvailabilityMode) {
121+
TCmsTestEnv env(8);
122+
123+
NKikimrCms::TCmsConfig config;
124+
config.MutableClusterLimits()->SetDisabledNodesLimit(1);
125+
config.MutableClusterLimits()->SetDisabledNodesRatioLimit(30);
126+
config.MutableTenantLimits()->SetDisabledNodesLimit(1);
127+
config.MutableTenantLimits()->SetDisabledNodesRatioLimit(30);
128+
env.SetCmsConfig(config);
129+
130+
auto response = env.CheckMaintenanceTaskCreate(
131+
"task-1",
132+
Ydb::StatusIds::SUCCESS,
133+
Ydb::Maintenance::AVAILABILITY_MODE_FORCE,
134+
MakeActionGroup(
135+
MakeLockAction(env.GetNodeId(0), TDuration::Minutes(10))
136+
),
137+
MakeActionGroup(
138+
MakeLockAction(env.GetNodeId(1), TDuration::Minutes(10))
139+
),
140+
MakeActionGroup(
141+
MakeLockAction(env.GetNodeId(2), TDuration::Minutes(10))
142+
),
143+
MakeActionGroup(
144+
MakeLockAction(env.GetNodeId(3), TDuration::Minutes(10))
145+
),
146+
MakeActionGroup(
147+
MakeLockAction(env.GetNodeId(4), TDuration::Minutes(10))
148+
),
149+
MakeActionGroup(
150+
MakeLockAction(env.GetNodeId(5), TDuration::Minutes(10))
151+
),
152+
MakeActionGroup(
153+
MakeLockAction(env.GetNodeId(6), TDuration::Minutes(10))
154+
),
155+
MakeActionGroup(
156+
MakeLockAction(env.GetNodeId(7), TDuration::Minutes(10))
157+
)
158+
);
159+
160+
// Everything is allowed to perform maintenance regardless of the failure model
161+
UNIT_ASSERT_VALUES_EQUAL(response.action_group_states().size(), 8);
162+
for (size_t i = 0; i < 8; ++i) {
163+
UNIT_ASSERT_VALUES_EQUAL(response.action_group_states(i).action_states().size(), 1);
164+
const auto& a = response.action_group_states(i).action_states(0);
165+
UNIT_ASSERT_VALUES_EQUAL(a.status(), ActionState::ACTION_STATUS_PERFORMED);
166+
}
167+
}
119168
}
120169

121170
} // namespace NKikimr::NCmsTest

ydb/core/cms/cms_tenants_ut.cpp

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -441,14 +441,14 @@ Y_UNIT_TEST_SUITE(TCmsTenatsTest) {
441441
auto res1 = env.ExtractPermissions
442442
(env.CheckPermissionRequest("user", false, false, false, true, TStatus::ALLOW,
443443
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(0), 60000000)));
444-
// Limit should work for any mode because we are restarting one node already.
444+
// Limit should work for any mode except force because we are restarting one node already.
445445
env.CheckPermissionRequest("user", false, false, false, true, TStatus::DISALLOW_TEMP,
446446
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(1), 60000000));
447447
env.CheckPermissionRequest("user", false, false, false, true,
448448
MODE_KEEP_AVAILABLE, TStatus::DISALLOW_TEMP,
449449
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(1), 60000000));
450-
env.CheckPermissionRequest("user", false, false, false, true,
451-
MODE_FORCE_RESTART, TStatus::DISALLOW_TEMP,
450+
env.CheckPermissionRequest("user", false, /* dry */ true, false, true,
451+
MODE_FORCE_RESTART, TStatus::ALLOW,
452452
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(1), 60000000));
453453

454454
env.CheckDonePermission("user", res1.second[0]);
@@ -505,27 +505,24 @@ Y_UNIT_TEST_SUITE(TCmsTenatsTest) {
505505
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(1), 60000000),
506506
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(2), 60000000)));
507507
UNIT_ASSERT_VALUES_EQUAL(res1.second.size(), 1);
508-
// Limit should work for any mode because we are restarting one node already.
508+
// Limit should work for any mode except force because we are restarting one node already.
509509
env.CheckRequest("user", res1.first, false, TStatus::DISALLOW_TEMP, 0);
510510
env.CheckRequest("user", res1.first, false, MODE_KEEP_AVAILABLE, TStatus::DISALLOW_TEMP, 0);
511-
env.CheckRequest("user", res1.first, false, MODE_FORCE_RESTART, TStatus::DISALLOW_TEMP, 0);
511+
env.CheckRequest("user", res1.first, /* dry */ true, MODE_FORCE_RESTART, TStatus::ALLOW, 2);
512512

513513
env.CheckDonePermission("user", res1.second[0]);
514514

515-
auto res2 = env.ExtractPermissions
516-
(env.CheckRequest("user", res1.first, false, MODE_FORCE_RESTART, TStatus::ALLOW_PARTIAL, 1));
517-
518-
env.CheckDonePermission("user", res2.second[0]);
515+
env.CheckRequest("user", res1.first, /* dry */ true, MODE_FORCE_RESTART, TStatus::ALLOW, 2);
519516

520517
// Now shutdown one node and try various modes again. Only MODE_FORCE_RESTART
521-
// should allow to restart another node.
518+
// should allow to restart nodes.
522519
{
523520
TGuard<TMutex> guard(TFakeNodeWhiteboardService::Mutex);
524521
TFakeNodeWhiteboardService::Info[env.GetNodeId(0)].Connected = false;
525522
}
526523
env.CheckRequest("user", res1.first, false, TStatus::DISALLOW_TEMP, 0);
527524
env.CheckRequest("user", res1.first, false, MODE_KEEP_AVAILABLE, TStatus::DISALLOW_TEMP, 0);
528-
env.CheckRequest("user", res1.first, false, MODE_FORCE_RESTART, TStatus::ALLOW, 1);
525+
env.CheckRequest("user", res1.first, false, MODE_FORCE_RESTART, TStatus::ALLOW, 2);
529526
}
530527

531528
Y_UNIT_TEST(TestTenantLimitForceRestartModeScheduled) {

ydb/core/cms/cms_ut.cpp

Lines changed: 18 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1050,39 +1050,31 @@ Y_UNIT_TEST_SUITE(TCmsTest) {
10501050
TestAvailabilityMode(MODE_FORCE_RESTART, true);
10511051
}
10521052

1053-
void TestAvailabilityModeScheduled(EAvailabilityMode mode, bool disconnectNodes)
1053+
void TestKeepAvailabileModeScheduled(bool disconnectNodes)
10541054
{
1055-
Y_ABORT_UNLESS(mode == MODE_KEEP_AVAILABLE
1056-
|| mode == MODE_FORCE_RESTART);
1057-
10581055
TCmsTestEnv env(8);
10591056
env.AdvanceCurrentTime(TDuration::Minutes(3));
10601057

10611058
auto res1 = env.ExtractPermissions
10621059
(env.CheckPermissionRequest("user", true, false, true,
1063-
true, mode, TStatus::ALLOW_PARTIAL,
1060+
true, MODE_KEEP_AVAILABLE, TStatus::ALLOW_PARTIAL,
10641061
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(0), 60000000),
10651062
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(1), 60000000),
10661063
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(2), 60000000)));
10671064
if (disconnectNodes) {
10681065
TFakeNodeWhiteboardService::Info[env.GetNodeId(0)].Connected = false;
10691066
}
10701067

1071-
env.CheckRequest("user", res1.first, false, mode, TStatus::ALLOW_PARTIAL, 1);
1068+
env.CheckRequest("user", res1.first, false, MODE_KEEP_AVAILABLE, TStatus::ALLOW_PARTIAL, 1);
10721069
if (disconnectNodes) {
10731070
TFakeNodeWhiteboardService::Info[env.GetNodeId(1)].Connected = false;
10741071
}
10751072

1076-
env.CheckRequest("user", res1.first, false, mode,
1077-
mode == MODE_KEEP_AVAILABLE ? TStatus::DISALLOW_TEMP : TStatus::ALLOW,
1078-
mode == MODE_KEEP_AVAILABLE ? 0 : 1);
1079-
if (mode != MODE_KEEP_AVAILABLE) {
1080-
return;
1081-
}
1073+
env.CheckRequest("user", res1.first, false, MODE_KEEP_AVAILABLE, TStatus::DISALLOW_TEMP, 0);
10821074

10831075
env.CheckDonePermission("user", res1.second[0]);
10841076

1085-
env.CheckRequest("user", res1.first, false, mode,
1077+
env.CheckRequest("user", res1.first, false, MODE_KEEP_AVAILABLE,
10861078
disconnectNodes ? TStatus::DISALLOW_TEMP : TStatus::ALLOW,
10871079
disconnectNodes ? 0 : 1);
10881080
if (!disconnectNodes) {
@@ -1091,27 +1083,17 @@ Y_UNIT_TEST_SUITE(TCmsTest) {
10911083

10921084
TFakeNodeWhiteboardService::Info[env.GetNodeId(0)].Connected = true;
10931085

1094-
env.CheckRequest("user", res1.first, false, mode, TStatus::ALLOW, 1);
1086+
env.CheckRequest("user", res1.first, false, MODE_KEEP_AVAILABLE, TStatus::ALLOW, 1);
10951087
}
10961088

10971089
Y_UNIT_TEST(TestKeepAvailableModeScheduled)
10981090
{
1099-
TestAvailabilityModeScheduled(MODE_KEEP_AVAILABLE, false);
1100-
}
1101-
1102-
Y_UNIT_TEST(TestForceRestartModeScheduled)
1103-
{
1104-
TestAvailabilityModeScheduled(MODE_FORCE_RESTART, false);
1091+
TestKeepAvailabileModeScheduled(false);
11051092
}
11061093

11071094
Y_UNIT_TEST(TestKeepAvailableModeScheduledDisconnects)
11081095
{
1109-
TestAvailabilityModeScheduled(MODE_KEEP_AVAILABLE, true);
1110-
}
1111-
1112-
Y_UNIT_TEST(TestForceRestartModeScheduledDisconnects)
1113-
{
1114-
TestAvailabilityModeScheduled(MODE_FORCE_RESTART, true);
1096+
TestKeepAvailabileModeScheduled(true);
11151097
}
11161098

11171099
Y_UNIT_TEST(TestOutdatedState)
@@ -1498,10 +1480,12 @@ Y_UNIT_TEST_SUITE(TCmsTest) {
14981480
env.CheckPermissionRequest("user", true, true, false, true, MODE_KEEP_AVAILABLE, TStatus::ALLOW_PARTIAL,
14991481
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(2), 60000000, "storage"),
15001482
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(3), 60000000, "storage"));
1501-
env.CheckPermissionRequest("user", true, true, false, true, MODE_FORCE_RESTART, TStatus::ALLOW_PARTIAL,
1483+
env.CheckPermissionRequest("user", true, true, false, true, MODE_MAX_AVAILABILITY, TStatus::ALLOW_PARTIAL,
15021484
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(2), 60000000, "storage"),
15031485
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(3), 60000000, "storage"));
1504-
env.CheckPermissionRequest("user", true, true, false, true, MODE_MAX_AVAILABILITY, TStatus::ALLOW_PARTIAL,
1486+
1487+
// But it is possible for FORCE RESTART mode
1488+
env.CheckPermissionRequest("user", true, true, false, true, MODE_FORCE_RESTART, TStatus::ALLOW,
15051489
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(2), 60000000, "storage"),
15061490
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(3), 60000000, "storage"));
15071491

@@ -1538,14 +1522,16 @@ Y_UNIT_TEST_SUITE(TCmsTest) {
15381522
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(4), 60000000, "storage"),
15391523
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(4), 60000000, "storage"),
15401524
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(5), 60000000, "storage"));
1541-
env.CheckPermissionRequest("user", true, true, false, true, MODE_FORCE_RESTART, TStatus::ALLOW_PARTIAL,
1525+
env.CheckPermissionRequest("user", true, true, false, true, MODE_MAX_AVAILABILITY, TStatus::ALLOW_PARTIAL,
1526+
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(2), 60000000, "storage"),
1527+
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(3), 60000000, "storage"));
1528+
1529+
// But it is possible for FORCE RESTART mode
1530+
env.CheckPermissionRequest("user", true, true, false, true, MODE_FORCE_RESTART, TStatus::ALLOW,
15421531
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(3), 60000000, "storage"),
15431532
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(4), 60000000, "storage"),
15441533
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(5), 60000000, "storage"),
15451534
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(6), 60000000, "storage"));
1546-
env.CheckPermissionRequest("user", true, true, false, true, MODE_MAX_AVAILABILITY, TStatus::ALLOW_PARTIAL,
1547-
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(2), 60000000, "storage"),
1548-
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(3), 60000000, "storage"));
15491535

15501536
// It's ok to get two permissions for one group if PartialPermissionAllowed is set to false
15511537
env.CheckPermissionRequest("user", false, true, false, true, MODE_KEEP_AVAILABLE, TStatus::ALLOW,

ydb/core/cms/node_checkers.cpp

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -91,8 +91,6 @@ bool TNodesLimitsCounterBase::TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabili
9191
Y_ABORT_UNLESS(NodeToState.contains(nodeId));
9292
auto nodeState = NodeToState.at(nodeId);
9393

94-
bool isForceRestart = mode == NKikimrCms::MODE_FORCE_RESTART;
95-
9694
NCH_LOG_D("Checking Node: "
9795
<< nodeId << ", with state: " << nodeState
9896
<< ", with limit: " << DisabledNodesLimit
@@ -114,12 +112,12 @@ bool TNodesLimitsCounterBase::TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabili
114112
return true;
115113
}
116114

117-
// Always allow at least one node
118-
if (LockedNodesCount + DownNodesCount == 0) {
115+
if (mode == NKikimrCms::MODE_FORCE_RESTART) {
119116
return true;
120117
}
121118

122-
if (isForceRestart && !LockedNodesCount) {
119+
// Always allow at least one node
120+
if (LockedNodesCount + DownNodesCount == 0) {
123121
return true;
124122
}
125123

0 commit comments

Comments
 (0)