Skip to content

Commit d59377c

Browse files
authored
Add evict vdisks for a rack (#9740)
1 parent 7ee2eb1 commit d59377c

File tree

6 files changed

+120
-6
lines changed

6 files changed

+120
-6
lines changed

ydb/core/cms/cms_ut.cpp

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1808,6 +1808,51 @@ Y_UNIT_TEST_SUITE(TCmsTest) {
18081808
env.CheckRejectRequest("user", request3.GetRequestId());
18091809
}
18101810

1811+
Y_UNIT_TEST(AllVDisksEvictionInRack)
1812+
{
1813+
auto opts = TTestEnvOpts(8)
1814+
.WithSentinel()
1815+
.WithNodeLocationCallback([](ui32 nodeId) {
1816+
NActorsInterconnect::TNodeLocation location;
1817+
location.SetRack(ToString(nodeId / 2 + 1));
1818+
return TNodeLocation(location); // Node = [0, 1, 2, 3, 4, 5, 6, 7]
1819+
// Rack = [1, 1, 2, 2, 3, 3, 4, 4]
1820+
});
1821+
TCmsTestEnv env(opts);
1822+
env.SetLogPriority(NKikimrServices::CMS, NLog::PRI_DEBUG);
1823+
1824+
// Evict all VDisks from rack 1
1825+
auto request1 = env.CheckPermissionRequest(
1826+
MakePermissionRequest(TRequestOptions("user").WithEvictVDisks(),
1827+
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(0), 600000000, "storage")
1828+
),
1829+
TStatus::DISALLOW_TEMP // ok, waiting for move VDisks
1830+
);
1831+
auto request2 = env.CheckPermissionRequest(
1832+
MakePermissionRequest(TRequestOptions("user").WithEvictVDisks(),
1833+
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(1), 600000000, "storage")
1834+
),
1835+
TStatus::DISALLOW_TEMP // ok, waiting for move VDisks
1836+
);
1837+
1838+
// Check that FAULTY BSC requests are sent
1839+
env.CheckBSCUpdateRequests({ env.GetNodeId(0), env.GetNodeId(1) }, NKikimrBlobStorage::FAULTY);
1840+
1841+
// "Move" VDisks from rack 1
1842+
auto& node1 = TFakeNodeWhiteboardService::Info[env.GetNodeId(0)];
1843+
node1.VDisksMoved = true;
1844+
node1.VDiskStateInfo.clear();
1845+
auto& node2 = TFakeNodeWhiteboardService::Info[env.GetNodeId(1)];
1846+
node2.VDisksMoved = true;
1847+
node2.VDiskStateInfo.clear();
1848+
env.RegenerateBSConfig(TFakeNodeWhiteboardService::Config.MutableResponse()->MutableStatus(0)->MutableBaseConfig(), opts);
1849+
1850+
auto permission1 = env.CheckRequest("user", request1.GetRequestId(), false, TStatus::ALLOW, 1);
1851+
auto permission2 = env.CheckRequest("user", request2.GetRequestId(), false, TStatus::ALLOW, 1);
1852+
env.CheckDonePermission("user", permission1.GetPermissions(0).GetId());
1853+
env.CheckDonePermission("user", permission2.GetPermissions(0).GetId());
1854+
}
1855+
18111856
Y_UNIT_TEST(EmergencyDuringRollingRestart)
18121857
{
18131858
TCmsTestEnv env(8);

ydb/core/cms/cms_ut_common.cpp

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,17 @@ const bool ENABLE_DETAILED_CMS_LOG = true;
2929
const bool ENABLE_DETAILED_CMS_LOG = false;
3030
#endif
3131

32+
#define COMMA ,
33+
Y_DECLARE_OUT_SPEC(, std::map<NKikimrBlobStorage::EDriveStatus COMMA std::set<ui32>>, o, value) {
34+
std::vector<TString> pairs;
35+
for (const auto& [status, nodes] : value) {
36+
pairs.push_back(
37+
TStringBuilder() << status << "=" << '[' << JoinSeq(',', nodes) << ']'
38+
);
39+
}
40+
o << '[' << JoinSeq(',', pairs) << ']';
41+
};
42+
3243
namespace NKikimr {
3344
namespace NCmsTest {
3445

@@ -391,7 +402,7 @@ static NKikimrConfig::TBootstrap GenerateBootstrapConfig(TTestActorRuntime &runt
391402
return res;
392403
}
393404

394-
static void SetupServices(TTestActorRuntime &runtime, const TTestEnvOpts &options) {
405+
static void SetupServices(TTestBasicRuntime &runtime, const TTestEnvOpts &options) {
395406
const ui32 domainsNum = 1;
396407
const ui32 disksInDomain = 1;
397408

@@ -503,6 +514,7 @@ static void SetupServices(TTestActorRuntime &runtime, const TTestEnvOpts &option
503514
),
504515
0);
505516

517+
runtime.LocationCallback = options.NodeLocationCallback;
506518
runtime.Initialize(app.Unwrap());
507519
auto dnsConfig = new TDynamicNameserviceConfig();
508520
dnsConfig->MaxStaticNodeId = 1000;
@@ -868,6 +880,39 @@ TCmsTestEnv::CheckRequest(const TString &user,
868880
return rec;
869881
}
870882

883+
void TCmsTestEnv::CheckBSCUpdateRequests(std::set<ui32> expectedNodes,
884+
NKikimrBlobStorage::EDriveStatus expectedStatus)
885+
{
886+
using TBSCRequests = std::map<NKikimrBlobStorage::EDriveStatus, std::set<ui32>>;
887+
888+
TBSCRequests expectedRequests = { {expectedStatus, expectedNodes} };
889+
TBSCRequests actualRequests;
890+
891+
TDispatchOptions options;
892+
options.FinalEvents.emplace_back([&](IEventHandle& ev) {
893+
if (ev.GetTypeRewrite() == TEvBlobStorage::TEvControllerConfigRequest::EventType) {
894+
const auto& request = ev.Get<TEvBlobStorage::TEvControllerConfigRequest>()->Record;
895+
bool foundUpdateDriveCommand = false;
896+
for (const auto& command : request.GetRequest().GetCommand()) {
897+
if (command.HasUpdateDriveStatus()) {
898+
foundUpdateDriveCommand = true;
899+
const auto& update = command.GetUpdateDriveStatus();
900+
actualRequests[update.GetStatus()].insert(update.GetHostKey().GetNodeId());
901+
}
902+
}
903+
return foundUpdateDriveCommand;
904+
}
905+
return false;
906+
});
907+
DispatchEvents(options, TDuration::Minutes(1));
908+
909+
UNIT_ASSERT_C(
910+
actualRequests == expectedRequests,
911+
TStringBuilder() << "Sentinel sent wrong update requests to BSC: "
912+
<< "expected# " << expectedRequests
913+
<< ", actual# " << actualRequests
914+
);
915+
}
871916

872917
void TCmsTestEnv::CheckWalleStoreTaskIsFailed(NCms::TEvCms::TEvStoreWalleTask* req)
873918
{

ydb/core/cms/cms_ut_common.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,9 @@ struct TTestEnvOpts {
9292
bool EnableCMSRequestPriorities;
9393
bool EnableSingleCompositeActionGroup;
9494

95+
using TNodeLocationCallback = std::function<TNodeLocation(ui32)>;
96+
TNodeLocationCallback NodeLocationCallback;
97+
9598
TTestEnvOpts() = default;
9699

97100
TTestEnvOpts(ui32 nodeCount,
@@ -126,6 +129,12 @@ struct TTestEnvOpts {
126129
EnableCMSRequestPriorities = false;
127130
return *this;
128131
}
132+
133+
TTestEnvOpts& WithNodeLocationCallback(TNodeLocationCallback nodeLocationCallback) {
134+
NodeLocationCallback = nodeLocationCallback;
135+
return *this;
136+
}
137+
129138
};
130139

131140
class TCmsTestEnv : public TTestBasicRuntime {
@@ -323,6 +332,8 @@ class TCmsTestEnv : public TTestBasicRuntime {
323332
return CheckRequest(user, id, dry, NKikimrCms::MODE_MAX_AVAILABILITY, res, count);
324333
}
325334

335+
void CheckBSCUpdateRequests(std::set<ui32> expectedNodes, NKikimrBlobStorage::EDriveStatus expectedStatus);
336+
326337
void CheckWalleStoreTaskIsFailed(NCms::TEvCms::TEvStoreWalleTask *req);
327338

328339
template <typename... Ts>

ydb/core/cms/pdisk_status.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
#pragma once
2+
3+
#include <ydb/core/protos/blobstorage_config.pb.h>
4+
5+
namespace NKikimr::NCms {
6+
7+
using EPDiskStatus = NKikimrBlobStorage::EDriveStatus;
8+
9+
} // namespace NKikimr::NCms

ydb/core/cms/sentinel.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,10 @@ void TPDiskStatusComputer::SetForcedStatus(EPDiskStatus status) {
125125
ForcedStatus = status;
126126
}
127127

128+
bool TPDiskStatusComputer::HasForcedStatus() const {
129+
return ForcedStatus.Defined();
130+
}
131+
128132
void TPDiskStatusComputer::ResetForcedStatus() {
129133
ForcedStatus.Clear();
130134
}
@@ -196,6 +200,7 @@ void TPDiskStatus::DisallowChanging() {
196200

197201
TPDiskInfo::TPDiskInfo(EPDiskStatus initialStatus, const ui32& defaultStateLimit, const TLimitsMap& stateLimits)
198202
: TPDiskStatus(initialStatus, defaultStateLimit, stateLimits)
203+
, ActualStatus(initialStatus)
199204
{
200205
Touch();
201206
}
@@ -898,7 +903,7 @@ class TSentinel: public TActorBootstrapped<TSentinel> {
898903

899904
all.AddPDisk(id);
900905
if (info.IsChanged()) {
901-
if (info.IsNewStatusGood()) {
906+
if (info.IsNewStatusGood() || info.HasForcedStatus()) {
902907
alwaysAllowed.insert(id);
903908
} else {
904909
changed.AddPDisk(id);

ydb/core/cms/sentinel_impl.h

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,14 @@
33
#include "defs.h"
44
#include "pdiskid.h"
55
#include "pdisk_state.h"
6-
7-
#include <ydb/core/protos/blobstorage_config.pb.h>
6+
#include "pdisk_status.h"
87

98
#include <util/generic/hash.h>
109
#include <util/generic/hash_set.h>
1110
#include <util/generic/map.h>
1211

1312
namespace NKikimr::NCms::NSentinel {
1413

15-
using EPDiskStatus = NKikimrBlobStorage::EDriveStatus;
1614
using TLimitsMap = TMap<EPDiskState, ui32>;
1715

1816
class TPDiskStatusComputer {
@@ -29,6 +27,7 @@ class TPDiskStatusComputer {
2927
void Reset();
3028

3129
void SetForcedStatus(EPDiskStatus status);
30+
bool HasForcedStatus() const;
3231
void ResetForcedStatus();
3332

3433
private:
@@ -84,7 +83,7 @@ struct TPDiskInfo
8483
using EIgnoreReason = NKikimrCms::TPDiskInfo::EIgnoreReason;
8584

8685
EPDiskStatus ActualStatus = EPDiskStatus::ACTIVE;
87-
EPDiskStatus PrevStatus = EPDiskStatus::ACTIVE;
86+
EPDiskStatus PrevStatus = EPDiskStatus::UNKNOWN;
8887
TInstant LastStatusChange;
8988
bool StatusChangeFailed = false;
9089
// means that this pdisk status change last time was the reason of whole request failure

0 commit comments

Comments
 (0)