From 8f4abb07e1eb827f87324f8182e8659587fae835 Mon Sep 17 00:00:00 2001 From: Sergey Belyakov Date: Tue, 25 Feb 2025 15:50:47 +0000 Subject: [PATCH 1/4] Always update info about groups with invalid layout, add more UT --- .../ut_blobstorage/sanitize_groups.cpp | 49 ++++++++++++------- ydb/core/mind/bscontroller/impl.h | 1 + ydb/core/mind/bscontroller/self_heal.cpp | 20 ++++---- ydb/core/protos/counters_bs_controller.proto | 1 + 4 files changed, 45 insertions(+), 26 deletions(-) diff --git a/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp b/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp index 26c40dd7d125..7746f26d54ba 100644 --- a/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp +++ b/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp @@ -25,8 +25,8 @@ Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) { } } - void CreateEnv(std::unique_ptr& env, std::vector& locations) { - TBlobStorageGroupType groupType = TBlobStorageGroupType::ErasureMirror3dc; + void CreateEnv(std::unique_ptr& env, std::vector& locations, + TBlobStorageGroupType groupType) { const ui32 numNodes = locations.size(); env.reset(new TEnvironmentSetup({ @@ -37,27 +37,30 @@ Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) { const ui32 disksPerNode = 1; const ui32 slotsPerDisk = 3; + + // Assure that sanitizer doesn't send request to initially allocated groups + env->Runtime->FilterFunction = CatchSanitizeRequests; env->CreateBoxAndPool(disksPerNode, numNodes * disksPerNode * slotsPerDisk / 9); } - Y_UNIT_TEST(Test3dc) { + NActorsInterconnect::TNodeLocation LocationGenerator(ui32 dc, ui32 rack, ui32 unit) { + NActorsInterconnect::TNodeLocation proto; + proto.SetDataCenter(ToString(dc)); + proto.SetRack(ToString(rack)); + proto.SetUnit(ToString(unit)); + return proto; + } + + void Test(TBlobStorageGroupType groupType, ui32 dcs, ui32 racks) { std::vector locations; - TLocationGenerator locationGenerator = [](ui32 dc, ui32 rack, ui32 unit) { - NActorsInterconnect::TNodeLocation proto; - proto.SetDataCenter(ToString(dc)); - proto.SetRack(ToString(rack)); - proto.SetUnit(ToString(unit)); - return proto; - }; - MakeLocations(locations, 3, 5, 1, locationGenerator); + MakeLocations(locations, dcs, racks, 1, LocationGenerator); std::unique_ptr env; - CreateEnv(env, locations); - TBlobStorageGroupType groupType = TBlobStorageGroupType::ErasureMirror3dc; - TGroupGeometryInfo geom = CreateGroupGeometry(groupType); + CreateEnv(env, locations, groupType); + env->Sim(TDuration::Minutes(3)); - env->Runtime->FilterFunction = CatchSanitizeRequests; + TGroupGeometryInfo geom = CreateGroupGeometry(groupType); TString error; auto cfg = env->FetchBaseConfig(); @@ -86,6 +89,18 @@ Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) { UNIT_ASSERT_C(CheckBaseConfigLayout(geom, cfg, true, error), error); } + Y_UNIT_TEST(Test3dc) { + Test(TBlobStorageGroupType::ErasureMirror3dc, 3, 5); + } + + Y_UNIT_TEST(TestBlock4Plus2) { + Test(TBlobStorageGroupType::Erasure4Plus2Block, 1, 12); + } + + Y_UNIT_TEST(TestMirror3of4) { + Test(TBlobStorageGroupType::ErasureMirror3of4, 1, 12); + } + TString PrintGroups(TBlobStorageGroupType groupType, const NKikimrBlobStorage::TBaseConfig& cfg, std::vector locations) { TGroupGeometryInfo geom = CreateGroupGeometry(groupType); @@ -137,6 +152,7 @@ Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) { } void TestMultipleRealmsOccupation(bool allowMultipleRealmsOccupation) { + TBlobStorageGroupType groupType = TBlobStorageGroupType::ErasureMirror3dc; std::vector locations; TLocationGenerator locationGenerator = [](ui32 dc, ui32 rack, ui32 unit) { NActorsInterconnect::TNodeLocation proto; @@ -152,9 +168,8 @@ Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) { }; MakeLocations(locations, 4, 5, 1, locationGenerator); std::unique_ptr env; - CreateEnv(env, locations); + CreateEnv(env, locations, groupType); - TBlobStorageGroupType groupType = TBlobStorageGroupType::ErasureMirror3dc; TGroupGeometryInfo geom = CreateGroupGeometry(groupType); env->Runtime->FilterFunction = CatchSanitizeRequests; diff --git a/ydb/core/mind/bscontroller/impl.h b/ydb/core/mind/bscontroller/impl.h index 20d38c32ad59..32b82bff302d 100644 --- a/ydb/core/mind/bscontroller/impl.h +++ b/ydb/core/mind/bscontroller/impl.h @@ -1512,6 +1512,7 @@ class TBlobStorageController : public TActor, public TTa private: TString InstanceId; std::shared_ptr SelfHealUnreassignableGroups = std::make_shared(); + std::shared_ptr GroupLayoutSanitizerInvalidGroups = std::make_shared(); TMaybe MigrationId; TVSlots VSlots; // ordering is important TPDisks PDisks; // ordering is important diff --git a/ydb/core/mind/bscontroller/self_heal.cpp b/ydb/core/mind/bscontroller/self_heal.cpp index 7618d1a93e03..de5282bce921 100644 --- a/ydb/core/mind/bscontroller/self_heal.cpp +++ b/ydb/core/mind/bscontroller/self_heal.cpp @@ -284,6 +284,7 @@ namespace NKikimr::NBsController { bool DonorMode; THostRecordMap HostRecords; std::shared_ptr EnableSelfHealWithDegraded; + std::shared_ptr GroupsWithInvalidLayoutCounter; using TTopologyDescr = std::tuple; THashMap> Topologies; @@ -296,7 +297,8 @@ namespace NKikimr::NBsController { public: TSelfHealActor(ui64 tabletId, std::shared_ptr unreassignableGroups, THostRecordMap hostRecords, bool groupLayoutSanitizerEnabled, bool allowMultipleRealmsOccupation, bool donorMode, - std::shared_ptr enableSelfHealWithDegraded) + std::shared_ptr enableSelfHealWithDegraded, + std::shared_ptr groupsWithInvalidLayoutCounter) : TabletId(tabletId) , UnreassignableGroups(std::move(unreassignableGroups)) , GroupLayoutSanitizerEnabled(groupLayoutSanitizerEnabled) @@ -304,6 +306,7 @@ namespace NKikimr::NBsController { , DonorMode(donorMode) , HostRecords(std::move(hostRecords)) , EnableSelfHealWithDegraded(std::move(enableSelfHealWithDegraded)) + , GroupsWithInvalidLayoutCounter(std::move(groupsWithInvalidLayoutCounter)) {} void Bootstrap(const TActorId& parentId) { @@ -318,17 +321,16 @@ namespace NKikimr::NBsController { void Handle(TEvControllerUpdateSelfHealInfo::TPtr& ev) { if (const auto& setting = ev->Get()->GroupLayoutSanitizerEnabled) { - bool previousSetting = std::exchange(GroupLayoutSanitizerEnabled, *setting); - if (!previousSetting && GroupLayoutSanitizerEnabled) { - UpdateLayoutInformationForAllGroups(); - } + std::exchange(GroupLayoutSanitizerEnabled, *setting); } + if (const auto& setting = ev->Get()->AllowMultipleRealmsOccupation) { bool previousSetting = std::exchange(AllowMultipleRealmsOccupation, *setting); if (previousSetting != AllowMultipleRealmsOccupation) { UpdateLayoutInformationForAllGroups(); } } + if (const auto& setting = ev->Get()->DonorMode) { DonorMode = *setting; } @@ -345,9 +347,7 @@ namespace NKikimr::NBsController { g.Content = std::move(*data); - if (GroupLayoutSanitizerEnabled) { - UpdateGroupLayoutInformation(g); - } + UpdateGroupLayoutInformation(g); ui32 numFailRealms = 0; ui32 numFailDomainsPerFailRealm = 0; @@ -500,6 +500,7 @@ namespace NKikimr::NBsController { } } + GroupsWithInvalidLayoutCounter->store(GroupsWithInvalidLayout.Size()); UnreassignableGroups->store(counter); } @@ -899,7 +900,7 @@ namespace NKikimr::NBsController { IActor *TBlobStorageController::CreateSelfHealActor() { Y_ABORT_UNLESS(HostRecords); return new TSelfHealActor(TabletID(), SelfHealUnreassignableGroups, HostRecords, GroupLayoutSanitizerEnabled, - AllowMultipleRealmsOccupation, DonorMode, EnableSelfHealWithDegraded); + AllowMultipleRealmsOccupation, DonorMode, EnableSelfHealWithDegraded, GroupLayoutSanitizerInvalidGroups); } void TBlobStorageController::InitializeSelfHealState() { @@ -1159,6 +1160,7 @@ namespace NKikimr::NBsController { ); TabletCounters->Simple()[NBlobStorageController::COUNTER_SELF_HEAL_UNREASSIGNABLE_GROUPS] = SelfHealUnreassignableGroups->load(); + TabletCounters->Simple()[NBlobStorageController::COUNTER_GROUP_LAYOUT_SANITIZER_INVALID_GROUPS] = GroupLayoutSanitizerInvalidGroups->load(); Schedule(TDuration::Seconds(15), new TEvPrivate::TEvUpdateSelfHealCounters); } diff --git a/ydb/core/protos/counters_bs_controller.proto b/ydb/core/protos/counters_bs_controller.proto index 5d6cdae97e58..82642fa4249a 100644 --- a/ydb/core/protos/counters_bs_controller.proto +++ b/ydb/core/protos/counters_bs_controller.proto @@ -28,6 +28,7 @@ enum ESimpleCounters { COUNTER_DISK_SCRUB_CUR_DISKS = 18 [(CounterOpts) = {Name: "CurrentlyScrubbedDisks"}]; COUNTER_DISK_SCRUB_CUR_GROUPS = 19 [(CounterOpts) = {Name: "CurrentlyScrubbedGroups"}]; COUNTER_SELF_HEAL_UNREASSIGNABLE_GROUPS = 20 [(CounterOpts) = {Name: "SelfHealUnreassignableGroups"}]; + COUNTER_GROUP_LAYOUT_SANITIZER_INVALID_GROUPS = 21 [(CounterOpts) = {Name: "GroupLayoutSanitizerInvlaidGroups"}]; } enum ECumulativeCounters { From 7868b2af70461d30dcee7f07df3b5b283547563f Mon Sep 17 00:00:00 2001 From: Sergey Belyakov Date: Tue, 25 Feb 2025 16:17:22 +0000 Subject: [PATCH 2/4] Improve UT --- ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp b/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp index 7746f26d54ba..caf9aa703b2d 100644 --- a/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp +++ b/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp @@ -38,9 +38,9 @@ Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) { const ui32 disksPerNode = 1; const ui32 slotsPerDisk = 3; - // Assure that sanitizer doesn't send request to initially allocated groups env->Runtime->FilterFunction = CatchSanitizeRequests; env->CreateBoxAndPool(disksPerNode, numNodes * disksPerNode * slotsPerDisk / 9); + env->Runtime->FilterFunction = {}; } NActorsInterconnect::TNodeLocation LocationGenerator(ui32 dc, ui32 rack, ui32 unit) { @@ -58,7 +58,13 @@ Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) { std::unique_ptr env; CreateEnv(env, locations, groupType); + + + // Assure that sanitizer doesn't send request to initially allocated groups + env->UpdateSettings(true, false, true); + env->Runtime->FilterFunction = CatchSanitizeRequests; env->Sim(TDuration::Minutes(3)); + env->UpdateSettings(false, false, false); TGroupGeometryInfo geom = CreateGroupGeometry(groupType); From 3eee5628a2926aab9b45cf7062e2e71e8dfb15be Mon Sep 17 00:00:00 2001 From: Sergey Belyakov Date: Wed, 26 Feb 2025 10:59:09 +0000 Subject: [PATCH 3/4] Fix UT --- .../ut_blobstorage/sanitize_groups.cpp | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp b/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp index caf9aa703b2d..34e660ca028b 100644 --- a/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp +++ b/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp @@ -51,18 +51,18 @@ Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) { return proto; } - void Test(TBlobStorageGroupType groupType, ui32 dcs, ui32 racks) { + void Test(TBlobStorageGroupType groupType, ui32 dcs, ui32 racks, ui32 units) { std::vector locations; - MakeLocations(locations, dcs, racks, 1, LocationGenerator); + MakeLocations(locations, dcs, racks, units, LocationGenerator); std::unique_ptr env; CreateEnv(env, locations, groupType); // Assure that sanitizer doesn't send request to initially allocated groups - env->UpdateSettings(true, false, true); env->Runtime->FilterFunction = CatchSanitizeRequests; + env->UpdateSettings(true, false, true); env->Sim(TDuration::Minutes(3)); env->UpdateSettings(false, false, false); @@ -71,14 +71,15 @@ Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) { TString error; auto cfg = env->FetchBaseConfig(); UNIT_ASSERT_C(CheckBaseConfigLayout(geom, cfg, true, error), error); - env->Cleanup(); // Shuffle node locayion, assure that layout error occured - std::random_shuffle(locations.begin(), locations.end()); - env->Initialize(); - env->Sim(TDuration::Seconds(100)); - cfg = env->FetchBaseConfig(); - CheckBaseConfigLayout(geom, cfg, true, error); + do { + env->Cleanup(); + std::random_shuffle(locations.begin(), locations.end()); + env->Initialize(); + env->Sim(TDuration::Seconds(100)); + cfg = env->FetchBaseConfig(); + } while (CheckBaseConfigLayout(geom, cfg, true, error)); Cerr << error << Endl; // Sanitize groups @@ -96,15 +97,15 @@ Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) { } Y_UNIT_TEST(Test3dc) { - Test(TBlobStorageGroupType::ErasureMirror3dc, 3, 5); + Test(TBlobStorageGroupType::ErasureMirror3dc, 3, 5, 1); } Y_UNIT_TEST(TestBlock4Plus2) { - Test(TBlobStorageGroupType::Erasure4Plus2Block, 1, 12); + Test(TBlobStorageGroupType::Erasure4Plus2Block, 1, 10, 2); } Y_UNIT_TEST(TestMirror3of4) { - Test(TBlobStorageGroupType::ErasureMirror3of4, 1, 12); + Test(TBlobStorageGroupType::ErasureMirror3of4, 1, 10, 2); } TString PrintGroups(TBlobStorageGroupType groupType, const NKikimrBlobStorage::TBaseConfig& cfg, From f2c7f99bad19f5f0d1021163375292b0265416b4 Mon Sep 17 00:00:00 2001 From: Sergey Belyakov Date: Wed, 26 Feb 2025 11:46:55 +0000 Subject: [PATCH 4/4] Remove unused layout checker function --- .../bscontroller/group_layout_checker.cpp | 44 ------------------- .../mind/bscontroller/group_layout_checker.h | 2 - 2 files changed, 46 deletions(-) diff --git a/ydb/core/mind/bscontroller/group_layout_checker.cpp b/ydb/core/mind/bscontroller/group_layout_checker.cpp index 8ab76e3e4f50..31e822eb4fa1 100644 --- a/ydb/core/mind/bscontroller/group_layout_checker.cpp +++ b/ydb/core/mind/bscontroller/group_layout_checker.cpp @@ -1,47 +1,3 @@ #include "group_layout_checker.h" -#include "group_geometry_info.h" - -namespace NKikimr::NBsController { - - TLayoutCheckResult CheckGroupLayout(const TGroupGeometryInfo& geom, const THashMap>& layout) { - using namespace NLayoutChecker; - - if (layout.empty()) { - return {}; - } - - TBlobStorageGroupInfo::TTopology topology(geom.GetType(), geom.GetNumFailRealms(), geom.GetNumFailDomainsPerFailRealm(), - geom.GetNumVDisksPerFailDomain(), true); - TGroupLayout group(topology); - TDomainMapper mapper; - THashMap map; - for (const auto& [vdiskId, p] : layout) { - const auto& [location, pdiskId] = p; - TPDiskLayoutPosition pos(mapper, location, pdiskId, geom); - group.AddDisk(pos, topology.GetOrderNumber(vdiskId)); - map.emplace(vdiskId, pos); - } - - std::vector> scoreboard; - for (const auto& [vdiskId, pos] : map) { - scoreboard.emplace_back(group.GetCandidateScore(pos, topology.GetOrderNumber(vdiskId)), vdiskId); - } - - auto comp1 = [](const auto& x, const auto& y) { return x.second < y.second; }; - std::sort(scoreboard.begin(), scoreboard.end(), comp1); - - auto comp = [](const auto& x, const auto& y) { return x.first.BetterThan(y.first); }; - std::sort(scoreboard.begin(), scoreboard.end(), comp); - TLayoutCheckResult res; - const auto reference = scoreboard.back().first; - if (!reference.SameAs({})) { // not perfectly correct layout - for (; !scoreboard.empty() && !scoreboard.back().first.BetterThan(reference); scoreboard.pop_back()) { - res.Candidates.push_back(scoreboard.back().second); - } - } - return res; - } - -} // NKikimr::NBsController Y_DECLARE_OUT_SPEC(, NKikimr::NBsController::NLayoutChecker::TEntityId, stream, value) { value.Output(stream); } diff --git a/ydb/core/mind/bscontroller/group_layout_checker.h b/ydb/core/mind/bscontroller/group_layout_checker.h index e2e2e66246f0..6af8e16678b9 100644 --- a/ydb/core/mind/bscontroller/group_layout_checker.h +++ b/ydb/core/mind/bscontroller/group_layout_checker.h @@ -245,6 +245,4 @@ namespace NKikimr::NBsController { } }; - TLayoutCheckResult CheckGroupLayout(const TGroupGeometryInfo& geom, const THashMap>& layout); - } // NKikimr::NBsController