Skip to content

Commit 6e5868b

Browse files
authored
add ability to stop a tenant via hive developer ui (#14315)
1 parent 9253f6a commit 6e5868b

File tree

12 files changed

+287
-20
lines changed

12 files changed

+287
-20
lines changed

ydb/core/mind/hive/domain_info.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ struct TDomainInfo {
4242
TString Path;
4343
TTabletId HiveId = 0;
4444
TMaybeServerlessComputeResourcesMode ServerlessComputeResourcesMode;
45+
bool Stopped = false;
4546

4647
ui64 TabletsTotal = 0;
4748
ui64 TabletsAlive = 0;

ydb/core/mind/hive/hive_impl.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1196,6 +1196,11 @@ THive::TBestNodeResult THive::FindBestNode(const TTabletInfo& tablet, TNodeId su
11961196
BLOG_D("[FBN] Finding best node for tablet " << tablet.ToString());
11971197
BLOG_TRACE("[FBN] Tablet " << tablet.ToString() << " family " << tablet.FamilyString());
11981198

1199+
const TDomainInfo* domain = FindDomain(tablet.NodeFilter.ObjectDomain);
1200+
if (domain && domain->Stopped) {
1201+
return TNoNodeFound();
1202+
}
1203+
11991204
if (tablet.PreferredNodeId != 0) {
12001205
TNodeInfo* node = FindNode(tablet.PreferredNodeId);
12011206
if (node != nullptr) {
@@ -2891,6 +2896,20 @@ void THive::BlockStorageForDelete(TTabletId tabletId, TSideEffects& sideEffects)
28912896
}
28922897
}
28932898

2899+
void THive::ProcessPendingStopTablet() {
2900+
if (!StopTenantTabletsQueue.empty()) {
2901+
Execute(CreateStopTabletByTenant(StopTenantTabletsQueue.front()));
2902+
StopTenantTabletsQueue.pop();
2903+
}
2904+
}
2905+
2906+
void THive::ProcessPendingResumeTablet() {
2907+
if (!ResumeTenantTabletsQueue.empty()) {
2908+
Execute(CreateResumeTabletByTenant(ResumeTenantTabletsQueue.front()));
2909+
ResumeTenantTabletsQueue.pop();
2910+
}
2911+
}
2912+
28942913
THive::THive(TTabletStorageInfo *info, const TActorId &tablet)
28952914
: TActor(&TThis::StateInit)
28962915
, TTabletExecutedFlat(info, tablet, new NMiniKQL::TMiniKQLFactory)

ydb/core/mind/hive/hive_impl.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,7 @@ class THive : public TActor<THive>, public TTabletExecutedFlat, public THiveShar
241241
friend class TTxMonEvent_TabletAvailability;
242242
friend class TLoggedMonTransaction;
243243
friend class TTxProcessUpdateFollowers;
244+
friend class TTxMonEvent_StopDomain;
244245

245246
friend class TDeleteTabletActor;
246247

@@ -279,7 +280,9 @@ class THive : public TActor<THive>, public TTabletExecutedFlat, public THiveShar
279280
ITransaction* CreateCheckTablets();
280281
ITransaction* CreateSyncTablets(const TActorId &local, NKikimrLocal::TEvSyncTablets& rec);
281282
ITransaction* CreateStopTablet(TTabletId tabletId, const TActorId& actorToNotify);
283+
ITransaction* CreateStopTabletByTenant(TTabletId tabletId);
282284
ITransaction* CreateResumeTablet(TTabletId tabletId, const TActorId& actorToNotify);
285+
ITransaction* CreateResumeTabletByTenant(TTabletId tabletId);
283286
ITransaction* CreateStartTablet(TFullTabletId tabletId, const TActorId& local, ui64 cookie, bool external = false);
284287
ITransaction* CreateUpdateTabletMetrics(TEvHive::TEvTabletMetrics::TPtr& ev);
285288
ITransaction* CreateReassignGroups(TTabletId tabletId, const TActorId& actorToNotify, const std::bitset<MAX_TABLET_CHANNELS>& channelProfileNewGroup);
@@ -424,6 +427,8 @@ class THive : public TActor<THive>, public TTabletExecutedFlat, public THiveShar
424427
std::queue<TActorId> NodePingQueue;
425428
std::unordered_set<TNodeId> NodePingsInProgress;
426429
TFollowerUpdates PendingFollowerUpdates;
430+
std::queue<TTabletId> StopTenantTabletsQueue;
431+
std::queue<TTabletId> ResumeTenantTabletsQueue;
427432

428433
struct TPendingCreateTablet {
429434
NKikimrHive::TEvCreateTablet CreateTablet;
@@ -714,6 +719,8 @@ TTabletInfo* FindTabletEvenInDeleting(TTabletId tabletId, TFollowerId followerId
714719
void UpdateObjectCount(const TLeaderTabletInfo& tablet, const TNodeInfo& node, i64 diff);
715720
ui64 GetObjectImbalance(TFullObjectId object);
716721
void BlockStorageForDelete(TTabletId tabletId, TSideEffects& sideEffects);
722+
void ProcessPendingStopTablet();
723+
void ProcessPendingResumeTablet();
717724

718725
ui32 GetEventPriority(IEventHandle* ev);
719726
void PushProcessIncomingEvent();

ydb/core/mind/hive/hive_schema.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ struct Schema : NIceDb::Schema {
9494
struct AllowedDataCenterIds : Column<122, NScheme::NTypeIds::String> { using Type = TVector<TString>; };
9595

9696
struct BalancerPolicy : Column<123, NScheme::NTypeIds::Uint64> { using Type = NKikimrHive::EBalancerPolicy; static constexpr NKikimrHive::EBalancerPolicy Default = NKikimrHive::EBalancerPolicy::POLICY_BALANCE; };
97+
struct StoppedByTenant : Column<124, NScheme::NTypeIds::Bool> {};
9798

9899
using TKey = TableKey<ID>;
99100
using TColumns = TableColumns<
@@ -121,7 +122,8 @@ struct Schema : NIceDb::Schema {
121122
Statistics,
122123
DataCentersPreference,
123124
AllowedDataCenterIds,
124-
BalancerPolicy
125+
BalancerPolicy,
126+
StoppedByTenant
125127
>;
126128
};
127129

@@ -274,10 +276,11 @@ struct Schema : NIceDb::Schema {
274276
struct HiveId : Column<5, NScheme::NTypeIds::Uint64> {};
275277
struct ServerlessComputeResourcesMode : Column<6, NScheme::NTypeIds::Uint32> { using Type = NKikimrSubDomains::EServerlessComputeResourcesMode; };
276278
struct ScaleRecommenderPolicies : Column<7, NScheme::NTypeIds::String> { using Type = NKikimrHive::TScaleRecommenderPolicies; };
279+
struct Stopped : Column<8, NScheme::NTypeIds::Bool> {};
277280

278281
using TKey = TableKey<SchemeshardId, PathId>;
279282
using TColumns = TableColumns<SchemeshardId, PathId, Path, Primary, HiveId, ServerlessComputeResourcesMode,
280-
ScaleRecommenderPolicies>;
283+
ScaleRecommenderPolicies, Stopped>;
281284
};
282285

283286
struct BlockedOwner : Table<18> {

ydb/core/mind/hive/hive_ut.cpp

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6990,6 +6990,96 @@ Y_UNIT_TEST_SUITE(THiveTest) {
69906990
MakeSureTabletIsUp(runtime, tabletId, 0);
69916991
}
69926992
}
6993+
6994+
Y_UNIT_TEST(TestStopTenant) {
6995+
TTestBasicRuntime runtime(2, false);
6996+
Setup(runtime, true);
6997+
6998+
const ui64 hiveTablet = MakeDefaultHiveID();
6999+
const ui64 testerTablet = MakeTabletID(false, 1);
7000+
const TActorId hiveActor = CreateTestBootstrapper(runtime, CreateTestTabletInfo(hiveTablet, TTabletTypes::Hive), &CreateDefaultHive);
7001+
runtime.EnableScheduleForActor(hiveActor);
7002+
MakeSureTabletIsUp(runtime, hiveTablet, 0); // root hive good
7003+
TActorId sender = runtime.AllocateEdgeActor(0);
7004+
7005+
THolder<TEvHive::TEvCreateTablet> createTablet1 = MakeHolder<TEvHive::TEvCreateTablet>(testerTablet, 1, TTabletTypes::Dummy, BINDED_CHANNELS);
7006+
createTablet1->Record.AddAllowedDomains();
7007+
createTablet1->Record.MutableAllowedDomains(0)->SetSchemeShard(TTestTxConfig::SchemeShard);
7008+
createTablet1->Record.MutableAllowedDomains(0)->SetPathId(1);
7009+
createTablet1->Record.MutableObjectDomain()->SetSchemeShard(1);
7010+
createTablet1->Record.MutableObjectDomain()->SetPathId(3);
7011+
ui64 tablet1 = SendCreateTestTablet(runtime, hiveTablet, testerTablet, std::move(createTablet1), 0, true);
7012+
7013+
THolder<TEvHive::TEvCreateTablet> createTablet2 = MakeHolder<TEvHive::TEvCreateTablet>(testerTablet, 2, TTabletTypes::Dummy, BINDED_CHANNELS);
7014+
createTablet2->Record.AddAllowedDomains();
7015+
createTablet2->Record.MutableAllowedDomains(0)->SetSchemeShard(TTestTxConfig::SchemeShard);
7016+
createTablet2->Record.MutableAllowedDomains(0)->SetPathId(1);
7017+
createTablet2->Record.MutableObjectDomain()->SetSchemeShard(1);
7018+
createTablet2->Record.MutableObjectDomain()->SetPathId(4);
7019+
ui64 tablet2 = SendCreateTestTablet(runtime, hiveTablet, testerTablet, std::move(createTablet2), 0, true);
7020+
7021+
MakeSureTabletIsUp(runtime, tablet1, 0);
7022+
MakeSureTabletIsUp(runtime, tablet2, 0);
7023+
7024+
{
7025+
NActorsProto::TRemoteHttpInfo pb;
7026+
pb.SetMethod(HTTP_METHOD_GET);
7027+
pb.SetPath("/app");
7028+
auto* p1 = pb.AddQueryParams();
7029+
p1->SetKey("TabletID");
7030+
p1->SetValue(TStringBuilder() << hiveTablet);
7031+
auto* p2 = pb.AddQueryParams();
7032+
p2->SetKey("page");
7033+
p2->SetValue("StopDomain");
7034+
auto* p3 = pb.AddQueryParams();
7035+
p3->SetKey("ss");
7036+
p3->SetValue("1");
7037+
auto* p4 = pb.AddQueryParams();
7038+
p4->SetKey("path");
7039+
p4->SetValue("4");
7040+
runtime.SendToPipe(hiveTablet, sender, new NMon::TEvRemoteHttpInfo(std::move(pb)), 0, GetPipeConfigWithRetries());
7041+
}
7042+
7043+
{
7044+
TDispatchOptions options;
7045+
options.FinalEvents.emplace_back(TEvLocal::EvStopTablet);
7046+
runtime.DispatchEvents(options);
7047+
}
7048+
7049+
MakeSureTabletIsUp(runtime, tablet1, 0);
7050+
MakeSureTabletIsDown(runtime, tablet2, 0);
7051+
7052+
{
7053+
NActorsProto::TRemoteHttpInfo pb;
7054+
pb.SetMethod(HTTP_METHOD_GET);
7055+
pb.SetPath("/app");
7056+
auto* p1 = pb.AddQueryParams();
7057+
p1->SetKey("TabletID");
7058+
p1->SetValue(TStringBuilder() << hiveTablet);
7059+
auto* p2 = pb.AddQueryParams();
7060+
p2->SetKey("page");
7061+
p2->SetValue("StopDomain");
7062+
auto* p3 = pb.AddQueryParams();
7063+
p3->SetKey("ss");
7064+
p3->SetValue("1");
7065+
auto* p4 = pb.AddQueryParams();
7066+
p4->SetKey("path");
7067+
p4->SetValue("4");
7068+
auto* p5 = pb.AddQueryParams();
7069+
p5->SetKey("stop");
7070+
p5->SetValue("0");
7071+
runtime.SendToPipe(hiveTablet, sender, new NMon::TEvRemoteHttpInfo(std::move(pb)), 0, GetPipeConfigWithRetries());
7072+
}
7073+
7074+
{
7075+
TDispatchOptions options;
7076+
options.FinalEvents.emplace_back(TEvLocal::EvBootTablet);
7077+
runtime.DispatchEvents(options);
7078+
}
7079+
7080+
MakeSureTabletIsUp(runtime, tablet1, 0);
7081+
MakeSureTabletIsUp(runtime, tablet2, 0);
7082+
}
69937083
}
69947084

69957085
Y_UNIT_TEST_SUITE(THeavyPerfTest) {

ydb/core/mind/hive/leader_tablet_info.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ struct TLeaderTabletInfo : TTabletInfo {
8484
TActorId LockedToActor;
8585
TDuration LockedReconnectTimeout;
8686
ui64 PendingUnlockSeqNo;
87+
bool StoppedByTenant = false;
8788

8889
bool SeizedByChild = false; // transient state for migration - need to delete it later
8990
bool NeedToReleaseFromParent = false; // transient state for migration - need to delete it later

ydb/core/mind/hive/monitoring.cpp

Lines changed: 81 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -485,6 +485,7 @@ class TTxMonEvent_MemStateDomains : public TTransactionBase<THive> {
485485
out << "<th>TabletsAliveInTenantDomain</th>";
486486
out << "<th>TabletsAliveInOtherDomains</th>";
487487
out << "<th>TabletsTotal</th>";
488+
out << "<th></th>";
488489
out << "</tr>";
489490
out << "</thead>";
490491
out << "<tbody>";
@@ -520,6 +521,11 @@ class TTxMonEvent_MemStateDomains : public TTransactionBase<THive> {
520521
out << "<td>-</td>";
521522
}
522523
out << "<td>" << domainInfo.TabletsTotal << "</td>";
524+
if (domainInfo.Stopped) {
525+
out << "<td><a href=app?TabletID=" << Self->HiveId << "&page=StopDomain&ss=" << domainKey.first << "&path=" << domainKey.second << "&stop=0>Resume</a></td>";
526+
} else {
527+
out << "<td><a href=app?TabletID=" << Self->HiveId << "&page=StopDomain&ss=" << domainKey.first << "&path=" << domainKey.second << "&stop=1>Stop</a></td>";
528+
}
523529
out << "</tr>";
524530
}
525531
out << "</tbody>";
@@ -3257,7 +3263,7 @@ class TStopTabletWaitActor : public TActor<TStopTabletWaitActor>, public ISubAct
32573263
}
32583264
};
32593265

3260-
class TTxMonEvent_StopTablet : public TTransactionBase<THive> {
3266+
class TTxMonEvent_StopTablet : public TTransactionBase<THive>, TLoggedMonTransaction {
32613267
public:
32623268
TAutoPtr<NMon::TEvRemoteHttpInfo> Event;
32633269
const TActorId Source;
@@ -3266,6 +3272,7 @@ class TTxMonEvent_StopTablet : public TTransactionBase<THive> {
32663272

32673273
TTxMonEvent_StopTablet(const TActorId& source, NMon::TEvRemoteHttpInfo::TPtr& ev, TSelf* hive)
32683274
: TBase(hive)
3275+
, TLoggedMonTransaction(ev, hive)
32693276
, Event(ev->Release())
32703277
, Source(source)
32713278
{
@@ -3275,7 +3282,7 @@ class TTxMonEvent_StopTablet : public TTransactionBase<THive> {
32753282

32763283
TTxType GetTxType() const override { return NHive::TXTYPE_MON_STOP_TABLET; }
32773284

3278-
bool Execute(TTransactionContext&, const TActorContext& ctx) override {
3285+
bool Execute(TTransactionContext& txc, const TActorContext& ctx) override {
32793286
TLeaderTabletInfo* tablet = Self->FindTablet(TabletId);
32803287
if (tablet != nullptr) {
32813288
TActorId waitActorId;
@@ -3286,6 +3293,11 @@ class TTxMonEvent_StopTablet : public TTransactionBase<THive> {
32863293
Self->SubActors.emplace_back(waitActor);
32873294
}
32883295
Self->Execute(Self->CreateStopTablet(TabletId, waitActorId));
3296+
NIceDb::TNiceDb db(txc.DB);
3297+
NJson::TJsonValue jsonOperation;
3298+
jsonOperation["Tablet"] = TabletId;
3299+
jsonOperation["Stop"] = true;
3300+
WriteOperation(db, jsonOperation);
32893301
if (!Wait) {
32903302
ctx.Send(Source, new NMon::TEvRemoteJsonInfoRes("{}"));
32913303
}
@@ -3298,6 +3310,62 @@ class TTxMonEvent_StopTablet : public TTransactionBase<THive> {
32983310
void Complete(const TActorContext&) override {}
32993311
};
33003312

3313+
class TTxMonEvent_StopDomain : public TTransactionBase<THive>, TLoggedMonTransaction {
3314+
public:
3315+
THolder<NMon::TEvRemoteHttpInfo> Event;
3316+
const TActorId Source;
3317+
TSubDomainKey DomainId;
3318+
bool Stop = true;
3319+
3320+
TTxMonEvent_StopDomain(const TActorId& source, NMon::TEvRemoteHttpInfo::TPtr& ev, TSelf* hive)
3321+
: TBase(hive)
3322+
, TLoggedMonTransaction(ev, hive)
3323+
, Event(ev->Release())
3324+
, Source(source)
3325+
{
3326+
ui64 ssId = FromStringWithDefault<ui64>(Event->Cgi().Get("ss"), 0);
3327+
ui64 pathId = FromStringWithDefault<ui64>(Event->Cgi().Get("path"), 0);
3328+
DomainId = {ssId, pathId};
3329+
Stop = FromStringWithDefault(Event->Cgi().Get("stop"), Stop);
3330+
}
3331+
3332+
TTxType GetTxType() const override { return NHive::TXTYPE_MON_STOP_TABLET; }
3333+
3334+
bool Execute(TTransactionContext& txc, const TActorContext& ctx) override {
3335+
TDomainInfo* domain = Self->FindDomain(DomainId);
3336+
if (domain != nullptr) {
3337+
NIceDb::TNiceDb db(txc.DB);
3338+
db.Table<Schema::SubDomain>().Key(DomainId).Update<Schema::SubDomain::Stopped>(Stop);
3339+
domain->Stopped = Stop;
3340+
for (const auto& [tabletId, tablet] : Self->Tablets) {
3341+
if (tablet.NodeFilter.ObjectDomain == DomainId) {
3342+
if (Stop) {
3343+
Self->StopTenantTabletsQueue.push(tabletId);
3344+
} else {
3345+
Self->ResumeTenantTabletsQueue.push(tabletId);
3346+
}
3347+
}
3348+
}
3349+
NJson::TJsonValue jsonOperation;
3350+
jsonOperation["SubDomain"] = TStringBuilder() << DomainId;
3351+
jsonOperation["Stop"] = Stop;
3352+
WriteOperation(db, jsonOperation);
3353+
ctx.Send(Source, new NMon::TEvRemoteJsonInfoRes("{\"status\":\"OK\"}"));
3354+
} else {
3355+
ctx.Send(Source, new NMon::TEvRemoteJsonInfoRes(TStringBuilder() << "{\"error\":\"Domain not found\"}"));
3356+
}
3357+
return true;
3358+
}
3359+
3360+
void Complete(const TActorContext&) override {
3361+
if (Stop) {
3362+
Self->ProcessPendingStopTablet();
3363+
} else {
3364+
Self->ProcessPendingResumeTablet();
3365+
}
3366+
}
3367+
};
3368+
33013369
class TResumeTabletWaitActor : public TActor<TResumeTabletWaitActor>, public ISubActor {
33023370
public:
33033371
TActorId Source;
@@ -3340,7 +3408,7 @@ class TResumeTabletWaitActor : public TActor<TResumeTabletWaitActor>, public ISu
33403408
};
33413409

33423410

3343-
class TTxMonEvent_ResumeTablet : public TTransactionBase<THive> {
3411+
class TTxMonEvent_ResumeTablet : public TTransactionBase<THive>, TLoggedMonTransaction {
33443412
public:
33453413
TAutoPtr<NMon::TEvRemoteHttpInfo> Event;
33463414
const TActorId Source;
@@ -3349,6 +3417,7 @@ class TTxMonEvent_ResumeTablet : public TTransactionBase<THive> {
33493417

33503418
TTxMonEvent_ResumeTablet(const TActorId& source, NMon::TEvRemoteHttpInfo::TPtr& ev, TSelf* hive)
33513419
: TBase(hive)
3420+
, TLoggedMonTransaction(ev, hive)
33523421
, Event(ev->Release())
33533422
, Source(source)
33543423
{
@@ -3358,7 +3427,7 @@ class TTxMonEvent_ResumeTablet : public TTransactionBase<THive> {
33583427

33593428
TTxType GetTxType() const override { return NHive::TXTYPE_MON_STOP_TABLET; }
33603429

3361-
bool Execute(TTransactionContext&, const TActorContext& ctx) override {
3430+
bool Execute(TTransactionContext& txc, const TActorContext& ctx) override {
33623431
TLeaderTabletInfo* tablet = Self->FindTablet(TabletId);
33633432
if (tablet != nullptr) {
33643433
TActorId waitActorId;
@@ -3369,6 +3438,11 @@ class TTxMonEvent_ResumeTablet : public TTransactionBase<THive> {
33693438
Self->SubActors.emplace_back(waitActor);
33703439
}
33713440
Self->Execute(Self->CreateResumeTablet(TabletId, waitActorId));
3441+
NIceDb::TNiceDb db(txc.DB);
3442+
NJson::TJsonValue jsonOperation;
3443+
jsonOperation["Tablet"] = TabletId;
3444+
jsonOperation["Stop"] = false;
3445+
WriteOperation(db, jsonOperation);
33723446
if (!Wait) {
33733447
ctx.Send(Source, new NMon::TEvRemoteJsonInfoRes("{}"));
33743448
}
@@ -4432,6 +4506,9 @@ void THive::CreateEvMonitoring(NMon::TEvRemoteHttpInfo::TPtr& ev, const TActorCo
44324506
if (page == "StopTablet") {
44334507
return Execute(new TTxMonEvent_StopTablet(ev->Sender, ev, this), ctx);
44344508
}
4509+
if (page == "StopDomain") {
4510+
return Execute(new TTxMonEvent_StopDomain(ev->Sender, ev, this), ctx);
4511+
}
44354512
if (page == "ResumeTablet") {
44364513
return Execute(new TTxMonEvent_ResumeTablet(ev->Sender, ev, this), ctx);
44374514
}

0 commit comments

Comments
 (0)