Skip to content

Commit ddcd276

Browse files
jepett0CyberROFL
andauthored
Support State Storage reconfiguration via Ring Groups in Scheme Board Subscribers (#20018)
Co-authored-by: Ilnaz Nizametdinov <i.nizametdinov@gmail.com>
1 parent a411961 commit ddcd276

File tree

7 files changed

+161
-76
lines changed

7 files changed

+161
-76
lines changed

ydb/core/blobstorage/nodewarden/distconf_invoke_state_storage.cpp

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -27,18 +27,14 @@ namespace NKikimr::NStorage {
2727
(StateStorageConfig, cmd));
2828

2929
NKikimrBlobStorage::TStorageConfig config = *Self->StorageConfig;
30-
if (cmd.HasSchemeBoardConfig()) {
31-
FinishWithError(TResult::ERROR, TStringBuilder() << "SchemeBoard are not supported");
32-
return;
33-
}
3430
if (!cmd.HasStateStorageConfig() && !cmd.HasStateStorageBoardConfig() && !cmd.HasSchemeBoardConfig()) {
3531
FinishWithError(TResult::ERROR, TStringBuilder() << "New configuration is not defined");
36-
return;
32+
return;
3733
}
3834
auto process = [&](const char *name, auto buildInfo, auto hasFunc, auto func, auto configHasFunc, auto configMutableFunc) {
3935
if (!(cmd.*hasFunc)()) {
4036
return true;
41-
}
37+
}
4238
if (!(config.*configHasFunc)()) {
4339
FinishWithError(TResult::ERROR, TStringBuilder() << name << " configuration is not filled in");
4440
return false;
@@ -60,7 +56,7 @@ namespace NKikimr::NStorage {
6056
for (auto& rg : newSSConfig.GetRingGroups()) {
6157
if (rg.RingSize() && rg.NodeSize()) {
6258
FinishWithError(TResult::ERROR, TStringBuilder() << name << " Ring and Node are defined, use the one of them");
63-
return false;
59+
return false;
6460
}
6561
const size_t numItems = Max(rg.RingSize(), rg.NodeSize());
6662
if (!rg.HasNToSelect() || numItems < 1 || rg.GetNToSelect() < 1 || rg.GetNToSelect() > numItems) {
@@ -70,11 +66,11 @@ namespace NKikimr::NStorage {
7066
for (auto &ring : rg.GetRing()) {
7167
if (ring.RingSize() > 0) {
7268
FinishWithError(TResult::ERROR, TStringBuilder() << name << " too deep nested ring declaration");
73-
return false;
69+
return false;
7470
}
7571
if(ring.HasRingGroupActorIdOffset()) {
7672
FinishWithError(TResult::ERROR, TStringBuilder() << name << " RingGroupActorIdOffset should be used in ring group level, not ring");
77-
return false;
73+
return false;
7874
}
7975
if (ring.NodeSize() < 1) {
8076
FinishWithError(TResult::ERROR, TStringBuilder() << name << " empty ring");
@@ -100,7 +96,7 @@ namespace NKikimr::NStorage {
10096
}
10197

10298
Y_ABORT_UNLESS(newSSInfo->RingGroups.size() > 0 && oldSSInfo->RingGroups.size() > 0);
103-
99+
104100
for (auto& newGroup : newSSInfo->RingGroups) {
105101
if (newGroup.WriteOnly) {
106102
continue;
@@ -113,7 +109,7 @@ namespace NKikimr::NStorage {
113109
}
114110
}
115111
if (!found) {
116-
FinishWithError(TResult::ERROR, TStringBuilder() <<
112+
FinishWithError(TResult::ERROR, TStringBuilder() <<
117113
"New introduced ring group should be WriteOnly old:" << oldSSInfo->ToString() <<" new: " << newSSInfo->ToString());
118114
return false;
119115
}
@@ -130,7 +126,7 @@ namespace NKikimr::NStorage {
130126
}
131127
}
132128
if (!found) {
133-
FinishWithError(TResult::ERROR, TStringBuilder() <<
129+
FinishWithError(TResult::ERROR, TStringBuilder() <<
134130
"Can not delete not WriteOnly ring group. Make it WriteOnly before deletion old:" << oldSSInfo->ToString() <<" new: " << newSSInfo->ToString());
135131
return false;
136132
}
@@ -148,7 +144,7 @@ namespace NKikimr::NStorage {
148144
}
149145
return true;
150146
};
151-
147+
152148
#define PROCESS(NAME) \
153149
if (!process(#NAME, &NKikimr::Build##NAME##Info, \
154150
&NKikimrBlobStorage::TStateStorageConfig::Has##NAME##Config, \

ydb/core/tx/scheme_board/subscriber.cpp

Lines changed: 138 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include <util/generic/string.h>
2525
#include <util/generic/utility.h>
2626
#include <util/string/cast.h>
27+
#include <util/generic/xrange.h>
2728

2829
namespace NKikimr {
2930

@@ -137,7 +138,7 @@ namespace {
137138
NKikimrSchemeBoard::TEvNotify Notify;
138139
TPathId SubdomainPathId;
139140
TSet<ui64> PathAbandonedTenantsSchemeShards;
140-
TMaybe<NKikimrScheme::TEvDescribeSchemeResult> DescribeSchemeResult;
141+
TMaybe<NKikimrScheme::TEvDescribeSchemeResult> DescribeSchemeResult = Nothing();
141142

142143
static TNotifyResponse FromNotify(NKikimrSchemeBoard::TEvNotify&& record) {
143144
// PathSubdomainPathId's absence is a marker that input message was sent
@@ -342,7 +343,6 @@ namespace {
342343
struct TEvPrivate {
343344
enum EEv {
344345
EvReplicaMissing = EventSpaceBegin(TKikimrEvents::ES_PRIVATE),
345-
EvSwitchReplica,
346346

347347
EvEnd,
348348
};
@@ -616,11 +616,6 @@ class TSubscriberProxy: public TMonitorableActor<TDerived> {
616616
};
617617
}
618618

619-
void HandleSwitchReplica(STATEFN_SIG) {
620-
Replica = ev->Sender;
621-
TActivationContext::Send(new IEventHandle(TEvents::TSystem::Poison, 0, ReplicaSubscriber, this->SelfId(), nullptr, 0));
622-
}
623-
624619
public:
625620
static constexpr NKikimrServices::TActivity::EType ActorActivityType() {
626621
return NKikimrServices::TActivity::SCHEME_BOARD_SUBSCRIBER_PROXY_ACTOR;
@@ -666,8 +661,6 @@ class TSubscriberProxy: public TMonitorableActor<TDerived> {
666661
hFunc(TEvents::TEvGone, Handle);
667662
hFunc(TEvPrivate::TEvReplicaMissing, Handle);
668663
cFunc(TEvents::TEvPoisonPill::EventType, PassAway);
669-
670-
fFunc(TEvPrivate::EvSwitchReplica, HandleSwitchReplica);
671664
}
672665
}
673666

@@ -679,8 +672,6 @@ class TSubscriberProxy: public TMonitorableActor<TDerived> {
679672

680673
CFunc(TEvents::TEvWakeup::EventType, Bootstrap);
681674
cFunc(TEvents::TEvPoisonPill::EventType, PassAway);
682-
683-
fFunc(TEvPrivate::EvSwitchReplica, HandleSwitchReplica);
684675
}
685676
}
686677

@@ -778,7 +769,24 @@ class TSubscriber: public TMonitorableActor<TDerived> {
778769
}
779770

780771
bool IsMajorityReached() const {
781-
return InitialResponses.size() > (Proxies.size() / 2);
772+
TVector<ui32> responsesByGroup(ProxyGroups.size());
773+
for (const auto& [proxy, _] : InitialResponses) {
774+
if (const auto* groupIdx = ProxyToGroupMap.FindPtr(proxy)) {
775+
responsesByGroup[*groupIdx]++;
776+
} else {
777+
SBS_LOG_N("Previously received response sender is currently unknown"
778+
<< ": sender# " << proxy);
779+
}
780+
}
781+
for (size_t groupIdx : xrange(ProxyGroups.size())) {
782+
if (ProxyGroups[groupIdx].WriteOnly) {
783+
continue;
784+
}
785+
if (responsesByGroup[groupIdx] <= ProxyGroups[groupIdx].Proxies.size() / 2) {
786+
return false;
787+
}
788+
}
789+
return true;
782790
}
783791

784792
void EnqueueSyncRequest(NInternalEvents::TEvSyncRequest::TPtr& ev) {
@@ -794,9 +802,11 @@ class TSubscriber: public TMonitorableActor<TDerived> {
794802
DelayedSyncRequest = 0;
795803

796804
Y_ABORT_UNLESS(PendingSync.empty());
797-
for (const auto& [proxy, replica] : Proxies) {
798-
this->Send(proxy, new NInternalEvents::TEvSyncVersionRequest(Path), 0, CurrentSyncRequest);
799-
PendingSync.emplace(proxy);
805+
for (const auto& proxyGroup : ProxyGroups) {
806+
for (const auto& [proxy, _] : proxyGroup.Proxies) {
807+
this->Send(proxy, new NInternalEvents::TEvSyncVersionRequest(Path), 0, CurrentSyncRequest);
808+
PendingSync.emplace(proxy);
809+
}
800810
}
801811

802812
return true;
@@ -812,6 +822,12 @@ class TSubscriber: public TMonitorableActor<TDerived> {
812822
return;
813823
}
814824

825+
if (!ProxyToGroupMap.contains(ev->Sender)) {
826+
SBS_LOG_E("Unknown " << ev->Get()->ToString()
827+
<< ": sender# " << ev->Sender);
828+
return;
829+
}
830+
815831
// TEvNotify message is consumed here, can't be used after this point
816832
TNotifyResponse notifyResponse = TNotifyResponse::FromNotify(std::move(*ev->Get()->MutableRecord()));
817833
TNotifyResponse* selectedNotify = &notifyResponse;
@@ -886,6 +902,11 @@ class TSubscriber: public TMonitorableActor<TDerived> {
886902
Y_ABORT_UNLESS(MaybeRunVersionSync());
887903
}
888904

905+
static bool IsSyncFinished(ui32 successes, ui32 failures, ui32 expectedTotal) {
906+
const auto half = expectedTotal;
907+
return successes > half || failures > half || successes + failures >= expectedTotal;
908+
}
909+
889910
void Handle(NInternalEvents::TEvSyncVersionResponse::TPtr& ev) {
890911
SBS_LOG_D("Handle " << ev->Get()->ToString()
891912
<< ": sender# " << ev->Sender
@@ -907,48 +928,70 @@ class TSubscriber: public TMonitorableActor<TDerived> {
907928
return;
908929
}
909930

931+
if (!ProxyToGroupMap.contains(ev->Sender)) {
932+
SBS_LOG_E("Sync sender is unknown"
933+
<< ": sender# " << ev->Sender
934+
<< ", cookie# " << ev->Cookie);
935+
return;
936+
}
937+
910938
PendingSync.erase(it);
911939
Y_ABORT_UNLESS(!ReceivedSync.contains(ev->Sender));
912940
ReceivedSync[ev->Sender] = ev->Get()->Record.GetPartial();
913941

914-
ui32 successes = 0;
915-
ui32 failures = 0;
916-
for (const auto& [_, partial] : ReceivedSync) {
942+
TVector<ui32> successesByGroup(ProxyGroups.size());
943+
TVector<ui32> failuresByGroup(ProxyGroups.size());
944+
for (const auto& [proxy, partial] : ReceivedSync) {
945+
const auto* groupIdx = ProxyToGroupMap.FindPtr(proxy);
946+
if (!groupIdx) {
947+
SBS_LOG_N("Previously received sync sender is currently unknown"
948+
<< ": sender# " << proxy);
949+
continue;
950+
}
917951
if (!partial) {
918-
++successes;
952+
++successesByGroup[*groupIdx];
919953
} else {
920-
++failures;
954+
++failuresByGroup[*groupIdx];
921955
}
922956
}
923-
924-
const ui32 size = Proxies.size();
925-
const ui32 half = size / 2;
926-
if (successes <= half && failures <= half && (successes + failures) < size) {
927-
SBS_LOG_D("Sync is in progress"
957+
bool syncIsComplete = true;
958+
for (size_t groupIdx : xrange(ProxyGroups.size())) {
959+
if (ProxyGroups[groupIdx].WriteOnly) {
960+
continue;
961+
}
962+
const ui32 size = ProxyGroups[groupIdx].Proxies.size();
963+
const ui32 half = size / 2;
964+
if (!IsSyncFinished(successesByGroup[groupIdx], failuresByGroup[groupIdx], size)) {
965+
SBS_LOG_D("Sync is in progress"
966+
<< ": cookie# " << ev->Cookie
967+
<< ", ring group# " << groupIdx
968+
<< ", size# " << size
969+
<< ", half# " << half
970+
<< ", successes# " << successesByGroup[groupIdx]
971+
<< ", failures# " << failuresByGroup[groupIdx]);
972+
return;
973+
}
974+
syncIsComplete &= successesByGroup[groupIdx] > half;
975+
const auto finalMessage = TStringBuilder() << "Sync is done in the ring group"
928976
<< ": cookie# " << ev->Cookie
977+
<< ", ring group# " << groupIdx
929978
<< ", size# " << size
930979
<< ", half# " << half
931-
<< ", successes# " << successes
932-
<< ", faulires# " << failures);
933-
return;
980+
<< ", successes# " << successesByGroup[groupIdx]
981+
<< ", failures# " << failuresByGroup[groupIdx]
982+
<< ", partial# " << !syncIsComplete;
983+
if (syncIsComplete) {
984+
SBS_LOG_D(finalMessage);
985+
} else {
986+
SBS_LOG_N(finalMessage);
987+
}
934988
}
935-
936-
const bool partial = !(successes > half);
937-
const TString done = TStringBuilder() << "Sync is done"
938-
<< ": cookie# " << ev->Cookie
939-
<< ", size# " << size
940-
<< ", half# " << half
941-
<< ", successes# " << successes
942-
<< ", faulires# " << failures
943-
<< ", partial# " << partial;
944-
945-
if (!partial) {
946-
SBS_LOG_D(done);
947-
} else {
948-
SBS_LOG_W(done);
989+
if (!syncIsComplete) {
990+
SBS_LOG_W("Sync is incomplete in one of the ring groups"
991+
<< ": cookie# " << ev->Cookie);
949992
}
950993

951-
this->Send(Owner, new NInternalEvents::TEvSyncResponse(Path, partial), 0, ev->Cookie);
994+
this->Send(Owner, new NInternalEvents::TEvSyncResponse(Path, !syncIsComplete), 0, ev->Cookie);
952995

953996
PendingSync.clear();
954997
ReceivedSync.clear();
@@ -959,28 +1002,50 @@ class TSubscriber: public TMonitorableActor<TDerived> {
9591002
void Handle(TEvStateStorage::TEvResolveReplicasList::TPtr& ev) {
9601003
SBS_LOG_D("Handle " << ev->Get()->ToString());
9611004

962-
const auto& replicas = ev->Get()->GetPlainReplicas();
1005+
const auto& replicaGroups = ev->Get()->ReplicaGroups;
9631006

964-
if (replicas.empty()) {
965-
Y_ABORT_UNLESS(Proxies.empty());
1007+
if (replicaGroups.empty()) {
1008+
Y_ABORT_UNLESS(ProxyGroups.empty());
9661009
SBS_LOG_E("Subscribe on unconfigured SchemeBoard");
9671010
this->Become(&TDerived::StateCalm);
9681011
return;
9691012
}
9701013

971-
Y_ABORT_UNLESS(Proxies.empty() || Proxies.size() == replicas.size());
1014+
for (const auto& group : ProxyGroups) {
1015+
for (const auto& [proxy, _] : group.Proxies) {
1016+
this->Send(proxy, new TEvents::TEvPoisonPill());
1017+
}
1018+
}
1019+
ProxyToGroupMap.clear();
1020+
ProxyGroups.clear();
1021+
States.clear();
1022+
InitialResponses.clear();
1023+
State.Clear();
1024+
PendingSync.clear();
1025+
ReceivedSync.clear();
9721026

973-
if (Proxies.empty()) {
974-
for (size_t i = 0; i < replicas.size(); ++i) {
975-
Proxies.emplace_back(this->RegisterWithSameMailbox(new TProxyDerived(this->SelfId(), i, replicas.size(),
976-
replicas[i], Path, DomainOwnerId)), replicas[i]);
1027+
for (size_t groupIdx = 0; groupIdx < replicaGroups.size(); ++groupIdx) {
1028+
const auto& replicaGroup = replicaGroups[groupIdx];
1029+
if (replicaGroup.WriteOnly) {
1030+
continue;
9771031
}
978-
} else {
979-
for (size_t i = 0; i < replicas.size(); ++i) {
980-
if (auto& [proxy, replica] = Proxies[i]; replica != replicas[i]) {
981-
TActivationContext::Send(new IEventHandle(TEvPrivate::EvSwitchReplica, 0, proxy, replicas[i], nullptr, 0));
982-
replica = replicas[i];
983-
}
1032+
auto& proxyGroup = ProxyGroups.emplace_back();
1033+
1034+
proxyGroup.Proxies.reserve(replicaGroup.Replicas.size());
1035+
for (size_t i = 0; i < replicaGroup.Replicas.size(); ++i) {
1036+
auto& proxy = proxyGroup.Proxies.emplace_back();
1037+
proxy.Replica = replicaGroup.Replicas[i];
1038+
proxy.Proxy = this->RegisterWithSameMailbox(
1039+
new TProxyDerived(
1040+
this->SelfId(),
1041+
i,
1042+
replicaGroup.Replicas.size(),
1043+
replicaGroup.Replicas[i],
1044+
Path,
1045+
DomainOwnerId
1046+
)
1047+
);
1048+
ProxyToGroupMap[proxy.Proxy] = ProxyGroups.size() - 1;
9841049
}
9851050
}
9861051

@@ -1040,10 +1105,11 @@ class TSubscriber: public TMonitorableActor<TDerived> {
10401105
}
10411106

10421107
void PassAway() override {
1043-
for (const auto& [proxy, replica] : Proxies) {
1044-
this->Send(proxy, new TEvents::TEvPoisonPill());
1108+
for (const auto& group : ProxyGroups) {
1109+
for (const auto& [proxy, _] : group.Proxies) {
1110+
this->Send(proxy, new TEvents::TEvPoisonPill());
1111+
}
10451112
}
1046-
10471113
TActivationContext::Send(new IEventHandle(TEvents::TSystem::Unsubscribe, 0, MakeStateStorageProxyID(),
10481114
this->SelfId(), nullptr, 0));
10491115

@@ -1128,7 +1194,18 @@ class TSubscriber: public TMonitorableActor<TDerived> {
11281194
const TPath Path;
11291195
const ui64 DomainOwnerId;
11301196

1131-
std::vector<std::tuple<TActorId, TActorId>> Proxies;
1197+
struct TProxyInfo {
1198+
TActorId Proxy;
1199+
TActorId Replica;
1200+
};
1201+
1202+
struct TProxyGroup {
1203+
bool WriteOnly;
1204+
TVector<TProxyInfo> Proxies;
1205+
};
1206+
1207+
THashMap<TActorId, ui32> ProxyToGroupMap;
1208+
TVector<TProxyGroup> ProxyGroups;
11321209
TMap<TActorId, TState> States;
11331210
TMap<TActorId, TNotifyResponse> InitialResponses;
11341211
TMaybe<TState> State;

0 commit comments

Comments
 (0)