Skip to content

Commit a86bd03

Browse files
committed
make nodes less critical (to make cluster less critical) (#19677)
1 parent 3653822 commit a86bd03

File tree

1 file changed

+32
-38
lines changed

1 file changed

+32
-38
lines changed

ydb/core/tablet/node_whiteboard.cpp

Lines changed: 32 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -767,72 +767,59 @@ class TNodeWhiteboardService : public TActorBootstrapped<TNodeWhiteboardService>
767767
}
768768
}
769769

770-
void UpdateSystemState(const TActorContext &ctx) {
770+
void UpdateSystemState() {
771771
NKikimrWhiteboard::EFlag eFlag = NKikimrWhiteboard::EFlag::Green;
772-
NKikimrWhiteboard::EFlag pDiskFlag = NKikimrWhiteboard::EFlag::Green;
773-
ui32 yellowFlags = 0;
772+
ui32 badDisks = 0;
774773
double maxDiskUsage = 0;
775774
for (const auto& pr : PDiskStateInfo) {
776-
if (!pr.second.HasState()) {
777-
pDiskFlag = std::max(pDiskFlag, NKikimrWhiteboard::EFlag::Yellow);
778-
++yellowFlags;
779-
} else {
775+
if (pr.second.HasState()) {
780776
switch (pr.second.GetState()) {
781777
case NKikimrBlobStorage::TPDiskState::InitialFormatReadError:
782778
case NKikimrBlobStorage::TPDiskState::InitialSysLogReadError:
783779
case NKikimrBlobStorage::TPDiskState::InitialSysLogParseError:
784780
case NKikimrBlobStorage::TPDiskState::InitialCommonLogReadError:
785781
case NKikimrBlobStorage::TPDiskState::InitialCommonLogParseError:
786782
case NKikimrBlobStorage::TPDiskState::CommonLoggerInitError:
787-
pDiskFlag = std::max(pDiskFlag, NKikimrWhiteboard::EFlag::Red);
788-
break;
789783
case NKikimrBlobStorage::TPDiskState::OpenFileError:
790-
pDiskFlag = std::max(pDiskFlag, NKikimrWhiteboard::EFlag::Yellow);
791-
++yellowFlags;
784+
eFlag = std::max(eFlag, NKikimrWhiteboard::EFlag::Yellow);
785+
++badDisks;
792786
break;
793787
default:
794788
break;
795789
}
796790
}
797791
if (pr.second.HasAvailableSize() && pr.second.GetTotalSize() != 0) {
798792
double avail = (double)pr.second.GetAvailableSize() / pr.second.GetTotalSize();
799-
if (avail <= 0.06) {
800-
pDiskFlag = std::max(pDiskFlag, NKikimrWhiteboard::EFlag::Red);
793+
if (avail <= 0.04) {
794+
eFlag = std::max(eFlag, NKikimrWhiteboard::EFlag::Orange);
801795
} else if (avail <= 0.08) {
802-
pDiskFlag = std::max(pDiskFlag, NKikimrWhiteboard::EFlag::Orange);
803-
} else if (avail <= 0.15) {
804-
pDiskFlag = std::max(pDiskFlag, NKikimrWhiteboard::EFlag::Yellow);
805-
++yellowFlags;
796+
eFlag = std::max(eFlag, NKikimrWhiteboard::EFlag::Yellow);
806797
}
807798
maxDiskUsage = std::max(maxDiskUsage, 1.0 - avail);
808799
}
809800
}
810801
if (PDiskStateInfo.size() > 0) {
811802
SystemStateInfo.SetMaxDiskUsage(maxDiskUsage);
812803
}
813-
if (pDiskFlag == NKikimrWhiteboard::EFlag::Yellow) {
814-
switch (yellowFlags) {
815-
case 1:
816-
break;
817-
case 2:
818-
pDiskFlag = NKikimrWhiteboard::EFlag::Orange;
819-
break;
820-
case 3:
821-
pDiskFlag = NKikimrWhiteboard::EFlag::Red;
822-
break;
823-
}
804+
if (eFlag == NKikimrWhiteboard::EFlag::Yellow && badDisks > 1) {
805+
eFlag = NKikimrWhiteboard::EFlag::Orange;
824806
}
825-
eFlag = std::max(eFlag, pDiskFlag);
826807
for (const auto& pr : VDiskStateInfo) {
827808
eFlag = std::max(eFlag, pr.second.GetDiskSpace());
828-
eFlag = std::max(eFlag, pr.second.GetSatisfactionRank().GetFreshRank().GetFlag());
829-
eFlag = std::max(eFlag, pr.second.GetSatisfactionRank().GetLevelRank().GetFlag());
830-
}
831-
if (SystemStateInfo.HasMessageBusState()) {
832-
eFlag = std::max(eFlag, SystemStateInfo.GetMessageBusState());
809+
if (pr.second.GetDiskSpace() >= NKikimrWhiteboard::EFlag::Red) {
810+
eFlag = std::max(eFlag, NKikimrWhiteboard::EFlag::Orange);
811+
} else if (pr.second.GetDiskSpace() > NKikimrWhiteboard::EFlag::Green) {
812+
eFlag = std::max(eFlag, NKikimrWhiteboard::EFlag::Yellow);
813+
}
814+
if (pr.second.GetSatisfactionRank().GetFreshRank().GetFlag() > NKikimrWhiteboard::EFlag::Green) {
815+
eFlag = std::max(eFlag, NKikimrWhiteboard::EFlag::Yellow);
816+
}
817+
if (pr.second.GetSatisfactionRank().GetLevelRank().GetFlag() > NKikimrWhiteboard::EFlag::Green) {
818+
eFlag = std::max(eFlag, NKikimrWhiteboard::EFlag::Yellow);
819+
}
833820
}
834821
if (SystemStateInfo.HasGRpcState()) {
835-
eFlag = std::max(eFlag, SystemStateInfo.GetGRpcState());
822+
eFlag = std::max(eFlag, std::max(SystemStateInfo.GetGRpcState(), NKikimrWhiteboard::EFlag::Orange));
836823
}
837824
for (const auto& stats : SystemStateInfo.GetPoolStats()) {
838825
double usage = stats.GetUsage();
@@ -846,11 +833,18 @@ class TNodeWhiteboardService : public TActorBootstrapped<TNodeWhiteboardService>
846833
} else {
847834
flag = NKikimrWhiteboard::EFlag::Green;
848835
}
849-
eFlag = Max(eFlag, flag);
836+
if (stats.GetName() == "User") {
837+
flag = std::min(flag, NKikimrWhiteboard::EFlag::Orange);
838+
} else if (stats.GetName() == "IO") {
839+
flag = std::min(flag, NKikimrWhiteboard::EFlag::Yellow);
840+
} else if (stats.GetName() == "Batch") {
841+
flag = std::min(flag, NKikimrWhiteboard::EFlag::Green);
842+
}
843+
eFlag = std::max(eFlag, flag);
850844
}
851845
if (!SystemStateInfo.HasSystemState() || SystemStateInfo.GetSystemState() != eFlag) {
852846
SystemStateInfo.SetSystemState(eFlag);
853-
SystemStateInfo.SetChangeTime(ctx.Now().MilliSeconds());
847+
SystemStateInfo.SetChangeTime(TActivationContext::Now().MilliSeconds());
854848
}
855849
}
856850

@@ -1158,7 +1152,7 @@ class TNodeWhiteboardService : public TActorBootstrapped<TNodeWhiteboardService>
11581152
threadInfo->MutableStates()->emplace(state.first, state.second);
11591153
}
11601154
}
1161-
UpdateSystemState(ctx);
1155+
UpdateSystemState();
11621156
ctx.Schedule(UPDATE_PERIOD, new TEvPrivate::TEvUpdateRuntimeStats());
11631157
}
11641158

0 commit comments

Comments
 (0)