Skip to content

Commit a89530a

Browse files
committed
make nodes less critical (to make cluster less critical) (#19677)
1 parent 002721e commit a89530a

File tree

1 file changed

+32
-38
lines changed

1 file changed

+32
-38
lines changed

ydb/core/tablet/node_whiteboard.cpp

Lines changed: 32 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -772,72 +772,59 @@ class TNodeWhiteboardService : public TActorBootstrapped<TNodeWhiteboardService>
772772
}
773773
}
774774

775-
void UpdateSystemState(const TActorContext &ctx) {
775+
void UpdateSystemState() {
776776
NKikimrWhiteboard::EFlag eFlag = NKikimrWhiteboard::EFlag::Green;
777-
NKikimrWhiteboard::EFlag pDiskFlag = NKikimrWhiteboard::EFlag::Green;
778-
ui32 yellowFlags = 0;
777+
ui32 badDisks = 0;
779778
double maxDiskUsage = 0;
780779
for (const auto& pr : PDiskStateInfo) {
781-
if (!pr.second.HasState()) {
782-
pDiskFlag = std::max(pDiskFlag, NKikimrWhiteboard::EFlag::Yellow);
783-
++yellowFlags;
784-
} else {
780+
if (pr.second.HasState()) {
785781
switch (pr.second.GetState()) {
786782
case NKikimrBlobStorage::TPDiskState::InitialFormatReadError:
787783
case NKikimrBlobStorage::TPDiskState::InitialSysLogReadError:
788784
case NKikimrBlobStorage::TPDiskState::InitialSysLogParseError:
789785
case NKikimrBlobStorage::TPDiskState::InitialCommonLogReadError:
790786
case NKikimrBlobStorage::TPDiskState::InitialCommonLogParseError:
791787
case NKikimrBlobStorage::TPDiskState::CommonLoggerInitError:
792-
pDiskFlag = std::max(pDiskFlag, NKikimrWhiteboard::EFlag::Red);
793-
break;
794788
case NKikimrBlobStorage::TPDiskState::OpenFileError:
795-
pDiskFlag = std::max(pDiskFlag, NKikimrWhiteboard::EFlag::Yellow);
796-
++yellowFlags;
789+
eFlag = std::max(eFlag, NKikimrWhiteboard::EFlag::Yellow);
790+
++badDisks;
797791
break;
798792
default:
799793
break;
800794
}
801795
}
802796
if (pr.second.HasAvailableSize() && pr.second.GetTotalSize() != 0) {
803797
double avail = (double)pr.second.GetAvailableSize() / pr.second.GetTotalSize();
804-
if (avail <= 0.06) {
805-
pDiskFlag = std::max(pDiskFlag, NKikimrWhiteboard::EFlag::Red);
798+
if (avail <= 0.04) {
799+
eFlag = std::max(eFlag, NKikimrWhiteboard::EFlag::Orange);
806800
} else if (avail <= 0.08) {
807-
pDiskFlag = std::max(pDiskFlag, NKikimrWhiteboard::EFlag::Orange);
808-
} else if (avail <= 0.15) {
809-
pDiskFlag = std::max(pDiskFlag, NKikimrWhiteboard::EFlag::Yellow);
810-
++yellowFlags;
801+
eFlag = std::max(eFlag, NKikimrWhiteboard::EFlag::Yellow);
811802
}
812803
maxDiskUsage = std::max(maxDiskUsage, 1.0 - avail);
813804
}
814805
}
815806
if (PDiskStateInfo.size() > 0) {
816807
SystemStateInfo.SetMaxDiskUsage(maxDiskUsage);
817808
}
818-
if (pDiskFlag == NKikimrWhiteboard::EFlag::Yellow) {
819-
switch (yellowFlags) {
820-
case 1:
821-
break;
822-
case 2:
823-
pDiskFlag = NKikimrWhiteboard::EFlag::Orange;
824-
break;
825-
case 3:
826-
pDiskFlag = NKikimrWhiteboard::EFlag::Red;
827-
break;
828-
}
809+
if (eFlag == NKikimrWhiteboard::EFlag::Yellow && badDisks > 1) {
810+
eFlag = NKikimrWhiteboard::EFlag::Orange;
829811
}
830-
eFlag = std::max(eFlag, pDiskFlag);
831812
for (const auto& pr : VDiskStateInfo) {
832813
eFlag = std::max(eFlag, pr.second.GetDiskSpace());
833-
eFlag = std::max(eFlag, pr.second.GetSatisfactionRank().GetFreshRank().GetFlag());
834-
eFlag = std::max(eFlag, pr.second.GetSatisfactionRank().GetLevelRank().GetFlag());
835-
}
836-
if (SystemStateInfo.HasMessageBusState()) {
837-
eFlag = std::max(eFlag, SystemStateInfo.GetMessageBusState());
814+
if (pr.second.GetDiskSpace() >= NKikimrWhiteboard::EFlag::Red) {
815+
eFlag = std::max(eFlag, NKikimrWhiteboard::EFlag::Orange);
816+
} else if (pr.second.GetDiskSpace() > NKikimrWhiteboard::EFlag::Green) {
817+
eFlag = std::max(eFlag, NKikimrWhiteboard::EFlag::Yellow);
818+
}
819+
if (pr.second.GetSatisfactionRank().GetFreshRank().GetFlag() > NKikimrWhiteboard::EFlag::Green) {
820+
eFlag = std::max(eFlag, NKikimrWhiteboard::EFlag::Yellow);
821+
}
822+
if (pr.second.GetSatisfactionRank().GetLevelRank().GetFlag() > NKikimrWhiteboard::EFlag::Green) {
823+
eFlag = std::max(eFlag, NKikimrWhiteboard::EFlag::Yellow);
824+
}
838825
}
839826
if (SystemStateInfo.HasGRpcState()) {
840-
eFlag = std::max(eFlag, SystemStateInfo.GetGRpcState());
827+
eFlag = std::max(eFlag, std::max(SystemStateInfo.GetGRpcState(), NKikimrWhiteboard::EFlag::Orange));
841828
}
842829
for (const auto& stats : SystemStateInfo.GetPoolStats()) {
843830
double usage = stats.GetUsage();
@@ -851,11 +838,18 @@ class TNodeWhiteboardService : public TActorBootstrapped<TNodeWhiteboardService>
851838
} else {
852839
flag = NKikimrWhiteboard::EFlag::Green;
853840
}
854-
eFlag = Max(eFlag, flag);
841+
if (stats.GetName() == "User") {
842+
flag = std::min(flag, NKikimrWhiteboard::EFlag::Orange);
843+
} else if (stats.GetName() == "IO") {
844+
flag = std::min(flag, NKikimrWhiteboard::EFlag::Yellow);
845+
} else if (stats.GetName() == "Batch") {
846+
flag = std::min(flag, NKikimrWhiteboard::EFlag::Green);
847+
}
848+
eFlag = std::max(eFlag, flag);
855849
}
856850
if (!SystemStateInfo.HasSystemState() || SystemStateInfo.GetSystemState() != eFlag) {
857851
SystemStateInfo.SetSystemState(eFlag);
858-
SystemStateInfo.SetChangeTime(ctx.Now().MilliSeconds());
852+
SystemStateInfo.SetChangeTime(TActivationContext::Now().MilliSeconds());
859853
}
860854
}
861855

@@ -1163,7 +1157,7 @@ class TNodeWhiteboardService : public TActorBootstrapped<TNodeWhiteboardService>
11631157
threadInfo->MutableStates()->emplace(state.first, state.second);
11641158
}
11651159
}
1166-
UpdateSystemState(ctx);
1160+
UpdateSystemState();
11671161
ctx.Schedule(UPDATE_PERIOD, new TEvPrivate::TEvUpdateRuntimeStats());
11681162
}
11691163

0 commit comments

Comments
 (0)