Skip to content

Commit 06b19de

Browse files
committed
Merge branch 'stable-25-1' into TR-25-1-2
2 parents 7d6b8d4 + 3750f48 commit 06b19de

File tree

163 files changed

+3181
-1176
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

163 files changed

+3181
-1176
lines changed

.github/config/muted_ya.txt

Lines changed: 23 additions & 102 deletions
Large diffs are not rendered by default.

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,16 @@
33
### Functionality
44

55
* 17114:Improved audit logging for user management operations. The audit logs now include details about user modification actions such as password changes, user blocking, and unblocking, making it easier to troubleshoot login issues. [#17114](https://github.com/ydb-platform/ydb/pull/17114) ([flown4qqqq](https://github.com/flown4qqqq))
6+
* 18352:Added database audit logs in console's tablet.[#18352](https://github.com/ydb-platform/ydb/pull/18352) ([flown4qqqq](https://github.com/flown4qqqq))
7+
* 18298:Limited the creation of ReassignerActor to only one active instance to prevent [SelfHeal](https://ydb.tech/docs/ru/maintenance/manual/selfheal) from overloading BSC. [#18298](https://github.com/ydb-platform/ydb/pull/18298) ([Sergey Belyakov](https://github.com/serbel324))
8+
* 18294:Changed version format from Year.Major.Minor.Hotfix to Year.Major.Minor.Patch.Hotfix [#18294](https://github.com/ydb-platform/ydb/pull/18294) ([Sergey Belyakov](https://github.com/serbel324))
69

710
### Bug fixes
811

912
* 17313:Fixed CopyTable operation to allow copying tables with all column types present in the source table, regardless of feature flag settings. This resolves an issue where copying tables with certain decimal types would fail after version downgrades. [#17313](https://github.com/ydb-platform/ydb/pull/17313) ([azevaykin](https://github.com/azevaykin))
1013
* 17122:Fixed an rare issue that caused client applications to hang during commit operations. The problem occurred because the `TEvDeletePartition` message could arrive before the `TEvApproveWriteQuota` message. The batch did not send TEvConsumed and this blocked the queue of write quota requests. [#17122](https://github.com/ydb-platform/ydb/pull/17122) ([Alek5andr-Kotov](https://github.com/Alek5andr-Kotov))
14+
* 18362:Table auto partitioning: Fixed crash when selecting split key from access samples containing a mix of full key and key prefix operations (e.g. exact/range reads). [#18362](https://github.com/ydb-platform/ydb/pull/18362) ([ijon](https://github.com/ijon))
15+
* 18301:Optimized memory usage in transactions with a large number of participants by changing the storage and resending mechanism for TEvReadSet messages. [#18302](https://github.com/ydb-platform/ydb/pull/18301) ([Alek5andr-Kotov](https://github.com/Alek5andr-Kotov))
16+
* 18296:Fixed replication continuing to consume disk space when storage was low, which caused VDisks to become read-only. [#18296](https://github.com/ydb-platform/ydb/pull/18296) ([Sergey Belyakov](https://github.com/serbel324))
17+
* 18271:Fix replication bug #10650 [#18271](https://github.com/ydb-platform/ydb/pull/18271) ([Alexander Rutkovsky](https://github.com/alexvru))
18+
* 18231:Fix segfault that could happen while retrying Whiteboard requests. [#18231](https://github.com/ydb-platform/ydb/pull/18231) ([Andrei Rykov](https://github.com/StekPerepolnen))

ydb/core/client/server/msgbus_server_console.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,10 @@ class TConsoleRequestActor : public TMessageBusSecureRequest<TMessageBusServerRe
6060

6161
SendRequest(ctx);
6262
TBase::Become(&TConsoleRequestActor::MainState);
63+
64+
if (const auto timeout = TDuration::MilliSeconds(Request.GetTimeoutMs())) {
65+
ctx.Schedule(timeout, new TEvents::TEvWakeup());
66+
}
6367
}
6468

6569
void SendRequest(const TActorContext &ctx)
@@ -328,6 +332,10 @@ class TConsoleRequestActor : public TMessageBusSecureRequest<TMessageBusServerRe
328332
SendReplyAndDie(ctx);
329333
}
330334

335+
void HandleTimeout(const TActorContext &ctx) {
336+
ReplyWithErrorAndDie(Ydb::StatusIds::TIMEOUT, "Console request timed out", ctx);
337+
}
338+
331339
STFUNC(MainState) {
332340
switch (ev->GetTypeRewrite()) {
333341
CFunc(TEvents::TEvUndelivered::EventType, Undelivered);
@@ -347,6 +355,7 @@ class TConsoleRequestActor : public TMessageBusSecureRequest<TMessageBusServerRe
347355
HFunc(TEvConsole::TEvToggleConfigValidatorResponse, Handle);
348356
CFunc(TEvTabletPipe::EvClientDestroyed, Undelivered);
349357
HFunc(TEvTabletPipe::TEvClientConnected, Handle);
358+
SFunc(TEvents::TEvWakeup, HandleTimeout);
350359
default:
351360
Y_ABORT("TConsoleRequestActor::MainState unexpected event type: %" PRIx32 " event: %s",
352361
ev->GetTypeRewrite(),

ydb/core/health_check/health_check.cpp

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1417,10 +1417,13 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
14171417
FilterDomainKey[TSubDomainKey(domainInfo->DomainKey.OwnerId, domainInfo->DomainKey.LocalPathId)] = path;
14181418

14191419
TTabletId hiveId = domainInfo->Params.GetHive();
1420-
if (hiveId && NeedToAskHive(hiveId)) {
1420+
if (hiveId) {
14211421
DatabaseState[path].HiveId = hiveId;
1422-
AskHive(path, hiveId);
1422+
if (NeedToAskHive(hiveId)) {
1423+
AskHive(path, hiveId);
1424+
}
14231425
} else if (RootHiveId && NeedToAskHive(RootHiveId)) {
1426+
DatabaseState[DomainPath].HiveId = RootHiveId;
14241427
AskHive(DomainPath, RootHiveId);
14251428
}
14261429

@@ -1515,23 +1518,31 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
15151518

15161519
void AggregateHiveInfo() {
15171520
TNodeTabletState::TTabletStateSettings settings;
1518-
for (const auto& [hiveId, hiveResponse] : HiveInfo) {
1521+
for (auto& [dbPath, dbState] : DatabaseState) {
1522+
const auto& hiveResponse = HiveInfo[dbState.HiveId];
15191523
if (hiveResponse.IsOk()) {
15201524
settings.AliveBarrier = TInstant::MilliSeconds(hiveResponse->Record.GetResponseTimestamp()) - TDuration::Minutes(5);
15211525
settings.MaxRestartsPerPeriod = HealthCheckConfig.GetThresholds().GetTabletsRestartsOrange();
15221526
for (const NKikimrHive::TTabletInfo& hiveTablet : hiveResponse->Record.GetTablets()) {
15231527
TSubDomainKey tenantId = TSubDomainKey(hiveTablet.GetObjectDomain());
15241528
auto itDomain = FilterDomainKey.find(tenantId);
1529+
TDatabaseState* database = nullptr;
15251530
if (itDomain == FilterDomainKey.end()) {
1526-
continue;
1527-
}
1528-
auto itDatabase = DatabaseState.find(itDomain->second);
1529-
if (itDatabase == DatabaseState.end()) {
1530-
continue;
1531+
if (!FilterDatabase || FilterDatabase == dbPath) {
1532+
database = &dbState;
1533+
} else {
1534+
continue;
1535+
}
1536+
} else {
1537+
auto itDatabase = DatabaseState.find(itDomain->second);
1538+
if (itDatabase != DatabaseState.end()) {
1539+
database = &itDatabase->second;
1540+
} else {
1541+
continue;
1542+
}
15311543
}
1532-
TDatabaseState& database = itDatabase->second;
15331544
auto tabletId = std::make_pair(hiveTablet.GetTabletID(), hiveTablet.GetFollowerID());
1534-
database.MergedTabletState.emplace(tabletId, &hiveTablet);
1545+
database->MergedTabletState.emplace(tabletId, &hiveTablet);
15351546
TNodeId nodeId = hiveTablet.GetNodeID();
15361547
switch (hiveTablet.GetVolatileState()) {
15371548
case NKikimrHive::ETabletVolatileState::TABLET_VOLATILE_STATE_STARTING:
@@ -1541,7 +1552,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
15411552
nodeId = 0;
15421553
break;
15431554
}
1544-
database.MergedNodeTabletState[nodeId].AddTablet(hiveTablet, settings);
1555+
database->MergedNodeTabletState[nodeId].AddTablet(hiveTablet, settings);
15451556
}
15461557
}
15471558
}

ydb/core/health_check/health_check_ut.cpp

Lines changed: 117 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -446,6 +446,15 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
446446
UNIT_ASSERT_VALUES_EQUAL(issueVdiscCount, issueVdiscNumber);
447447
}
448448

449+
bool HasTabletIssue(const Ydb::Monitoring::SelfCheckResult& result) {
450+
for (const auto& issue_log : result.issue_log()) {
451+
if (issue_log.level() == 4 && issue_log.type() == "TABLET") {
452+
return true;
453+
}
454+
}
455+
return false;
456+
}
457+
449458
void ListingTest(int const groupNumber, int const vdiscPerGroupNumber, bool const isMergeRecords = false) {
450459
auto result = RequestHc(groupNumber, vdiscPerGroupNumber, isMergeRecords);
451460
CheckHcResult(result, groupNumber, vdiscPerGroupNumber, isMergeRecords);
@@ -865,6 +874,15 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
865874
}
866875
}
867876

877+
void AddBadServerlessTablet(TEvHive::TEvResponseHiveInfo::TPtr* ev) {
878+
auto &record = (*ev)->Get()->Record;
879+
auto* tablet = record.MutableTablets()->Add();
880+
tablet->SetTabletID(1);
881+
tablet->MutableObjectDomain()->SetSchemeShard(SERVERLESS_DOMAIN_KEY.OwnerId);
882+
tablet->MutableObjectDomain()->SetPathId(SERVERLESS_DOMAIN_KEY.LocalPathId);
883+
tablet->SetRestartsPerPeriod(500);
884+
}
885+
868886
Y_UNIT_TEST(SpecificServerless) {
869887
TPortManager tp;
870888
ui16 port = tp.GetPort(2134);
@@ -1164,6 +1182,102 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
11641182
UNIT_ASSERT(!databaseFoundInResult);
11651183
}
11661184

1185+
Y_UNIT_TEST(ServerlessBadTablets) {
1186+
TPortManager tp;
1187+
ui16 port = tp.GetPort(2134);
1188+
ui16 grpcPort = tp.GetPort(2135);
1189+
auto settings = TServerSettings(port)
1190+
.SetNodeCount(1)
1191+
.SetDynamicNodeCount(1)
1192+
.SetUseRealThreads(false)
1193+
.SetDomainName("Root");
1194+
TServer server(settings);
1195+
server.EnableGRpc(grpcPort);
1196+
TClient client(settings);
1197+
TTestActorRuntime& runtime = *server.GetRuntime();
1198+
1199+
auto &dynamicNameserviceConfig = runtime.GetAppData().DynamicNameserviceConfig;
1200+
dynamicNameserviceConfig->MaxStaticNodeId = runtime.GetNodeId(server.StaticNodes() - 1);
1201+
dynamicNameserviceConfig->MinDynamicNodeId = runtime.GetNodeId(server.StaticNodes());
1202+
dynamicNameserviceConfig->MaxDynamicNodeId = runtime.GetNodeId(server.StaticNodes() + server.DynamicNodes() - 1);
1203+
1204+
ui32 sharedDynNodeId = runtime.GetNodeId(1);
1205+
1206+
bool firstConsoleResponse = true;
1207+
auto observerFunc = [&](TAutoPtr<IEventHandle>& ev) {
1208+
switch (ev->GetTypeRewrite()) {
1209+
case NConsole::TEvConsole::EvListTenantsResponse: {
1210+
auto *x = reinterpret_cast<NConsole::TEvConsole::TEvListTenantsResponse::TPtr*>(&ev);
1211+
AddPathsToListTenantsResponse(x, { "/Root/serverless", "/Root/shared" });
1212+
break;
1213+
}
1214+
case NConsole::TEvConsole::EvGetTenantStatusResponse: {
1215+
auto *x = reinterpret_cast<NConsole::TEvConsole::TEvGetTenantStatusResponse::TPtr*>(&ev);
1216+
if (!firstConsoleResponse) {
1217+
ChangeGetTenantStatusResponse(x, "/Root/serverless");
1218+
} else {
1219+
firstConsoleResponse = false;
1220+
ChangeGetTenantStatusResponse(x, "/Root/shared");
1221+
}
1222+
break;
1223+
}
1224+
case TEvTxProxySchemeCache::EvNavigateKeySetResult: {
1225+
auto *x = reinterpret_cast<TEvTxProxySchemeCache::TEvNavigateKeySetResult::TPtr*>(&ev);
1226+
ChangeNavigateKeyResultServerless(x, NKikimrSubDomains::EServerlessComputeResourcesModeShared, runtime);
1227+
break;
1228+
}
1229+
case TEvHive::EvResponseHiveNodeStats: {
1230+
auto *x = reinterpret_cast<TEvHive::TEvResponseHiveNodeStats::TPtr*>(&ev);
1231+
ChangeResponseHiveNodeStats(x, sharedDynNodeId);
1232+
break;
1233+
}
1234+
case TEvHive::EvResponseHiveInfo: {
1235+
auto *x = reinterpret_cast<TEvHive::TEvResponseHiveInfo::TPtr*>(&ev);
1236+
AddBadServerlessTablet(x);
1237+
break;
1238+
}
1239+
case TEvSchemeShard::EvDescribeSchemeResult: {
1240+
auto *x = reinterpret_cast<NSchemeShard::TEvSchemeShard::TEvDescribeSchemeResult::TPtr*>(&ev);
1241+
ChangeDescribeSchemeResultServerless(x);
1242+
break;
1243+
}
1244+
case TEvBlobStorage::EvControllerConfigResponse: {
1245+
auto *x = reinterpret_cast<TEvBlobStorage::TEvControllerConfigResponse::TPtr*>(&ev);
1246+
AddGroupVSlotInControllerConfigResponseWithStaticGroup(x, NKikimrBlobStorage::TGroupStatus::FULL, TVDisks(1));
1247+
break;
1248+
}
1249+
case NSysView::TEvSysView::EvGetVSlotsResponse: {
1250+
auto* x = reinterpret_cast<NSysView::TEvSysView::TEvGetVSlotsResponse::TPtr*>(&ev);
1251+
AddVSlotsToSysViewResponse(x, 1, TVDisks(1));
1252+
break;
1253+
}
1254+
case NSysView::TEvSysView::EvGetGroupsResponse: {
1255+
auto* x = reinterpret_cast<NSysView::TEvSysView::TEvGetGroupsResponse::TPtr*>(&ev);
1256+
AddGroupsToSysViewResponse(x);
1257+
break;
1258+
}
1259+
case NSysView::TEvSysView::EvGetStoragePoolsResponse: {
1260+
auto* x = reinterpret_cast<NSysView::TEvSysView::TEvGetStoragePoolsResponse::TPtr*>(&ev);
1261+
AddStoragePoolsToSysViewResponse(x);
1262+
break;
1263+
}
1264+
}
1265+
1266+
return TTestActorRuntime::EEventAction::PROCESS;
1267+
};
1268+
runtime.SetObserverFunc(observerFunc);
1269+
1270+
TActorId sender = runtime.AllocateEdgeActor();
1271+
TAutoPtr<IEventHandle> handle;
1272+
1273+
auto *request = new NHealthCheck::TEvSelfCheckRequest;
1274+
request->Request.set_return_verbose_status(true);
1275+
runtime.Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, request, 0));
1276+
const auto result = runtime.GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
1277+
Ctest << result.ShortDebugString();
1278+
UNIT_ASSERT(HasTabletIssue(result));
1279+
}
1280+
11671281
Y_UNIT_TEST(DontIgnoreServerlessWithExclusiveNodesWhenNotSpecific) {
11681282
TPortManager tp;
11691283
ui16 port = tp.GetPort(2134);
@@ -1859,15 +1973,6 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
18591973
ShardsQuotaTest(105, 0, 0, Ydb::Monitoring::StatusFlag::GREEN);
18601974
}
18611975

1862-
bool HasDeadTabletIssue(const Ydb::Monitoring::SelfCheckResult& result) {
1863-
for (const auto& issue_log : result.issue_log()) {
1864-
if (issue_log.level() == 4 && issue_log.type() == "TABLET") {
1865-
return true;
1866-
}
1867-
}
1868-
return false;
1869-
}
1870-
18711976
Y_UNIT_TEST(TestTabletIsDead) {
18721977
TPortManager tp;
18731978
ui16 port = tp.GetPort(2134);
@@ -1895,7 +2000,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
18952000
auto result = runtime->GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
18962001
Cerr << result.ShortDebugString();
18972002

1898-
UNIT_ASSERT(HasDeadTabletIssue(result));
2003+
UNIT_ASSERT(HasTabletIssue(result));
18992004
}
19002005

19012006
Y_UNIT_TEST(TestBootingTabletIsNotDead) {
@@ -1926,7 +2031,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
19262031
auto result = runtime->GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
19272032
Cerr << result.ShortDebugString();
19282033

1929-
UNIT_ASSERT(!HasDeadTabletIssue(result));
2034+
UNIT_ASSERT(!HasTabletIssue(result));
19302035
}
19312036

19322037
Y_UNIT_TEST(TestReBootingTabletIsDead) {
@@ -1960,7 +2065,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
19602065
auto result = runtime->GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
19612066
Cerr << result.ShortDebugString();
19622067

1963-
UNIT_ASSERT(HasDeadTabletIssue(result));
2068+
UNIT_ASSERT(HasTabletIssue(result));
19642069
}
19652070

19662071
void SendHealthCheckConfigUpdate(TTestActorRuntime &runtime, const TActorId& sender, const NKikimrConfig::THealthCheckConfig &cfg) {

ydb/core/kafka_proxy/kafka_connection.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ class TKafkaConnection: public TActorBootstrapped<TKafkaConnection>, public TNet
103103

104104
void Bootstrap() {
105105
Context->ConnectionId = SelfId();
106-
Context->RequireAuthentication = NKikimr::AppData()->EnforceUserTokenRequirement;
106+
Context->RequireAuthentication = NKikimr::AppData()->EnforceUserTokenRequirement || NKikimr::AppData()->PQConfig.GetRequireCredentialsInNewProtocol();
107107
// if no authentication required, then we can use local database as our target
108108
if (!Context->RequireAuthentication) {
109109
Context->DatabasePath = NKikimr::AppData()->TenantName;
@@ -478,7 +478,7 @@ class TKafkaConnection: public TActorBootstrapped<TKafkaConnection>, public TNet
478478
return;
479479
}
480480

481-
Context->RequireAuthentication = NKikimr::AppData()->EnforceUserTokenRequirement;
481+
Context->RequireAuthentication = NKikimr::AppData()->EnforceUserTokenRequirement || NKikimr::AppData()->PQConfig.GetRequireCredentialsInNewProtocol();
482482
Context->UserToken = event->UserToken;
483483
Context->DatabasePath = event->DatabasePath;
484484
Context->AuthenticationStep = authStep;

ydb/core/mind/hive/boot_queue.cpp

Lines changed: 41 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,20 +17,54 @@ void TBootQueue::AddToBootQueue(TBootQueueRecord record) {
1717
}
1818

1919
TBootQueue::TBootQueueRecord TBootQueue::PopFromBootQueue() {
20-
TBootQueueRecord record = BootQueue.top();
21-
BootQueue.pop();
20+
TQueue& currentQueue = GetCurrentQueue();
21+
TBootQueueRecord record = currentQueue.top();
22+
currentQueue.pop();
23+
if (ProcessWaitQueue) {
24+
NextFromWaitQueue = !NextFromWaitQueue;
25+
}
2226
return record;
2327
}
2428

2529
void TBootQueue::AddToWaitQueue(TBootQueueRecord record) {
26-
WaitQueue.emplace_back(record);
30+
WaitQueue.push(record);
31+
}
32+
33+
void TBootQueue::IncludeWaitQueue() {
34+
ProcessWaitQueue = true;
2735
}
2836

29-
void TBootQueue::MoveFromWaitQueueToBootQueue() {
30-
for (TBootQueueRecord record : WaitQueue) {
31-
AddToBootQueue(record);
37+
void TBootQueue::ExcludeWaitQueue() {
38+
ProcessWaitQueue = false;
39+
}
40+
41+
bool TBootQueue::Empty() const {
42+
if (ProcessWaitQueue) {
43+
return BootQueue.empty() && WaitQueue.empty();
44+
} else {
45+
return BootQueue.empty();
46+
}
47+
}
48+
49+
size_t TBootQueue::Size() const {
50+
if (ProcessWaitQueue) {
51+
return BootQueue.size() + WaitQueue.size();
52+
} else {
53+
return BootQueue.size();
54+
}
55+
}
56+
57+
TBootQueue::TQueue& TBootQueue::GetCurrentQueue() {
58+
if (BootQueue.empty()) {
59+
return WaitQueue;
60+
}
61+
if (WaitQueue.empty()) {
62+
return BootQueue;
63+
}
64+
if (ProcessWaitQueue && NextFromWaitQueue) {
65+
return WaitQueue;
3266
}
33-
WaitQueue.clear();
67+
return BootQueue;
3468
}
3569

3670
}

0 commit comments

Comments
 (0)