Skip to content

Commit d45b64e

Browse files
committed
Fix not consistent generation counters for data erasure in SchemeShard tablet and BSC tablet (#19019)
1 parent e62bf24 commit d45b64e

File tree

5 files changed

+106
-16
lines changed

5 files changed

+106
-16
lines changed

ydb/core/tx/schemeshard/schemeshard__data_erasure_manager.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ class TDataErasureManager {
5454
virtual bool Remove(const TPathId& pathId) = 0;
5555
virtual bool Remove(const TShardIdx& shardIdx) = 0;
5656
virtual void HandleNewPartitioning(const std::vector<TShardIdx>& dataErasureShards, NIceDb::TNiceDb& db) = 0;
57+
virtual void SyncBscGeneration(NIceDb::TNiceDb& db, ui64 currentBscGeneration) = 0;
5758

5859
void Clear();
5960

@@ -128,6 +129,7 @@ using TQueue = NOperationQueue::TOperationQueueWithTimer<
128129
bool Remove(const TPathId& pathId) override;
129130
bool Remove(const TShardIdx& shardIdx) override;
130131
void HandleNewPartitioning(const std::vector<TShardIdx>& dataErasureShards, NIceDb::TNiceDb& db) override;
132+
void SyncBscGeneration(NIceDb::TNiceDb& db, ui64 currentBscGeneration) override;
131133

132134
private:
133135
static TQueue::TConfig ConvertConfig(const NKikimrConfig::TDataErasureConfig& config);
@@ -189,6 +191,7 @@ using TQueue = NOperationQueue::TOperationQueueWithTimer<
189191
bool Remove(const TPathId& pathId) override;
190192
bool Remove(const TShardIdx& shardIdx) override;
191193
void HandleNewPartitioning(const std::vector<TShardIdx>& dataErasureShards, NIceDb::TNiceDb& db) override;
194+
void SyncBscGeneration(NIceDb::TNiceDb& db, ui64 currentBscGeneration) override;
192195

193196
private:
194197
static TQueue::TConfig ConvertConfig(const NKikimrConfig::TDataErasureConfig& config);

ydb/core/tx/schemeshard/schemeshard__root_data_erasure_manager.cpp

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ void TRootDataErasureManager::Run(NIceDb::TNiceDb& db) {
131131
Status = EDataErasureStatus::IN_PROGRESS_BSC;
132132
}
133133
db.Table<Schema::DataErasureGenerations>().Key(Generation).Update<Schema::DataErasureGenerations::Status,
134-
Schema::DataErasureGenerations::StartTime>(Status, StartTime.MicroSeconds());
134+
Schema::DataErasureGenerations::StartTime>(Status, StartTime.MicroSeconds());
135135

136136
const auto ctx = SchemeShard->ActorContext();
137137
LOG_NOTICE_S(ctx, NKikimrServices::FLAT_TX_SCHEMESHARD,
@@ -481,6 +481,13 @@ void TRootDataErasureManager::HandleNewPartitioning(const std::vector<TShardIdx>
481481
LOG_WARN_S(ctx, NKikimrServices::FLAT_TX_SCHEMESHARD, "[RootDataErasureManager] [HandleNewPartitioning] Cannot execute in root schemeshard: " << SchemeShard->TabletID());
482482
}
483483

484+
void TRootDataErasureManager::SyncBscGeneration(NIceDb::TNiceDb& db, ui64 currentBscGeneration) {
485+
db.Table<Schema::DataErasureGenerations>().Key(GetGeneration()).Delete();
486+
SetGeneration(currentBscGeneration + 1);
487+
db.Table<Schema::DataErasureGenerations>().Key(GetGeneration()).Update<Schema::DataErasureGenerations::Status,
488+
Schema::DataErasureGenerations::StartTime>(GetStatus(), StartTime.MicroSeconds());
489+
}
490+
484491
void TRootDataErasureManager::UpdateMetrics() {
485492
SchemeShard->TabletCounters->Simple()[COUNTER_DATA_ERASURE_QUEUE_SIZE].Set(Queue->Size());
486493
SchemeShard->TabletCounters->Simple()[COUNTER_DATA_ERASURE_QUEUE_RUNNING].Set(Queue->RunningSize());
@@ -633,13 +640,15 @@ struct TSchemeShard::TTxCompleteDataErasureBSC : public TSchemeShard::TRwTxBase
633640

634641
const auto& record = Ev->Get()->Record;
635642
auto& manager = Self->DataErasureManager;
636-
if (record.GetCurrentGeneration() != manager->GetGeneration()) {
643+
NIceDb::TNiceDb db(txc.DB);
644+
if (ui64 currentBscGeneration = record.GetCurrentGeneration(); currentBscGeneration > manager->GetGeneration()) {
637645
LOG_DEBUG_S(ctx, NKikimrServices::FLAT_TX_SCHEMESHARD,
638-
"TTxCompleteDataErasureBSC Unknown generation#" << record.GetCurrentGeneration() << ", Expected gen# " << manager->GetGeneration() << " at schemestard: " << Self->TabletID());
646+
"TTxCompleteDataErasureBSC Unknown generation#" << currentBscGeneration << ", Expected gen# " << manager->GetGeneration() << " at schemestard: " << Self->TabletID());
647+
manager->SyncBscGeneration(db, currentBscGeneration);
648+
manager->SendRequestToBSC();
639649
return;
640650
}
641651

642-
NIceDb::TNiceDb db(txc.DB);
643652
if (record.GetCompleted()) {
644653
LOG_NOTICE_S(ctx, NKikimrServices::FLAT_TX_SCHEMESHARD, "TTxCompleteDataErasureBSC: Data shred in BSC is completed");
645654
manager->Complete();

ydb/core/tx/schemeshard/schemeshard__tenant_data_erasure_manager.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -445,6 +445,12 @@ void TTenantDataErasureManager::HandleNewPartitioning(const std::vector<TShardId
445445
<< ", Status# " << static_cast<ui32>(Status));
446446
}
447447

448+
void TTenantDataErasureManager::SyncBscGeneration(NIceDb::TNiceDb&, ui64) {
449+
auto ctx = SchemeShard->ActorContext();
450+
LOG_WARN_S(ctx, NKikimrServices::FLAT_TX_SCHEMESHARD,
451+
"[TenantDataErasureManager] [SyncBscGeneration] Cannot execute in tenant schemeshard: " << SchemeShard->TabletID());
452+
}
453+
448454
void TTenantDataErasureManager::UpdateMetrics() {
449455
SchemeShard->TabletCounters->Simple()[COUNTER_TENANT_DATA_ERASURE_QUEUE_SIZE].Set(Queue->Size());
450456
SchemeShard->TabletCounters->Simple()[COUNTER_TENANT_DATA_ERASURE_QUEUE_RUNNING].Set(Queue->RunningSize());

ydb/core/tx/schemeshard/ut_data_erasure/ut_data_erasure.cpp

Lines changed: 84 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ namespace {
7575
}
7676

7777
Y_UNIT_TEST_SUITE(TestDataErasure) {
78-
void SimpleDataErasureTest(const TSchemeObject& createSchemeObject) {
78+
void SimpleDataErasureTest(const TSchemeObject& createSchemeObject, ui64 currentBscGeneration = 0) {
7979
TTestBasicRuntime runtime;
8080
TTestEnv env(runtime);
8181

@@ -93,6 +93,11 @@ Y_UNIT_TEST_SUITE(TestDataErasure) {
9393
dataErasureConfig.SetBlobStorageControllerRequestIntervalSeconds(1);
9494

9595
auto sender = runtime.AllocateEdgeActor();
96+
// Change BSC counter value between data erasure iterations
97+
if (currentBscGeneration > 1) {
98+
auto request = MakeHolder<TEvBlobStorage::TEvControllerShredRequest>(currentBscGeneration);
99+
runtime.SendToPipe(MakeBSControllerID(), sender, request.Release(), 0, GetPipeConfigWithRetries());
100+
}
96101
RebootTablet(runtime, TTestTxConfig::SchemeShard, sender);
97102

98103
ui64 txId = 100;
@@ -101,7 +106,7 @@ Y_UNIT_TEST_SUITE(TestDataErasure) {
101106
CreateTestExtSubdomain(runtime, env, &txId, "Database2", createSchemeObject);
102107

103108
TDispatchOptions options;
104-
options.FinalEvents.push_back(TDispatchOptions::TFinalEventCondition(TEvBlobStorage::EvControllerShredResponse, 3));
109+
options.FinalEvents.push_back(TDispatchOptions::TFinalEventCondition(TEvBlobStorage::EvControllerShredResponse, (currentBscGeneration > 1 ? 4 : 3)));
105110
runtime.DispatchEvents(options);
106111

107112
auto request = MakeHolder<TEvSchemeShard::TEvDataErasureInfoRequest>();
@@ -110,22 +115,30 @@ Y_UNIT_TEST_SUITE(TestDataErasure) {
110115
TAutoPtr<IEventHandle> handle;
111116
auto response = runtime.GrabEdgeEventRethrow<TEvSchemeShard::TEvDataErasureInfoResponse>(handle);
112117

113-
UNIT_ASSERT_EQUAL_C(response->Record.GetGeneration(), 1, response->Record.GetGeneration());
118+
if (currentBscGeneration > 1) {
119+
UNIT_ASSERT_EQUAL_C(response->Record.GetGeneration(), currentBscGeneration + 1, response->Record.GetGeneration());
120+
} else {
121+
UNIT_ASSERT_EQUAL_C(response->Record.GetGeneration(), 1, response->Record.GetGeneration());
122+
}
114123
UNIT_ASSERT_EQUAL(response->Record.GetStatus(), NKikimrScheme::TEvDataErasureInfoResponse::COMPLETED);
115124
}
116125

117-
Y_UNIT_TEST(SimpleDataErasureTestForTables) {
126+
Y_UNIT_TEST(SimpleTestForTables) {
118127
SimpleDataErasureTest({.Table = true, .Topic = false});
119128
}
120129

121-
Y_UNIT_TEST(SimpleDataErasureTestForTopic) {
130+
Y_UNIT_TEST(SimpleTestForTopic) {
122131
SimpleDataErasureTest({.Table = false, .Topic = true});
123132
}
124133

125-
Y_UNIT_TEST(SimpleDataErasureTestForAllSupportedObjects) {
134+
Y_UNIT_TEST(SimpleTestForAllSupportedObjects) {
126135
SimpleDataErasureTest({.Table = true, .Topic = true});
127136
}
128137

138+
Y_UNIT_TEST(SchemeShardCounterDoesNotConsistWithBscCounter) {
139+
SimpleDataErasureTest({.Table = true, .Topic = false}, /*currentBscGeneration*/ 47);
140+
}
141+
129142
void DataErasureRun3Cycles(const TSchemeObject& createSchemeObject) {
130143
TTestBasicRuntime runtime;
131144
TTestEnv env(runtime);
@@ -165,15 +178,15 @@ Y_UNIT_TEST_SUITE(TestDataErasure) {
165178
UNIT_ASSERT_EQUAL(response->Record.GetStatus(), NKikimrScheme::TEvDataErasureInfoResponse::COMPLETED);
166179
}
167180

168-
Y_UNIT_TEST(DataErasureRun3CyclesForTables) {
181+
Y_UNIT_TEST(Run3CyclesForTables) {
169182
DataErasureRun3Cycles({.Table = true, .Topic = false});
170183
}
171184

172-
Y_UNIT_TEST(DataErasureRun3CyclesForTopics) {
185+
Y_UNIT_TEST(Run3CyclesForTopics) {
173186
DataErasureRun3Cycles({.Table = false, .Topic = true});
174187
}
175188

176-
Y_UNIT_TEST(DataErasureRun3CyclesForAllSupportedObjects) {
189+
Y_UNIT_TEST(Run3CyclesForAllSupportedObjects) {
177190
DataErasureRun3Cycles({.Table = true, .Topic = true});
178191
}
179192

@@ -221,7 +234,7 @@ Y_UNIT_TEST_SUITE(TestDataErasure) {
221234
UNIT_ASSERT_EQUAL(response->Record.GetStatus(), NKikimrScheme::TEvDataErasureInfoResponse::COMPLETED);
222235
}
223236

224-
Y_UNIT_TEST(DataErasureManualLaunch3Cycles) {
237+
Y_UNIT_TEST(ManualLaunch3Cycles) {
225238
TTestBasicRuntime runtime;
226239
TTestEnv env(runtime);
227240

@@ -272,6 +285,67 @@ Y_UNIT_TEST_SUITE(TestDataErasure) {
272285
RunDataErasure(3);
273286
}
274287

288+
Y_UNIT_TEST(ManualLaunch3CyclesWithNotConsistentCountersInSchemeShardAndBSC) {
289+
TTestBasicRuntime runtime;
290+
TTestEnv env(runtime);
291+
292+
runtime.SetLogPriority(NKikimrServices::TX_PROXY, NLog::PRI_DEBUG);
293+
runtime.SetLogPriority(NKikimrServices::FLAT_TX_SCHEMESHARD, NActors::NLog::PRI_TRACE);
294+
295+
auto info = CreateTestTabletInfo(MakeBSControllerID(), TTabletTypes::BSController);
296+
CreateTestBootstrapper(runtime, info, [](const TActorId &tablet, TTabletStorageInfo *info) -> IActor* {
297+
return new TFakeBSController(tablet, info);
298+
});
299+
300+
runtime.GetAppData().FeatureFlags.SetEnableDataErasure(true);
301+
auto& dataErasureConfig = runtime.GetAppData().DataErasureConfig;
302+
dataErasureConfig.SetDataErasureIntervalSeconds(0); // do not schedule
303+
dataErasureConfig.SetBlobStorageControllerRequestIntervalSeconds(1);
304+
305+
auto sender = runtime.AllocateEdgeActor();
306+
RebootTablet(runtime, TTestTxConfig::SchemeShard, sender);
307+
308+
ui64 txId = 100;
309+
310+
CreateTestExtSubdomain(runtime, env, &txId, "Database1");
311+
CreateTestExtSubdomain(runtime, env, &txId, "Database2");
312+
313+
auto RunDataErasure = [&runtime] (ui32 expectedGeneration, ui32 requiredCountShredResponses) {
314+
auto sender = runtime.AllocateEdgeActor();
315+
{
316+
auto request = MakeHolder<TEvSchemeShard::TEvDataErasureManualStartupRequest>();
317+
runtime.SendToPipe(TTestTxConfig::SchemeShard, sender, request.Release(), 0, GetPipeConfigWithRetries());
318+
}
319+
320+
TDispatchOptions options;
321+
options.FinalEvents.push_back(TDispatchOptions::TFinalEventCondition(TEvBlobStorage::EvControllerShredResponse, requiredCountShredResponses));
322+
runtime.DispatchEvents(options);
323+
324+
auto request = MakeHolder<TEvSchemeShard::TEvDataErasureInfoRequest>();
325+
runtime.SendToPipe(TTestTxConfig::SchemeShard, sender, request.Release(), 0, GetPipeConfigWithRetries());
326+
327+
TAutoPtr<IEventHandle> handle;
328+
auto response = runtime.GrabEdgeEventRethrow<TEvSchemeShard::TEvDataErasureInfoResponse>(handle);
329+
330+
UNIT_ASSERT_EQUAL_C(response->Record.GetGeneration(), expectedGeneration, response->Record.GetGeneration());
331+
UNIT_ASSERT_EQUAL(response->Record.GetStatus(), NKikimrScheme::TEvDataErasureInfoResponse::COMPLETED);
332+
};
333+
334+
RunDataErasure(1, 3);
335+
// Change BSC counter value between data erasure iterations
336+
{
337+
auto request = MakeHolder<TEvBlobStorage::TEvControllerShredRequest>(50);
338+
runtime.SendToPipe(MakeBSControllerID(), sender, request.Release(), 0, GetPipeConfigWithRetries());
339+
}
340+
RunDataErasure(51, 4);
341+
// Change BSC counter value between data erasure iterations
342+
{
343+
auto request = MakeHolder<TEvBlobStorage::TEvControllerShredRequest>(100);
344+
runtime.SendToPipe(MakeBSControllerID(), sender, request.Release(), 0, GetPipeConfigWithRetries());
345+
}
346+
RunDataErasure(101, 4);
347+
}
348+
275349
Y_UNIT_TEST(DataErasureWithCopyTable) {
276350
TTestBasicRuntime runtime;
277351
TVector<TIntrusivePtr<NFake::TProxyDS>> dsProxies {

ydb/core/tx/schemeshard/ut_data_erasure/ya.make

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,6 @@ FORK_SUBTESTS()
44

55
SPLIT_FACTOR(10)
66

7-
TIMEOUT(20)
8-
97
IF (SANITIZER_TYPE == "thread" OR WITH_VALGRIND)
108
SIZE(LARGE)
119
TAG(ya:fat)

0 commit comments

Comments
 (0)