Skip to content

Commit d9167c4

Browse files
authored
dstool cluster balance add --storage-pool, --max-donors-per-pdisk and --sort-by={slots/space_ratio} (#10406)
1 parent f1035af commit d9167c4

File tree

2 files changed

+83
-11
lines changed

2 files changed

+83
-11
lines changed

ydb/apps/dstool/lib/common.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -684,6 +684,15 @@ def build_pdisk_map(base_config):
684684
return pdisk_map
685685

686686

687+
def build_donors_per_pdisk_map(base_config):
688+
donors_per_vdisk = defaultdict(int)
689+
for vslot in base_config.VSlot:
690+
for donor in vslot.Donors:
691+
pdisk_id = get_pdisk_id(donor.VSlotId)
692+
donors_per_vdisk[pdisk_id] += 1
693+
return donors_per_vdisk
694+
695+
687696
def build_pdisk_static_slots_map(base_config):
688697
pdisk_static_slots_map = {
689698
get_pdisk_id(pdisk): pdisk.NumStaticSlots

ydb/apps/dstool/lib/dstool_cmd_cluster_balance.py

Lines changed: 74 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,20 +10,53 @@
1010
def add_options(p):
1111
p.add_argument('--max-replicating-pdisks', type=int, help='Limit number of maximum replicating PDisks in the cluster')
1212
p.add_argument('--only-from-overpopulated-pdisks', action='store_true', help='Move vdisks out only from pdisks with over expected slot count')
13+
p.add_argument('--sort-by', choices=['slots', 'space_ratio'], default='slots', help='First to reassign disks with the most slots or with the highest space ratio')
14+
p.add_argument('--storage-pool', type=str, help='Storage pool to balance')
15+
p.add_argument('--max-donors-per-pdisk', type=int, default=0, help='Limit number of donors per pdisk')
1316
common.add_basic_format_options(p)
1417

1518

19+
def build_pdisk_statistics(base_config, pdisk_map, vsolts):
20+
pdisks_statistics = {
21+
pdisk_id: {
22+
"PDiskId": pdisk_id,
23+
"AvailableSize": pdisk.PDiskMetrics.AvailableSize,
24+
"TotalSize": pdisk.PDiskMetrics.TotalSize,
25+
"CandidateVSlots": [],
26+
"DonorVSlots": [],
27+
}
28+
for pdisk_id, pdisk in pdisk_map.items()
29+
if pdisk.PDiskMetrics.TotalSize > 0 # pdisk works
30+
}
31+
for vslot in vsolts:
32+
pdisk_id = common.get_pdisk_id(vslot.VSlotId)
33+
pdisks_statistics[pdisk_id]["CandidateVSlots"].append(vslot)
34+
for vslot in base_config.VSlot:
35+
for donor in vslot.Donors:
36+
pdisk_id = common.get_pdisk_id(donor.VSlotId)
37+
pdisks_statistics[pdisk_id]["DonorVSlots"].append(donor)
38+
return pdisks_statistics
39+
40+
1641
def do(args):
1742
while True:
1843
common.flush_cache()
1944

2045
base_config = common.fetch_base_config()
46+
storage_pools = common.fetch_storage_pools()
2147
node_mon_map = common.fetch_node_mon_map({vslot.VSlotId.NodeId for vslot in base_config.VSlot})
2248
vslot_map = common.build_vslot_map(base_config)
2349
pdisk_map = common.build_pdisk_map(base_config)
2450
pdisk_usage = common.build_pdisk_usage_map(base_config, count_donors=False)
2551
pdisk_usage_w_donors = common.build_pdisk_usage_map(base_config, count_donors=True)
2652

53+
storage_pool_names_map = common.build_storage_pool_names_map(storage_pools)
54+
group_id_to_storage_pool_name_map = {
55+
group_id: storage_pool_names_map[(group.BoxId, group.StoragePoolId)]
56+
for group_id, group in common.build_group_map(base_config).items()
57+
if (group.BoxId, group.StoragePoolId) != (0, 0) # static group
58+
}
59+
2760
vdisks_groups_count_map = defaultdict(int)
2861
for group in base_config.Group:
2962
num = sum(vslot.Status == 'READY' for vslot in common.vslots_of_group(group, vslot_map)) - len(group.VSlotId)
@@ -82,12 +115,26 @@ def do(args):
82115

83116
candidate_vslots = []
84117
if healthy_vslots_from_overpopulated_pdisks:
85-
common.print_if_not_quiet(args, f'Found {len(healthy_vslots_from_overpopulated_pdisks)} vdisks from overpopulated pdisks', sys.stdout)
118+
common.print_if_not_quiet(args, f'Found {len(healthy_vslots_from_overpopulated_pdisks)} vdisks in healthy groups from overpopulated pdisks', sys.stdout)
86119
candidate_vslots = healthy_vslots_from_overpopulated_pdisks
87120
elif healthy_vslots and not args.only_from_overpopulated_pdisks:
88-
common.print_if_not_quiet(args, f'Found {len(healthy_vslots)} vdisks suitable for relocation', sys.stdout)
121+
common.print_if_not_quiet(args, f'Found {len(healthy_vslots)} vdisks in healthy groups', sys.stdout)
89122
candidate_vslots = healthy_vslots
90-
else: # candidate_vslots is empty
123+
124+
if args.storage_pool is not None:
125+
existing_storage_pools = set(group_id_to_storage_pool_name_map.values())
126+
if args.storage_pool not in existing_storage_pools:
127+
print(f"Storage pool {args.storage_pool} not found in existing storage pools: {existing_storage_pools}")
128+
sys.exit(1)
129+
candidate_vslots = [vslot for vslot in candidate_vslots if group_id_to_storage_pool_name_map[vslot.GroupId] == args.storage_pool]
130+
common.print_if_not_quiet(args, f'Found {len(candidate_vslots)} vdisks in {args.storage_pool} sotrage pool', sys.stdout)
131+
132+
if args.max_donors_per_pdisk > 0:
133+
donors_per_pdisk = common.build_donors_per_pdisk_map(base_config)
134+
candidate_vslots = [vslot for vslot in candidate_vslots if donors_per_pdisk[common.get_pdisk_id(vslot.VSlotId)] < args.max_donors_per_pdisk]
135+
common.print_if_not_quiet(args, f'Found {len(candidate_vslots)} vdisks with donors per pdisk < {args.max_donors_per_pdisk}', sys.stdout)
136+
137+
if len(candidate_vslots) == 0:
91138
common.print_if_not_quiet(args, 'No vdisks suitable for relocation found, waiting..', sys.stdout)
92139
time.sleep(10)
93140
continue
@@ -182,14 +229,30 @@ def add_reassign_cmd(request, vslot):
182229
return True
183230
# end of do_reassign()
184231

185-
vslots_by_pdisk_slot_usage = defaultdict(list)
186-
for vslot in candidate_vslots:
187-
pdisk_id = common.get_pdisk_id(vslot.VSlotId)
188-
pdisk_slot_usage = pdisk_usage[pdisk_id]
189-
vslots_by_pdisk_slot_usage[pdisk_slot_usage].append(vslot)
190-
191-
# check vslots from pdisks with the highest slot usage first
192-
for pdisk_slot_usage, vslots in sorted(vslots_by_pdisk_slot_usage.items(), reverse=True):
232+
vslots_ordered_groups_to_reassign = None
233+
if args.sort_by == 'slots':
234+
vslots_by_pdisk_slot_usage = defaultdict(list)
235+
for vslot in candidate_vslots:
236+
pdisk_id = common.get_pdisk_id(vslot.VSlotId)
237+
pdisk_slot_usage = pdisk_usage[pdisk_id]
238+
vslots_by_pdisk_slot_usage[pdisk_slot_usage].append(vslot)
239+
vslots_ordered_groups_to_reassign = [vslots for _, vslots in sorted(vslots_by_pdisk_slot_usage.items(), reverse=True)]
240+
elif args.sort_by == 'space_ratio':
241+
pdisks = {
242+
pdisk_id: {
243+
"FreeSpaceRatio": float(pdisk.PDiskMetrics.AvailableSize) / float(pdisk.PDiskMetrics.TotalSize),
244+
"CandidateVSlots": [],
245+
}
246+
for pdisk_id, pdisk in pdisk_map.items()
247+
if pdisk.PDiskMetrics.TotalSize > 0 # pdisk works
248+
}
249+
for vslot in candidate_vslots:
250+
pdisk_id = common.get_pdisk_id(vslot.VSlotId)
251+
pdisks[pdisk_id]["CandidateVSlots"].append(vslot)
252+
print({pdisk: (len(info["CandidateVSlots"]), info["FreeSpaceRatio"]) for pdisk, info in pdisks.items()})
253+
vslots_ordered_groups_to_reassign = [info["CandidateVSlots"] for _, info in sorted(list(pdisks.items()), key=lambda x: x[1]["FreeSpaceRatio"])]
254+
255+
for vslots in vslots_ordered_groups_to_reassign:
193256
random.shuffle(vslots)
194257
for vslot in vslots:
195258
if do_reassign(vslot, False):

0 commit comments

Comments
 (0)