Skip to content

Commit 58d14b3

Browse files
committed
feat(qrm): return errors instead of empty array in hints
1 parent 8453174 commit 58d14b3

File tree

4 files changed

+64
-33
lines changed

4 files changed

+64
-33
lines changed

pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_hint_handlers.go

Lines changed: 36 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ import (
4141
qosutil "github.com/kubewharf/katalyst-core/pkg/util/qos"
4242
)
4343

44+
var errNoAvailableCPUHints = fmt.Errorf("no available cpu hints")
45+
4446
type memBWHintUpdate struct {
4547
updatedPreferrence bool
4648
leftAllocatable int
@@ -73,7 +75,7 @@ func (p *DynamicPolicy) sharedCoresHintHandler(ctx context.Context,
7375
"podName": req.PodName,
7476
"containerName": req.ContainerName,
7577
})...)
76-
return nil, fmt.Errorf("no enough cpu resource")
78+
return nil, errNoAvailableCPUHints
7779
}
7880
}
7981

@@ -209,12 +211,6 @@ func (p *DynamicPolicy) calculateHints(reqInt int,
209211
}
210212
sort.Ints(numaNodes)
211213

212-
hints := map[string]*pluginapi.ListOfTopologyHints{
213-
string(v1.ResourceCPU): {
214-
Hints: []*pluginapi.TopologyHint{},
215-
},
216-
}
217-
218214
minNUMAsCountNeeded, _, err := util.GetNUMANodesCountToFitCPUReq(reqInt, p.machineInfo.CPUTopology)
219215
if err != nil {
220216
return nil, fmt.Errorf("GetNUMANodesCountToFitCPUReq failed with error: %v", err)
@@ -262,6 +258,7 @@ func (p *DynamicPolicy) calculateHints(reqInt int,
262258
}
263259

264260
preferredHintIndexes := []int{}
261+
var availableNumaHints []*pluginapi.TopologyHint
265262
machine.IterateBitMasks(numaNodes, numaBound, func(mask machine.BitMask) {
266263
maskCount := mask.Count()
267264
if maskCount < minNUMAsCountNeeded {
@@ -292,16 +289,25 @@ func (p *DynamicPolicy) calculateHints(reqInt int,
292289
}
293290

294291
preferred := maskCount == minNUMAsCountNeeded
295-
hints[string(v1.ResourceCPU)].Hints = append(hints[string(v1.ResourceCPU)].Hints, &pluginapi.TopologyHint{
292+
availableNumaHints = append(availableNumaHints, &pluginapi.TopologyHint{
296293
Nodes: machine.MaskToUInt64Array(mask),
297294
Preferred: preferred,
298295
})
299296

300297
if preferred {
301-
preferredHintIndexes = append(preferredHintIndexes, len(hints[string(v1.ResourceCPU)].Hints)-1)
298+
preferredHintIndexes = append(preferredHintIndexes, len(availableNumaHints)-1)
302299
}
303300
})
304301

302+
// NOTE: because grpc is inability to distinguish between an empty array and nil,
303+
// we return an error instead of an empty array.
304+
// we should resolve this issue if we need manage multi resource in one plugin.
305+
if len(availableNumaHints) == 0 {
306+
general.Warningf("calculateHints got no available cpu hints for pod: %s/%s, container: %s",
307+
req.PodNamespace, req.PodName, req.ContainerName)
308+
return nil, errNoAvailableCPUHints
309+
}
310+
305311
if numaBound > machine.MBWNUMAsPoint {
306312
numaAllocatedMemBW, err := getNUMAAllocatedMemBW(machineState, p.metaServer, p.getContainerRequestedCores)
307313

@@ -314,11 +320,17 @@ func (p *DynamicPolicy) calculateHints(reqInt int,
314320
general.Errorf("getNUMAAllocatedMemBW failed with error: %v", err)
315321
_ = p.emitter.StoreInt64(util.MetricNameGetNUMAAllocatedMemBWFailed, 1, metrics.MetricTypeNameRaw)
316322
} else {
317-
p.updatePreferredCPUHintsByMemBW(preferredHintIndexes, hints[string(v1.ResourceCPU)].Hints,
323+
p.updatePreferredCPUHintsByMemBW(preferredHintIndexes, availableNumaHints,
318324
reqInt, numaAllocatedMemBW, req, numaExclusive)
319325
}
320326
}
321327

328+
hints := map[string]*pluginapi.ListOfTopologyHints{
329+
string(v1.ResourceCPU): {
330+
Hints: availableNumaHints,
331+
},
332+
}
333+
322334
return hints, nil
323335
}
324336

@@ -633,7 +645,7 @@ func (p *DynamicPolicy) sharedCoresWithNUMABindingHintHandler(_ context.Context,
633645
general.Infof("pod: %s/%s, container: %s request inplace update resize and no enough resource in current NUMA, try to migrate it to new NUMA",
634646
req.PodNamespace, req.PodName, req.ContainerName)
635647
var calculateErr error
636-
hints, calculateErr = p.calculateHintsForNUMABindingSharedCores(reqInt, podEntries, machineState, req.Annotations)
648+
hints, calculateErr = p.calculateHintsForNUMABindingSharedCores(reqInt, podEntries, machineState, req)
637649
if calculateErr != nil {
638650
general.Errorf("pod: %s/%s, container: %s request inplace update resize and no enough resource in current NUMA, failed to migrate it to new NUMA",
639651
req.PodNamespace, req.PodName, req.ContainerName)
@@ -642,15 +654,15 @@ func (p *DynamicPolicy) sharedCoresWithNUMABindingHintHandler(_ context.Context,
642654
} else {
643655
general.Errorf("pod: %s/%s, container: %s request inplace update resize, but no enough resource for it in current NUMA",
644656
req.PodNamespace, req.PodName, req.ContainerName)
645-
return nil, fmt.Errorf("inplace update resize scale out failed with no enough resource")
657+
return nil, errNoAvailableCPUHints
646658
}
647659
} else {
648660
general.Infof("pod: %s/%s, container: %s request inplace update resize, there is enough resource for it in current NUMA",
649661
req.PodNamespace, req.PodName, req.ContainerName)
650662
}
651663
} else if hints == nil {
652664
var calculateErr error
653-
hints, calculateErr = p.calculateHintsForNUMABindingSharedCores(reqInt, podEntries, machineState, req.Annotations)
665+
hints, calculateErr = p.calculateHintsForNUMABindingSharedCores(reqInt, podEntries, machineState, req)
654666
if calculateErr != nil {
655667
return nil, fmt.Errorf("calculateHintsForNUMABindingSharedCores failed with error: %v", calculateErr)
656668
}
@@ -780,12 +792,13 @@ func (p *DynamicPolicy) filterNUMANodesByNonBindingSharedRequestedQuantity(nonBi
780792

781793
func (p *DynamicPolicy) calculateHintsForNUMABindingSharedCores(reqInt int, podEntries state.PodEntries,
782794
machineState state.NUMANodeMap,
783-
reqAnnotations map[string]string,
795+
req *pluginapi.ResourceRequest,
784796
) (map[string]*pluginapi.ListOfTopologyHints, error) {
785797
nonBindingNUMAsCPUQuantity := machineState.GetFilteredAvailableCPUSet(p.reservedCPUs, nil, state.CheckNUMABinding).Size()
786798
nonBindingNUMAs := machineState.GetFilteredNUMASet(state.CheckNUMABinding)
787799
nonBindingSharedRequestedQuantity := state.GetNonBindingSharedRequestedQuantityFromPodEntries(podEntries, nil, p.getContainerRequestedCores)
788800

801+
reqAnnotations := req.Annotations
789802
numaNodes := p.filterNUMANodesByNonBindingSharedRequestedQuantity(nonBindingSharedRequestedQuantity,
790803
nonBindingNUMAsCPUQuantity, nonBindingNUMAs, machineState,
791804
machineState.GetFilteredNUMASetWithAnnotations(state.CheckNUMABindingSharedCoresAntiAffinity, reqAnnotations).ToSliceInt())
@@ -826,6 +839,15 @@ func (p *DynamicPolicy) calculateHintsForNUMABindingSharedCores(reqInt int, podE
826839
p.populateHintsByPreferPolicy(numaNodes, cpuconsts.CPUNUMAHintPreferPolicySpreading, hints, machineState, reqInt)
827840
}
828841

842+
// NOTE: because grpc is inability to distinguish between an empty array and nil,
843+
// we return an error instead of an empty array.
844+
// we should resolve this issue if we need manage multi resource in one plugin.
845+
if len(hints[string(v1.ResourceCPU)].Hints) == 0 {
846+
general.Warningf("calculateHints got no available memory hints for snb pod: %s/%s, container: %s",
847+
req.PodNamespace, req.PodName, req.ContainerName)
848+
return nil, errNoAvailableCPUHints
849+
}
850+
829851
return hints, nil
830852
}
831853

pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_test.go

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5291,10 +5291,8 @@ func TestSNBAdmitWithSidecarReallocate(t *testing.T) {
52915291
}
52925292

52935293
// pod aggregated size is 8, the new container request is 4, 8 + 4 > 11 (share-NUMA0 size)
5294-
res, err = dynamicPolicy.GetTopologyHints(context.Background(), anotherReq)
5295-
as.Nil(err)
5296-
as.NotNil(res.ResourceHints[string(v1.ResourceCPU)])
5297-
as.Equal(0, len(res.ResourceHints[string(v1.ResourceCPU)].Hints))
5294+
_, err = dynamicPolicy.GetTopologyHints(context.Background(), anotherReq)
5295+
as.ErrorContains(err, errNoAvailableCPUHints.Error())
52985296

52995297
// reallocate sidecar
53005298
_, err = dynamicPolicy.Allocate(context.Background(), sidecarReq)

pkg/agent/qrm-plugins/cpu/dynamicpolicy/vpa_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -708,7 +708,7 @@ func TestNormalShareVPA(t *testing.T) {
708708
}
709709

710710
_, err = dynamicPolicy.GetTopologyHints(context.Background(), resizeReq)
711-
as.ErrorContains(err, "no enough")
711+
as.ErrorContains(err, errNoAvailableCPUHints.Error())
712712

713713
resizeReq1 := &pluginapi.ResourceRequest{
714714
PodUid: req.PodUid,

pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_hint_handlers.go

Lines changed: 25 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ import (
3333
qosutil "github.com/kubewharf/katalyst-core/pkg/util/qos"
3434
)
3535

36+
var errNoAvailableMemoryHints = fmt.Errorf("no available memory hints")
37+
3638
func (p *DynamicPolicy) sharedCoresHintHandler(ctx context.Context,
3739
req *pluginapi.ResourceRequest,
3840
) (*pluginapi.ResourceHintsResponse, error) {
@@ -59,7 +61,7 @@ func (p *DynamicPolicy) sharedCoresHintHandler(ctx context.Context,
5961
"podName": req.PodName,
6062
"containerName": req.ContainerName,
6163
})...)
62-
return nil, fmt.Errorf("no enough memory resource")
64+
return nil, errNoAvailableMemoryHints
6365
}
6466
}
6567

@@ -197,7 +199,7 @@ func (p *DynamicPolicy) numaBindingHintHandler(_ context.Context,
197199

198200
var calculateErr error
199201
// recalculate hints for the whole pod
200-
hints, calculateErr = p.calculateHints(uint64(podAggregatedRequest), resourcesMachineState, req.Annotations)
202+
hints, calculateErr = p.calculateHints(uint64(podAggregatedRequest), resourcesMachineState, req)
201203
if calculateErr != nil {
202204
general.Errorf("failed to calculate hints for pod: %s/%s, container: %s, error: %v",
203205
req.PodNamespace, req.PodName, req.ContainerName, calculateErr)
@@ -213,7 +215,7 @@ func (p *DynamicPolicy) numaBindingHintHandler(_ context.Context,
213215
// otherwise, calculate hint for container without allocated memory
214216
var calculateErr error
215217
// calculate hint for container without allocated memory
216-
hints, calculateErr = p.calculateHints(uint64(podAggregatedRequest), resourcesMachineState, req.Annotations)
218+
hints, calculateErr = p.calculateHints(uint64(podAggregatedRequest), resourcesMachineState, req)
217219
if calculateErr != nil {
218220
general.Errorf("failed to calculate hints for pod: %s/%s, container: %s, error: %v",
219221
req.PodNamespace, req.PodName, req.ContainerName, calculateErr)
@@ -256,8 +258,9 @@ func (p *DynamicPolicy) dedicatedCoresWithoutNUMABindingHintHandler(_ context.Co
256258

257259
// calculateHints is a helper function to calculate the topology hints
258260
// with the given container requests.
259-
func (p *DynamicPolicy) calculateHints(reqInt uint64, resourcesMachineState state.NUMANodeResourcesMap,
260-
reqAnnotations map[string]string,
261+
func (p *DynamicPolicy) calculateHints(reqInt uint64,
262+
resourcesMachineState state.NUMANodeResourcesMap,
263+
req *pluginapi.ResourceRequest,
261264
) (map[string]*pluginapi.ListOfTopologyHints, error) {
262265
machineState := resourcesMachineState[v1.ResourceMemory]
263266

@@ -271,12 +274,6 @@ func (p *DynamicPolicy) calculateHints(reqInt uint64, resourcesMachineState stat
271274
}
272275
sort.Ints(numaNodes)
273276

274-
hints := map[string]*pluginapi.ListOfTopologyHints{
275-
string(v1.ResourceMemory): {
276-
Hints: []*pluginapi.TopologyHint{},
277-
},
278-
}
279-
280277
bytesPerNUMA, err := machineState.BytesPerNUMA()
281278
if err != nil {
282279
return nil, fmt.Errorf("getBytesPerNUMAFromMachineState failed with error: %v", err)
@@ -286,7 +283,7 @@ func (p *DynamicPolicy) calculateHints(reqInt uint64, resourcesMachineState stat
286283
if err != nil {
287284
return nil, fmt.Errorf("GetNUMANodesCountToFitMemoryReq failed with error: %v", err)
288285
}
289-
286+
reqAnnotations := req.Annotations
290287
numaBinding := qosutil.AnnotationsIndicateNUMABinding(reqAnnotations)
291288
numaExclusive := qosutil.AnnotationsIndicateNUMAExclusive(reqAnnotations)
292289

@@ -328,6 +325,7 @@ func (p *DynamicPolicy) calculateHints(reqInt uint64, resourcesMachineState stat
328325
numaBound = minNUMAsCountNeeded + 1
329326
}
330327

328+
var availableNumaHints []*pluginapi.TopologyHint
331329
machine.IterateBitMasks(numaNodes, numaBound, func(mask machine.BitMask) {
332330
maskCount := mask.Count()
333331
if maskCount < minNUMAsCountNeeded {
@@ -357,13 +355,26 @@ func (p *DynamicPolicy) calculateHints(reqInt uint64, resourcesMachineState stat
357355
return
358356
}
359357

360-
hints[string(v1.ResourceMemory)].Hints = append(hints[string(v1.ResourceMemory)].Hints, &pluginapi.TopologyHint{
358+
availableNumaHints = append(availableNumaHints, &pluginapi.TopologyHint{
361359
Nodes: machine.MaskToUInt64Array(mask),
362360
Preferred: len(maskBits) == minNUMAsCountNeeded,
363361
})
364362
})
365363

366-
return hints, nil
364+
// NOTE: because grpc is inability to distinguish between an empty array and nil,
365+
// we return an error instead of an empty array.
366+
// we should resolve this issue if we need manage multi resource in one plugin.
367+
if len(availableNumaHints) == 0 {
368+
general.Warningf("calculateHints got no available memory hints for pod: %s/%s, container: %s",
369+
req.PodNamespace, req.PodName, req.ContainerName)
370+
return nil, errNoAvailableMemoryHints
371+
}
372+
373+
return map[string]*pluginapi.ListOfTopologyHints{
374+
string(v1.ResourceMemory): {
375+
Hints: availableNumaHints,
376+
},
377+
}, nil
367378
}
368379

369380
// regenerateHints regenerates hints for container that'd already been allocated memory,

0 commit comments

Comments
 (0)