Skip to content

Commit 97eeaa9

Browse files
authored
Merge pull request #796 from luomingmeng/dev/sysadvisor-support-revise-reclaimed-resource
feat(sysadvisor): support MinIgnoredReclaimedResourceForReport to avo…
2 parents cae1007 + 1f16f6c commit 97eeaa9

File tree

8 files changed

+726
-34
lines changed

8 files changed

+726
-34
lines changed

cmd/katalyst-agent/app/options/dynamic/adminqos/reclaimedresource/reclaimedresource_base.go

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,12 @@ import (
2929
)
3030

3131
type ReclaimedResourceOptions struct {
32-
EnableReclaim bool
33-
ReservedResourceForReport general.ResourceList
34-
MinReclaimedResourceForReport general.ResourceList
35-
ReservedResourceForAllocate general.ResourceList
36-
ReservedResourceForReclaimedCores general.ResourceList
32+
EnableReclaim bool
33+
ReservedResourceForReport general.ResourceList
34+
MinReclaimedResourceForReport general.ResourceList
35+
MinIgnoredReclaimedResourceForReport general.ResourceList
36+
ReservedResourceForAllocate general.ResourceList
37+
ReservedResourceForReclaimedCores general.ResourceList
3738

3839
*cpuheadroom.CPUHeadroomOptions
3940
*memoryheadroom.MemoryHeadroomOptions
@@ -50,6 +51,10 @@ func NewReclaimedResourceOptions() *ReclaimedResourceOptions {
5051
v1.ResourceCPU: resource.MustParse("4"),
5152
v1.ResourceMemory: resource.MustParse("5Gi"),
5253
},
54+
MinIgnoredReclaimedResourceForReport: map[v1.ResourceName]resource.Quantity{
55+
v1.ResourceCPU: resource.MustParse("0.1"),
56+
v1.ResourceMemory: resource.MustParse("100Mi"),
57+
},
5358
ReservedResourceForAllocate: map[v1.ResourceName]resource.Quantity{
5459
v1.ResourceCPU: resource.MustParse("4"),
5560
v1.ResourceMemory: resource.MustParse("5Gi"),
@@ -73,6 +78,8 @@ func (o *ReclaimedResourceOptions) AddFlags(fss *cliflag.NamedFlagSets) {
7378
"reserved reclaimed resource report to cnr")
7479
fs.Var(&o.MinReclaimedResourceForReport, "min-reclaimed-resource-for-report",
7580
"min reclaimed resource report to cnr")
81+
fs.Var(&o.MinIgnoredReclaimedResourceForReport, "min-ignored-reclaimed-resource-for-report",
82+
"min ignored reclaimed resource report to cnr")
7683
fs.Var(&o.ReservedResourceForAllocate, "reserved-resource-for-allocate",
7784
"reserved reclaimed resource actually not allocate to reclaimed resource")
7885
fs.Var(&o.ReservedResourceForReclaimedCores, "reserved-resource-for-reclaimed-cores",
@@ -88,6 +95,7 @@ func (o *ReclaimedResourceOptions) ApplyTo(c *reclaimedresource.ReclaimedResourc
8895
c.EnableReclaim = o.EnableReclaim
8996
c.ReservedResourceForReport = v1.ResourceList(o.ReservedResourceForReport)
9097
c.MinReclaimedResourceForReport = v1.ResourceList(o.MinReclaimedResourceForReport)
98+
c.MinIgnoredReclaimedResourceForReport = v1.ResourceList(o.MinIgnoredReclaimedResourceForReport)
9199
c.ReservedResourceForAllocate = v1.ResourceList(o.ReservedResourceForAllocate)
92100
c.MinReclaimedResourceForAllocate = v1.ResourceList(o.ReservedResourceForReclaimedCores)
93101

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ require (
1919
github.com/google/uuid v1.3.0
2020
github.com/h2non/gock v1.2.0
2121
github.com/klauspost/cpuid/v2 v2.2.6
22-
github.com/kubewharf/katalyst-api v0.5.2-0.20241210135216-5785b7552c05
22+
github.com/kubewharf/katalyst-api v0.5.2-0.20250317093030-1a5a0e1b5b70
2323
github.com/montanaflynn/stats v0.7.1
2424
github.com/opencontainers/runc v1.1.6
2525
github.com/opencontainers/selinux v1.10.0

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -570,8 +570,8 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
570570
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
571571
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
572572
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
573-
github.com/kubewharf/katalyst-api v0.5.2-0.20241210135216-5785b7552c05 h1:oV/CsCzr3T3WnWz91aZebSpbKiYFQSS564CgsCrD/QQ=
574-
github.com/kubewharf/katalyst-api v0.5.2-0.20241210135216-5785b7552c05/go.mod h1:Y2IeIorxQamF2a3oa0+URztl5QCSty6Jj3zD83R8J9k=
573+
github.com/kubewharf/katalyst-api v0.5.2-0.20250317093030-1a5a0e1b5b70 h1:4ePDOk8c8fE6ZDJcna2gbb7psIhBDBR14kbfWbJ0yfY=
574+
github.com/kubewharf/katalyst-api v0.5.2-0.20250317093030-1a5a0e1b5b70/go.mod h1:Y2IeIorxQamF2a3oa0+URztl5QCSty6Jj3zD83R8J9k=
575575
github.com/kubewharf/kubelet v1.24.6-kubewharf.9 h1:jOTYZt7h/J7I8xQMKMUcJjKf5UFBv37jHWvNp5VRFGc=
576576
github.com/kubewharf/kubelet v1.24.6-kubewharf.9/go.mod h1:MxbSZUx3wXztFneeelwWWlX7NAAStJ6expqq7gY2J3c=
577577
github.com/kyoh86/exportloopref v0.1.7/go.mod h1:h1rDl2Kdj97+Kwh4gdz3ujE7XHmH51Q0lUiZ1z4NLj8=

pkg/agent/sysadvisor/plugin/qosaware/reporter/headroom_reporter.go

Lines changed: 109 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,12 @@ import (
3636
"github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/qosaware/reporter/manager/resource"
3737
hmadvisor "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/qosaware/resource"
3838
"github.com/kubewharf/katalyst-core/pkg/config"
39+
"github.com/kubewharf/katalyst-core/pkg/config/agent/dynamic"
3940
"github.com/kubewharf/katalyst-core/pkg/metaserver"
4041
"github.com/kubewharf/katalyst-core/pkg/metrics"
4142
"github.com/kubewharf/katalyst-core/pkg/util"
43+
"github.com/kubewharf/katalyst-core/pkg/util/general"
44+
"github.com/kubewharf/katalyst-core/pkg/util/native"
4245
)
4346

4447
func init() {
@@ -50,6 +53,10 @@ const (
5053
headroomReporterPluginName = "headroom-reporter-plugin"
5154
)
5255

56+
const (
57+
metricsNameReclaimedResourceRevised = "reclaimed_resource_revised"
58+
)
59+
5360
type HeadroomResourceManager interface {
5461
manager.ResourceManager
5562
manager.NumaResourceManager
@@ -111,16 +118,21 @@ type reclaimedResource struct {
111118
capacity v1.ResourceList
112119
numaAllocatable map[int]v1.ResourceList
113120
numaCapacity map[int]v1.ResourceList
121+
122+
resourceNameMap map[v1.ResourceName]v1.ResourceName
123+
milliValue map[v1.ResourceName]bool
114124
}
115125

116126
type headroomReporterPlugin struct {
117127
sync.Mutex
118128
headroomManagers map[v1.ResourceName]manager.HeadroomManager
119129
numaSocketZoneNodeMap map[util.ZoneNode]util.ZoneNode
120130

121-
ctx context.Context
122-
cancel context.CancelFunc
123-
started bool
131+
dynamicConf *dynamic.DynamicAgentConfiguration
132+
ctx context.Context
133+
cancel context.CancelFunc
134+
emitter metrics.MetricEmitter
135+
started bool
124136
}
125137

126138
func newHeadroomReporterPlugin(emitter metrics.MetricEmitter, metaServer *metaserver.MetaServer,
@@ -131,6 +143,11 @@ func newHeadroomReporterPlugin(emitter metrics.MetricEmitter, metaServer *metase
131143
errList []error
132144
)
133145

146+
// init numa topo info by metaServer
147+
if metaServer == nil || metaServer.MachineInfo == nil {
148+
return nil, nil, fmt.Errorf("get metaserver machine info is nil")
149+
}
150+
134151
initializers := manager.GetRegisteredManagerInitializers()
135152
headroomManagers := make(map[v1.ResourceName]manager.HeadroomManager, len(initializers))
136153
for name, initializer := range initializers {
@@ -140,18 +157,15 @@ func newHeadroomReporterPlugin(emitter metrics.MetricEmitter, metaServer *metase
140157
}
141158
}
142159

143-
// init numa topo info by metaServer
144-
if metaServer == nil || metaServer.MachineInfo == nil {
145-
errList = append(errList, fmt.Errorf("get metaserver machine info is nil"))
146-
}
147-
148160
if len(errList) > 0 {
149161
return nil, nil, errors.NewAggregate(errList)
150162
}
151163

152164
reporter := &headroomReporterPlugin{
153165
headroomManagers: headroomManagers,
154166
numaSocketZoneNodeMap: util.GenerateNumaSocketZone(metaServer.MachineInfo.Topology),
167+
dynamicConf: conf.DynamicAgentConfiguration,
168+
emitter: emitter,
155169
}
156170
pluginWrapper, err := skeleton.NewRegistrationPluginWrapper(reporter, []string{conf.PluginRegistrationDir},
157171
func(key string, value int64) {
@@ -223,6 +237,12 @@ func (r *headroomReporterPlugin) GetReportContent(_ context.Context, _ *v1alpha1
223237
return nil, err
224238
}
225239

240+
// revise reclaimed resource to avoid resource fragmentation
241+
err = r.reviseReclaimedResource(res)
242+
if err != nil {
243+
return nil, err
244+
}
245+
226246
reportToCNR, err := r.getReportReclaimedResourceForCNR(res)
227247
if err != nil {
228248
return nil, err
@@ -256,47 +276,51 @@ func (r *headroomReporterPlugin) getReclaimedResource() (*reclaimedResource, err
256276
capacity := make(v1.ResourceList)
257277
numaAllocatable := make(map[int]v1.ResourceList)
258278
numaCapacity := make(map[int]v1.ResourceList)
259-
for resourceName, rm := range r.headroomManagers {
260-
allocatable[resourceName], err = rm.GetAllocatable()
279+
resourceNameMap := make(map[v1.ResourceName]v1.ResourceName)
280+
milliValue := make(map[v1.ResourceName]bool)
281+
for reportName, rm := range r.headroomManagers {
282+
// get origin resource name
283+
resourceNameMap[reportName] = rm.Name()
284+
milliValue[reportName] = rm.MilliValue()
285+
allocatable[reportName], err = rm.GetAllocatable()
261286
if err != nil {
262-
errList = append(errList, fmt.Errorf("get reclaimed %s allocatable failed: %s", resourceName, err))
287+
errList = append(errList, fmt.Errorf("get reclaimed %s allocatable failed: %s", reportName, err))
263288
}
264289

265-
capacity[resourceName], err = rm.GetCapacity()
290+
capacity[reportName], err = rm.GetCapacity()
266291
if err != nil {
267-
errList = append(errList, err, fmt.Errorf("get reclaimed %s capacity failed: %s", resourceName, err))
292+
errList = append(errList, err, fmt.Errorf("get reclaimed %s capacity failed: %s", reportName, err))
268293
}
269294

270295
// get allocatable per numa
271296
allocatableMap, err := rm.GetNumaAllocatable()
272297
if err != nil {
273-
errList = append(errList, fmt.Errorf("get reclaimed %s numa allocatable failed: %s", resourceName, err))
298+
errList = append(errList, fmt.Errorf("get reclaimed %s numa allocatable failed: %s", reportName, err))
274299
} else {
275300
for numaID, quantity := range allocatableMap {
276301
perNumaAllocatable, ok := numaAllocatable[numaID]
277302
if !ok {
278303
perNumaAllocatable = make(v1.ResourceList)
279304
numaAllocatable[numaID] = perNumaAllocatable
280305
}
281-
perNumaAllocatable[resourceName] = quantity
306+
perNumaAllocatable[reportName] = quantity
282307
}
283308
}
284309

285310
// get capacity per numa
286311
capacityMap, err := rm.GetNumaCapacity()
287312
if err != nil {
288-
errList = append(errList, fmt.Errorf("get reclaimed %s numa capacity failed: %s", resourceName, err))
313+
errList = append(errList, fmt.Errorf("get reclaimed %s numa capacity failed: %s", reportName, err))
289314
} else {
290315
for numaID, quantity := range capacityMap {
291316
perNumaCapacity, ok := numaCapacity[numaID]
292317
if !ok {
293318
perNumaCapacity = make(v1.ResourceList)
294319
numaCapacity[numaID] = perNumaCapacity
295320
}
296-
perNumaCapacity[resourceName] = quantity
321+
perNumaCapacity[reportName] = quantity
297322
}
298323
}
299-
300324
}
301325

302326
if len(errList) > 0 {
@@ -308,7 +332,9 @@ func (r *headroomReporterPlugin) getReclaimedResource() (*reclaimedResource, err
308332
capacity: capacity,
309333
numaAllocatable: numaAllocatable,
310334
numaCapacity: numaCapacity,
311-
}, err
335+
milliValue: milliValue,
336+
resourceNameMap: resourceNameMap,
337+
}, nil
312338
}
313339

314340
func (r *headroomReporterPlugin) getReportReclaimedResourceForCNR(reclaimedResource *reclaimedResource) (*v1alpha1.ReportContent, error) {
@@ -385,3 +411,67 @@ func (r *headroomReporterPlugin) getReportNUMAReclaimedResource(reclaimedResourc
385411
Value: value,
386412
}, nil
387413
}
414+
415+
func (r *headroomReporterPlugin) reviseReclaimedResource(res *reclaimedResource) error {
416+
if res == nil {
417+
return fmt.Errorf("reclaimed resource is nil")
418+
}
419+
420+
conf := r.dynamicConf.GetDynamicConfiguration()
421+
reviseFunc := func(resList v1.ResourceList) bool {
422+
revise := false
423+
for reportName, quantity := range resList {
424+
resourceName, ok := res.resourceNameMap[reportName]
425+
if !ok {
426+
resourceName = reportName
427+
}
428+
429+
minIgnored, ok := conf.MinIgnoredReclaimedResourceForReport[resourceName]
430+
if ok {
431+
milliValue, ok := res.milliValue[reportName]
432+
if ok && milliValue {
433+
minIgnored = *apiresource.NewQuantity(minIgnored.MilliValue(), minIgnored.Format)
434+
}
435+
436+
if quantity.Cmp(minIgnored) <= 0 {
437+
revise = true
438+
break
439+
}
440+
}
441+
}
442+
443+
if revise {
444+
for resourceName := range resList {
445+
resList[resourceName] = apiresource.Quantity{}
446+
}
447+
}
448+
449+
return revise
450+
}
451+
452+
numaRevised := false
453+
for numaID := range res.numaAllocatable {
454+
if reviseFunc(res.numaAllocatable[numaID]) {
455+
numaRevised = true
456+
}
457+
}
458+
459+
if numaRevised {
460+
sumNUMAAllocatable := v1.ResourceList{}
461+
for _, allocatable := range res.numaAllocatable {
462+
sumNUMAAllocatable = native.AddResources(sumNUMAAllocatable, allocatable)
463+
}
464+
res.allocatable = sumNUMAAllocatable
465+
}
466+
467+
revised := reviseFunc(res.allocatable)
468+
if numaRevised || revised {
469+
general.InfoS("revised result",
470+
"allocatable", res.allocatable,
471+
"capacity", res.capacity,
472+
"numaAllocatable", res.numaAllocatable,
473+
"numaCapacity", res.numaCapacity)
474+
_ = r.emitter.StoreInt64(metricsNameReclaimedResourceRevised, 1, metrics.MetricTypeNameRaw)
475+
}
476+
return nil
477+
}

0 commit comments

Comments
 (0)