Skip to content

Commit fd243ab

Browse files
authored
Merge pull request #772 from funnydreamwinz/dev/fix-util-based-headroom
fix(sysadvisor): get util based headroom per-numa
2 parents ca7c39d + cf0a6cf commit fd243ab

File tree

3 files changed

+57
-17
lines changed

3 files changed

+57
-17
lines changed

pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_common.go

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,12 @@ func (ha *HeadroomAssemblerCommon) getHeadroomByUtil() (resource.Quantity, map[i
162162
MaxCapacity: dynamicConfig.MaxHeadroomCapacityRate * float64(ha.metaServer.MachineInfo.NumCores/ha.metaServer.NumNUMANodes),
163163
}
164164

165+
reclaimedCPUs, err := ha.getLastReclaimedCPUPerNUMA()
166+
if err != nil {
167+
general.Errorf("getLastReclaimedCPUPerNUMA failed: %v", err)
168+
return resource.Quantity{}, nil, err
169+
}
170+
165171
// get headroom per NUMA
166172
for _, numaID := range bindingNUMAs {
167173
cpuSet, ok := reclaimPoolInfo.TopologyAwareAssignments[numaID]
@@ -174,7 +180,9 @@ func (ha *HeadroomAssemblerCommon) getHeadroomByUtil() (resource.Quantity, map[i
174180
return resource.Quantity{}, nil, fmt.Errorf("get reclaim Metrics failed with numa %d: %v", numaID, err)
175181
}
176182

177-
headroom, err := ha.getUtilBasedHeadroom(options, reclaimMetrics)
183+
lastReclaimedCPUPerNumaForCalculate := make(map[int]float64)
184+
lastReclaimedCPUPerNumaForCalculate[numaID] = reclaimedCPUs[numaID]
185+
headroom, err := ha.getUtilBasedHeadroom(options, reclaimMetrics, lastReclaimedCPUPerNumaForCalculate)
178186
if err != nil {
179187
return resource.Quantity{}, nil, fmt.Errorf("get util-based headroom failed with numa %d: %v", numaID, err)
180188
}
@@ -186,13 +194,15 @@ func (ha *HeadroomAssemblerCommon) getHeadroomByUtil() (resource.Quantity, map[i
186194
// get global reclaim headroom
187195
if len(nonBindingNumas) > 0 {
188196
cpusets := machine.NewCPUSet()
197+
lastReclaimedCPUPerNumaForCalculate := make(map[int]float64)
189198
for _, numaID := range nonBindingNumas {
190199
cpuSet, ok := reclaimPoolInfo.TopologyAwareAssignments[numaID]
191200
if !ok {
192201
return resource.Quantity{}, nil, fmt.Errorf("reclaim pool NOT found TopologyAwareAssignments with numaID: %v", numaID)
193202
}
194203

195204
cpusets = cpusets.Union(cpuSet)
205+
lastReclaimedCPUPerNumaForCalculate[numaID] = reclaimedCPUs[numaID]
196206
}
197207

198208
reclaimMetrics, err := metricHelper.GetReclaimMetrics(cpusets, ha.getReclaimCgroupPath(), ha.metaServer.MetricsFetcher)
@@ -201,7 +211,7 @@ func (ha *HeadroomAssemblerCommon) getHeadroomByUtil() (resource.Quantity, map[i
201211
}
202212

203213
options.MaxCapacity *= float64(len(nonBindingNumas))
204-
headroom, err := ha.getUtilBasedHeadroom(options, reclaimMetrics)
214+
headroom, err := ha.getUtilBasedHeadroom(options, reclaimMetrics, lastReclaimedCPUPerNumaForCalculate)
205215
if err != nil {
206216
return resource.Quantity{}, nil, fmt.Errorf("get util-based headroom failed: %v", err)
207217
}

pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_common_util.go

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import (
3030
"github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/qosaware/resource/helper"
3131
"github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/types"
3232
metaserverHelper "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/metric/helper"
33+
"github.com/kubewharf/katalyst-core/pkg/util"
3334
"github.com/kubewharf/katalyst-core/pkg/util/general"
3435
"github.com/kubewharf/katalyst-core/pkg/util/native"
3536
)
@@ -40,11 +41,8 @@ const (
4041

4142
func (ha *HeadroomAssemblerCommon) getUtilBasedHeadroom(options helper.UtilBasedCapacityOptions,
4243
reclaimMetrics *metaserverHelper.ReclaimMetrics,
44+
lastReclaimedCPUPerNumaForCalculate map[int]float64,
4345
) (resource.Quantity, error) {
44-
lastReclaimedCPU, err := ha.getLastReclaimedCPU()
45-
if err != nil {
46-
return resource.Quantity{}, err
47-
}
4846
if reclaimMetrics == nil {
4947
return resource.Quantity{}, fmt.Errorf("reclaimMetrics is nil")
5048
}
@@ -54,10 +52,14 @@ func (ha *HeadroomAssemblerCommon) getUtilBasedHeadroom(options helper.UtilBased
5452
}
5553

5654
util := reclaimMetrics.CgroupCPUUsage / reclaimMetrics.ReclaimedCoresSupply
55+
lastReclaimedCPU := 0.0
56+
for _, cpu := range lastReclaimedCPUPerNumaForCalculate {
57+
lastReclaimedCPU += cpu
58+
}
5759

5860
general.InfoS("getUtilBasedHeadroom", "reclaimedCoresSupply", reclaimMetrics.ReclaimedCoresSupply,
5961
"util", util, "reclaim PoolCPUUsage", reclaimMetrics.PoolCPUUsage, "reclaim CgroupCPUUsage", reclaimMetrics.CgroupCPUUsage,
60-
"lastReclaimedCPU", lastReclaimedCPU)
62+
"lastReclaimedCPUPerNUMA", lastReclaimedCPUPerNumaForCalculate)
6163

6264
headroom, err := helper.EstimateUtilBasedCapacity(
6365
options,
@@ -72,20 +74,13 @@ func (ha *HeadroomAssemblerCommon) getUtilBasedHeadroom(options helper.UtilBased
7274
return *resource.NewQuantity(int64(math.Ceil(headroom)), resource.DecimalSI), nil
7375
}
7476

75-
func (ha *HeadroomAssemblerCommon) getLastReclaimedCPU() (float64, error) {
77+
func (ha *HeadroomAssemblerCommon) getLastReclaimedCPUPerNUMA() (map[int]float64, error) {
7678
cnr, err := ha.metaServer.CNRFetcher.GetCNR(context.Background())
7779
if err != nil {
78-
return 0, err
79-
}
80-
81-
if cnr.Status.Resources.Allocatable != nil {
82-
if reclaimedMilliCPU, ok := (*cnr.Status.Resources.Allocatable)[consts.ReclaimedResourceMilliCPU]; ok {
83-
return float64(reclaimedMilliCPU.Value()) / 1000, nil
84-
}
80+
return nil, err
8581
}
8682

87-
klog.Errorf("cnr status resource allocatable reclaimed milli cpu not found")
88-
return 0, nil
83+
return util.GetReclaimedCPUPerNUMA(cnr.Status.TopologyZone), nil
8984
}
9085

9186
func (ha *HeadroomAssemblerCommon) getReclaimNUMABindingTopo(reclaimPool *types.PoolInfo) (bindingNUMAs, nonBindingNumas []int, err error) {

pkg/util/cnr_topology.go

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,13 @@ package util
1919
import (
2020
"fmt"
2121
"sort"
22+
"strconv"
2223

2324
v1 "k8s.io/api/core/v1"
25+
"k8s.io/klog/v2"
2426

2527
nodev1alpha1 "github.com/kubewharf/katalyst-api/pkg/apis/node/v1alpha1"
28+
"github.com/kubewharf/katalyst-api/pkg/consts"
2629
"github.com/kubewharf/katalyst-core/pkg/config/generic"
2730
"github.com/kubewharf/katalyst-core/pkg/util/qos"
2831
)
@@ -212,3 +215,35 @@ func ValidateSharedCoresWithNumaBindingPod(qosConf *generic.QoSConfiguration, po
212215

213216
return true, nil
214217
}
218+
219+
func GetReclaimedCPUPerNUMA(topologyZones []*nodev1alpha1.TopologyZone) map[int]float64 {
220+
numaMap := make(map[int]float64)
221+
for _, topologyZone := range topologyZones {
222+
if topologyZone.Type != nodev1alpha1.TopologyTypeSocket {
223+
continue
224+
}
225+
226+
for _, child := range topologyZone.Children {
227+
if child.Type != nodev1alpha1.TopologyTypeNuma {
228+
continue
229+
}
230+
231+
numaID, err := strconv.Atoi(child.Name)
232+
if err != nil {
233+
klog.Errorf("invalid numa name: %v, %v", child.Name, err)
234+
continue
235+
}
236+
237+
if child.Resources.Allocatable == nil {
238+
klog.Errorf("numa zone without allocatable resource: %d", numaID)
239+
continue
240+
}
241+
242+
if reclaimedMilliCPU, ok := (*child.Resources.Allocatable)[consts.ReclaimedResourceMilliCPU]; ok {
243+
numaMap[numaID] = float64(reclaimedMilliCPU.Value()) / 1000
244+
}
245+
}
246+
}
247+
248+
return numaMap
249+
}

0 commit comments

Comments
 (0)