Skip to content

Commit aabfc53

Browse files
committed
refine getUtilBasedHeadroom
when allow_shared_cores_overlap_reclaimed = true, the workloads of reclaim pool include shared_cores and reclaimed_cores, so resource supply for reclaimed_cores should consider shared_cores. Signed-off-by: linzhecheng <linzhecheng@bytedance.com>
1 parent df45505 commit aabfc53

File tree

5 files changed

+97
-40
lines changed

5 files changed

+97
-40
lines changed

pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_common.go

Lines changed: 9 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,11 @@ import (
2929
"github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/region"
3030
"github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/types"
3131
"github.com/kubewharf/katalyst-core/pkg/config"
32-
pkgconsts "github.com/kubewharf/katalyst-core/pkg/consts"
3332
"github.com/kubewharf/katalyst-core/pkg/metaserver"
33+
"github.com/kubewharf/katalyst-core/pkg/metaserver/agent/metric/helper"
3434
"github.com/kubewharf/katalyst-core/pkg/metrics"
3535
"github.com/kubewharf/katalyst-core/pkg/util/general"
3636
"github.com/kubewharf/katalyst-core/pkg/util/machine"
37-
"github.com/kubewharf/katalyst-core/pkg/util/metric"
3837
)
3938

4039
type HeadroomAssemblerCommon struct {
@@ -104,18 +103,9 @@ func (ha *HeadroomAssemblerCommon) GetHeadroom() (resource.Quantity, error) {
104103
emptyNUMAs = emptyNUMAs.Difference(r.GetBindingNumas())
105104
}
106105

107-
reclaimPoolUtil := 0.0
108-
109106
// add non binding reclaim pool size
110-
reclaimPoolInfo, ok := ha.metaReader.GetPoolInfo(state.PoolNameReclaim)
111-
if ok && reclaimPoolInfo != nil {
112-
113-
reclaimedMetrics, err := ha.getPoolMetrics(state.PoolNameReclaim)
114-
if err != nil {
115-
return resource.Quantity{}, err
116-
}
117-
reclaimPoolUtil = reclaimedMetrics.coreAvgUtil
118-
107+
reclaimPoolInfo, reclaimPoolExist := ha.metaReader.GetPoolInfo(state.PoolNameReclaim)
108+
if reclaimPoolExist && reclaimPoolInfo != nil {
119109
reclaimPoolNUMAs := machine.GetCPUAssignmentNUMAs(reclaimPoolInfo.TopologyAwareAssignments)
120110

121111
sharedCoresHeadroom := 0.0
@@ -150,31 +140,15 @@ func (ha *HeadroomAssemblerCommon) GetHeadroom() (resource.Quantity, error) {
150140
general.InfoS("[qosaware-cpu] headroom assembled", "headroomTotal", headroomTotal, "backoffRetries",
151141
ha.backoffRetries, "util based enabled", dynamicConfig.CPUUtilBasedConfiguration.Enable)
152142

153-
// if util based cpu headroom disable, just return total reclaim pool size as headroom
154-
if !dynamicConfig.CPUUtilBasedConfiguration.Enable {
143+
// if util based cpu headroom disable or reclaim pool not existed, just return total reclaim pool size as headroom
144+
if !dynamicConfig.CPUUtilBasedConfiguration.Enable || !reclaimPoolExist || reclaimPoolInfo == nil {
155145
return *resource.NewQuantity(int64(headroomTotal), resource.DecimalSI), nil
156146
}
157147

158-
return ha.getUtilBasedHeadroom(dynamicConfig, int(headroomTotal), reclaimPoolUtil)
159-
}
160-
161-
type poolMetrics struct {
162-
coreAvgUtil float64
163-
poolSize int
164-
}
165-
166-
// getPoolMetrics get reclaimed pool metrics, including the average utilization of each core in
167-
// the reclaimed pool and the size of the pool
168-
func (ha *HeadroomAssemblerCommon) getPoolMetrics(poolName string) (*poolMetrics, error) {
169-
reclaimedInfo, ok := ha.metaReader.GetPoolInfo(poolName)
170-
if !ok {
171-
return nil, fmt.Errorf("failed get reclaim pool info")
148+
reclaimMetrics, err := helper.GetReclaimMetrics(reclaimPoolInfo.TopologyAwareAssignments.MergeCPUSet(), ha.conf.ReclaimRelativeRootCgroupPath, ha.metaServer.MetricsFetcher)
149+
if err != nil {
150+
return resource.Quantity{}, err
172151
}
173152

174-
cpuSet := reclaimedInfo.TopologyAwareAssignments.MergeCPUSet()
175-
m := ha.metaServer.AggregateCoreMetric(cpuSet, pkgconsts.MetricCPUUsageRatio, metric.AggregatorAvg)
176-
return &poolMetrics{
177-
coreAvgUtil: m.Value,
178-
poolSize: cpuSet.Size(),
179-
}, nil
153+
return ha.getUtilBasedHeadroom(dynamicConfig, reclaimMetrics)
180154
}

pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_common_test.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ func TestHeadroomAssemblerCommon_GetHeadroom(t *testing.T) {
144144
for i := 0; i < 10; i++ {
145145
store.SetCPUMetric(i, pkgconsts.MetricCPUUsageRatio, utilmetric.MetricData{Value: 0.3, Time: &now})
146146
}
147+
store.SetCgroupMetric("/kubepods/besteffort", pkgconsts.MetricCPUUsageCgroup, utilmetric.MetricData{Value: 3, Time: &now})
147148
},
148149
setMetaCache: func(cache *metacache.MetaCacheImp) {
149150
err := cache.SetPoolInfo(state.PoolNameReclaim, &types.PoolInfo{
@@ -194,8 +195,9 @@ func TestHeadroomAssemblerCommon_GetHeadroom(t *testing.T) {
194195
},
195196
setFakeMetric: func(store *metric.FakeMetricsFetcher) {
196197
for i := 0; i < 10; i++ {
197-
store.SetCPUMetric(i, pkgconsts.MetricCPUUsageRatio, utilmetric.MetricData{Value: 0.3, Time: &now})
198+
store.SetCPUMetric(i, pkgconsts.MetricCPUUsageRatio, utilmetric.MetricData{Value: 0.8, Time: &now})
198199
}
200+
store.SetCgroupMetric("/kubepods/besteffort", pkgconsts.MetricCPUUsageCgroup, utilmetric.MetricData{Value: 1, Time: &now})
199201
store.SetContainerMetric("pod1", "container1", metric_consts.MetricCPUUsageContainer, metric_util.MetricData{Value: 4})
200202
},
201203
setMetaCache: func(cache *metacache.MetaCacheImp) {
@@ -217,7 +219,7 @@ func TestHeadroomAssemblerCommon_GetHeadroom(t *testing.T) {
217219
})
218220
},
219221
},
220-
want: *resource.NewQuantity(13, resource.DecimalSI),
222+
want: *resource.NewQuantity(5, resource.DecimalSI),
221223
},
222224
{
223225
name: "disable util based",
@@ -251,6 +253,7 @@ func TestHeadroomAssemblerCommon_GetHeadroom(t *testing.T) {
251253
for i := 0; i < 10; i++ {
252254
store.SetCPUMetric(i, pkgconsts.MetricCPUUsageRatio, utilmetric.MetricData{Value: 0.3, Time: &now})
253255
}
256+
store.SetCgroupMetric("/kubepods/besteffort", pkgconsts.MetricCPUUsageCgroup, utilmetric.MetricData{Value: 3, Time: &now})
254257
},
255258
setMetaCache: func(cache *metacache.MetaCacheImp) {
256259
err := cache.SetPoolInfo(state.PoolNameReclaim, &types.PoolInfo{
@@ -296,6 +299,7 @@ func TestHeadroomAssemblerCommon_GetHeadroom(t *testing.T) {
296299
for i := 0; i < 10; i++ {
297300
store.SetCPUMetric(i, pkgconsts.MetricCPUUsageRatio, utilmetric.MetricData{Time: &now})
298301
}
302+
store.SetCgroupMetric("/kubepods/besteffort", pkgconsts.MetricCPUUsageCgroup, utilmetric.MetricData{Value: 0, Time: &now})
299303
},
300304
setMetaCache: func(cache *metacache.MetaCacheImp) {
301305
err := cache.SetPoolInfo(state.PoolNameReclaim, &types.PoolInfo{
@@ -341,6 +345,7 @@ func TestHeadroomAssemblerCommon_GetHeadroom(t *testing.T) {
341345
for i := 0; i < 96; i++ {
342346
store.SetCPUMetric(i, pkgconsts.MetricCPUUsageRatio, utilmetric.MetricData{Value: 0.9, Time: &now})
343347
}
348+
store.SetCgroupMetric("/kubepods/besteffort", pkgconsts.MetricCPUUsageCgroup, utilmetric.MetricData{Value: 9, Time: &now})
344349
},
345350
setMetaCache: func(cache *metacache.MetaCacheImp) {
346351
err := cache.SetPoolInfo(state.PoolNameReclaim, &types.PoolInfo{
@@ -388,6 +393,7 @@ func TestHeadroomAssemblerCommon_GetHeadroom(t *testing.T) {
388393
for i := 0; i < 96; i++ {
389394
store.SetCPUMetric(i, pkgconsts.MetricCPUUsageRatio, utilmetric.MetricData{Value: 0.3, Time: &now})
390395
}
396+
store.SetCgroupMetric("/kubepods/besteffort", pkgconsts.MetricCPUUsageCgroup, utilmetric.MetricData{Value: 28.8, Time: &now})
391397
},
392398
setMetaCache: func(cache *metacache.MetaCacheImp) {
393399
err := cache.SetPoolInfo(state.PoolNameReclaim, &types.PoolInfo{

pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_common_util.go

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,22 +18,39 @@ package headroomassembler
1818

1919
import (
2020
"context"
21+
"fmt"
22+
"math"
2123

2224
"k8s.io/apimachinery/pkg/api/resource"
2325
"k8s.io/klog/v2"
2426

2527
"github.com/kubewharf/katalyst-api/pkg/consts"
2628
"github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/qosaware/resource/helper"
2729
"github.com/kubewharf/katalyst-core/pkg/config/agent/dynamic"
30+
metaserverHelper "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/metric/helper"
31+
"github.com/kubewharf/katalyst-core/pkg/util/general"
2832
)
2933

3034
func (ha *HeadroomAssemblerCommon) getUtilBasedHeadroom(dynamicConfig *dynamic.Configuration,
31-
poolSize int, util float64,
35+
reclaimMetrics *metaserverHelper.ReclaimMetrics,
3236
) (resource.Quantity, error) {
3337
lastReclaimedCPU, err := ha.getLastReclaimedCPU()
3438
if err != nil {
3539
return resource.Quantity{}, err
3640
}
41+
if reclaimMetrics == nil {
42+
return resource.Quantity{}, fmt.Errorf("reclaimMetrics is nil")
43+
}
44+
45+
if reclaimMetrics.ReclaimedCoresSupply == 0 {
46+
return *resource.NewQuantity(0, resource.DecimalSI), nil
47+
}
48+
49+
util := reclaimMetrics.CgroupCPUUsage / reclaimMetrics.ReclaimedCoresSupply
50+
51+
general.InfoS("getUtilBasedHeadroom", "reclaimedCoresSupply", reclaimMetrics.ReclaimedCoresSupply,
52+
"util", util, "reclaim PoolCPUUsage", reclaimMetrics.PoolCPUUsage, "reclaim CgroupCPUUsage", reclaimMetrics.CgroupCPUUsage,
53+
"lastReclaimedCPU", lastReclaimedCPU)
3754

3855
headroom, err := helper.EstimateUtilBasedCapacity(
3956
helper.UtilBasedCapacityOptions{
@@ -42,15 +59,15 @@ func (ha *HeadroomAssemblerCommon) getUtilBasedHeadroom(dynamicConfig *dynamic.C
4259
MaxOversoldRate: dynamicConfig.MaxOversoldRate,
4360
MaxCapacity: dynamicConfig.MaxHeadroomCapacityRate * float64(ha.metaServer.MachineInfo.NumCores),
4461
},
45-
float64(poolSize),
62+
reclaimMetrics.ReclaimedCoresSupply,
4663
util,
4764
lastReclaimedCPU,
4865
)
4966
if err != nil {
5067
return resource.Quantity{}, err
5168
}
5269

53-
return *resource.NewQuantity(int64(headroom), resource.DecimalSI), nil
70+
return *resource.NewQuantity(int64(math.Ceil(headroom)), resource.DecimalSI), nil
5471
}
5572

5673
func (ha *HeadroomAssemblerCommon) getLastReclaimedCPU() (float64, error) {

pkg/agent/sysadvisor/plugin/qosaware/resource/helper/estimation_canonical.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,8 @@ func EstimateUtilBasedCapacity(options UtilBasedCapacityOptions, resourceSupply,
217217
oversold = resourceSupply * (options.MaxUtilization - currentUtilization)
218218
}
219219

220+
// TODO: consider cpu PSI
221+
220222
result = math.Max(lastCapacityResult+oversold, resourceSupply)
221223
result = math.Min(result, resourceSupply*options.MaxOversoldRate)
222224
if options.MaxCapacity > 0 {
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
/*
2+
Copyright 2022 The Katalyst Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package helper
18+
19+
import (
20+
pkgconsts "github.com/kubewharf/katalyst-core/pkg/consts"
21+
"github.com/kubewharf/katalyst-core/pkg/metaserver/agent/metric/types"
22+
"github.com/kubewharf/katalyst-core/pkg/util/general"
23+
"github.com/kubewharf/katalyst-core/pkg/util/machine"
24+
"github.com/kubewharf/katalyst-core/pkg/util/metric"
25+
)
26+
27+
type ReclaimMetrics struct {
28+
// total cpu usage of cpus in reclaim pool
29+
PoolCPUUsage float64
30+
// cpu usage of root cgroup for reclaim pods
31+
CgroupCPUUsage float64
32+
// reclaim pool size
33+
Size int
34+
// reclaimedCoresSupply is the actual CPU resource can be supplied to reclaimed cores
35+
ReclaimedCoresSupply float64
36+
}
37+
38+
// GetReclaimMetrics returns the reclaim CPU metrics for the given cpus and cgroupPath
39+
func GetReclaimMetrics(cpus machine.CPUSet, cgroupPath string, metricsFetcher types.MetricsFetcher) (*ReclaimMetrics, error) {
40+
data := metricsFetcher.AggregateCoreMetric(cpus, pkgconsts.MetricCPUUsageRatio, metric.AggregatorSum)
41+
poolCPUUsage := data.Value
42+
43+
data, err := metricsFetcher.GetCgroupMetric(cgroupPath, pkgconsts.MetricCPUUsageCgroup)
44+
if err != nil {
45+
return nil, err
46+
}
47+
cgroupCPUUsage := data.Value
48+
49+
// when shared_cores overlap reclaimed_cores, the actual CPU resource can be supplied to reclaimed cores is idle + reclaimed_cores cpu usage
50+
reclaimedCoresSupply := general.MaxFloat64(float64(cpus.Size())-poolCPUUsage, 0) + cgroupCPUUsage
51+
52+
return &ReclaimMetrics{
53+
PoolCPUUsage: poolCPUUsage,
54+
CgroupCPUUsage: cgroupCPUUsage,
55+
Size: cpus.Size(),
56+
ReclaimedCoresSupply: reclaimedCoresSupply,
57+
}, nil
58+
}

0 commit comments

Comments
 (0)