Skip to content

Commit 7d9189e

Browse files
authored
Merge pull request #663 from cheney-lin/dev/refine_util_based
refine pressure_supression eviction and getUtilBasedHeadroom
2 parents df45505 + 3f4179f commit 7d9189e

File tree

7 files changed

+221
-104
lines changed

7 files changed

+221
-104
lines changed

pkg/agent/qrm-plugins/cpu/dynamicpolicy/cpueviction/strategy/pressure_suppression.go

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ import (
3131
"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/cpu/dynamicpolicy/state"
3232
"github.com/kubewharf/katalyst-core/pkg/config"
3333
"github.com/kubewharf/katalyst-core/pkg/metaserver"
34+
"github.com/kubewharf/katalyst-core/pkg/metaserver/agent/metric/helper"
3435
"github.com/kubewharf/katalyst-core/pkg/metrics"
3536
"github.com/kubewharf/katalyst-core/pkg/util/general"
3637
"github.com/kubewharf/katalyst-core/pkg/util/native"
@@ -40,18 +41,20 @@ import (
4041
const EvictionNameSuppression = "cpu-pressure-suppression-plugin"
4142

4243
type CPUPressureSuppression struct {
43-
conf *config.Configuration
44-
state state.ReadonlyState
44+
conf *config.Configuration
45+
state state.ReadonlyState
46+
metaServer *metaserver.MetaServer
4547

4648
lastToleranceTime sync.Map
4749
}
4850

49-
func NewCPUPressureSuppressionEviction(_ metrics.MetricEmitter, _ *metaserver.MetaServer,
51+
func NewCPUPressureSuppressionEviction(_ metrics.MetricEmitter, metaServer *metaserver.MetaServer,
5052
conf *config.Configuration, state state.ReadonlyState,
5153
) (CPUPressureEviction, error) {
5254
return &CPUPressureSuppression{
53-
conf: conf,
54-
state: state,
55+
conf: conf,
56+
state: state,
57+
metaServer: metaServer,
5558
}, nil
5659
}
5760

@@ -90,6 +93,11 @@ func (p *CPUPressureSuppression) GetEvictPods(_ context.Context, request *plugin
9093
return &pluginapi.GetEvictPodsResponse{}, nil
9194
}
9295

96+
reclaimMetrics, err := helper.GetReclaimMetrics(poolCPUSet, p.conf.ReclaimRelativeRootCgroupPath, p.metaServer.MetricsFetcher)
97+
if err != nil {
98+
return nil, fmt.Errorf("get reclaim metrics failed: %s", err)
99+
}
100+
93101
filteredPods := native.FilterPods(request.ActivePods, p.conf.CheckReclaimedQoSForPod)
94102
if len(filteredPods) == 0 {
95103
return &pluginapi.GetEvictPodsResponse{}, nil
@@ -107,13 +115,21 @@ func (p *CPUPressureSuppression) GetEvictPods(_ context.Context, request *plugin
107115
for _, pod := range filteredPods {
108116
totalCPURequest.Add(native.CPUQuantityGetter()(native.SumUpPodRequestResources(pod)))
109117
}
110-
general.Infof("total reclaim cpu request is %v, reclaim pool size is %v", totalCPURequest.String(), poolSize)
118+
119+
general.InfoS("info", "reclaim cpu request", totalCPURequest.String(),
120+
"reclaim pool size", poolSize, "reclaimedCoresSupply", reclaimMetrics.ReclaimedCoresSupply,
121+
"reclaimPoolUsage", reclaimMetrics.PoolCPUUsage, "reclaimedCoresUsage", reclaimMetrics.CgroupCPUUsage)
111122

112123
now := time.Now()
113124
var evictPods []*v1alpha1.EvictPod
114125
for _, pod := range filteredPods {
115126
key := native.GenerateUniqObjectNameKey(pod)
116-
poolSuppressionRate := float64(totalCPURequest.Value()) / float64(poolSize)
127+
poolSuppressionRate := 0.0
128+
if reclaimMetrics.ReclaimedCoresSupply == 0 {
129+
poolSuppressionRate = math.MaxFloat64
130+
} else {
131+
poolSuppressionRate = float64(totalCPURequest.Value()) / reclaimMetrics.ReclaimedCoresSupply
132+
}
117133

118134
if podToleranceRate := p.getPodToleranceRate(pod, dynamicConfig.MaxSuppressionToleranceRate); podToleranceRate < poolSuppressionRate {
119135
last, _ := p.lastToleranceTime.LoadOrStore(key, now)

pkg/agent/qrm-plugins/cpu/dynamicpolicy/cpueviction/strategy/pressure_suppression_test.go

Lines changed: 101 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,11 @@ import (
3636
evictionpluginapi "github.com/kubewharf/katalyst-api/pkg/protocol/evictionplugin/v1alpha1"
3737
qrmstate "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/cpu/dynamicpolicy/state"
3838
"github.com/kubewharf/katalyst-core/pkg/config"
39+
pkgconsts "github.com/kubewharf/katalyst-core/pkg/consts"
3940
"github.com/kubewharf/katalyst-core/pkg/metaserver/agent/metric"
4041
"github.com/kubewharf/katalyst-core/pkg/metrics"
4142
"github.com/kubewharf/katalyst-core/pkg/util/machine"
43+
utilmetric "github.com/kubewharf/katalyst-core/pkg/util/metric"
4244
)
4345

4446
const (
@@ -53,6 +55,7 @@ func makeSuppressionEvictionConf(cpuMaxSuppressionToleranceRate float64,
5355
conf.GetDynamicConfiguration().EnableSuppressionEviction = true
5456
conf.GetDynamicConfiguration().MaxSuppressionToleranceRate = cpuMaxSuppressionToleranceRate
5557
conf.GetDynamicConfiguration().MinSuppressionToleranceDuration = cpuMinSuppressionToleranceDuration
58+
conf.ReclaimRelativeRootCgroupPath = "test"
5659
return conf
5760
}
5861

@@ -77,15 +80,7 @@ func TestCPUPressureSuppression_GetEvictPods(t *testing.T) {
7780

7881
as := require.New(t)
7982

80-
cpuTopology, err := machine.GenerateDummyCPUTopology(16, 2, 4)
81-
as.Nil(err)
82-
conf := makeSuppressionEvictionConf(defaultCPUMaxSuppressionToleranceRate, defaultCPUMinSuppressionToleranceDuration)
83-
metaServer := makeMetaServer(metric.NewFakeMetricsFetcher(metrics.DummyMetrics{}), cpuTopology)
84-
stateImpl, err := makeState(cpuTopology)
85-
as.Nil(err)
86-
87-
plugin, _ := NewCPUPressureSuppressionEviction(metrics.DummyMetrics{}, metaServer, conf, stateImpl)
88-
as.NotNil(plugin)
83+
now := time.Now()
8984

9085
pod1UID := string(uuid.NewUUID())
9186
pod1Name := "pod-1"
@@ -95,6 +90,7 @@ func TestCPUPressureSuppression_GetEvictPods(t *testing.T) {
9590
tests := []struct {
9691
name string
9792
podEntries qrmstate.PodEntries
93+
setFakeMetric func(store *metric.FakeMetricsFetcher)
9894
wantEvictPodUIDSet sets.String
9995
}{
10096
{
@@ -157,6 +153,20 @@ func TestCPUPressureSuppression_GetEvictPods(t *testing.T) {
157153
},
158154
},
159155
wantEvictPodUIDSet: sets.NewString(),
156+
setFakeMetric: func(store *metric.FakeMetricsFetcher) {
157+
store.SetCPUMetric(1, pkgconsts.MetricCPUUsageRatio, utilmetric.MetricData{Value: 0.5, Time: &now})
158+
store.SetCPUMetric(3, pkgconsts.MetricCPUUsageRatio, utilmetric.MetricData{Value: 0.5, Time: &now})
159+
store.SetCPUMetric(4, pkgconsts.MetricCPUUsageRatio, utilmetric.MetricData{Value: 0.5, Time: &now})
160+
store.SetCPUMetric(5, pkgconsts.MetricCPUUsageRatio, utilmetric.MetricData{Value: 0.5, Time: &now})
161+
store.SetCPUMetric(6, pkgconsts.MetricCPUUsageRatio, utilmetric.MetricData{Value: 0.5, Time: &now})
162+
store.SetCPUMetric(9, pkgconsts.MetricCPUUsageRatio, utilmetric.MetricData{Value: 0.5, Time: &now})
163+
store.SetCPUMetric(11, pkgconsts.MetricCPUUsageRatio, utilmetric.MetricData{Value: 0.5, Time: &now})
164+
store.SetCPUMetric(12, pkgconsts.MetricCPUUsageRatio, utilmetric.MetricData{Value: 0.5, Time: &now})
165+
store.SetCPUMetric(13, pkgconsts.MetricCPUUsageRatio, utilmetric.MetricData{Value: 0.5, Time: &now})
166+
store.SetCPUMetric(14, pkgconsts.MetricCPUUsageRatio, utilmetric.MetricData{Value: 0.5, Time: &now})
167+
168+
store.SetCgroupMetric("test", pkgconsts.MetricCPUUsageCgroup, utilmetric.MetricData{Value: 5, Time: &now})
169+
},
160170
},
161171
{
162172
name: "over tolerance rate",
@@ -253,72 +263,106 @@ func TestCPUPressureSuppression_GetEvictPods(t *testing.T) {
253263
},
254264
},
255265
wantEvictPodUIDSet: sets.NewString(pod1UID),
266+
setFakeMetric: func(store *metric.FakeMetricsFetcher) {
267+
store.SetCPUMetric(1, pkgconsts.MetricCPUUsageRatio, utilmetric.MetricData{Value: 0.5, Time: &now})
268+
store.SetCPUMetric(3, pkgconsts.MetricCPUUsageRatio, utilmetric.MetricData{Value: 0.5, Time: &now})
269+
store.SetCPUMetric(4, pkgconsts.MetricCPUUsageRatio, utilmetric.MetricData{Value: 0.5, Time: &now})
270+
store.SetCPUMetric(5, pkgconsts.MetricCPUUsageRatio, utilmetric.MetricData{Value: 0.5, Time: &now})
271+
store.SetCPUMetric(6, pkgconsts.MetricCPUUsageRatio, utilmetric.MetricData{Value: 0.5, Time: &now})
272+
store.SetCPUMetric(9, pkgconsts.MetricCPUUsageRatio, utilmetric.MetricData{Value: 0.5, Time: &now})
273+
store.SetCPUMetric(11, pkgconsts.MetricCPUUsageRatio, utilmetric.MetricData{Value: 0.5, Time: &now})
274+
store.SetCPUMetric(12, pkgconsts.MetricCPUUsageRatio, utilmetric.MetricData{Value: 0.5, Time: &now})
275+
store.SetCPUMetric(13, pkgconsts.MetricCPUUsageRatio, utilmetric.MetricData{Value: 0.5, Time: &now})
276+
store.SetCPUMetric(14, pkgconsts.MetricCPUUsageRatio, utilmetric.MetricData{Value: 0.5, Time: &now})
277+
278+
store.SetCgroupMetric("test", pkgconsts.MetricCPUUsageCgroup, utilmetric.MetricData{Value: 5, Time: &now})
279+
},
256280
},
257281
}
258282

259283
for _, tt := range tests {
260-
stateImpl, err := makeState(cpuTopology)
261-
as.Nil(err)
284+
tt := tt
285+
t.Run(tt.name, func(t *testing.T) {
286+
t.Parallel()
262287

263-
pods := make([]*v1.Pod, 0, len(tt.podEntries))
288+
cpuTopology, err := machine.GenerateDummyCPUTopology(16, 2, 4)
289+
as.Nil(err)
290+
conf := makeSuppressionEvictionConf(defaultCPUMaxSuppressionToleranceRate, defaultCPUMinSuppressionToleranceDuration)
264291

265-
for entryName, entries := range tt.podEntries {
266-
for subEntryName, entry := range entries {
267-
stateImpl.SetAllocationInfo(entryName, subEntryName, entry)
292+
metricsFetcher := metric.NewFakeMetricsFetcher(metrics.DummyMetrics{})
293+
store := metricsFetcher.(*metric.FakeMetricsFetcher)
268294

269-
if entries.IsPoolEntry() {
270-
continue
271-
}
295+
metaServer := makeMetaServer(metricsFetcher, cpuTopology)
296+
stateImpl, err := makeState(cpuTopology)
297+
as.Nil(err)
272298

273-
pod := &v1.Pod{
274-
ObjectMeta: metav1.ObjectMeta{
275-
UID: types.UID(entry.PodUid),
276-
Name: entry.PodName,
277-
Namespace: entry.PodNamespace,
278-
Annotations: maputil.CopySS(entry.Annotations),
279-
Labels: maputil.CopySS(entry.Labels),
280-
},
281-
Spec: v1.PodSpec{
282-
Containers: []v1.Container{
283-
{
284-
Name: entry.ContainerName,
285-
Resources: v1.ResourceRequirements{
286-
Requests: v1.ResourceList{
287-
apiconsts.ReclaimedResourceMilliCPU: *resource.NewQuantity(int64(entry.RequestQuantity*1000), resource.DecimalSI),
288-
},
289-
Limits: v1.ResourceList{
290-
apiconsts.ReclaimedResourceMilliCPU: *resource.NewQuantity(int64(entry.RequestQuantity*1000), resource.DecimalSI),
299+
plugin, _ := NewCPUPressureSuppressionEviction(metrics.DummyMetrics{}, metaServer, conf, stateImpl)
300+
as.NotNil(plugin)
301+
302+
pods := make([]*v1.Pod, 0, len(tt.podEntries))
303+
304+
if tt.setFakeMetric != nil {
305+
tt.setFakeMetric(store)
306+
}
307+
308+
for entryName, entries := range tt.podEntries {
309+
for subEntryName, entry := range entries {
310+
stateImpl.SetAllocationInfo(entryName, subEntryName, entry)
311+
312+
if entries.IsPoolEntry() {
313+
continue
314+
}
315+
316+
pod := &v1.Pod{
317+
ObjectMeta: metav1.ObjectMeta{
318+
UID: types.UID(entry.PodUid),
319+
Name: entry.PodName,
320+
Namespace: entry.PodNamespace,
321+
Annotations: maputil.CopySS(entry.Annotations),
322+
Labels: maputil.CopySS(entry.Labels),
323+
},
324+
Spec: v1.PodSpec{
325+
Containers: []v1.Container{
326+
{
327+
Name: entry.ContainerName,
328+
Resources: v1.ResourceRequirements{
329+
Requests: v1.ResourceList{
330+
apiconsts.ReclaimedResourceMilliCPU: *resource.NewQuantity(int64(entry.RequestQuantity*1000), resource.DecimalSI),
331+
},
332+
Limits: v1.ResourceList{
333+
apiconsts.ReclaimedResourceMilliCPU: *resource.NewQuantity(int64(entry.RequestQuantity*1000), resource.DecimalSI),
334+
},
291335
},
292336
},
293337
},
294338
},
295-
},
296-
}
339+
}
297340

298-
pods = append(pods, pod)
341+
pods = append(pods, pod)
342+
}
299343
}
300-
}
301344

302-
plugin.(*CPUPressureSuppression).state = stateImpl
345+
plugin.(*CPUPressureSuppression).state = stateImpl
303346

304-
resp, err := plugin.GetEvictPods(context.TODO(), &evictionpluginapi.GetEvictPodsRequest{
305-
ActivePods: pods,
306-
})
307-
assert.NoError(t, err)
308-
assert.NotNil(t, resp)
347+
resp, err := plugin.GetEvictPods(context.TODO(), &evictionpluginapi.GetEvictPodsRequest{
348+
ActivePods: pods,
349+
})
350+
assert.NoError(t, err)
351+
assert.NotNil(t, resp)
352+
353+
time.Sleep(defaultCPUMinSuppressionToleranceDuration)
309354

310-
time.Sleep(defaultCPUMinSuppressionToleranceDuration)
355+
resp, err = plugin.GetEvictPods(context.TODO(), &evictionpluginapi.GetEvictPodsRequest{
356+
ActivePods: pods,
357+
})
358+
assert.NoError(t, err)
359+
assert.NotNil(t, resp)
311360

312-
resp, err = plugin.GetEvictPods(context.TODO(), &evictionpluginapi.GetEvictPodsRequest{
313-
ActivePods: pods,
361+
evictPodUIDSet := sets.String{}
362+
for _, pod := range resp.EvictPods {
363+
evictPodUIDSet.Insert(string(pod.Pod.GetUID()))
364+
}
365+
assert.Equal(t, tt.wantEvictPodUIDSet, evictPodUIDSet)
314366
})
315-
assert.NoError(t, err)
316-
assert.NotNil(t, resp)
317-
318-
evictPodUIDSet := sets.String{}
319-
for _, pod := range resp.EvictPods {
320-
evictPodUIDSet.Insert(string(pod.Pod.GetUID()))
321-
}
322-
assert.Equal(t, tt.wantEvictPodUIDSet, evictPodUIDSet)
323367
}
324368
}

pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_common.go

Lines changed: 9 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,11 @@ import (
2929
"github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/region"
3030
"github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/types"
3131
"github.com/kubewharf/katalyst-core/pkg/config"
32-
pkgconsts "github.com/kubewharf/katalyst-core/pkg/consts"
3332
"github.com/kubewharf/katalyst-core/pkg/metaserver"
33+
"github.com/kubewharf/katalyst-core/pkg/metaserver/agent/metric/helper"
3434
"github.com/kubewharf/katalyst-core/pkg/metrics"
3535
"github.com/kubewharf/katalyst-core/pkg/util/general"
3636
"github.com/kubewharf/katalyst-core/pkg/util/machine"
37-
"github.com/kubewharf/katalyst-core/pkg/util/metric"
3837
)
3938

4039
type HeadroomAssemblerCommon struct {
@@ -104,18 +103,9 @@ func (ha *HeadroomAssemblerCommon) GetHeadroom() (resource.Quantity, error) {
104103
emptyNUMAs = emptyNUMAs.Difference(r.GetBindingNumas())
105104
}
106105

107-
reclaimPoolUtil := 0.0
108-
109106
// add non binding reclaim pool size
110-
reclaimPoolInfo, ok := ha.metaReader.GetPoolInfo(state.PoolNameReclaim)
111-
if ok && reclaimPoolInfo != nil {
112-
113-
reclaimedMetrics, err := ha.getPoolMetrics(state.PoolNameReclaim)
114-
if err != nil {
115-
return resource.Quantity{}, err
116-
}
117-
reclaimPoolUtil = reclaimedMetrics.coreAvgUtil
118-
107+
reclaimPoolInfo, reclaimPoolExist := ha.metaReader.GetPoolInfo(state.PoolNameReclaim)
108+
if reclaimPoolExist && reclaimPoolInfo != nil {
119109
reclaimPoolNUMAs := machine.GetCPUAssignmentNUMAs(reclaimPoolInfo.TopologyAwareAssignments)
120110

121111
sharedCoresHeadroom := 0.0
@@ -150,31 +140,15 @@ func (ha *HeadroomAssemblerCommon) GetHeadroom() (resource.Quantity, error) {
150140
general.InfoS("[qosaware-cpu] headroom assembled", "headroomTotal", headroomTotal, "backoffRetries",
151141
ha.backoffRetries, "util based enabled", dynamicConfig.CPUUtilBasedConfiguration.Enable)
152142

153-
// if util based cpu headroom disable, just return total reclaim pool size as headroom
154-
if !dynamicConfig.CPUUtilBasedConfiguration.Enable {
143+
// if util based cpu headroom disable or reclaim pool not existed, just return total reclaim pool size as headroom
144+
if !dynamicConfig.CPUUtilBasedConfiguration.Enable || !reclaimPoolExist || reclaimPoolInfo == nil {
155145
return *resource.NewQuantity(int64(headroomTotal), resource.DecimalSI), nil
156146
}
157147

158-
return ha.getUtilBasedHeadroom(dynamicConfig, int(headroomTotal), reclaimPoolUtil)
159-
}
160-
161-
type poolMetrics struct {
162-
coreAvgUtil float64
163-
poolSize int
164-
}
165-
166-
// getPoolMetrics get reclaimed pool metrics, including the average utilization of each core in
167-
// the reclaimed pool and the size of the pool
168-
func (ha *HeadroomAssemblerCommon) getPoolMetrics(poolName string) (*poolMetrics, error) {
169-
reclaimedInfo, ok := ha.metaReader.GetPoolInfo(poolName)
170-
if !ok {
171-
return nil, fmt.Errorf("failed get reclaim pool info")
148+
reclaimMetrics, err := helper.GetReclaimMetrics(reclaimPoolInfo.TopologyAwareAssignments.MergeCPUSet(), ha.conf.ReclaimRelativeRootCgroupPath, ha.metaServer.MetricsFetcher)
149+
if err != nil {
150+
return resource.Quantity{}, err
172151
}
173152

174-
cpuSet := reclaimedInfo.TopologyAwareAssignments.MergeCPUSet()
175-
m := ha.metaServer.AggregateCoreMetric(cpuSet, pkgconsts.MetricCPUUsageRatio, metric.AggregatorAvg)
176-
return &poolMetrics{
177-
coreAvgUtil: m.Value,
178-
poolSize: cpuSet.Size(),
179-
}, nil
153+
return ha.getUtilBasedHeadroom(dynamicConfig, reclaimMetrics)
180154
}

0 commit comments

Comments
 (0)