Skip to content

Commit 4a40e7e

Browse files
authored
Remove patching with null value for descheduling strategy (intel#108)
Signed-off-by: Madalina Lazar <madalina.lazar@intel.com>
1 parent 9a41b72 commit 4a40e7e

File tree

2 files changed

+227
-55
lines changed

2 files changed

+227
-55
lines changed

telemetry-aware-scheduling/pkg/strategies/deschedule/enforce.go

Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,10 @@ import (
1717
)
1818

1919
const (
20-
l2 = 2
21-
l4 = 4
20+
l2 = 2
21+
l4 = 4
22+
failNodeListCleanUpMessage = "failed to list nodes during clean-up"
23+
failNodeListEnforceMessage = "failed to list all nodes during enforce"
2224
)
2325

2426
var errNull = errors.New("")
@@ -40,7 +42,7 @@ func (d *Strategy) Cleanup(enforcer *strategy.MetricEnforcer, policyName string)
4042
msg := fmt.Sprintf("cannot list nodes: %v", err)
4143
klog.V(l2).InfoS(msg, "component", "controller")
4244

43-
return fmt.Errorf("failed to cleanup node labels: %w", err)
45+
return fmt.Errorf("%s: %w", failNodeListCleanUpMessage, err)
4446
}
4547

4648
for _, node := range nodes.Items {
@@ -74,7 +76,7 @@ func (d *Strategy) Enforce(enforcer *strategy.MetricEnforcer, cache cache.Reader
7476
msg := fmt.Sprintf("cannot list nodes: %v", err)
7577
klog.V(l2).InfoS(msg, "component", "controller")
7678

77-
return -1, fmt.Errorf("failed to enforce strategy: %w", err)
79+
return -1, fmt.Errorf("%s: %w", failNodeListEnforceMessage, err)
7880
}
7981

8082
list := d.nodeStatusForStrategy(enforcer, cache)
@@ -147,31 +149,28 @@ func (d *Strategy) updateNodeLabels(enforcer *strategy.MetricEnforcer, viols vio
147149

148150
for policyName := range nonViolatedPolicies {
149151
if _, ok := node.Labels[policyName]; ok {
150-
// There is a duplication of work here - both label added as null and label removed. Due to some oddness in behaviour on remove label.
151-
//TODO: Decide which behaviour is better. This leaves a constant label on every node for every strategy in the enforcer.
152152
payload = append(payload,
153153
patchValue{
154-
Op: "remove",
155-
Path: "/metadata/labels/" + policyName,
154+
Op: "remove",
155+
Path: "/metadata/labels/" + policyName,
156+
Value: "",
156157
})
157-
payload = append(payload, patchValue{
158-
Op: "add",
159-
Path: "/metadata/labels/" + policyName,
160-
Value: "null",
161-
})
162158
}
163159
totalViolations++
164160
}
165161

166-
err := d.patchNode(node.Name, enforcer, payload)
167-
if err != nil {
168-
if len(labelErrs) == 0 {
169-
labelErrs = "could not label: "
170-
}
162+
if len(payload) != 0 {
163+
err := d.patchNode(node.Name, enforcer, payload)
171164

172-
klog.V(l4).InfoS(err.Error(), "component", "controller")
165+
if err != nil {
166+
if len(labelErrs) == 0 {
167+
labelErrs = "could not label: "
168+
}
173169

174-
labelErrs = labelErrs + node.Name + ": [ " + violatedPolicies + " ]; "
170+
klog.V(l4).InfoS(err.Error(), "component", "controller")
171+
172+
labelErrs = labelErrs + node.Name + ": [ " + violatedPolicies + " ]; "
173+
}
175174
}
176175

177176
if len(violatedPolicies) > 0 {

telemetry-aware-scheduling/pkg/strategies/deschedule/enforce_test.go

Lines changed: 208 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,16 @@ package deschedule
22

33
import (
44
"context"
5+
"errors"
56
"fmt"
7+
"reflect"
8+
"strings"
69
"testing"
710
"time"
811

9-
"k8s.io/klog/v2"
12+
"k8s.io/apimachinery/pkg/runtime"
13+
"k8s.io/client-go/kubernetes/typed/core/v1/fake"
14+
k8stesting "k8s.io/client-go/testing"
1015

1116
"github.com/intel/platform-aware-scheduling/telemetry-aware-scheduling/pkg/cache"
1217
"github.com/intel/platform-aware-scheduling/telemetry-aware-scheduling/pkg/metrics"
@@ -19,81 +24,249 @@ import (
1924
_ "k8s.io/client-go/plugin/pkg/client/auth"
2025
)
2126

27+
var errMockTest = errors.New("error when calling list")
28+
29+
type expected struct {
30+
nodes map[string]map[string]string // node name: string -> labels: map[string]string
31+
labeledNodes map[string]map[string]string // node name: string -> labels: map[string]string
32+
}
33+
34+
type CacheMetric struct {
35+
metricName string
36+
metricValue int64
37+
}
38+
39+
func assertViolatingNodes(t *testing.T, nodeList *v1.NodeList, wantNodes map[string]map[string]string) {
40+
t.Helper()
41+
42+
nodes := nodeList.Items
43+
// check lengths are equal
44+
if len(nodes) != len(wantNodes) {
45+
t.Errorf("Number of violating nodes returned: %v not as expected: %v", len(nodes), len(wantNodes))
46+
}
47+
48+
// check if the nodes are similar
49+
for _, node := range nodes {
50+
currentNodeName := node.Name
51+
currentNodeLabels := node.Labels
52+
53+
if wantNodes[currentNodeName] == nil {
54+
t.Errorf("Expected to find node %s in list of expected nodes, but wasn't there.", currentNodeName)
55+
}
56+
57+
expectedNodeLabels := wantNodes[node.Name]
58+
if !reflect.DeepEqual(expectedNodeLabels, currentNodeLabels) {
59+
t.Errorf("Labels for node were different, expected %v got: %v", expectedNodeLabels, currentNodeLabels)
60+
}
61+
}
62+
}
63+
2264
func TestDescheduleStrategy_Enforce(t *testing.T) {
2365
type args struct {
2466
enforcer *strategy.MetricEnforcer
2567
cache cache.ReaderWriter
2668
}
2769

28-
type expected struct {
29-
nodeNames []string
30-
}
70+
clientWithListNodeException := testclient.NewSimpleClientset()
71+
clientWithListNodeException.CoreV1().(*fake.FakeCoreV1).PrependReactor("list", "nodes",
72+
func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) {
73+
return true, &v1.NodeList{}, errMockTest
74+
})
3175

3276
tests := []struct {
33-
name string
34-
d *Strategy
35-
node *v1.Node
36-
args args
37-
wantErr bool
38-
want expected
77+
name string
78+
d *Strategy
79+
nodes []*v1.Node
80+
args args
81+
wantErr bool
82+
want expected
83+
cacheMetrics map[string]CacheMetric // node name: string -> metric : { metricName, metricValue }
84+
wantErrMessageToken string
3985
}{
4086
{name: "node label test",
4187
d: &Strategy{PolicyName: "deschedule-test", Rules: []telpol.TASPolicyRule{
4288
{Metricname: "memory", Operator: "GreaterThan", Target: 1},
4389
{Metricname: "cpu", Operator: "LessThan", Target: 10}}},
44-
node: &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node-1", Labels: map[string]string{"deschedule-test": ""}}},
90+
nodes: []*v1.Node{{ObjectMeta: metav1.ObjectMeta{Name: "node-1", Labels: map[string]string{"deschedule-test": "", "node-1-label": "test"}}},
91+
{ObjectMeta: metav1.ObjectMeta{Name: "node-2", Labels: map[string]string{"deschedule-test": "violating", "node-2-label": "test"}}},
92+
{ObjectMeta: metav1.ObjectMeta{Name: "node-3", Labels: map[string]string{"node-3-label": "test"}}}},
93+
cacheMetrics: map[string]CacheMetric{"node-2": {"cpu", 5}, "node-3": {"memory", 100}},
4594
args: args{enforcer: strategy.NewEnforcer(testclient.NewSimpleClientset()),
4695
cache: cache.MockEmptySelfUpdatingCache()},
47-
want: expected{nodeNames: []string{"node-1"}}},
96+
want: expected{
97+
nodes: map[string]map[string]string{"node-1": {"node-1-label": "test"},
98+
"node-2": {"deschedule-test": "violating", "node-2-label": "test"},
99+
"node-3": {"deschedule-test": "violating", "node-3-label": "test"}},
100+
labeledNodes: map[string]map[string]string{"node-2": {"deschedule-test": "violating", "node-2-label": "test"},
101+
"node-3": {"deschedule-test": "violating", "node-3-label": "test"}},
102+
}},
48103
{name: "node unlabel test",
49104
d: &Strategy{PolicyName: "deschedule-test", Rules: []telpol.TASPolicyRule{
50105
{Metricname: "memory", Operator: "GreaterThan", Target: 1000},
51106
{Metricname: "cpu", Operator: "LessThan", Target: 10}}},
52-
node: &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node-1", Labels: map[string]string{"deschedule-test": "violating"}}},
107+
nodes: []*v1.Node{{ObjectMeta: metav1.ObjectMeta{Name: "node-1", Labels: map[string]string{"deschedule-test": "violating", "node-1-label": "test"}}},
108+
{ObjectMeta: metav1.ObjectMeta{Name: "node-2", Labels: map[string]string{"deschedule-test": "violating", "node-2-label": "test"}}},
109+
{ObjectMeta: metav1.ObjectMeta{Name: "node-3", Labels: map[string]string{"node-3-label": "test"}}}},
110+
cacheMetrics: map[string]CacheMetric{"node-2": {"cpu", 11}, "node-3": {"memory", 100}},
53111
args: args{enforcer: strategy.NewEnforcer(testclient.NewSimpleClientset()),
54112
cache: cache.MockEmptySelfUpdatingCache()},
55-
want: expected{nodeNames: []string{}}},
113+
want: expected{
114+
nodes: map[string]map[string]string{"node-1": {"node-1-label": "test"},
115+
"node-2": {"node-2-label": "test"},
116+
"node-3": {"node-3-label": "test"}},
117+
labeledNodes: map[string]map[string]string{}}},
118+
{name: "list nodes with exception",
119+
d: &Strategy{PolicyName: "deschedule-test", Rules: []telpol.TASPolicyRule{
120+
{Metricname: "memory", Operator: "GreaterThan", Target: 1000},
121+
{Metricname: "cpu", Operator: "LessThan", Target: 10}}},
122+
nodes: []*v1.Node{{ObjectMeta: metav1.ObjectMeta{Name: "node-1", Labels: map[string]string{"deschedule-test": "violating", "node-1-label": "test"}}},
123+
{ObjectMeta: metav1.ObjectMeta{Name: "node-2", Labels: map[string]string{"deschedule-test": "violating", "node-2-label": "test"}}},
124+
{ObjectMeta: metav1.ObjectMeta{Name: "node-3", Labels: map[string]string{"node-3-label": "test"}}}},
125+
cacheMetrics: map[string]CacheMetric{"node-2": {"cpu", 11}, "node-3": {"memory", 100}},
126+
args: args{enforcer: strategy.NewEnforcer(clientWithListNodeException),
127+
cache: cache.MockEmptySelfUpdatingCache()},
128+
want: expected{},
129+
wantErr: true,
130+
wantErrMessageToken: failNodeListEnforceMessage},
56131
}
57132
for _, tt := range tests {
58133
tt := tt
59134

60-
err := tt.args.cache.WriteMetric("memory", metrics.NodeMetricsInfo{
61-
"node-1": {Timestamp: time.Now(), Window: 1, Value: *resource.NewQuantity(100, resource.DecimalSI)}})
62-
if err != nil {
63-
t.Errorf("Cannot write metric to mock cach for test: %v", err)
135+
for metricNodeName, metric := range tt.cacheMetrics {
136+
err := tt.args.cache.WriteMetric(metric.metricName, metrics.NodeMetricsInfo{
137+
metricNodeName: {Timestamp: time.Now(), Window: 1, Value: *resource.NewQuantity(metric.metricValue, resource.DecimalSI)}})
138+
if err != nil {
139+
t.Errorf("Cannot write metric %s to mock cache for test: %v", metricNodeName, err)
140+
}
64141
}
65142

66-
_, err = tt.args.enforcer.KubeClient.CoreV1().Nodes().Create(context.TODO(), tt.node, metav1.CreateOptions{})
67-
if err != nil {
68-
t.Errorf("Cannot write metric to mock cach for test: %v", err)
143+
// create nodes
144+
for _, node := range tt.nodes {
145+
_, err := tt.args.enforcer.KubeClient.CoreV1().Nodes().Create(context.TODO(), node, metav1.CreateOptions{})
146+
if err != nil {
147+
t.Errorf("Cannot create node %s : %v", node.Name, err)
148+
}
69149
}
70150

71151
tt.args.enforcer.RegisterStrategyType(tt.d)
72152
tt.args.enforcer.AddStrategy(tt.d, tt.d.StrategyType())
73153
t.Run(tt.name, func(t *testing.T) {
74-
got := []string{}
75154
if _, err := tt.d.Enforce(tt.args.enforcer, tt.args.cache); (err != nil) != tt.wantErr {
76-
t.Errorf("Strategy.Enforce() error = %v, wantErr %v", err, tt.wantErr)
155+
if !strings.Contains(err.Error(), tt.wantErrMessageToken) {
156+
t.Errorf("Expecting output to match wantErr %v, instead got %v", tt.wantErrMessageToken, err)
157+
158+
return
159+
}
160+
t.Errorf("Unexpected exception while trying to call Enforce %v", err)
161+
162+
return
77163
}
78164

79-
labelledNodes, err := tt.args.enforcer.KubeClient.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{LabelSelector: "deschedule-test=violating"})
80-
if err != nil {
81-
if !tt.wantErr {
82-
t.Errorf("Strategy.Enforce() error = %v, wantErr %v", err, tt.wantErr)
165+
if !tt.wantErr {
166+
// violating nodes
167+
labeledNodes, err := tt.args.enforcer.KubeClient.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{LabelSelector: "deschedule-test=violating"})
168+
if err != nil && !tt.wantErr {
169+
t.Errorf("Unexpected exception while trying to fetch violating nodes %v", err)
83170

84171
return
85172
}
173+
assertViolatingNodes(t, labeledNodes, tt.want.labeledNodes)
174+
nodes, _ := tt.args.enforcer.KubeClient.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
175+
assertViolatingNodes(t, nodes, tt.want.nodes)
176+
}
177+
})
178+
}
179+
}
180+
181+
func TestDescheduleStrategy_Cleanup(t *testing.T) {
182+
type args struct {
183+
enforcer *strategy.MetricEnforcer
184+
cache cache.ReaderWriter
185+
}
186+
187+
clientWithException := testclient.NewSimpleClientset()
188+
clientWithException.CoreV1().(*fake.FakeCoreV1).PrependReactor("list", "nodes", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) {
189+
return true, &v1.NodeList{}, errMockTest
190+
})
191+
192+
tests := []struct {
193+
name string
194+
d *Strategy
195+
node *v1.Node
196+
args args
197+
wantErr bool
198+
wantErrMessageToken string
199+
want expected
200+
}{
201+
{name: "node with violating label",
202+
d: &Strategy{PolicyName: "deschedule-test", Rules: []telpol.TASPolicyRule{
203+
{Metricname: "memory", Operator: "GreaterThan", Target: 1},
204+
{Metricname: "cpu", Operator: "LessThan", Target: 10}}},
205+
node: &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node-1", Labels: map[string]string{"deschedule-test": "violating", "node-1-label": "test"}}},
206+
args: args{enforcer: strategy.NewEnforcer(testclient.NewSimpleClientset()),
207+
cache: cache.MockEmptySelfUpdatingCache()},
208+
want: expected{
209+
nodes: map[string]map[string]string{"node-1": {"node-1-label": "test"}},
210+
labeledNodes: map[string]map[string]string{},
211+
}},
212+
{name: "node without violating label",
213+
d: &Strategy{PolicyName: "deschedule-test", Rules: []telpol.TASPolicyRule{
214+
{Metricname: "memory", Operator: "GreaterThan", Target: 1000},
215+
{Metricname: "cpu", Operator: "LessThan", Target: 10}}},
216+
node: &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node-2", Labels: map[string]string{"deschedule-test": "", "node-2-label": "test"}}},
217+
args: args{enforcer: strategy.NewEnforcer(testclient.NewSimpleClientset()),
218+
cache: cache.MockEmptySelfUpdatingCache()},
219+
want: expected{
220+
nodes: map[string]map[string]string{"node-2": {"deschedule-test": "", "node-2-label": "test"}},
221+
labeledNodes: map[string]map[string]string{},
222+
}},
223+
{name: "list nodes throws an error",
224+
d: &Strategy{PolicyName: "deschedule-test", Rules: []telpol.TASPolicyRule{
225+
{Metricname: "memory", Operator: "GreaterThan", Target: 1000},
226+
{Metricname: "cpu", Operator: "LessThan", Target: 10}}},
227+
node: &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node-2", Labels: map[string]string{"deschedule-test": "", "test": "label"}}},
228+
args: args{enforcer: strategy.NewEnforcer(clientWithException),
229+
cache: cache.MockEmptySelfUpdatingCache()},
230+
wantErr: true,
231+
wantErrMessageToken: failNodeListCleanUpMessage,
232+
want: expected{}},
233+
}
234+
235+
for _, tt := range tests {
236+
tt := tt
237+
238+
_, err := tt.args.enforcer.KubeClient.CoreV1().Nodes().Create(context.TODO(), tt.node, metav1.CreateOptions{})
239+
if err != nil {
240+
t.Errorf("Cannot write metric to mock cach for test: %v", err)
241+
}
242+
243+
t.Run(tt.name, func(t *testing.T) {
244+
if err := tt.d.Cleanup(tt.args.enforcer, tt.d.PolicyName); (err != nil) != tt.wantErr {
245+
if !strings.Contains(fmt.Sprint(err.Error()), tt.wantErrMessageToken) {
246+
t.Errorf("Expecting output to match wantErr %v, instead got %v", tt.wantErrMessageToken, err)
247+
248+
return
249+
}
250+
t.Errorf("Strategy.Cleanup() unexpected error = %v, wantErr %v", err, tt.wantErr)
86251

87252
return
88253
}
89-
for _, node := range labelledNodes.Items {
90-
got = append(got, node.Name)
91-
}
92-
nodys, _ := tt.args.enforcer.KubeClient.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
93-
msg := fmt.Sprint(nodys.Items[0])
94-
klog.InfoS(msg, "component", "testing")
95-
if len(tt.want.nodeNames) != len(got) {
96-
t.Errorf("Number of pods returned: %v not as expected: %v", got, tt.want.nodeNames)
254+
255+
if !tt.wantErr {
256+
labelledNodes, err := tt.args.enforcer.KubeClient.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{LabelSelector: "deschedule-test=violating"})
257+
if err != nil {
258+
if !tt.wantErr {
259+
t.Errorf("Strategy.Enforce() error = %v, wantErr %v", err, tt.wantErr)
260+
261+
return
262+
}
263+
t.Errorf("Unexpected error encountered while trying to filter for the deschedule-test=violating label...")
264+
265+
return
266+
}
267+
assertViolatingNodes(t, labelledNodes, tt.want.labeledNodes)
268+
nodes, _ := tt.args.enforcer.KubeClient.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
269+
assertViolatingNodes(t, nodes, tt.want.nodes)
97270
}
98271
})
99272
}

0 commit comments

Comments
 (0)