Skip to content

Commit 3234be2

Browse files
authored
CP-53335, topology: do not raise exception when loading invalid distance matrices (NUMA) (#6249)
Unreachable nodes do not contain any CPUs, and therefore VCPUs cannot be scheduled on them. They marked with a value of (2ˆ32) - 1. Instead of failing to produce a NUMA object that allows for scheduling, create an object that contains only schedulable NUMA nodes. This means changing how the datastructures node_cpus and candidates are created to ignore the unreachable ones. Fixes two minor potential issues (distances being NaN, and adding duplicates to candidates, which adds to running time); and separates the xenopsd unit tests into 3 binaries for ease of testing. Fixes #6240
2 parents b81ff9d + 9badc14 commit 3234be2

File tree

7 files changed

+297
-151
lines changed

7 files changed

+297
-151
lines changed

ocaml/xenopsd/lib/topology.ml

Lines changed: 118 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,8 @@
1414

1515
module D = Debug.Make (struct let name = "topology" end)
1616

17-
open D
18-
1917
module CPUSet = struct
20-
include Set.Make (struct
21-
type t = int
22-
23-
let compare (x : int) (y : int) = compare x y
24-
end)
18+
include Set.Make (Int)
2519

2620
let pp_dump = Fmt.using to_seq Fmt.(Dump.seq int)
2721

@@ -95,27 +89,32 @@ let seq_range a b =
9589
let rec next i () = if i = b then Seq.Nil else Seq.Cons (i, next (i + 1)) in
9690
next a
9791

98-
(** [gen_2n n] Generates all non-empty subsets of the set of [n] nodes. *)
99-
let seq_gen_2n n =
92+
let seq_filteri p s =
93+
let rec loop i s () =
94+
match s () with
95+
| Seq.Nil ->
96+
Seq.Nil
97+
| Cons (hd, s) ->
98+
if p i hd then
99+
Cons (hd, loop (i + 1) s)
100+
else
101+
loop (i + 1) s ()
102+
in
103+
loop 0 s
104+
105+
(** [seq_all_subsets n] Generates all non-empty subsets of the [nodes] set. *)
106+
let seq_all_subsets nodes =
107+
let n = Seq.length nodes in
100108
(* A node can either be present in the output or not, so use a loop [1, 2^n)
101109
and have the [i]th bit determine that. *)
102-
let of_mask i =
103-
seq_range 0 n |> Seq.filter (fun bit -> (i lsr bit) land 1 = 1)
104-
in
110+
let of_mask i = nodes |> seq_filteri (fun bit _ -> (i lsr bit) land 1 = 1) in
105111
seq_range 1 (1 lsl n) |> Seq.map of_mask
106112

107113
(** [seq_sort ~cmp s] sorts [s] in a temporary place using [cmp]. *)
108114
let seq_sort ~cmp s =
109115
let a = Array.of_seq s in
110116
Array.fast_sort cmp a ; Array.to_seq a
111117

112-
(** [seq_append a b] is the sequence [a] followed by [b] *)
113-
let seq_append (a : 'a Seq.t) (b : 'a Seq.t) =
114-
let rec next v () =
115-
match v () with Seq.Nil -> b () | Seq.Cons (x, xs) -> Seq.Cons (x, next xs)
116-
in
117-
next a
118-
119118
module NUMA = struct
120119
type node = Node of int
121120

@@ -125,31 +124,43 @@ module NUMA = struct
125124
let compare (Node a) (Node b) = compare a b
126125
end)
127126

127+
(* -1 in 32 bits *)
128+
let unreachable_distance = 0xFFFFFFFF
129+
130+
let self_distance = 10
131+
128132
(* no mutation is exposed in the interface, therefore this is immutable *)
129133
type t = {
130134
distances: int array array
131135
; cpu_to_node: node array
132136
; node_cpus: CPUSet.t array
133137
; all: CPUSet.t
134138
; node_usage: int array
139+
(** Usage across nodes is meant to be balanced when choosing candidates for a VM *)
135140
; candidates: (float * node Seq.t) Seq.t
141+
(** Sequence of all subsets of nodes and the average distance within
142+
the subset, sorted by the latter in increasing order. *)
136143
}
137144

138145
let node_of_int i = Node i
139146

140147
let node_distances d nodes =
141-
let dists =
142-
nodes |> Seq.flat_map (fun n1 -> nodes |> Seq.map (fun n2 -> d.(n1).(n2)))
143-
in
144-
let count, max_dist, sum_dist =
145-
Seq.fold_left
146-
(fun (count, maxv, sum) e -> (count + 1, max maxv e, sum + e))
147-
(0, min_int, 0) dists
148-
in
149-
(* We want to minimize maximum distance first, and average distance next.
150-
When running the VM we don't know which pCPU it'll end up using, and want
151-
to limit the worst case performance. *)
152-
((max_dist, float sum_dist /. float count), nodes)
148+
if Seq.is_empty nodes then
149+
None
150+
else
151+
let dists =
152+
nodes
153+
|> Seq.flat_map (fun n1 -> nodes |> Seq.map (fun n2 -> d.(n1).(n2)))
154+
in
155+
let count, max_dist, sum_dist =
156+
Seq.fold_left
157+
(fun (count, maxv, sum) e -> (count + 1, max maxv e, sum + e))
158+
(0, min_int, 0) dists
159+
in
160+
(* We want to minimize maximum distance first, and average distance next.
161+
When running the VM we don't know which pCPU it'll end up using, and want
162+
to limit the worst case performance. *)
163+
Some ((max_dist, float sum_dist /. float count), nodes)
153164

154165
let dist_cmp (a1, _) (b1, _) = compare a1 b1
155166

@@ -159,60 +170,96 @@ module NUMA = struct
159170
[n*multiply ... n*multiply + multiply-1], except we always the add the
160171
single NUMA node combinations. *)
161172
(* make sure that single NUMA nodes are always present in the combinations *)
162-
let single_nodes =
173+
let distance_to_candidate d = (d, float d) in
174+
let valid_nodes =
163175
seq_range 0 (Array.length d)
164-
|> Seq.map (fun i -> ((10, 10.0), Seq.return i))
176+
|> Seq.filter_map (fun i ->
177+
let self = d.(i).(i) in
178+
if self <> unreachable_distance then
179+
Some i
180+
else
181+
None
182+
)
165183
in
166-
let numa_nodes = Array.length d in
184+
let numa_nodes = Seq.length valid_nodes in
167185
let nodes =
168-
if numa_nodes > 16 then
169-
(* try just the single nodes, and give up (use all nodes otherwise) to
170-
avoid exponential running time. We could do better here, e.g. by
186+
if numa_nodes > 16 then (
187+
(* Avoid generating too many candidates because of the exponential
188+
running time. We could do better here, e.g. by
171189
reducing the matrix *)
172-
single_nodes
173-
else
174-
numa_nodes
175-
|> seq_gen_2n
176-
|> Seq.map (node_distances d)
177-
|> seq_append single_nodes
190+
D.info
191+
"%s: More than 16 valid NUMA nodes detected, considering only \
192+
individual nodes."
193+
__FUNCTION__ ;
194+
valid_nodes
195+
|> Seq.map (fun i ->
196+
let self = d.(i).(i) in
197+
(distance_to_candidate self, Seq.return i)
198+
)
199+
) else
200+
valid_nodes |> seq_all_subsets |> Seq.filter_map (node_distances d)
178201
in
179202
nodes
180203
|> seq_sort ~cmp:dist_cmp
181204
|> Seq.map (fun ((_, avg), nodes) -> (avg, Seq.map (fun n -> Node n) nodes))
182205

183-
let pp_dump_distances = Fmt.(int |> Dump.array |> Dump.array)
184-
185206
let make ~distances ~cpu_to_node =
186-
debug "Distances: %s" (Fmt.to_to_string pp_dump_distances distances) ;
187-
debug "CPU2Node: %s" (Fmt.to_to_string Fmt.(Dump.array int) cpu_to_node) ;
207+
let ( let* ) = Option.bind in
188208
let node_cpus = Array.map (fun _ -> CPUSet.empty) distances in
209+
210+
(* nothing can be scheduled on unreachable nodes, remove them from the
211+
node_cpus *)
189212
Array.iteri
190-
(fun i node -> node_cpus.(node) <- CPUSet.add i node_cpus.(node))
191-
cpu_to_node ;
192-
Array.iteri
193-
(fun i row ->
194-
let d = distances.(i).(i) in
195-
if d <> 10 then
196-
invalid_arg
197-
(Printf.sprintf "NUMA distance from node to itself must be 10: %d" d) ;
198-
Array.iteri
199-
(fun _ d ->
200-
if d < 10 then
201-
invalid_arg (Printf.sprintf "NUMA distance must be >= 10: %d" d)
202-
)
203-
row
213+
(fun i node ->
214+
let self = distances.(node).(node) in
215+
if self <> unreachable_distance then
216+
node_cpus.(node) <- CPUSet.add i node_cpus.(node)
204217
)
205-
distances ;
206-
let all = Array.fold_left CPUSet.union CPUSet.empty node_cpus in
207-
let candidates = gen_candidates distances in
208-
{
218+
cpu_to_node ;
219+
220+
let* () =
221+
if Array.for_all (fun cpus -> CPUSet.is_empty cpus) node_cpus then (
222+
D.info
223+
"Not enabling NUMA: the ACPI SLIT only contains unreachable nodes." ;
224+
None
225+
) else
226+
Some ()
227+
in
228+
229+
let numa_matrix_is_reasonable =
209230
distances
210-
; cpu_to_node= Array.map node_of_int cpu_to_node
211-
; node_cpus
212-
; all
213-
; node_usage= Array.map (fun _ -> 0) distances
214-
; candidates
215-
}
231+
|> Array.to_seqi
232+
|> Seq.for_all (fun (i, row) ->
233+
let d = distances.(i).(i) in
234+
(d = unreachable_distance || d = self_distance)
235+
&& Array.for_all
236+
(fun d -> d >= self_distance || d = unreachable_distance)
237+
row
238+
)
239+
in
240+
241+
let* () =
242+
if not numa_matrix_is_reasonable then (
243+
D.info
244+
"Not enabling NUMA: the ACPI SLIT table contains values that are \
245+
invalid." ;
246+
None
247+
) else
248+
Some ()
249+
in
250+
251+
let candidates = gen_candidates distances in
252+
253+
let all = Array.fold_left CPUSet.union CPUSet.empty node_cpus in
254+
Some
255+
{
256+
distances
257+
; cpu_to_node= Array.map node_of_int cpu_to_node
258+
; node_cpus
259+
; all
260+
; node_usage= Array.map (fun _ -> 0) distances
261+
; candidates
262+
}
216263

217264
let distance t (Node a) (Node b) = t.distances.(a).(b)
218265

ocaml/xenopsd/lib/topology.mli

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ module NUMA : sig
7878
(** A NUMA node index. Distinct from an int to avoid mixing with CPU numbers *)
7979
type node = private Node of int
8080

81-
val make : distances:int array array -> cpu_to_node:int array -> t
81+
val make : distances:int array array -> cpu_to_node:int array -> t option
8282
(** [make distances cpu_to_node] stores the topology. [distances] is a square
8383
matrix [d] where [d.(i).(j)] is an approximation to how much slower it is
8484
to access memory from node [j] when running on node [i]. Distances are
@@ -97,8 +97,6 @@ module NUMA : sig
9797
9898
NUMA nodes without any CPUs are accepted (to handle hard affinities).
9999
100-
Raises [invalid_arg] if above constraints are not met
101-
102100
A typical matrix might look like this:
103101
104102
10 21

ocaml/xenopsd/test/dune

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
11
(test
22
(name test)
3-
(modes exe)
3+
(modules :standard \ test_cpuid test_topology)
44
(package xapi-tools)
55
(libraries
66
alcotest
7-
cpuid
8-
97
fmt
108
result
119
rpclib.core
@@ -15,15 +13,27 @@
1513
xapi-idl.xen.interface.types
1614
xapi-log
1715
xapi-stdext-pervasives
18-
xapi-test-utils
1916
xapi_xenopsd
20-
xenstore_transport.unix
2117
)
2218
(preprocess
2319
(per_module ((pps ppx_deriving_rpc) Test))
2420
)
2521
)
2622

23+
(test
24+
(name test_cpuid)
25+
(modules test_cpuid)
26+
(package xapi-tools)
27+
(libraries alcotest cpuid xapi-test-utils xapi_xenopsd)
28+
)
29+
30+
(test
31+
(name test_topology)
32+
(modules test_topology)
33+
(package xapi-tools)
34+
(libraries alcotest fmt xapi-log xapi_xenopsd)
35+
)
36+
2737
(rule
2838
(alias runtest)
2939
(package xapi-tools)

ocaml/xenopsd/test/test.ml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1032,4 +1032,4 @@ let _ =
10321032
)
10331033
in
10341034
Debug.log_to_stdout () ;
1035-
Alcotest.run "xenops test" ([suite; Test_topology.suite] @ Test_cpuid.tests)
1035+
Alcotest.run "xenops test" [suite]

ocaml/xenopsd/test/test_cpuid.ml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -327,3 +327,5 @@ let tests =
327327
; ("equality", Equality.tests)
328328
; ("comparisons", Comparisons.tests)
329329
]
330+
331+
let () = Alcotest.run "CPUID" tests

0 commit comments

Comments
 (0)