CP-53335, topology: do not raise exception when loading invalid distance matrices (NUMA) (#6249)

psafont · web-flow · commit 3234be28c386 · 2025-02-03T17:22:37.000Z
Unreachable nodes do not contain any CPUs, and therefore VCPUs cannot be scheduled on them. They marked with a value of (2ˆ32) - 1. Instead of failing to produce a NUMA object that allows for scheduling, create an object that contains only schedulable NUMA nodes. This means changing how the datastructures node_cpus and candidates are created to ignore the unreachable ones. Fixes two minor potential issues (distances being NaN, and adding duplicates to candidates, which adds to running time); and separates the xenopsd unit tests into 3 binaries for ease of testing. Fixes #6240
diff --git a/ocaml/xenopsd/lib/topology.ml b/ocaml/xenopsd/lib/topology.ml
@@ -14,14 +14,8 @@
 
 module D = Debug.Make (struct let name = "topology" end)
 
-open D
-
 module CPUSet = struct
-  include Set.Make (struct
-    type t = int
-
-    let compare (x : int) (y : int) = compare x y
-  end)
+  include Set.Make (Int)
 
   let pp_dump = Fmt.using to_seq Fmt.(Dump.seq int)
 
@@ -95,27 +89,32 @@ let seq_range a b =
   let rec next i () = if i = b then Seq.Nil else Seq.Cons (i, next (i + 1)) in
   next a
 
-(** [gen_2n n] Generates all non-empty subsets of the set of [n] nodes. *)
-let seq_gen_2n n =
+let seq_filteri p s =
+  let rec loop i s () =
+    match s () with
+    | Seq.Nil ->
+        Seq.Nil
+    | Cons (hd, s) ->
+        if p i hd then
+          Cons (hd, loop (i + 1) s)
+        else
+          loop (i + 1) s ()
+  in
+  loop 0 s
+
+(** [seq_all_subsets n] Generates all non-empty subsets of the [nodes] set. *)
+let seq_all_subsets nodes =
+  let n = Seq.length nodes in
   (* A node can either be present in the output or not, so use a loop [1, 2^n)
      and have the [i]th bit determine that. *)
-  let of_mask i =
-    seq_range 0 n |> Seq.filter (fun bit -> (i lsr bit) land 1 = 1)
-  in
+  let of_mask i = nodes |> seq_filteri (fun bit _ -> (i lsr bit) land 1 = 1) in
   seq_range 1 (1 lsl n) |> Seq.map of_mask
 
 (** [seq_sort ~cmp s] sorts [s] in a temporary place using [cmp]. *)
 let seq_sort ~cmp s =
   let a = Array.of_seq s in
   Array.fast_sort cmp a ; Array.to_seq a
 
-(** [seq_append a b] is the sequence [a] followed by [b] *)
-let seq_append (a : 'a Seq.t) (b : 'a Seq.t) =
-  let rec next v () =
-    match v () with Seq.Nil -> b () | Seq.Cons (x, xs) -> Seq.Cons (x, next xs)
-  in
-  next a
-
 module NUMA = struct
   type node = Node of int
 
@@ -125,31 +124,43 @@ module NUMA = struct
     let compare (Node a) (Node b) = compare a b
   end)
 
+  (* -1 in 32 bits *)
+  let unreachable_distance = 0xFFFFFFFF
+
+  let self_distance = 10
+
   (* no mutation is exposed in the interface, therefore this is immutable *)
   type t = {
       distances: int array array
     ; cpu_to_node: node array
     ; node_cpus: CPUSet.t array
     ; all: CPUSet.t
     ; node_usage: int array
+          (** Usage across nodes is meant to be balanced when choosing candidates for a VM *)
     ; candidates: (float * node Seq.t) Seq.t
+          (** Sequence of all subsets of nodes and the average distance within
+              the subset, sorted by the latter in increasing order. *)
   }
 
   let node_of_int i = Node i
 
   let node_distances d nodes =
-    let dists =
-      nodes |> Seq.flat_map (fun n1 -> nodes |> Seq.map (fun n2 -> d.(n1).(n2)))
-    in
-    let count, max_dist, sum_dist =
-      Seq.fold_left
-        (fun (count, maxv, sum) e -> (count + 1, max maxv e, sum + e))
-        (0, min_int, 0) dists
-    in
-    (* We want to minimize maximum distance first, and average distance next.
-       When running the VM we don't know which pCPU it'll end up using, and want
-       to limit the worst case performance. *)
-    ((max_dist, float sum_dist /. float count), nodes)
+    if Seq.is_empty nodes then
+      None
+    else
+      let dists =
+        nodes
+        |> Seq.flat_map (fun n1 -> nodes |> Seq.map (fun n2 -> d.(n1).(n2)))
+      in
+      let count, max_dist, sum_dist =
+        Seq.fold_left
+          (fun (count, maxv, sum) e -> (count + 1, max maxv e, sum + e))
+          (0, min_int, 0) dists
+      in
+      (* We want to minimize maximum distance first, and average distance next.
+         When running the VM we don't know which pCPU it'll end up using, and want
+         to limit the worst case performance. *)
+      Some ((max_dist, float sum_dist /. float count), nodes)
 
   let dist_cmp (a1, _) (b1, _) = compare a1 b1
 
@@ -159,60 +170,96 @@ module NUMA = struct
        [n*multiply ... n*multiply + multiply-1], except we always the add the
        single NUMA node combinations. *)
     (* make sure that single NUMA nodes are always present in the combinations *)
-    let single_nodes =
+    let distance_to_candidate d = (d, float d) in
+    let valid_nodes =
       seq_range 0 (Array.length d)
-      |> Seq.map (fun i -> ((10, 10.0), Seq.return i))
+      |> Seq.filter_map (fun i ->
+             let self = d.(i).(i) in
+             if self <> unreachable_distance then
+               Some i
+             else
+               None
+         )
     in
-    let numa_nodes = Array.length d in
+    let numa_nodes = Seq.length valid_nodes in
     let nodes =
-      if numa_nodes > 16 then
-        (* try just the single nodes, and give up (use all nodes otherwise) to
-           avoid exponential running time. We could do better here, e.g. by
+      if numa_nodes > 16 then (
+        (* Avoid generating too many candidates because of the exponential
+           running time. We could do better here, e.g. by
            reducing the matrix *)
-        single_nodes
-      else
-        numa_nodes
-        |> seq_gen_2n
-        |> Seq.map (node_distances d)
-        |> seq_append single_nodes
+        D.info
+          "%s: More than 16 valid NUMA nodes detected, considering only \
+           individual nodes."
+          __FUNCTION__ ;
+        valid_nodes
+        |> Seq.map (fun i ->
+               let self = d.(i).(i) in
+               (distance_to_candidate self, Seq.return i)
+           )
+      ) else
+        valid_nodes |> seq_all_subsets |> Seq.filter_map (node_distances d)
     in
     nodes
     |> seq_sort ~cmp:dist_cmp
     |> Seq.map (fun ((_, avg), nodes) -> (avg, Seq.map (fun n -> Node n) nodes))
 
-  let pp_dump_distances = Fmt.(int |> Dump.array |> Dump.array)
-
   let make ~distances ~cpu_to_node =
-    debug "Distances: %s" (Fmt.to_to_string pp_dump_distances distances) ;
-    debug "CPU2Node: %s" (Fmt.to_to_string Fmt.(Dump.array int) cpu_to_node) ;
+    let ( let* ) = Option.bind in
     let node_cpus = Array.map (fun _ -> CPUSet.empty) distances in
+
+    (* nothing can be scheduled on unreachable nodes, remove them from the
+       node_cpus *)
     Array.iteri
-      (fun i node -> node_cpus.(node) <- CPUSet.add i node_cpus.(node))
-      cpu_to_node ;
-    Array.iteri
-      (fun i row ->
-        let d = distances.(i).(i) in
-        if d <> 10 then
-          invalid_arg
-            (Printf.sprintf "NUMA distance from node to itself must be 10: %d" d) ;
-        Array.iteri
-          (fun _ d ->
-            if d < 10 then
-              invalid_arg (Printf.sprintf "NUMA distance must be >= 10: %d" d)
-          )
-          row
+      (fun i node ->
+        let self = distances.(node).(node) in
+        if self <> unreachable_distance then
+          node_cpus.(node) <- CPUSet.add i node_cpus.(node)
       )
-      distances ;
-    let all = Array.fold_left CPUSet.union CPUSet.empty node_cpus in
-    let candidates = gen_candidates distances in
-    {
+      cpu_to_node ;
+
+    let* () =
+      if Array.for_all (fun cpus -> CPUSet.is_empty cpus) node_cpus then (
+        D.info
+          "Not enabling NUMA: the ACPI SLIT only contains unreachable nodes." ;
+        None
+      ) else
+        Some ()
+    in
+
+    let numa_matrix_is_reasonable =
       distances
-    ; cpu_to_node= Array.map node_of_int cpu_to_node
-    ; node_cpus
-    ; all
-    ; node_usage= Array.map (fun _ -> 0) distances
-    ; candidates
-    }
+      |> Array.to_seqi
+      |> Seq.for_all (fun (i, row) ->
+             let d = distances.(i).(i) in
+             (d = unreachable_distance || d = self_distance)
+             && Array.for_all
+                  (fun d -> d >= self_distance || d = unreachable_distance)
+                  row
+         )
+    in
+
+    let* () =
+      if not numa_matrix_is_reasonable then (
+        D.info
+          "Not enabling NUMA: the ACPI SLIT table contains values that are \
+           invalid." ;
+        None
+      ) else
+        Some ()
+    in
+
+    let candidates = gen_candidates distances in
+
+    let all = Array.fold_left CPUSet.union CPUSet.empty node_cpus in
+    Some
+      {
+        distances
+      ; cpu_to_node= Array.map node_of_int cpu_to_node
+      ; node_cpus
+      ; all
+      ; node_usage= Array.map (fun _ -> 0) distances
+      ; candidates
+      }
 
   let distance t (Node a) (Node b) = t.distances.(a).(b)
 
diff --git a/ocaml/xenopsd/lib/topology.mli b/ocaml/xenopsd/lib/topology.mli
@@ -78,7 +78,7 @@ module NUMA : sig
   (** A NUMA node index. Distinct from an int to avoid mixing with CPU numbers *)
   type node = private Node of int
 
-  val make : distances:int array array -> cpu_to_node:int array -> t
+  val make : distances:int array array -> cpu_to_node:int array -> t option
   (** [make distances cpu_to_node] stores the topology. [distances] is a square
       matrix [d] where [d.(i).(j)] is an approximation to how much slower it is
       to access memory from node [j] when running on node [i]. Distances are
@@ -97,8 +97,6 @@ module NUMA : sig
 
       NUMA nodes without any CPUs are accepted (to handle hard affinities).
 
-      Raises [invalid_arg] if above constraints are not met
-
       A typical matrix might look like this:
 
       10 21
diff --git a/ocaml/xenopsd/test/dune b/ocaml/xenopsd/test/dune
@@ -1,11 +1,9 @@
 (test
  (name test)
- (modes exe)
+ (modules :standard \ test_cpuid test_topology)
  (package xapi-tools)
  (libraries
   alcotest
-  cpuid
-  
   fmt
   result
   rpclib.core
@@ -15,15 +13,27 @@
   xapi-idl.xen.interface.types
   xapi-log
   xapi-stdext-pervasives
-  xapi-test-utils
   xapi_xenopsd
-  xenstore_transport.unix
  )
  (preprocess
   (per_module ((pps ppx_deriving_rpc) Test))
  )
 )
 
+(test
+  (name test_cpuid)
+  (modules test_cpuid)
+  (package xapi-tools)
+  (libraries alcotest cpuid xapi-test-utils xapi_xenopsd)
+)
+
+(test
+  (name test_topology)
+  (modules test_topology)
+  (package xapi-tools)
+  (libraries alcotest fmt xapi-log xapi_xenopsd)
+)
+
 (rule
   (alias runtest)
   (package xapi-tools)
diff --git a/ocaml/xenopsd/test/test.ml b/ocaml/xenopsd/test/test.ml
@@ -1032,4 +1032,4 @@ let _ =
     )
   in
   Debug.log_to_stdout () ;
-  Alcotest.run "xenops test" ([suite; Test_topology.suite] @ Test_cpuid.tests)
+  Alcotest.run "xenops test" [suite]
diff --git a/ocaml/xenopsd/test/test_cpuid.ml b/ocaml/xenopsd/test/test_cpuid.ml
@@ -327,3 +327,5 @@ let tests =
     ; ("equality", Equality.tests)
     ; ("comparisons", Comparisons.tests)
     ]
+
+let () = Alcotest.run "CPUID" tests
diff --git a/ocaml/xenopsd/test/test_topology.ml b/ocaml/xenopsd/test/test_topology.ml
diff --git a/ocaml/xenopsd/xc/domain.ml b/ocaml/xenopsd/xc/domain.ml

Original file line number	Diff line number	Diff line change
`@@ -1032,4 +1032,4 @@ let _ =`
`1032`	`1032`	`)`
`1033`	`1033`	`in`
`1034`	`1034`	`Debug.log_to_stdout () ;`
`1035`		`- Alcotest.run "xenops test" ([suite; Test_topology.suite] @ Test_cpuid.tests)`
	`1035`	`+ Alcotest.run "xenops test" [suite]`
Original file line number	Diff line number	Diff line change
`@@ -327,3 +327,5 @@ let tests =`
`327`	`327`	`; ("equality", Equality.tests)`
`328`	`328`	`; ("comparisons", Comparisons.tests)`
`329`	`329`	`]`
	`330`	`+`
	`331`	`+let () = Alcotest.run "CPUID" tests`