@@ -984,9 +984,134 @@ let numa_hierarchy =
984984 NUMA. make ~distances ~cpu_to_node ~node_cores
985985 )
986986
987+ module NUMAResources = struct
988+ module DomidMap = Map. Make (Int )
989+
990+ type node_memusage = int64 Topology.NUMA.NodeMap .t
991+
992+ type t = {
993+ host : Topology.NUMAResource .t Topology.NUMA.NodeMap .t
994+ ; estimated : node_memusage DomidMap .t
995+ ; outstanding_builds : node_memusage DomidMap .t
996+ }
997+
998+ let node_memusage_of_domidmap node t =
999+ DomidMap. fold
1000+ (fun _ nodemap acc ->
1001+ Topology.NUMA.NodeMap. find_opt node nodemap
1002+ |> Option. value ~default: 0L
1003+ |> Int64. add acc
1004+ )
1005+ t 0L
1006+
1007+ let to_nodes t =
1008+ let open Topology in
1009+ t.host
1010+ |> NUMA.NodeMap. mapi @@ fun node r ->
1011+ NUMAResource. shrink_memory r
1012+ (Int64. add
1013+ (node_memusage_of_domidmap node t.estimated)
1014+ (node_memusage_of_domidmap node t.outstanding_builds)
1015+ )
1016+
1017+ let of_host nodes =
1018+ {
1019+ host= nodes |> Topology.NUMA.NodeMap. of_seq
1020+ ; estimated= DomidMap. empty
1021+ ; outstanding_builds= DomidMap. empty
1022+ }
1023+
1024+ let empty =
1025+ {
1026+ host= Topology.NUMA.NodeMap. empty
1027+ ; estimated= DomidMap. empty
1028+ ; outstanding_builds= DomidMap. empty
1029+ }
1030+
1031+ let pp_dump_nodemap pp_elt =
1032+ let open Topology.NUMA in
1033+ Fmt. (Dump. iter_bindings NodeMap. iter (any " nodemap" ) pp_dump_node pp_elt)
1034+
1035+ let pp_dump_domidmap pp_elt =
1036+ Fmt. (Dump. iter_bindings DomidMap. iter (any " domidmap" ) int pp_elt)
1037+
1038+ let pp_dump_numa_resources =
1039+ Fmt.Dump. (
1040+ record
1041+ [
1042+ field " host"
1043+ (fun t -> t.host)
1044+ (pp_dump_nodemap Topology.NUMAResource. pp_dump)
1045+ ; field " estimated"
1046+ (fun t -> t.estimated)
1047+ (pp_dump_domidmap @@ pp_dump_nodemap Fmt. int64 )
1048+ ; field " outstanding_builds"
1049+ (fun t -> t.outstanding_builds)
1050+ (pp_dump_domidmap @@ pp_dump_nodemap Fmt. int64 )
1051+ ]
1052+ )
1053+
1054+ let add_outstanding domid plan t =
1055+ D. debug " domid %u: adding outstanding NUMA memory usage estimate: %s" domid
1056+ ((Fmt. to_to_string @@ pp_dump_nodemap Fmt. int64 ) plan) ;
1057+ {t with outstanding_builds= DomidMap. add domid plan t.outstanding_builds}
1058+
1059+ let remove_outstanding domid t =
1060+ D. debug " domid %u: removing outstanding NUMA memory usage estimate" domid ;
1061+ {t with outstanding_builds= DomidMap. remove domid t.outstanding_builds}
1062+
1063+ let transfer_outstanding domid t =
1064+ D. debug " domid %u: transfering outstanding NUMA memory usage estimate" domid ;
1065+ DomidMap. find_opt domid t.outstanding_builds
1066+ |> Option. fold ~none: t ~some: (fun plan ->
1067+ {
1068+ t with
1069+ estimated= DomidMap. add domid plan t.estimated
1070+ ; outstanding_builds= DomidMap. remove domid t.outstanding_builds
1071+ }
1072+ )
1073+
1074+ let estimate_plan nodes mem plan =
1075+ let open Topology in
1076+ let _, _, nodemap =
1077+ List. fold_left
1078+ (fun (mem , remaining_nodes , acc ) node ->
1079+ let per_node_usage =
1080+ Int64. div mem (remaining_nodes |> Int64. of_int)
1081+ in
1082+ let r = NUMA.NodeMap. find node nodes in
1083+ let allocated_from_node =
1084+ Int64. min per_node_usage r.NUMAResource. memfree
1085+ in
1086+ ( Int64. sub mem allocated_from_node
1087+ , remaining_nodes - 1
1088+ , NUMA.NodeMap. add node allocated_from_node acc
1089+ )
1090+ )
1091+ (mem, List. length plan, NUMA.NodeMap. empty)
1092+ plan
1093+ in
1094+ nodemap
1095+ end
1096+
9871097let numa_mutex = Mutex. create ()
9881098
989- let numa_resources = ref None
1099+ (* protected by numa_mutex *)
1100+ let numa_resources = ref NUMAResources. empty
1101+
1102+ let numa_reserve_node0 mem =
1103+ let reserved = Int64. shift_left 1L ! Xenopsd. numa_reserve_node0_dmaheap_bits in
1104+ let mem' = Int64. (max 0L (sub mem reserved)) in
1105+ (* Prefer nodes other than node0 when node0 is low on memory.
1106+ This is reserved for the DMA heap in Xen, and even if we try to make a VM
1107+ use it up, Xen will move some of its memory to other nodes to avoid
1108+ running out.
1109+ Without the claim API we have no way of forcing or querying this
1110+ (and even with the claim API it'd be desirable to avoid running out on
1111+ node0).
1112+ *)
1113+ D. debug " numa_reserve_node0, reducing free memory: %Lu -> %Lu" mem mem' ;
1114+ mem'
9901115
9911116let numa_init () =
9921117 let xcext = Xenctrlext. get_handle () in
@@ -1007,28 +1132,48 @@ let set_affinity = function
10071132 | Xenops_server. Soft ->
10081133 Xenctrlext. vcpu_setaffinity_soft
10091134
1010- let numa_placement domid ~vcpus ~cores ~memory affinity =
1135+ let numa_placement domid ~vcpus ~cores ~memory ~ required_free affinity =
10111136 let open Xenctrlext in
10121137 let open Topology in
1138+ let open NUMAResources in
10131139 with_lock numa_mutex (fun () ->
10141140 let ( let * ) = Option. bind in
10151141 let xcext = get_handle () in
10161142 let * host = Lazy. force numa_hierarchy in
10171143 let numa_meminfo = (numainfo xcext).memory |> Array. to_seq in
10181144 let nodes =
10191145 Seq. map2
1020- (fun node m -> NUMA. resource host node ~memory: m.memfree)
1146+ (fun node m ->
1147+ ( node
1148+ , let (NUMA. Node nodeid) = node in
1149+ let memory = m.memfree in
1150+ let memory =
1151+ if nodeid = 0 then
1152+ numa_reserve_node0 memory
1153+ else
1154+ memory
1155+ in
1156+ NUMA. resource host node ~memory
1157+ )
1158+ )
10211159 (NUMA. nodes host) numa_meminfo
10221160 in
10231161 let vm = NUMARequest. make ~memory ~vcpus ~cores in
1162+
1163+ let previous = ! numa_resources in
1164+ if DomidMap. is_empty previous.outstanding_builds then begin
1165+ (* no outstanding domain builds: synchronize actual free node memory
1166+ with host *)
1167+ numa_resources := of_host nodes ;
1168+ D. debug " no outstanding domain builds, dropping NUMA usage estimates"
1169+ end ;
1170+ D. debug " numa_resources(in): %s"
1171+ @@ Fmt. to_to_string pp_dump_numa_resources ! numa_resources ;
1172+
1173+ let estimated_nodes = to_nodes ! numa_resources in
10241174 let nodea =
1025- match ! numa_resources with
1026- | None ->
1027- Array. of_seq nodes
1028- | Some a ->
1029- Array. map2 NUMAResource. min_memory (Array. of_seq nodes) a
1175+ estimated_nodes |> NUMA.NodeMap. to_seq |> Seq. map snd |> Array. of_seq
10301176 in
1031- numa_resources := Some nodea ;
10321177 let cpu_affinity, memory_plan =
10331178 match Softaffinity. plan ~vm host nodea with
10341179 | None ->
@@ -1037,6 +1182,12 @@ let numa_placement domid ~vcpus ~cores ~memory affinity =
10371182 | Some (cpu_affinity , mem_plan ) ->
10381183 (Some cpu_affinity, mem_plan)
10391184 in
1185+ numa_resources :=
1186+ add_outstanding domid
1187+ (estimate_plan estimated_nodes required_free memory_plan)
1188+ ! numa_resources ;
1189+ D. debug " numa_resources(out): %s"
1190+ @@ Fmt. to_to_string pp_dump_numa_resources ! numa_resources ;
10401191 let set_vcpu_affinity = function
10411192 | None ->
10421193 D. debug " %s: not setting vcpu affinity for domain %d" __FUNCTION__
@@ -1177,6 +1328,7 @@ let build_pre ~xc ~xs ~vcpus ~memory ~hard_affinity domid =
11771328 in
11781329 numa_placement domid ~vcpus ~cores
11791330 ~memory: (Int64. mul memory.xen_max_mib 1048576L )
1331+ ~required_free: (Int64. mul memory.required_host_free_mib 1048576L )
11801332 affinity
11811333 |> Option. map fst
11821334 )
@@ -1358,6 +1510,11 @@ let build (task : Xenops_task.task_handle) ~xc ~xs ~store_domid ~console_domid
13581510 let target_mib = Memory. mib_of_kib_used target_kib in
13591511 (* Sanity check. *)
13601512 assert (target_mib < = static_max_mib) ;
1513+ let finally () =
1514+ with_lock numa_mutex @@ fun () ->
1515+ numa_resources := NUMAResources. remove_outstanding domid ! numa_resources
1516+ in
1517+ Fun. protect ~finally @@ fun () ->
13611518 let store_mfn, store_port, console_mfn, console_port, vm_stuff, domain_type =
13621519 match info.priv with
13631520 | BuildHVM hvminfo ->
@@ -1444,6 +1601,10 @@ let build (task : Xenops_task.task_handle) ~xc ~xs ~store_domid ~console_domid
14441601 )
14451602 in
14461603 let local_stuff = console_keys console_port console_mfn in
1604+ let () =
1605+ with_lock numa_mutex @@ fun () ->
1606+ numa_resources := NUMAResources. transfer_outstanding domid ! numa_resources
1607+ in
14471608 build_post ~xc ~xs ~target_mib ~static_max_mib domid domain_type store_mfn
14481609 store_port local_stuff vm_stuff
14491610
0 commit comments