From d5d976e02a0a35f2fbc7ef6d13ceca034b1ba05a Mon Sep 17 00:00:00 2001 From: Stephen Cheng Date: Wed, 14 Jan 2026 15:30:32 +0800 Subject: [PATCH 01/65] CA-422448: Write proxy credentials to repo file instead of command line DNF5 logs command-line arguments to /var/log/dnf5.log, exposing proxy_password when passed via `dnf config-manager setopt`. Write proxy credentials directly to the .repo file (mode 0o400) and remove them after sync completes to avoid password exposure in logs. Signed-off-by: Stephen Cheng --- ocaml/tests/test_repository_helpers.ml | 2 +- ocaml/xapi/repository.ml | 28 +++++++++++++------------- ocaml/xapi/repository_helpers.ml | 7 ++++--- 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/ocaml/tests/test_repository_helpers.ml b/ocaml/tests/test_repository_helpers.ml index fbc7618ce1b..97055e4958d 100644 --- a/ocaml/tests/test_repository_helpers.ml +++ b/ocaml/tests/test_repository_helpers.ml @@ -347,7 +347,7 @@ module WriteYumConfig = Generic.MakeStateless (struct try (* The path of file which will be written by write_yum_config *) write_yum_config ~source_url ~binary_url ~repo_gpgcheck:true - ~gpgkey_path:gpgkey_path' ~repo_name ; + ~gpgkey_path:gpgkey_path' ~repo_name () ; let in_ch = open_in repo_file_path in let content = read_from_in_channel "" in_ch in close_in in_ch ; finally () ; Ok content diff --git a/ocaml/xapi/repository.ml b/ocaml/xapi/repository.ml index c8066abe00d..5a8c185020b 100644 --- a/ocaml/xapi/repository.ml +++ b/ocaml/xapi/repository.ml @@ -224,9 +224,12 @@ let sync ~__context ~self ~token ~token_id ~username ~password = | s -> s in - let write_initial_yum_config ~binary_url = - write_yum_config ~source_url ~binary_url ~repo_gpgcheck:true ~gpgkey_path - ~repo_name + let proxy_config = + match use_proxy with true -> get_proxy_params ~__context | false -> [] + in + let write_initial_yum_config ?(proxy_config = proxy_config) ~binary_url () = + write_yum_config ~proxy_config ~source_url ~binary_url ~repo_gpgcheck:true + ~gpgkey_path ~repo_name () in Xapi_stdext_pervasives.Pervasiveext.finally (fun () -> @@ -256,7 +259,8 @@ let sync ~__context ~self ~token ~token_id ~username ~password = with_sync_client_auth client_auth @@ fun client_auth -> with_sync_server_auth server_auth @@ fun binary_url' -> write_initial_yum_config - ~binary_url:(Option.value binary_url' ~default:binary_url) ; + ~binary_url:(Option.value binary_url' ~default:binary_url) + () ; clean_yum_cache repo_name ; (* Remove imported YUM repository GPG key *) if Pkgs.manager = Yum then @@ -272,15 +276,10 @@ let sync ~__context ~self ~token ~token_id ~username ~password = | None -> [] in - let proxy_params = - match use_proxy with - | true -> - get_proxy_params ~__context - | false -> - [] - in - auth_params @ proxy_params |> fun x -> - config_repo x ; make_cache () ; sync_repo () + (* Proxy config is now written directly to repo file, so only pass + * auth_params to config_repo to avoid exposing credentials in + * command-line arguments which get logged by DNF5 *) + config_repo auth_params ; make_cache () ; sync_repo () ) (fun () -> (* Rewrite repo conf file as initial content to remove credential @@ -294,7 +293,8 @@ let sync ~__context ~self ~token ~token_id ~username ~password = *) match Pkgs.manager with | Yum -> - write_initial_yum_config ~binary_url + (* Write clean config without proxy credentials *) + write_initial_yum_config ~proxy_config:[] ~binary_url () | Dnf -> Unixext.unlink_safe !Xapi_globs.dnf_repo_config_file ) ; diff --git a/ocaml/xapi/repository_helpers.ml b/ocaml/xapi/repository_helpers.ml index 4e259e161ec..a4a80e39b34 100644 --- a/ocaml/xapi/repository_helpers.ml +++ b/ocaml/xapi/repository_helpers.ml @@ -310,8 +310,8 @@ let remove_repo_conf_file repo_name = in Unixext.unlink_safe path -let write_yum_config ~source_url ~binary_url ~repo_gpgcheck ~gpgkey_path - ~repo_name = +let write_yum_config ?(proxy_config = []) ~source_url ~binary_url ~repo_gpgcheck + ~gpgkey_path ~repo_name () = let file_path = Filename.concat !Xapi_globs.yum_repos_config_dir (repo_name ^ ".repo") in @@ -344,6 +344,7 @@ let write_yum_config ~source_url ~binary_url ~repo_gpgcheck ~gpgkey_path ; opt_gpgcheck ; opt_gpgkey ] + @ proxy_config in let content_of_source = match source_url with @@ -446,7 +447,7 @@ let with_local_repositories ~__context f = in remove_repo_conf_file repo_name ; write_yum_config ~source_url:None ~binary_url ~repo_gpgcheck:false - ~gpgkey_path ~repo_name ; + ~gpgkey_path ~repo_name () ; clean_yum_cache repo_name ; let Pkg_mgr.{cmd; params} = [ From 6b4237fdea1ba9d9d9686529b0666758cde76560 Mon Sep 17 00:00:00 2001 From: Stephen Cheng Date: Fri, 16 Jan 2026 15:46:09 +0800 Subject: [PATCH 02/65] Make proxy_config non-optional Signed-off-by: Stephen Cheng --- ocaml/tests/test_repository_helpers.ml | 4 ++-- ocaml/xapi/repository.ml | 11 +++++------ ocaml/xapi/repository_helpers.ml | 8 ++++---- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/ocaml/tests/test_repository_helpers.ml b/ocaml/tests/test_repository_helpers.ml index 97055e4958d..66c1f4b4b1e 100644 --- a/ocaml/tests/test_repository_helpers.ml +++ b/ocaml/tests/test_repository_helpers.ml @@ -346,8 +346,8 @@ module WriteYumConfig = Generic.MakeStateless (struct let gpgkey_path' = Option.value ~default:"" name in try (* The path of file which will be written by write_yum_config *) - write_yum_config ~source_url ~binary_url ~repo_gpgcheck:true - ~gpgkey_path:gpgkey_path' ~repo_name () ; + write_yum_config ~proxy_config:[] ~source_url ~binary_url + ~repo_gpgcheck:true ~gpgkey_path:gpgkey_path' ~repo_name ; let in_ch = open_in repo_file_path in let content = read_from_in_channel "" in_ch in close_in in_ch ; finally () ; Ok content diff --git a/ocaml/xapi/repository.ml b/ocaml/xapi/repository.ml index 5a8c185020b..9fb284fcc55 100644 --- a/ocaml/xapi/repository.ml +++ b/ocaml/xapi/repository.ml @@ -227,9 +227,9 @@ let sync ~__context ~self ~token ~token_id ~username ~password = let proxy_config = match use_proxy with true -> get_proxy_params ~__context | false -> [] in - let write_initial_yum_config ?(proxy_config = proxy_config) ~binary_url () = + let write_initial_yum_config ~proxy_config ~binary_url = write_yum_config ~proxy_config ~source_url ~binary_url ~repo_gpgcheck:true - ~gpgkey_path ~repo_name () + ~gpgkey_path ~repo_name in Xapi_stdext_pervasives.Pervasiveext.finally (fun () -> @@ -258,9 +258,8 @@ let sync ~__context ~self ~token ~token_id ~username ~password = with_sync_client_auth client_auth @@ fun client_auth -> with_sync_server_auth server_auth @@ fun binary_url' -> - write_initial_yum_config - ~binary_url:(Option.value binary_url' ~default:binary_url) - () ; + write_initial_yum_config ~proxy_config + ~binary_url:(Option.value binary_url' ~default:binary_url) ; clean_yum_cache repo_name ; (* Remove imported YUM repository GPG key *) if Pkgs.manager = Yum then @@ -294,7 +293,7 @@ let sync ~__context ~self ~token ~token_id ~username ~password = match Pkgs.manager with | Yum -> (* Write clean config without proxy credentials *) - write_initial_yum_config ~proxy_config:[] ~binary_url () + write_initial_yum_config ~proxy_config:[] ~binary_url | Dnf -> Unixext.unlink_safe !Xapi_globs.dnf_repo_config_file ) ; diff --git a/ocaml/xapi/repository_helpers.ml b/ocaml/xapi/repository_helpers.ml index a4a80e39b34..487e8e30f17 100644 --- a/ocaml/xapi/repository_helpers.ml +++ b/ocaml/xapi/repository_helpers.ml @@ -310,8 +310,8 @@ let remove_repo_conf_file repo_name = in Unixext.unlink_safe path -let write_yum_config ?(proxy_config = []) ~source_url ~binary_url ~repo_gpgcheck - ~gpgkey_path ~repo_name () = +let write_yum_config ~proxy_config ~source_url ~binary_url ~repo_gpgcheck + ~gpgkey_path ~repo_name = let file_path = Filename.concat !Xapi_globs.yum_repos_config_dir (repo_name ^ ".repo") in @@ -446,8 +446,8 @@ let with_local_repositories ~__context f = s in remove_repo_conf_file repo_name ; - write_yum_config ~source_url:None ~binary_url ~repo_gpgcheck:false - ~gpgkey_path ~repo_name () ; + write_yum_config ~proxy_config:[] ~source_url:None ~binary_url + ~repo_gpgcheck:false ~gpgkey_path ~repo_name ; clean_yum_cache repo_name ; let Pkg_mgr.{cmd; params} = [ From ed593d5d27a543624e21af8cc30cc84e6a470f05 Mon Sep 17 00:00:00 2001 From: Lin Liu Date: Thu, 15 Jan 2026 07:35:28 +0000 Subject: [PATCH 03/65] CP-311020: Design for enabling ldaps for external auth Signed-off-by: Lin Liu --- doc/content/design/external-auth-ldaps.md | 323 ++++++++++++++++++++++ 1 file changed, 323 insertions(+) create mode 100644 doc/content/design/external-auth-ldaps.md diff --git a/doc/content/design/external-auth-ldaps.md b/doc/content/design/external-auth-ldaps.md new file mode 100644 index 00000000000..1e8c6478312 --- /dev/null +++ b/doc/content/design/external-auth-ldaps.md @@ -0,0 +1,323 @@ +--- +title: Secure LDAP (LDAPS) Support for External Authentication +layout: default +design_doc: true +revision: 1 +status: draft +--- + +## Terminology + +| Term | Meaning | +|------|---------| +| AD | Windows Active Directory | +| samba/winbind | Client used in xapi to talk to AD | +| DC | Windows AD domain controller | +| ldap/ldaps | Lightweight Directory Access Protocol / over SSL | +| Joining host | The host joining to a pool | + +## 1. Background + +To integrate XenServer with AD, XenServer performs LDAP queries in the following use cases: + +- **Enable external authentication/Join domain**: Samba LDAP queries DC details +- **Session revalidation**: xapi queries user details (e.g., whether user is still valid, password expired, etc.) to decide whether to destroy a session + +Currently XenServer uses plain LDAP queries, which is a concern for some enterprise customers. + +## 2. Xapi Database + +### 2.1 External Auth Configuration + +External auth details are stored in the `host` (table) → `external_auth_configuration` (field). For example: + +``` +external_auth_configuration: { + domain: xenrt16718.local, + user: Administrator, + workgroup: XENRTXENRT16718, + netbios_name: genus-35-103d, + machine_pwd_last_change_time: 1767508709 +} +``` + +A new field `ldaps` (bool, optional) will be added to `external_auth_configuration` field to state whether LDAPS should be used instead of LDAP. If not set, LDAP will be used for backward compatibility. + +So the field will look like: + +``` +external_auth_configuration: { + domain: xenrt16718.local, + user: Administrator, + workgroup: XENRTXENRT16718, + netbios_name: genus-35-103d, + machine_pwd_last_change_time: 1767508709, + ldaps: true +} +``` + +### 2.2 Certificate + +To enforce security, if customer uses self-signed certificate, they need to upload the root CA certificate to XenServer, so XenServer can verify the certificate/public key used talking to DC for LDAPS. + +The [trusted-certificates.md](https://github.com/xapi-project/xen-api/blob/master/doc/content/design/trusted-certificates.md) design enhanced the `Certificate` table and introduced a new field `purpose` for security, which limits the certificate only for specific purpose. `ldaps` will be added to `purpose` field as a value for LDAPS. + +## 3. Interfaces + +### 3.1 pool.enable_external_auth + +To enable external auth, the current API arguments are as follows: + +- `pool` (Ref _pool): The pool whose external authentication should be enabled +- `config` (Map (String, String)): A list of key-values containing the configuration data +- `service_name` (String): The name of the service +- `auth_type` (String): The type of authentication (e.g., AD for Active Directory) + +For example: + +```bash +xe pool-enable-external-auth uuid= auth-type=AD service-name= config:user= config:pass= +``` + +This API signature does not change. Regarding the config map, one new option is added: + +- `config:ldaps`: whether LDAPS is required, default to `false` + - Set `client ldap sasl wrapping` to `ldaps` if true, `seal` otherwise + - This item will be stored in database in section 2.1 + +Given `ldaps` default to `false`, this feature is **NOT** enabled until explicitly set. + +### 3.2 Set/Get Pool LDAPS Status + +#### 3.2.1 pool.external_auth_set_ldaps + +User can specify LDAPS during join domain as in 3.1. + +For the existing joined domain, user can switch between LDAP and LDAPS with this new API. Args as follows: + +- `pool` (Ref _pool): pool to set LDAPS +- `ldaps` (Bool): whether LDAPS is required + +This API will set the `ldaps` in database (Refer to 2.1). + +This API performs following sanity check and rejects update if check fails: + +- AD has already been enabled +- Find proper certificate (Refer to 4.1 for the details) +- Do a `ldaps` query to embedded user `krbtgt` for the joined domain + +**Note:** +- This API allow en-entry for debug purpose +- Will not do the LDAPS query on the trusted domains, as xapi does not have trusted domain details +- The joined domain likely has multiple DCs. LDAPS query tries every DC of the domain. Check pass if LDAPS query succeeds on any DC of the domain. This implies iterate and locate a DC supporting LDAPS (with proper certificate trust setup) before LDAPS query. However, this does not introduce performance problems as the LDAPS query happens in backend and refreshes result into XAPI DB +- Pool coordinator dispatches this API request to every host, and only succeeds if all hosts pass the check +- This API needs to be synced with other APIs. For example, `authenticate_username_password` should fail if this API is performing checking and configuration + +This API will refresh `winbind` configuration (Refer to 4.1). + +So following xe command can be used to switch between LDAP and LDAPS: + +```bash +xe pool-external-auth-set-ldaps uuid= ldaps= +``` + +#### 3.2.2 Get Pool LDAPS Status + +xapi generates a get message for each field automatically. To query the LDAPS status, client only needs to query the get method of `host` (class) → `external-auth-configuration` (field), and parse the result. The example as follows: + +```bash +xe host-param-get uuid= param-name=external-auth-configuration +``` + +### 3.3 Install Certificate + +If the certificate for LDAPS in DC is signed by a private CA (vs a trusted public CA), user needs to import their Root or Intermediate CA certificate into XenServer. + +`pool.install_trusted_certificate` can install the certificate with following parameters, refer to [trusted-certificates.md](https://github.com/xapi-project/xen-api/blob/master/doc/content/design/trusted-certificates.md) for the details: + +- `session` (ref session_id): reference to a valid session +- `self` (ref Pool): reference to the pool +- `ca` (boolean): should always be `true` for `ldaps`. xapi should reject this CA otherwise +- `cert` (string): the trusted certificate in PEM format +- `purpose` (string list): the purposes of this cert. It can be one of following: + - contain `ldaps` if for specific this specific purpose + - empty set, thus would take as general purpose CA. It will be used for `ldaps` if no `ldaps` specific purpose found + +**Note:** If the DCs (of joining domain and trusted domain in use) are signed by different CAs, all the CAs need to be uploaded to XenServer. + +## 4. Configuration Item + +To enforce LDAPS, following are required: + +- Samba needs to be updated to 4.21+ (Already done) +- LDAPS needs to be enabled in smb.conf + +### 4.1 Samba Configuration + +#### 4.1.1 smb.conf + +To enforce LDAPS, xapi just passthrough the configuration to winbind. Following configuration needs to be updated to `/etc/samba/smb.conf`, details refer to [smb.conf](https://www.samba.org/samba/docs/current/man-html/smb.conf.5.html): + +```ini +client ldap sasl wrapping = +tls verify peer = ca_and_name_if_available +tls trust system cas = yes +tls cafile = /etc/trusted-certs/ca-bundle-[ldaps|general].pem +``` + +- Switch between `ldap` and `ldaps` will flip `client ldap sasl wrapping` between `seal` and `ldaps` +- `tls cafile` points to a CA bundle used to verify DC certs. Details refer to 4.1.2 + +#### 4.1.2 Certificate Selection + +This design is following [trusted-certificates.md](https://github.com/xapi-project/xen-api/blob/master/doc/content/design/trusted-certificates.md): + +- Use `/etc/trusted-certs/ca-bundle-ldaps.pem` if exists +- Fall back to `/etc/trusted-certs/ca-bundle-general.pem` if exists and previous not match +- Report error if none of above match + +**Note:** The selection/configuration is only refreshed on following cases: + +- xapi (re)start +- `pool.external_auth_set_ldaps` API +- (Re)join domain + +### 4.2 Xapi Configuration + +#### 4.2.1 winbind-tls-verify-peer + +For security, xapi asks winbind to verify CA certificate. `ca_and_name_if_available` is the default. + +However, user may want to disable this verification for debug purpose. + +`winbind-tls-verify-peer` is introduced for xapi configuration, and the possible values are `no_check`, `ca_only`, `ca_and_name_if_available`, `ca_and_name` and `as_strict_as_possible`. + The configured value will override `tls verify peer` value in xapi generated samba configuration. Refer to [smb.conf](https://www.samba.org/samba/docs/current/man-html/smb.conf.5.html) for the details. + + +**Note:** This item is not intended for public documentation. This is only for debug purpose, or system tuning for specific scenarios from engineering/support team. + +#### 4.2.2 ad-warning-message-interval + +xapi sends warning message to user with this interval on LDAP query failure. Default to 1 week. Refer to section "Session revalidate" for the details. + +## 5. Session Revalidate + +xapi LDAP queries domain user status (if user has been added to manage XenServer) at configurable interval, and destroys the session created by domain user if user no longer in healthy status. + +However, the LDAP query may fail due to various issues as follows: + +- Temporary network issues +- CA certificate is not properly configured, or expired, etc. + +Instead of destroying user session for stability, a warning message will be sent to user with the details at configurable interval `ad-warning-message-interval`. + +- If no LDAP error, do nothing +- If error happens, send the warning message if: + - first time see the error through xapi start up (so no need to persist last send time) or + - `current_time - last_sent_time > winbind_warning_message_interval` + +The message is defined as follows: +- name: AD_DC_LDAP_CHECK +- priority: Warning +- cls: `Host +- Body: LDAP(S) query check to `` of `` failed from `` of `` + +Note: +- The backend session revalidate check only performs on pool coordinator, thus the backend LDAP(S) query check only on coordinator +- `external_auth_set_ldaps` perform LDAP(S) query check on every host +- All previous AD_DC_LDAP_CHECK warning of a host will be cleaned on a successful LDAP(s) query from that host + +## 6. Pool Join/Leave + +### 6.1 pool.join + +#### 6.1.1 AD Pre-checks + +Currently the pool.join pre-check checks the following: + +- `external_auth_type`: whether joined AD +- `external_auth_service_name`: whether joined the same domain + +The pre-check is good enough, no matter whether `ldaps` is in use, as this ensures host can talk to AD. There are following cases: + +- pool with AD, joining host with same AD: check pass as before this design +- pool without AD, joining host without AD: check pass as before this design +- pool without AD, joining host with AD: check failed as before this design +- pool with AD, joining host without AD: + - LDAPS not enabled: joining host needs to join to same AD as before this design + - LDAPS enabled: joining host needs to enable AD without certificate check, details refer to 6.1.2 + +#### 6.1.2 Join Host to Pool with LDAPS Enabled + +When joining a host without AD to a pool with LDAPS enabled, the host may not have the (CA) certificate for the domain. It can be trivial to enforce customer to upload the CA certificate to every joining host, thus client would help to orchestrate certificates. + +The workflow: + +~~~mermaid + +sequenceDiagram +participant user as User +participant client as Client +participant join as Joining host +participant coor as Pool Coordinator +participant dc as AD/DC + +user->>client: pool.join +Note over client: precheck +alt precheck failed +client-->>user: precheck failed +end + +Note over client,coor: sync all ldaps certs +client->>coor: pool.download_trusted_certificate +coor-->>client: +client->>join: pool.install_trusted_certificate +join-->>client: + +user->>client: join domain username/password +client->>join: join domain username/password +join->>dc: join domain +dc-->>join: +join-->>client: + +client->>join: pool.join +Note over join,coor: join pool ops
certs sync +join-->>client: +client-->>user: pool.join succeed + +~~~ + +**Detailed Steps:** + +1. Client find proper `ldaps certs` from pool coordinator as `certs_pool` + - a. find all certs `ldaps in purpose` + - b. if no LDAPS certs, find all `general` certs +2. Client find all certs in joining host as `certs_joining_host` +3. Client identify the certs needs to be synced to joining host as `certs_to_sync = certs_pool - certs_joining_host` (certs in `certs_pool`, but not in `certs_joining_host`), the certs fingerprint should be used to identify the certs +4. Client download all `certs_to_sync`, `pool.download_trusted_certificate` from coordinator +5. Client upload all certs to joining pool, `pool.install_trusted_certificate` to joining pool, with the same purpose +6. Client trigger `pool.join` again with domain username and password +7. After pool.join: + - If pool.join failed, Client call `pool.uninstall_trusted_certificate` on joining host to revert the certs + - If pool.join succeed, do nothing as pool.join would sync the certs anyway + +### 6.2 pool.leave + +`pool.disable_external_auth` is called during pool leave, thus the `ldaps` status is cleaned. + +This design does not change it. + +## 7. Telemetry Support + +### 7.1 External Auth Enabled + +`host` (table) → `external_auth_type` (field) = `AD` + +### 7.2 LDAPS Enabled + +`host` (table) → `external_auth_configuration` (field) → `ldaps` (key) = `true` + +## References + +- [trusted-certificates.md](https://github.com/xapi-project/xen-api/blob/master/doc/content/design/trusted-certificates.md) +- [Samba smb.conf manual](https://www.samba.org/samba/docs/current/man-html/smb.conf.5.html) From 77c6bf308854524b5f24999fd4e64e164c6502ec Mon Sep 17 00:00:00 2001 From: Guillaume Date: Thu, 15 Jan 2026 13:34:47 +0100 Subject: [PATCH 04/65] Check that suspend SR has enough space to save VM state This patch adds a helper to compute the free space on a SR. It is used to check that the suspend SR has enough space when creating a snapshot with memory. If there is not enough space, SR_SUSPEND_SPACE_INSUFFICIENT is raised. Signed-off-by: Guillaume --- ocaml/idl/datamodel_errors.ml | 5 +++ ocaml/xapi-consts/api_errors.ml | 2 ++ ocaml/xapi/helpers.ml | 55 ++++++++++++++++++++++----------- ocaml/xapi/xapi_xenops.ml | 5 ++- ocaml/xapi/xha_statefile.ml | 9 ++---- 5 files changed, 51 insertions(+), 25 deletions(-) diff --git a/ocaml/idl/datamodel_errors.ml b/ocaml/idl/datamodel_errors.ml index c1b1f09e2b3..24262314274 100644 --- a/ocaml/idl/datamodel_errors.ml +++ b/ocaml/idl/datamodel_errors.ml @@ -1116,6 +1116,11 @@ let _ = "The source SR does not have sufficient temporary space available to \ proceed the operation." () ; + error Api_errors.sr_suspend_space_insufficient ["sr"] + ~doc: + "The suspend SR does not have sufficient free space to store the VM \ + suspend image required to complete a snapshot with memory." + () ; error Api_errors.pbd_exists ["sr"; "host"; "pbd"] ~doc:"A PBD already exists connecting the SR to the server." () ; error Api_errors.sr_has_pbd ["sr"] diff --git a/ocaml/xapi-consts/api_errors.ml b/ocaml/xapi-consts/api_errors.ml index e796369f583..8ca6eaafaa9 100644 --- a/ocaml/xapi-consts/api_errors.ml +++ b/ocaml/xapi-consts/api_errors.ml @@ -501,6 +501,8 @@ let sr_full = add_error "SR_FULL" let sr_source_space_insufficient = add_error "SR_SOURCE_SPACE_INSUFFICIENT" +let sr_suspend_space_insufficient = add_error "SR_SUSPEND_SPACE_INSUFFICIENT" + let sr_has_pbd = add_error "SR_HAS_PBD" let sr_requires_upgrade = add_error "SR_REQUIRES_UPGRADE" diff --git a/ocaml/xapi/helpers.ml b/ocaml/xapi/helpers.ml index 519fc0f2948..e571b962272 100644 --- a/ocaml/xapi/helpers.ml +++ b/ocaml/xapi/helpers.ml @@ -1127,8 +1127,14 @@ let check_sr_exists_for_host ~__context ~self ~host = else None +(* Returns the amount of free space for a given SR *) +let get_sr_free_space ~__context ~sr = + let size = Db.SR.get_physical_size ~__context ~self:sr in + let utilisation = Db.SR.get_physical_utilisation ~__context ~self:sr in + Int64.sub size utilisation + (* Returns an SR suitable for suspending this VM *) -let choose_suspend_sr ~__context ~vm = +let choose_suspend_sr ~__context ~vm ~required_space = (* If the VM.suspend_SR exists, use that. If it fails, try the Pool.suspend_image_SR. *) (* If that fails, try the Host.suspend_image_SR. *) let vm_sr = Db.VM.get_suspend_SR ~__context ~self:vm in @@ -1136,23 +1142,36 @@ let choose_suspend_sr ~__context ~vm = let pool_sr = Db.Pool.get_suspend_image_SR ~__context ~self:pool in let resident_on = Db.VM.get_resident_on ~__context ~self:vm in let host_sr = Db.Host.get_suspend_image_sr ~__context ~self:resident_on in - match - ( check_sr_exists_for_host ~__context ~self:vm_sr ~host:resident_on - , check_sr_exists_for_host ~__context ~self:pool_sr ~host:resident_on - , check_sr_exists_for_host ~__context ~self:host_sr ~host:resident_on - ) - with - | Some x, _, _ -> - x - | _, Some x, _ -> - x - | _, _, Some x -> - x - | None, None, None -> - raise - (Api_errors.Server_error - (Api_errors.vm_no_suspend_sr, [Ref.string_of vm]) - ) + let sr = + match + ( check_sr_exists_for_host ~__context ~self:vm_sr ~host:resident_on + , check_sr_exists_for_host ~__context ~self:pool_sr ~host:resident_on + , check_sr_exists_for_host ~__context ~self:host_sr ~host:resident_on + ) + with + | Some x, _, _ -> + x + | _, Some x, _ -> + x + | _, _, Some x -> + x + | None, None, None -> + raise + (Api_errors.Server_error + (Api_errors.vm_no_suspend_sr, [Ref.string_of vm]) + ) + in + let free_space = get_sr_free_space ~__context ~sr in + if free_space < required_space then ( + let sr_str = Ref.string_of sr in + error "%s: SR %s free=%Ld needed=%Ld" __FUNCTION__ sr_str free_space + required_space ; + raise + (Api_errors.Server_error + (Api_errors.sr_suspend_space_insufficient, [sr_str]) + ) + ) else + sr (* return the operations filtered for cancels functions *) let cancel_tasks ~__context ~ops ~all_tasks_in_db diff --git a/ocaml/xapi/xapi_xenops.ml b/ocaml/xapi/xapi_xenops.ml index c3a0687fb5e..f63e3ccf7d9 100644 --- a/ocaml/xapi/xapi_xenops.ml +++ b/ocaml/xapi/xapi_xenops.ml @@ -4028,7 +4028,10 @@ let suspend ~__context ~self = in Int64.(ram |> add vgpu |> add 104857600L) in - let suspend_SR = Helpers.choose_suspend_sr ~__context ~vm:self in + let suspend_SR = + Helpers.choose_suspend_sr ~__context ~vm:self + ~required_space:space_needed + in let sm_config = [ (Constants._sm_vm_hint, id) diff --git a/ocaml/xapi/xha_statefile.ml b/ocaml/xapi/xha_statefile.ml index 54428684a44..0c5baf6ab65 100644 --- a/ocaml/xapi/xha_statefile.ml +++ b/ocaml/xapi/xha_statefile.ml @@ -58,14 +58,11 @@ let ha_fits_sr ~__context ~what ~sr ~typ ~minimum_size = | [] -> debug "no suitable existing %s found; would have to create a fresh one" what ; - let self = sr in - let size = Db.SR.get_physical_size ~__context ~self in - let utilisation = Db.SR.get_physical_utilisation ~__context ~self in - let free_space = Int64.sub size utilisation in + let free_space = Helpers.get_sr_free_space ~__context ~sr in if free_space < minimum_sr_size then ( let sr = Ref.string_of sr in - info "%s: SR %s size=%Ld utilisation=%Ld free=%Ld needed=%Ld" - __FUNCTION__ sr size utilisation free_space minimum_sr_size ; + info "%s: SR %s free=%Ld needed=%Ld" __FUNCTION__ sr free_space + minimum_sr_size ; raise (Api_errors.Server_error (Api_errors.sr_source_space_insufficient, [sr]) From 1370eccd597f450572d57de0298dc037a31bd6c7 Mon Sep 17 00:00:00 2001 From: Marcus Granado Date: Thu, 24 Jul 2025 14:15:40 +0000 Subject: [PATCH 05/65] CP-309060: Domain CPU RRD3 metric - numa_node_nonaffine_vcpus Adding a new CPU RRD metric: "numa_node_nonaffine_vcpus" per domain as fraction of vCPU time running outside of vCPU affinity. Signed-off-by: Marcus Granado --- ocaml/xcp-rrdd/bin/rrdp-cpu/rrdp_cpu.ml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/ocaml/xcp-rrdd/bin/rrdp-cpu/rrdp_cpu.ml b/ocaml/xcp-rrdd/bin/rrdp-cpu/rrdp_cpu.ml index 0cea2e90295..c03406f2609 100644 --- a/ocaml/xcp-rrdd/bin/rrdp-cpu/rrdp_cpu.ml +++ b/ocaml/xcp-rrdd/bin/rrdp-cpu/rrdp_cpu.ml @@ -80,6 +80,24 @@ let dss_vcpus xc doms = ) ] in + let nonaffine_vcpus_ds = + match ri.Xenctrl.Runstateinfo.V2.running with + | 0L -> + [] + | _ -> + [ + ( Rrd.VM uuid + , Ds.ds_make ~name:"numa_node_nonaffine_vcpus" ~units:"(fraction)" + ~value: + (Rrd.VT_Float + (Int64.to_float ri.Xenctrl.Runstateinfo.V2.nonaffine /. 1.0e9) + ) + ~description: + "Fraction of vCPU time running outside of vCPU soft-affinity" + ~ty:Rrd.Derive ~default:false ~min:0.0 ~max:1.0 () + ) + ] + in ( Rrd.VM uuid , Ds.ds_make ~name:"runstate_fullrun" ~units:"(fraction)" ~value: @@ -172,6 +190,7 @@ let dss_vcpus xc doms = ) :: dss @ runnable_vcpus_ds + @ nonaffine_vcpus_ds with _ -> dss in try cpus 0 dss with _ -> dss From a5163593ba6e6d8f8c854e6644a089b546cb549b Mon Sep 17 00:00:00 2001 From: Christian Lindig Date: Wed, 29 Oct 2025 15:43:42 +0000 Subject: [PATCH 06/65] xenopsd/xc: adapt claim_pages to new single numa node version (CP-53658) Now the numa node needs to be passed. A special value of (~0U) is used to signify that no node is meant to be used. Since this is arch-dependent, and contained in a long in x86_64, an int is used to encode the value. Also remove the exception that was guarding the codepath to use this case Signed-off-by: Pau Ruiz Safont Signed-off-by: Christian Lindig --- ocaml/libs/xenctrl-ext/xenctrlext.ml | 1 - ocaml/libs/xenctrl-ext/xenctrlext_stubs.c | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/ocaml/libs/xenctrl-ext/xenctrlext.ml b/ocaml/libs/xenctrl-ext/xenctrlext.ml index 31360870fbc..419907ff268 100644 --- a/ocaml/libs/xenctrl-ext/xenctrlext.ml +++ b/ocaml/libs/xenctrl-ext/xenctrlext.ml @@ -124,7 +124,6 @@ end exception Not_available let domain_claim_pages handle domid ?(numa_node = NumaNode.none) nr_pages = - if numa_node <> NumaNode.none then raise Not_available ; stub_domain_claim_pages handle domid numa_node nr_pages let get_nr_nodes handle = diff --git a/ocaml/libs/xenctrl-ext/xenctrlext_stubs.c b/ocaml/libs/xenctrl-ext/xenctrlext_stubs.c index 5da78215acc..c250a955059 100644 --- a/ocaml/libs/xenctrl-ext/xenctrlext_stubs.c +++ b/ocaml/libs/xenctrl-ext/xenctrlext_stubs.c @@ -681,11 +681,11 @@ CAMLprim value stub_xenctrlext_domain_claim_pages(value xch_val, value domid_val int retval, the_errno; xc_interface* xch = xch_of_val(xch_val); uint32_t domid = Int_val(domid_val); - // unsigned int numa_node = Int_val(numa_node_val); + unsigned int numa_node = Int_val(numa_node_val); unsigned long nr_pages = Long_val(nr_pages_val); caml_release_runtime_system(); - retval = xc_domain_claim_pages(xch, domid, /*numa_node,*/ nr_pages); + retval = xc_domain_claim_pages_node(xch, domid, numa_node, nr_pages); the_errno = errno; caml_acquire_runtime_system(); From 7153d5dde283d59b032f602176db7494dc2e3241 Mon Sep 17 00:00:00 2001 From: Pau Ruiz Safont Date: Thu, 19 Jun 2025 13:27:20 +0100 Subject: [PATCH 07/65] xenopsd-xc: do not try keep track of free memory when planning NUMA nodes (CA-411684) Free memory is now properly accounted for because the memory pages are claimed within the NUMA mutex, so there's no need to have double tracking. On top of that, this code never increased the free memory, which means that it always reached a point where it was impossible to allocate a domain into a single numa node. Signed-off-by: Pau Ruiz Safont --- ocaml/xenopsd/xc/domain.ml | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/ocaml/xenopsd/xc/domain.ml b/ocaml/xenopsd/xc/domain.ml index 1cdc9a87785..1dbc52c2cab 100644 --- a/ocaml/xenopsd/xc/domain.ml +++ b/ocaml/xenopsd/xc/domain.ml @@ -987,8 +987,6 @@ let numa_hierarchy = let numa_mutex = Mutex.create () -let numa_resources = ref None - let numa_init () = let xcext = Xenctrlext.get_handle () in let host = Lazy.force numa_hierarchy in @@ -1022,14 +1020,7 @@ let numa_placement domid ~vcpus ~cores ~memory affinity = (NUMA.nodes host) numa_meminfo in let vm = NUMARequest.make ~memory ~vcpus ~cores in - let nodea = - match !numa_resources with - | None -> - Array.of_seq nodes - | Some a -> - Array.map2 NUMAResource.min_memory (Array.of_seq nodes) a - in - numa_resources := Some nodea ; + let nodea = Array.of_seq nodes in let cpu_affinity, memory_plan = match Softaffinity.plan ~vm host nodea with | None -> From 6f750b8c17d93345ac5afec252c4938ab89ba263 Mon Sep 17 00:00:00 2001 From: Marcus Granado Date: Tue, 1 Jul 2025 20:15:12 +0000 Subject: [PATCH 08/65] CP-54238: RRD4: rebase over rrdp_squeezed.ml Signed-off-by: Marcus Granado --- .../bin/rrdp-squeezed/rrdp_squeezed.ml | 59 ++++++++++++++++++- 1 file changed, 56 insertions(+), 3 deletions(-) diff --git a/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml b/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml index df49dca259f..b36dfdff902 100644 --- a/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml +++ b/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml @@ -252,9 +252,61 @@ let free_other uuid free = ~value:(Rrd.VT_Int64 free) ~ty:Rrd.Gauge ~min:0.0 ~default:true () ) +let dss_numa_info xc dom uuid domid = + try + + let host_nr_nodes = Xenctrlext.(get_handle () |> get_nr_nodes) in + let vm_nodes = + + Xenctrl.domain_get_numa_info_node_pages xc domid + in + let dss_memory_numa_nodes_of_vm (dss, nr_nodes_used_by_vm) + (node_id, tot_pages_per_node) = + (* + for each numa node used by the host, show the + corresponding amount of memory used by the vm + *) + let is_node_used_by_vm = tot_pages_per_node > 0L in + let is_node_used_by_host = node_id < host_nr_nodes in + if is_node_used_by_host then + ( ( Rrd.VM uuid + , Ds.ds_make + ~name:(Printf.sprintf "memory_numa_node_%d" node_id) + ~units:"B" + ~description: + (Printf.sprintf "Memory from NUMA node %d used by VM" node_id) + ~value:(Rrd.VT_Int64 (Int64.mul tot_pages_per_node 4096L)) + ~min:0.0 ~ty:Rrd.Gauge ~default:false () + ) + :: dss + (* remember the number of nodes used by vm *) + , nr_nodes_used_by_vm + if is_node_used_by_vm then 1 else 0 + ) + else + (dss, nr_nodes_used_by_vm) + in + let dss_numa_nodes_of_vm (dss, nr_nodes_used_by_vm) = + ( Rrd.VM uuid + , Ds.ds_make + ~name:(Printf.sprintf "numa_nodes") + ~units:"count" + ~description:(Printf.sprintf "Number of NUMA nodes used by VM") + ~value:(Rrd.VT_Int64 (Int64.of_int nr_nodes_used_by_vm)) + ~min:0.0 ~ty:Rrd.Gauge ~default:false () + ) + :: (List.rev dss) + in + vm_nodes.Xenctrl.tot_pages_per_node + |> Array.mapi (fun i x -> (i, x)) + |> Array.fold_left dss_memory_numa_nodes_of_vm ([], 0) + |> dss_numa_nodes_of_vm + with e -> + D.debug "dss_numa_info: %s" (Printexc.to_string e) ; + [] + let get_list f = Option.to_list (f ()) -let generate_vm_sources domains = +let generate_vm_sources xc domains = let metrics_of ((dom, uuid, domid), {target; free; _}) = let target () = Option.map @@ -288,6 +340,7 @@ let generate_vm_sources domains = ~value:(Rrd.VT_Int64 memory) ~ty:Rrd.Gauge ~min:0.0 ~default:true () ) in + let get_list_numa_info = dss_numa_info xc dom uuid domid in (* CA-34383: Memory updates from paused domains serve no useful purpose. During a migrate such updates can also cause undesirable discontinuities in the observed value of memory_actual. Hence, we @@ -295,14 +348,14 @@ let generate_vm_sources domains = if dom.Xenctrl.paused then [] else - get_list target @ get_list free @ get_list total + get_list target @ get_list free @ get_list total @ get_list_numa_info in List.concat_map metrics_of domains let generate_sources xc () = let domain_stats = get_domain_stats xc in - generate_host_sources xc domain_stats @ generate_vm_sources domain_stats + generate_host_sources xc domain_stats @ generate_vm_sources xc domain_stats (** The json-like serialization for 3 dss in dss_mem_vms takes 622 bytes. These bytes plus some overhead make 1024 bytes an upper bound. *) From 9f7172860b28bb0add118320cd3bd617b224396a Mon Sep 17 00:00:00 2001 From: Marcus Granado Date: Tue, 1 Jul 2025 20:25:18 +0000 Subject: [PATCH 09/65] CA-412929: work around a small amount of internal pages in unclaimed node Signed-off-by: Marcus Granado --- ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml b/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml index b36dfdff902..9feca170218 100644 --- a/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml +++ b/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml @@ -266,7 +266,7 @@ let dss_numa_info xc dom uuid domid = for each numa node used by the host, show the corresponding amount of memory used by the vm *) - let is_node_used_by_vm = tot_pages_per_node > 0L in + let is_node_used_by_vm = tot_pages_per_node > 4096L in let is_node_used_by_host = node_id < host_nr_nodes in if is_node_used_by_host then ( ( Rrd.VM uuid From ef10c4fd5cc484143a09b35b9bf080a64637224a Mon Sep 17 00:00:00 2001 From: Marcus Granado Date: Wed, 10 Dec 2025 15:11:34 +0000 Subject: [PATCH 10/65] CP-310822: RRD4 :link Xenctrlext in rrdp-squeezed Signed-off-by: Marcus Granado --- ocaml/xcp-rrdd/bin/rrdp-squeezed/dune | 1 + 1 file changed, 1 insertion(+) diff --git a/ocaml/xcp-rrdd/bin/rrdp-squeezed/dune b/ocaml/xcp-rrdd/bin/rrdp-squeezed/dune index 75c8e1f5ab5..ee5f217f137 100644 --- a/ocaml/xcp-rrdd/bin/rrdp-squeezed/dune +++ b/ocaml/xcp-rrdd/bin/rrdp-squeezed/dune @@ -13,6 +13,7 @@ xapi-log xapi-rrd xenctrl + xenctrl_ext xenstore xenstore.unix xenstore_transport From e6e90d6f3d2da8dd432b71d70583e113aadc8f13 Mon Sep 17 00:00:00 2001 From: Rob Hoes Date: Tue, 20 Jan 2026 10:42:29 +0000 Subject: [PATCH 11/65] Update datamodel lifecycle Signed-off-by: Rob Hoes --- ocaml/idl/datamodel_lifecycle.ml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ocaml/idl/datamodel_lifecycle.ml b/ocaml/idl/datamodel_lifecycle.ml index 8cfdf21cef2..ffd0064859a 100644 --- a/ocaml/idl/datamodel_lifecycle.ml +++ b/ocaml/idl/datamodel_lifecycle.ml @@ -138,11 +138,11 @@ let prototyped_of_field = function | "VM_guest_metrics", "netbios_name" -> Some "24.28.0" | "VM_metrics", "numa_node_memory" -> - Some "26.1.0-next" + Some "26.2.0" | "VM_metrics", "numa_nodes" -> - Some "26.1.0-next" + Some "26.2.0" | "VM_metrics", "numa_optimised" -> - Some "26.1.0-next" + Some "26.2.0" | "VM", "groups" -> Some "24.19.1" | "VM", "pending_guidances_full" -> From ea41b659e44badd5599bc3e2fd0a989d020628a3 Mon Sep 17 00:00:00 2001 From: Marcus Granado Date: Tue, 20 Jan 2026 18:20:43 +0000 Subject: [PATCH 12/65] CP-53658: only use xc_domain_claim_pages_node if defined this avoids a link error in systems that don't have this function: /usr/bin/ld: ocaml/libs/xenctrl-ext/libxenctrl_ext_stubs.a(xenctrlext_stubs.o): in function `stub_xenctrlext_domain_claim_pages': xen-api/_build/default/ocaml/libs/xenctrl-ext/xenctrlext_stubs.c:688: undefined reference to `xc_domain_claim_pages_node' set errno to ENOSYS if not defined, to keep behaviour consistent with other stubs in this file. keep CAMLparam4 in both cases to avoid unused parameter warnings. Signed-off-by: Marcus Granado --- ocaml/libs/xenctrl-ext/xenctrlext_stubs.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ocaml/libs/xenctrl-ext/xenctrlext_stubs.c b/ocaml/libs/xenctrl-ext/xenctrlext_stubs.c index c250a955059..f4dd234b780 100644 --- a/ocaml/libs/xenctrl-ext/xenctrlext_stubs.c +++ b/ocaml/libs/xenctrl-ext/xenctrlext_stubs.c @@ -678,6 +678,7 @@ CAMLprim value stub_xenctrlext_domain_claim_pages(value xch_val, value domid_val value numa_node_val, value nr_pages_val) { CAMLparam4(xch_val, domid_val, numa_node_val, nr_pages_val); +#ifdef XEN_DOMCTL_NUMA_OP_GET_NODE_PAGES int retval, the_errno; xc_interface* xch = xch_of_val(xch_val); uint32_t domid = Int_val(domid_val); @@ -694,6 +695,9 @@ CAMLprim value stub_xenctrlext_domain_claim_pages(value xch_val, value domid_val "Error when trying to claim memory pages"); } CAMLreturn(Val_unit); +#else + raise_unix_errno_msg(ENOSYS, "xc_domain_claim_pages_node"); +#endif } #ifdef XEN_DOMCTL_NUMA_OP_GET_NODE_PAGES From 80045e85a1934e187deeb340b388a75a72d0889f Mon Sep 17 00:00:00 2001 From: Marcus Granado Date: Tue, 20 Jan 2026 19:26:28 +0000 Subject: [PATCH 13/65] reformat code using latest ocamlformat 0.28.1 Signed-off-by: Marcus Granado --- ocaml/xcp-rrdd/bin/rrdp-cpu/rrdp_cpu.ml | 10 +++++++--- .../xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml | 17 +++++++++-------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/ocaml/xcp-rrdd/bin/rrdp-cpu/rrdp_cpu.ml b/ocaml/xcp-rrdd/bin/rrdp-cpu/rrdp_cpu.ml index c03406f2609..763aaaed5d8 100644 --- a/ocaml/xcp-rrdd/bin/rrdp-cpu/rrdp_cpu.ml +++ b/ocaml/xcp-rrdd/bin/rrdp-cpu/rrdp_cpu.ml @@ -87,13 +87,17 @@ let dss_vcpus xc doms = | _ -> [ ( Rrd.VM uuid - , Ds.ds_make ~name:"numa_node_nonaffine_vcpus" ~units:"(fraction)" + , Ds.ds_make ~name:"numa_node_nonaffine_vcpus" + ~units:"(fraction)" ~value: (Rrd.VT_Float - (Int64.to_float ri.Xenctrl.Runstateinfo.V2.nonaffine /. 1.0e9) + (Int64.to_float ri.Xenctrl.Runstateinfo.V2.nonaffine + /. 1.0e9 + ) ) ~description: - "Fraction of vCPU time running outside of vCPU soft-affinity" + "Fraction of vCPU time running outside of vCPU \ + soft-affinity" ~ty:Rrd.Derive ~default:false ~min:0.0 ~max:1.0 () ) ] diff --git a/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml b/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml index 9feca170218..f0809671da5 100644 --- a/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml +++ b/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml @@ -254,12 +254,8 @@ let free_other uuid free = let dss_numa_info xc dom uuid domid = try - let host_nr_nodes = Xenctrlext.(get_handle () |> get_nr_nodes) in - let vm_nodes = - - Xenctrl.domain_get_numa_info_node_pages xc domid - in + let vm_nodes = Xenctrl.domain_get_numa_info_node_pages xc domid in let dss_memory_numa_nodes_of_vm (dss, nr_nodes_used_by_vm) (node_id, tot_pages_per_node) = (* @@ -279,8 +275,13 @@ let dss_numa_info xc dom uuid domid = ~min:0.0 ~ty:Rrd.Gauge ~default:false () ) :: dss - (* remember the number of nodes used by vm *) - , nr_nodes_used_by_vm + if is_node_used_by_vm then 1 else 0 + (* remember the number of nodes used by vm *) + , nr_nodes_used_by_vm + + + if is_node_used_by_vm then + 1 + else + 0 ) else (dss, nr_nodes_used_by_vm) @@ -294,7 +295,7 @@ let dss_numa_info xc dom uuid domid = ~value:(Rrd.VT_Int64 (Int64.of_int nr_nodes_used_by_vm)) ~min:0.0 ~ty:Rrd.Gauge ~default:false () ) - :: (List.rev dss) + :: List.rev dss in vm_nodes.Xenctrl.tot_pages_per_node |> Array.mapi (fun i x -> (i, x)) From 4eb2ca1151f0bcd6c2c838d758d655181a50f03a Mon Sep 17 00:00:00 2001 From: Marcus Granado Date: Tue, 20 Jan 2026 21:07:50 +0000 Subject: [PATCH 14/65] CP-310822: use only xenctrlext Signed-off-by: Marcus Granado --- ocaml/libs/xenctrl-ext/xenctrlext.mli | 8 ++++++++ .../xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml | 17 ++++++++++------- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/ocaml/libs/xenctrl-ext/xenctrlext.mli b/ocaml/libs/xenctrl-ext/xenctrlext.mli index 11d6ad8ad66..1851b3d1b12 100644 --- a/ocaml/libs/xenctrl-ext/xenctrlext.mli +++ b/ocaml/libs/xenctrl-ext/xenctrlext.mli @@ -109,6 +109,14 @@ val get_nr_nodes : handle -> int (** Returns the count of NUMA nodes available in the system. *) module DomainNuma : sig + type domain_numainfo_node_pages = { + tot_pages_per_node: int64 array (* page=4k bytes *) + } + + external domain_get_numa_info_node_pages : + handle -> int -> domain_numainfo_node_pages + = "stub_xc_domain_numa_get_node_pages_wrapper" + type t = {optimised: bool; nodes: int; memory: int64 array (* bytes *)} val state : handle -> domid:int -> t diff --git a/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml b/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml index f0809671da5..951b7a8b1d5 100644 --- a/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml +++ b/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml @@ -252,10 +252,13 @@ let free_other uuid free = ~value:(Rrd.VT_Int64 free) ~ty:Rrd.Gauge ~min:0.0 ~default:true () ) -let dss_numa_info xc dom uuid domid = +let dss_numa_info uuid domid = try - let host_nr_nodes = Xenctrlext.(get_handle () |> get_nr_nodes) in - let vm_nodes = Xenctrl.domain_get_numa_info_node_pages xc domid in + let handle = Xenctrlext.get_handle () in + let host_nr_nodes = Xenctrlext.get_nr_nodes handle in + let vm_nodes = + Xenctrlext.DomainNuma.domain_get_numa_info_node_pages handle domid + in let dss_memory_numa_nodes_of_vm (dss, nr_nodes_used_by_vm) (node_id, tot_pages_per_node) = (* @@ -297,7 +300,7 @@ let dss_numa_info xc dom uuid domid = ) :: List.rev dss in - vm_nodes.Xenctrl.tot_pages_per_node + vm_nodes.Xenctrlext.DomainNuma.tot_pages_per_node |> Array.mapi (fun i x -> (i, x)) |> Array.fold_left dss_memory_numa_nodes_of_vm ([], 0) |> dss_numa_nodes_of_vm @@ -307,7 +310,7 @@ let dss_numa_info xc dom uuid domid = let get_list f = Option.to_list (f ()) -let generate_vm_sources xc domains = +let generate_vm_sources domains = let metrics_of ((dom, uuid, domid), {target; free; _}) = let target () = Option.map @@ -341,7 +344,7 @@ let generate_vm_sources xc domains = ~value:(Rrd.VT_Int64 memory) ~ty:Rrd.Gauge ~min:0.0 ~default:true () ) in - let get_list_numa_info = dss_numa_info xc dom uuid domid in + let get_list_numa_info = dss_numa_info uuid domid in (* CA-34383: Memory updates from paused domains serve no useful purpose. During a migrate such updates can also cause undesirable discontinuities in the observed value of memory_actual. Hence, we @@ -356,7 +359,7 @@ let generate_vm_sources xc domains = let generate_sources xc () = let domain_stats = get_domain_stats xc in - generate_host_sources xc domain_stats @ generate_vm_sources xc domain_stats + generate_host_sources xc domain_stats @ generate_vm_sources domain_stats (** The json-like serialization for 3 dss in dss_mem_vms takes 622 bytes. These bytes plus some overhead make 1024 bytes an upper bound. *) From d73437c5782ca132557845c838e904308240c37b Mon Sep 17 00:00:00 2001 From: Lin Liu Date: Tue, 13 Jan 2026 08:39:25 +0000 Subject: [PATCH 15/65] CA-422713: XSI-2105: Pool.join failed due to AD status corrupt The target pool has leaved AD, the joining host leave AD as well. However, the AD status is somehow corrupt - external_auth_type is empty, this is expected - external_auth_service_name is a valid domain This confused pool.join as it thinks AD is not enabled, but somehow joined to a domain. - Normal domain leave does not resolve the issue, and it does not join domain - Join domain again(failed) does not resolve it neither, as xapi will restore to the current value before join on failed. This commit introduce force option to host.disable_external_auth API to force clean up to recover host BTW, current code try to keep them consistent already, but not atomic. Signed-off-by: Lin Liu --- ocaml/idl/datamodel_host.ml | 7 +++++ ocaml/xapi-cli-server/cli_operations.ml | 2 +- ocaml/xapi/message_forwarding.ml | 8 +++--- ocaml/xapi/xapi_host.ml | 34 ++++++++----------------- ocaml/xapi/xapi_host.mli | 2 ++ ocaml/xapi/xapi_pool.ml | 18 ++++++++++--- 6 files changed, 41 insertions(+), 30 deletions(-) diff --git a/ocaml/idl/datamodel_host.ml b/ocaml/idl/datamodel_host.ml index c8d5743b6ae..d4fad6e7d8d 100644 --- a/ocaml/idl/datamodel_host.ml +++ b/ocaml/idl/datamodel_host.ml @@ -2033,6 +2033,13 @@ let disable_external_auth = ; param_release= george_release ; param_default= Some (VMap []) } + ; { + param_type= Bool + ; param_name= "force" + ; param_doc= "Disable external auth even when not enabled" + ; param_release= numbered_release "26.2.0-next" + ; param_default= Some (VBool false) + } ] ~doc:"This call disables external authentication on the local host" ~allowed_roles:_R_POOL_ADMIN () diff --git a/ocaml/xapi-cli-server/cli_operations.ml b/ocaml/xapi-cli-server/cli_operations.ml index b20ed934107..e0921800b21 100644 --- a/ocaml/xapi-cli-server/cli_operations.ml +++ b/ocaml/xapi-cli-server/cli_operations.ml @@ -7089,7 +7089,7 @@ let host_disable_external_auth _printer rpc session_id params = let host_uuid = List.assoc "host-uuid" params in let host = Client.Host.get_by_uuid ~rpc ~session_id ~uuid:host_uuid in let config = read_map_params "config" params in - Client.Host.disable_external_auth ~rpc ~session_id ~host ~config + Client.Host.disable_external_auth ~rpc ~session_id ~host ~config ~force:true let host_refresh_pack_info _printer rpc session_id params = let host_uuid = List.assoc "host-uuid" params in diff --git a/ocaml/xapi/message_forwarding.ml b/ocaml/xapi/message_forwarding.ml index 1d8d228cc80..83786bd8afd 100644 --- a/ocaml/xapi/message_forwarding.ml +++ b/ocaml/xapi/message_forwarding.ml @@ -3730,11 +3730,13 @@ functor in do_op_on ~local_fn ~__context ~host ~remote_fn - let disable_external_auth ~__context ~host ~config = + let disable_external_auth ~__context ~host ~config ~force = info "Host.disable_external_auth: host = '%s'" (host_uuid ~__context host) ; - let local_fn = Local.Host.disable_external_auth ~host ~config in - let remote_fn = Client.Host.disable_external_auth ~host ~config in + let local_fn = Local.Host.disable_external_auth ~host ~config ~force in + let remote_fn = + Client.Host.disable_external_auth ~host ~config ~force + in do_op_on ~local_fn ~__context ~host ~remote_fn let install_ca_certificate ~__context ~host ~name ~cert = diff --git a/ocaml/xapi/xapi_host.ml b/ocaml/xapi/xapi_host.ml index 08ded94e240..f6b1109d6fd 100644 --- a/ocaml/xapi/xapi_host.ml +++ b/ocaml/xapi/xapi_host.ml @@ -1889,13 +1889,13 @@ let enable_external_auth ~__context ~host ~config ~service_name ~auth_type = (* CP-718: Disables external auth/directory service for host *) let disable_external_auth_common ?(during_pool_eject = false) ~__context ~host - ~config () = + ~config ~force () = (* CP-825: Serialize execution of host-enable-extauth and host-disable-extauth *) (* we need to protect against concurrent access to the host.external_auth_type variable *) with_lock serialize_host_enable_disable_extauth (fun () -> let host_name_label = Db.Host.get_name_label ~__context ~self:host in let auth_type = Db.Host.get_external_auth_type ~__context ~self:host in - if auth_type = "" then + if auth_type = "" && not force then (* nothing to do, external authentication is already disabled *) let msg = "external authentication service is already disabled" in debug "Failed to disable external authentication in host %s: %s" @@ -1936,6 +1936,8 @@ let disable_external_auth_common ?(during_pool_eject = false) ~__context ~host , [msg] ) ) + | Extauth_is_disabled -> + Some Extauth_is_disabled | e -> (*absorb any exception*) debug @@ -1957,19 +1959,6 @@ let disable_external_auth_common ?(during_pool_eject = false) ~__context ~host Xapi_globs.event_hook_auth_on_xapi_initialize_succeeded := true ; (* succeeds because there's no need to initialize anymore *) - - (* If any cache is present, clear it in order to ensure cached - logins don't persist after disabling external - authentication. *) - Xapi_session.clear_external_auth_cache () ; - - (* 3. CP-703: we always revalidate all sessions after the external authentication has been disabled *) - (* so that all sessions that were externally authenticated will be destroyed *) - debug - "calling revalidate_all_sessions after disabling external auth for \ - host %s" - host_name_label ; - Xapi_session.revalidate_all_sessions ~__context ; if not during_pool_eject then (* CA-28168 *) (* CA-24856: detect non-homogeneous external-authentication config in this host *) @@ -1978,19 +1967,18 @@ let disable_external_auth_common ?(during_pool_eject = false) ~__context ~host if auth_type = Xapi_globs.auth_type_AD then Extauth_ad.stop_backend_daemon ~wait_until_success:false ; match plugin_disable_failure with - | None -> + (* we do not want to stop pool_eject and permit Extauth_is_disabled during force *) + | Some e when during_pool_eject || (e = Extauth_is_disabled && force) -> () | Some e -> - if not during_pool_eject then - raise e (* bubble up plugin's on_disable exception *) - else - () - (* we do not want to stop pool_eject *) + raise e + | None -> + () ) -let disable_external_auth ~__context ~host ~config = +let disable_external_auth ~__context ~host ~config ~force = disable_external_auth_common ~during_pool_eject:false ~__context ~host ~config - () + ~force () module Static_vdis_list = Xapi_database.Static_vdis_list diff --git a/ocaml/xapi/xapi_host.mli b/ocaml/xapi/xapi_host.mli index d01bc221305..f153d37422a 100644 --- a/ocaml/xapi/xapi_host.mli +++ b/ocaml/xapi/xapi_host.mli @@ -361,6 +361,7 @@ val disable_external_auth_common : -> __context:Context.t -> host:API.ref_host -> config:(string * string) list + -> force:bool -> unit -> unit @@ -368,6 +369,7 @@ val disable_external_auth : __context:Context.t -> host:API.ref_host -> config:(string * string) list + -> force:bool -> unit (** {2 Static VDIs} *) diff --git a/ocaml/xapi/xapi_pool.ml b/ocaml/xapi/xapi_pool.ml index ef4abe5cd07..5214fb3998a 100644 --- a/ocaml/xapi/xapi_pool.ml +++ b/ocaml/xapi/xapi_pool.ml @@ -2037,7 +2037,7 @@ let eject_self ~__context ~host = (* disable the external authentication of this slave being ejected *) (* this call will return an exception if something goes wrong *) Xapi_host.disable_external_auth_common ~during_pool_eject:true ~__context - ~host ~config:[] () ; + ~host ~config:[] ~force:false () ; (* FIXME: in the future, we should send the windows AD admin/pass here *) (* in order to remove the slave from the AD database during pool-eject *) @@ -2973,7 +2973,7 @@ let enable_external_auth ~__context ~pool:_ ~config ~service_name ~auth_type = (* best-effort attempt to disable all enabled hosts, swallowing any exceptions *) try call_fn_on_host ~__context - (Client.Host.disable_external_auth ~config) + (Client.Host.disable_external_auth ~config ~force:false) host with e -> debug @@ -3041,7 +3041,7 @@ let disable_external_auth ~__context ~pool:_ ~config = (* forward the call to the host in the pool *) try call_fn_on_host ~__context - (Client.Host.disable_external_auth ~config) + (Client.Host.disable_external_auth ~config ~force:false) host ; (* no failed host to add to the filtered list, just visit next host *) (host, "", "") @@ -3100,9 +3100,21 @@ let disable_external_auth ~__context ~pool:_ ~config = ) ) ) else (* OK *) + ( + (* If any cache is present, clear it in order to ensure cached + logins don't persist after disabling external + authentication. *) + Xapi_session.clear_external_auth_cache () ; + + (* CP-703: we always revalidate all sessions after the external authentication has been disabled *) + (* so that all sessions that were externally authenticated will be destroyed *) + debug "calling revalidate_all_sessions after disabling external auth" ; + Xapi_session.revalidate_all_sessions ~__context ; + debug "The external authentication of all hosts in the pool was disabled \ successfully" + ) ) (* CA-24856: detect non-homogeneous external-authentication config in pool *) From be5420b794f5fa086e2c1e1473c80cb6f07b8095 Mon Sep 17 00:00:00 2001 From: Christian Lindig Date: Thu, 15 Jan 2026 15:55:05 +0000 Subject: [PATCH 16/65] CA-419840 mark CD VBD as empty when its VDI is removed When a CDR is removed from an ISO SR the corresponding VDI is deleted. So far we relied on the DB GC to mark the VBD as empty. This creates a window for a race where the VDI/CD is reported as present when in fact it is not. So mark the VBD as empty as early as possible. Signed-off-by: Christian Lindig --- ocaml/xapi/xapi_vdi.ml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/ocaml/xapi/xapi_vdi.ml b/ocaml/xapi/xapi_vdi.ml index be82fc7450f..1d4b881c418 100644 --- a/ocaml/xapi/xapi_vdi.ml +++ b/ocaml/xapi/xapi_vdi.ml @@ -739,6 +739,18 @@ let db_forget ~__context ~vdi = debug "db_forget uuid=%s ref=%s" (Db.VDI.get_uuid ~__context ~self:vdi) (Ref.string_of vdi) ; + (* CA-419840 mark VBD as empty when it is a CDR *) + ( Db.VDI.get_VBDs ~__context ~self:vdi |> function + | [] -> + debug "%s: no VBD for VDI %s" __FUNCTION__ (Ref.string_of vdi) + | self :: _ when self = Ref.null -> + warn "%s: NULL VBD for VDI %s" __FUNCTION__ (Ref.string_of vdi) + | self :: _ when Db.VBD.get_type ~__context ~self = `CD -> + Db.VBD.set_VDI ~__context ~self ~value:Ref.null ; + Db.VBD.set_empty ~__context ~self ~value:true + | _ -> + () (* not a CDR *) + ) ; Db.VDI.destroy ~__context ~self:vdi let introduce ~__context ~uuid ~name_label ~name_description ~sR ~_type From aabf39f0f2eacf80e0b4a8020b68949ed191f731 Mon Sep 17 00:00:00 2001 From: Rob Hoes Date: Tue, 20 Jan 2026 12:54:30 +0000 Subject: [PATCH 17/65] CP-311102: Make migration timeouts configurable Also improve the logging a little (e.g. log "suspend" rather than "shutdown" when suspending). Signed-off-by: Rob Hoes --- ocaml/xenopsd/lib/xenopsd.ml | 15 +++++++++++++++ ocaml/xenopsd/xc/domain.ml | 11 +++++++---- ocaml/xenopsd/xc/xenops_server_xen.ml | 19 ++++++++++++++++--- ocaml/xenopsd/xenopsd.conf | 6 ++++++ 4 files changed, 44 insertions(+), 7 deletions(-) diff --git a/ocaml/xenopsd/lib/xenopsd.ml b/ocaml/xenopsd/lib/xenopsd.ml index 226a686c283..8c92caf854b 100644 --- a/ocaml/xenopsd/lib/xenopsd.ml +++ b/ocaml/xenopsd/lib/xenopsd.ml @@ -77,6 +77,10 @@ let vm_guest_agent_xenstore_quota = ref 128 let vm_guest_agent_xenstore_quota_warn_interval = ref 3600 +let vm_suspend_timeout = ref 1200. + +let vm_suspend_ack_timeout = ref 30. + let oxenstored_conf = ref "/etc/xen/oxenstored.conf" let for_each_line path f = @@ -320,6 +324,17 @@ let options = , (fun () -> string_of_bool !Xenops_server.xenopsd_vbd_plug_unplug_legacy) , "False if we want to split the plug atomic into attach/activate" ) + ; ( "vm-suspend-timeout" + , Arg.Set_float vm_suspend_timeout + , (fun () -> string_of_float !vm_suspend_timeout) + , "Timeout in seconds for a VM to suspend after acknowledging a suspend \ + request" + ) + ; ( "vm-suspend-ack-timeout" + , Arg.Set_float vm_suspend_ack_timeout + , (fun () -> string_of_float !vm_suspend_ack_timeout) + , "Timeout in seconds for a VM to acknowledge a suspend request" + ) ] let path () = Filename.concat !sockets_path "xenopsd" diff --git a/ocaml/xenopsd/xc/domain.ml b/ocaml/xenopsd/xc/domain.ml index 1cdc9a87785..9c9c6946ab3 100644 --- a/ocaml/xenopsd/xc/domain.ml +++ b/ocaml/xenopsd/xc/domain.ml @@ -778,8 +778,11 @@ let shutdown_wait_for_ack (t : Xenops_task.task_handle) ~timeout ~xc ~xs domid Xenctrl.domain_shutdown xc domid (shutdown_to_xc_shutdown req) ) else ( debug - "VM = %s; domid = %d; Waiting for domain to acknowledge shutdown request" - uuid domid ; + "VM = %s; domid = %d; Waiting for domain to acknowledge %s request \ + (timeout = %.0fs)" + uuid domid + (string_of_shutdown_reason req) + timeout ; let path = control_shutdown ~xs domid in let cancel = Domain domid in if @@ -788,8 +791,8 @@ let shutdown_wait_for_ack (t : Xenops_task.task_handle) ~timeout ~xc ~xs domid [Watch.key_to_disappear path] t ~xs ~timeout () then - info "VM = %s; domid = %d; Domain acknowledged shutdown request" uuid - domid + info "VM = %s; domid = %d; Domain acknowledged %s request" uuid domid + (string_of_shutdown_reason req) else debug "VM = %s; domid = %d; Domain disappeared" uuid domid ) diff --git a/ocaml/xenopsd/xc/xenops_server_xen.ml b/ocaml/xenopsd/xc/xenops_server_xen.ml index 03ce150ae26..173b4b46353 100644 --- a/ocaml/xenopsd/xc/xenops_server_xen.ml +++ b/ocaml/xenopsd/xc/xenops_server_xen.ml @@ -2859,7 +2859,9 @@ module VM = struct not ( with_tracing ~task ~name:"VM_save_domain_suspend_callback_request_shutdown" - @@ fun () -> request_shutdown task vm Suspend 30. + @@ fun () -> + request_shutdown task vm Suspend + !Xenopsd.vm_suspend_ack_timeout ) then raise (Xenopsd_error Failed_to_acknowledge_suspend_request) ; @@ -2870,14 +2872,25 @@ module VM = struct | _ -> () ) ; + let suspend_timeout = !Xenopsd.vm_suspend_timeout in if not ( with_tracing ~task ~name:"VM_save_domain_suspend_callback_wait_shutdown" - @@ fun () -> wait_shutdown task vm Suspend 1200. + @@ fun () -> + debug + "VM = %s; domid = %d; Waiting for domain to suspend \ + (timeout = %.0fs)" + vm.Vm.id domid suspend_timeout ; + wait_shutdown task vm Suspend suspend_timeout ) then - raise (Xenopsd_error (Failed_to_suspend (vm.Vm.id, 1200.))) + raise + (Xenopsd_error + (Failed_to_suspend (vm.Vm.id, suspend_timeout)) + ) + else + debug "VM = %s; domid = %d; Domain suspended" vm.Vm.id domid ) ; (* Record the final memory usage of the domain so we know how much to allocate for the resume *) diff --git a/ocaml/xenopsd/xenopsd.conf b/ocaml/xenopsd/xenopsd.conf index e1c3c87c7cb..f8a81ba06c0 100644 --- a/ocaml/xenopsd/xenopsd.conf +++ b/ocaml/xenopsd/xenopsd.conf @@ -116,3 +116,9 @@ disable-logging-for=http tracing tracing_export # On Intel a similar effect is already achieved with iPAT in Xen, # but setting this to 0 works on Intel too. # xen-platform-pci-bar-uc=false + +# Timeout in seconds for a VM to suspend after acknowledging a suspend request +# vm-suspend-timeout=1200 + +# Timeout in seconds for a VM to acknowledge a suspend request +# vm-suspend-ack-timeout=30 From 01c9a6d3802b3d5b5723d9337b3a134a351a15e7 Mon Sep 17 00:00:00 2001 From: Alex Brett Date: Tue, 20 Jan 2026 15:28:37 +0000 Subject: [PATCH 18/65] CA-423064: Trigger group upgrades in addition to package upgrades In the event we add a package to one of the yum repository groups, that has no packages requiring it, dnf5 will not install it by default (as it has not implemented the `upgrade_group_objects_upgrade` configuration option - see https://dnf5.readthedocs.io/en/latest/dnf5.conf-todo.5.html). We do not wish to be reliant on having to ensure all new packages are required by an existing package, as such (at least until an implementation of the above option is done in dnf5) trigger a `group upgrade` in addition to the `upgrade`. Signed-off-by: Alex Brett --- ocaml/tests/test_pkg_mgr.ml | 48 +++++++++++++++++++++++++++++++++++++ ocaml/xapi/pkg_mgr.ml | 17 +++++++++++++ ocaml/xapi/pkg_mgr.mli | 3 +++ ocaml/xapi/repository.ml | 11 +++++++-- 4 files changed, 77 insertions(+), 2 deletions(-) diff --git a/ocaml/tests/test_pkg_mgr.ml b/ocaml/tests/test_pkg_mgr.ml index 3593e01d014..8fd7bd38f40 100644 --- a/ocaml/tests/test_pkg_mgr.ml +++ b/ocaml/tests/test_pkg_mgr.ml @@ -159,6 +159,29 @@ let test_dnf_apply_upgrades = ) ] +let test_dnf_apply_group_upgrades = + [ + ( "" + , `Quick + , check + { + cmd= !Xapi_globs.dnf_cmd + ; params= + [ + "-y" + ; "--disablerepo=*" + ; "--enablerepo=testrepo1,testrepo2" + ; "group" + ; "upgrade" + ; "*" + ] + } + (Pkg_mgr.Dnf_cmd.apply_group_upgrade + ~repositories:["testrepo1"; "testrepo2"] + ) + ) + ] + let test_yum_repo_query_installed = [ ( "" @@ -286,6 +309,29 @@ let test_yum_apply_upgrades = ) ] +let test_yum_apply_group_upgrades = + [ + ( "" + , `Quick + , check + { + cmd= !Xapi_globs.yum_cmd + ; params= + [ + "-y" + ; "--disablerepo=*" + ; "--enablerepo=testrepo1,testrepo2" + ; "group" + ; "upgrade" + ; "*" + ] + } + (Pkg_mgr.Yum_cmd.apply_group_upgrade + ~repositories:["testrepo1"; "testrepo2"] + ) + ) + ] + let test_yum_repo_query_updates = [ ( "" @@ -320,6 +366,7 @@ let tests = ; ("test_dnf_cofig_repo", test_dnf_config_repo) ; ("test_dnf_sync_repo", test_dnf_sync_repo) ; ("test_dnf_apply_upgrades", test_dnf_apply_upgrades) + ; ("test_dnf_apply_group_upgrades", test_dnf_apply_group_upgrades) ; ("test_yum_repo_query_installed", test_yum_repo_query_installed) ; ("test_yum_clean_all_cache", test_yum_clean_all_cache) ; ("test_yum_clean_repo_cache", test_yum_clean_repo_cache) @@ -327,6 +374,7 @@ let tests = ; ("test_yum_cofig_repo", test_yum_config_repo) ; ("test_yum_sync_repo", test_yum_sync_repo) ; ("test_yum_apply_upgrades", test_yum_apply_upgrades) + ; ("test_yum_apply_group_upgrades", test_yum_apply_group_upgrades) ; ("test_yum_repo_query_updates", test_yum_repo_query_updates) ] diff --git a/ocaml/xapi/pkg_mgr.ml b/ocaml/xapi/pkg_mgr.ml index ed2550ffe2a..ffd39995310 100644 --- a/ocaml/xapi/pkg_mgr.ml +++ b/ocaml/xapi/pkg_mgr.ml @@ -52,6 +52,8 @@ module type S = sig val sync_repo : repo_name:string -> cmd_line val apply_upgrade : repositories:string list -> cmd_line + + val apply_group_upgrade : repositories:string list -> cmd_line end module type Args = sig @@ -86,6 +88,8 @@ module type Args = sig val sync_repo : string -> string list val apply_upgrade : string list -> string list + + val apply_group_upgrade : string list -> string list end let repoquery_sep = ":|" @@ -161,6 +165,16 @@ module Common_args = struct ; Printf.sprintf "--enablerepo=%s" (String.concat "," repositories) ; "upgrade" ] + + let apply_group_upgrade repositories = + [ + "-y" + ; "--disablerepo=*" + ; Printf.sprintf "--enablerepo=%s" (String.concat "," repositories) + ; "group" + ; "upgrade" + ; "*" + ] end module Yum_args : Args = struct @@ -305,6 +319,9 @@ module Cmd_line (M : Args) : S = struct let apply_upgrade ~repositories = {cmd= M.pkg_cmd; params= M.apply_upgrade repositories} + + let apply_group_upgrade ~repositories = + {cmd= M.pkg_cmd; params= M.apply_group_upgrade repositories} end module Yum_cmd = Cmd_line (Yum_args) diff --git a/ocaml/xapi/pkg_mgr.mli b/ocaml/xapi/pkg_mgr.mli index a2849fb4ce3..448d3651ea4 100644 --- a/ocaml/xapi/pkg_mgr.mli +++ b/ocaml/xapi/pkg_mgr.mli @@ -67,6 +67,9 @@ module type S = sig val apply_upgrade : repositories:string list -> cmd_line (** Command line and arguments to apply upgrades from repos *) + + val apply_group_upgrade : repositories:string list -> cmd_line + (** Command line and arguments to apply group upgrades from repos *) end (** Exposed only for unittest, do not use the modules directly diff --git a/ocaml/xapi/repository.ml b/ocaml/xapi/repository.ml index 9fb284fcc55..9a7783eb98a 100644 --- a/ocaml/xapi/repository.ml +++ b/ocaml/xapi/repository.ml @@ -679,8 +679,15 @@ let get_pool_updates_in_json ~__context ~hosts = let apply ~__context ~host = (* This function runs on member host *) with_local_repositories ~__context (fun repositories -> - let Pkg_mgr.{cmd; params} = Pkgs.apply_upgrade ~repositories in - try ignore (Helpers.call_script cmd params) + let upgrade () = + let Pkg_mgr.{cmd; params} = Pkgs.apply_upgrade ~repositories in + ignore (Helpers.call_script cmd params) + in + let group_upgrade () = + let Pkg_mgr.{cmd; params} = Pkgs.apply_group_upgrade ~repositories in + ignore (Helpers.call_script cmd params) + in + try upgrade () ; group_upgrade () with e -> let host' = Ref.string_of host in error "Failed to apply updates on host ref='%s': %s" host' From 73a3ed393d20fd5db4e1d2909a331b58e9a4ed13 Mon Sep 17 00:00:00 2001 From: Pau Ruiz Safont Date: Wed, 21 Jan 2026 11:57:50 +0000 Subject: [PATCH 19/65] datamodel_errors: generalize error for sr_suspend_space_insufficient The error can be produced while doing a suspend, not only on checkpoints. Remove the last sentence so it applied in both cases Signed-off-by: Pau Ruiz Safont --- ocaml/idl/datamodel_errors.ml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocaml/idl/datamodel_errors.ml b/ocaml/idl/datamodel_errors.ml index 24262314274..f739665a836 100644 --- a/ocaml/idl/datamodel_errors.ml +++ b/ocaml/idl/datamodel_errors.ml @@ -1119,7 +1119,7 @@ let _ = error Api_errors.sr_suspend_space_insufficient ["sr"] ~doc: "The suspend SR does not have sufficient free space to store the VM \ - suspend image required to complete a snapshot with memory." + suspend image." () ; error Api_errors.pbd_exists ["sr"; "host"; "pbd"] ~doc:"A PBD already exists connecting the SR to the server." () ; From 32adc7d9e4f335e9d1e8f2efd908d89105cf5af3 Mon Sep 17 00:00:00 2001 From: Alex Brett Date: Tue, 20 Jan 2026 16:58:19 +0000 Subject: [PATCH 20/65] CA-423064: Also check group upgrade when determining updates available Signed-off-by: Alex Brett --- ocaml/xapi/pkg_mgr.ml | 25 +++++++++++++++++++++++++ ocaml/xapi/pkg_mgr.mli | 4 ++++ ocaml/xapi/repository.ml | 16 +++++++++++++++- ocaml/xapi/repository_helpers.ml | 28 ++++++++++++++++++++++++++++ 4 files changed, 72 insertions(+), 1 deletion(-) diff --git a/ocaml/xapi/pkg_mgr.ml b/ocaml/xapi/pkg_mgr.ml index ffd39995310..af140d5a48c 100644 --- a/ocaml/xapi/pkg_mgr.ml +++ b/ocaml/xapi/pkg_mgr.ml @@ -37,6 +37,9 @@ module type S = sig val get_updates_from_upgrade_dry_run : repositories:string list -> cmd_line + val get_updates_from_group_upgrade_dry_run : + repositories:string list -> cmd_line + val is_obsoleted : pkg_name:string -> repositories:string list -> cmd_line val repoquery_updates : repositories:string list -> cmd_line @@ -73,6 +76,8 @@ module type Args = sig val get_updates_from_upgrade_dry_run : string list -> string list + val get_updates_from_group_upgrade_dry_run : string list -> string list + val is_obsoleted : string -> string list -> string list val repoquery_updates : string list -> string list @@ -129,6 +134,16 @@ module Common_args = struct ; "upgrade" ] + let get_updates_from_group_upgrade_dry_run repositories = + [ + "--disablerepo=*" + ; Printf.sprintf "--enablerepo=%s" (String.concat "," repositories) + ; "--assumeno" + ; "group" + ; "upgrade" + ; "*" + ] + let repoquery repositories = [ "--disablerepo=*" @@ -193,6 +208,10 @@ module Yum_args : Args = struct let get_updates_from_upgrade_dry_run repositories = ["--quiet"] @ Common_args.get_updates_from_upgrade_dry_run repositories + let get_updates_from_group_upgrade_dry_run repositories = + ["--quiet"] + @ Common_args.get_updates_from_group_upgrade_dry_run repositories + let is_obsoleted pkg_name repositories = ["--all"] @ Common_args.is_obsoleted pkg_name repositories @ ["--plugins"] @@ -297,6 +316,12 @@ module Cmd_line (M : Args) : S = struct let get_updates_from_upgrade_dry_run ~repositories = {cmd= M.pkg_cmd; params= M.get_updates_from_upgrade_dry_run repositories} + let get_updates_from_group_upgrade_dry_run ~repositories = + { + cmd= M.pkg_cmd + ; params= M.get_updates_from_group_upgrade_dry_run repositories + } + let is_obsoleted ~pkg_name ~repositories = {cmd= M.repoquery_cmd; params= M.is_obsoleted pkg_name repositories} diff --git a/ocaml/xapi/pkg_mgr.mli b/ocaml/xapi/pkg_mgr.mli index 448d3651ea4..059943c1f10 100644 --- a/ocaml/xapi/pkg_mgr.mli +++ b/ocaml/xapi/pkg_mgr.mli @@ -41,6 +41,10 @@ module type S = sig val get_updates_from_upgrade_dry_run : repositories:string list -> cmd_line (** Command line and arguments to dry run an upgrade, with repositories enabled *) + val get_updates_from_group_upgrade_dry_run : + repositories:string list -> cmd_line + (** Command line and arguments to dry run a group upgrade, with repositories enabled *) + val is_obsoleted : pkg_name:string -> repositories:string list -> cmd_line (** Command line and arguments to check whether a package is obsoleted by any other * package in given repositories *) diff --git a/ocaml/xapi/repository.ml b/ocaml/xapi/repository.ml index 9a7783eb98a..e87809e21cb 100644 --- a/ocaml/xapi/repository.ml +++ b/ocaml/xapi/repository.ml @@ -558,6 +558,20 @@ let get_host_updates_in_json ~__context ~installed = let latest_updates' = get_updates_from_yum_upgrade_dry_run repositories in + let latest_updates_group' = + get_updates_from_yum_group_upgrade_dry_run repositories + in + let latest_updates_combined' = + match (latest_updates', latest_updates_group') with + | Some pkgs', Some group_pkgs' -> + Some (List.sort_uniq compare (pkgs' @ group_pkgs')) + | Some pkgs', None -> + Some pkgs' + | None, Some group_pkgs' -> + Some group_pkgs' + | None, None -> + None + in let latest_updates'' = get_updates_from_repoquery repositories in (* To ensure the updating function will not strand, use redundant * functions to get the update/installation list. @@ -569,7 +583,7 @@ let get_host_updates_in_json ~__context ~installed = let fail_on_error = Xapi_fist.fail_on_error_in_yum_upgrade_dry_run () in let latest_updates = get_latest_updates_from_redundancy ~fail_on_error - ~pkgs:latest_updates' ~fallback_pkgs:latest_updates'' + ~pkgs:latest_updates_combined' ~fallback_pkgs:latest_updates'' in List.iter (fun r -> clean_yum_cache r) repositories ; let latest_updates_in_json = diff --git a/ocaml/xapi/repository_helpers.ml b/ocaml/xapi/repository_helpers.ml index 487e8e30f17..1afa819a437 100644 --- a/ocaml/xapi/repository_helpers.ml +++ b/ocaml/xapi/repository_helpers.ml @@ -771,8 +771,10 @@ module YumUpgradeOutput = struct | false -> ( take_till is_eol <* end_of_line >>= function | ( "Installing:" + | "Installing group/module packages:" | "Updating:" | "Upgrading:" + | "Upgrading groups:" | "Removing:" | "Reinstalling:" | "Downgrading:" @@ -890,6 +892,7 @@ module YumUpgradeOutput = struct |> List.filter (fun (section, _) -> match section with | "Installing:" + | "Installing group/module packages:" | "Updating:" | "Upgrading:" | "Installing for dependencies:" @@ -958,6 +961,31 @@ let get_updates_from_yum_upgrade_dry_run repositories = error "%s" (ExnHelper.string_of_exn e) ; None +let get_updates_from_yum_group_upgrade_dry_run repositories = + let Pkg_mgr.{cmd; params} = + Pkgs.get_updates_from_group_upgrade_dry_run ~repositories + in + match Forkhelpers.execute_command_get_output cmd params with + | _, _ -> + Some [] + | exception Forkhelpers.Spawn_internal_error (stderr, stdout, Unix.WEXITED 1) + -> ( + (*Yum put the details to stderr while dnf to stdout*) + (match Pkgs.manager with Yum -> stderr | Dnf -> stdout) + |> YumUpgradeOutput.parse_output_of_dry_run + |> function + | Ok (pkgs, Some txn_file) -> + Unixext.unlink_safe txn_file ; + Some pkgs + | Ok (pkgs, None) -> + Some pkgs + | Error msg -> + error "%s" msg ; None + ) + | exception e -> + error "%s" (ExnHelper.string_of_exn e) ; + None + let get_latest_updates_from_redundancy ~fail_on_error ~pkgs ~fallback_pkgs = let err = "Failed to parse output of 'yum upgrade (dry run)' correctly" in let get_latest_updates_from_redundancy' ~fail_on_error ~pkgs ~fallback_pkgs = From ab74c8894e206325caa0f0e8352c33014e1b0b2f Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Wed, 21 Jan 2026 09:52:21 +0000 Subject: [PATCH 21/65] qcow-stream-tool: Use tail-recursive functions in read_headers Otherwise we overflow the stack on large QCOW files: qcow-stream-tool: internal error, uncaught exception: Stack overflow Raised by primitive operation at Dune__exe__Qcow_stream_tool.Impl.read_headers.(fun) in file "ocaml/qcow-stream-tool/qcow_stream_tool.ml", line 23, characters 12-24 Called from Stdlib__List.map in file "list.ml", line 92, characters 32-39 Called from Stdlib__List.map in file "list.ml", line 92, characters 32-39 .... Going through map->seq->list transformations is faster than reversing a list with List.rev + List.rev_map. Signed-off-by: Andrii Sultanov --- ocaml/qcow-stream-tool/qcow_stream_tool.ml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/ocaml/qcow-stream-tool/qcow_stream_tool.ml b/ocaml/qcow-stream-tool/qcow_stream_tool.ml index 41b57c9a366..b8605c2f44e 100644 --- a/ocaml/qcow-stream-tool/qcow_stream_tool.ml +++ b/ocaml/qcow-stream-tool/qcow_stream_tool.ml @@ -12,17 +12,18 @@ module Impl = struct let* virtual_size, cluster_bits, _, data_cluster_map = Qcow_stream.start_stream_decode fd in - let clusters = Qcow_types.Cluster.Map.bindings data_cluster_map in + (* TODO: List.map becomes tail-recursive in OCaml 5.1, and could be used here instead *) let clusters = - List.map - (fun (_, virt_address) -> + data_cluster_map + |> Qcow_types.Cluster.Map.to_seq + |> Seq.map (fun (_, virt_address) -> let ( >> ) = Int64.shift_right_logical in let address = Int64.to_int (virt_address >> Int32.to_int cluster_bits) in `Int address - ) - clusters + ) + |> List.of_seq in let json = `Assoc From 40759d890ffb7333f69fea44a26e0b1ce19d643a Mon Sep 17 00:00:00 2001 From: Sebastien Marie Date: Wed, 21 Jan 2026 21:05:42 +0100 Subject: [PATCH 22/65] Don't depend on LANG for running tests I have test failing because of incorrect output: 'Aucun fichier ou dossier de ce nom' instead of 'No such file or directory' Ensure dune is called with LANG=C to avoid such problem. Signed-off-by: Sebastien Marie --- dune | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/dune b/dune index ac7f4810205..4810b2793b0 100644 --- a/dune +++ b/dune @@ -3,10 +3,13 @@ (ocamlopt_flags (:standard -g -p -w -39)) (flags (:standard -w -39)) ) - (dev (flags (:standard -g -w -39))) + (dev + (flags (:standard -g -w -39)) + (env-vars (LANG C))) (release (flags (:standard -w -39-6@5)) - (env-vars (ALCOTEST_COMPACT 1)) + (env-vars (ALCOTEST_COMPACT 1) + (LANG C)) ) ) From 4db0de7717c18b31a01475d8e257d80b43c15b55 Mon Sep 17 00:00:00 2001 From: Christian Lindig Date: Fri, 23 Jan 2026 15:35:36 +0000 Subject: [PATCH 23/65] Fix -Wreturn-type warning in xenctrlext_stubs.c Signed-off-by: Christian Lindig --- ocaml/libs/xenctrl-ext/xenctrlext_stubs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocaml/libs/xenctrl-ext/xenctrlext_stubs.c b/ocaml/libs/xenctrl-ext/xenctrlext_stubs.c index f4dd234b780..88fae69f98e 100644 --- a/ocaml/libs/xenctrl-ext/xenctrlext_stubs.c +++ b/ocaml/libs/xenctrl-ext/xenctrlext_stubs.c @@ -694,10 +694,10 @@ CAMLprim value stub_xenctrlext_domain_claim_pages(value xch_val, value domid_val raise_unix_errno_msg(the_errno, "Error when trying to claim memory pages"); } - CAMLreturn(Val_unit); #else raise_unix_errno_msg(ENOSYS, "xc_domain_claim_pages_node"); #endif + CAMLreturn(Val_unit); } #ifdef XEN_DOMCTL_NUMA_OP_GET_NODE_PAGES From 1b37d8e0510a3d0b11ab3b98d1c63f83f4bd4cae Mon Sep 17 00:00:00 2001 From: Alex Brett Date: Sat, 24 Jan 2026 22:35:08 +0000 Subject: [PATCH 24/65] CA-423213: Fix bundle URL construct Without the host parameter, Uri.make returns `file:/path/to/file`. dnf expects the `file:///path/to/file` variant - using the single slash variant leads to a failure to locate the signature file. Signed-off-by: Alex Brett --- ocaml/xapi/repository.ml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/ocaml/xapi/repository.ml b/ocaml/xapi/repository.ml index 9fb284fcc55..4c669f42316 100644 --- a/ocaml/xapi/repository.ml +++ b/ocaml/xapi/repository.ml @@ -169,7 +169,14 @@ let sync ~__context ~self ~token ~token_id ~username ~password = ) | `bundle -> let uri = - Uri.make ~scheme:"file" ~path:!Xapi_globs.bundle_repository_dir () + (* dnf requires a URI that is file:///absolute/path, using the + * file:/absolute/path variant will result in a failure to locate + * the signature file, thus the host parameter to Uri.make is + * required + *) + Uri.make ~scheme:"file" ~host:"" + ~path:!Xapi_globs.bundle_repository_dir + () in (Uri.to_string uri, None, false, NoAuth, NoAuth) | `remote_pool -> From 09483d8edce28c61000796927201921dbb0b16ef Mon Sep 17 00:00:00 2001 From: Lin Liu Date: Tue, 27 Jan 2026 03:03:43 +0000 Subject: [PATCH 25/65] CP-311020: Add force option to external_auth_set_ldaps for debug Signed-off-by: Lin Liu --- doc/content/design/external-auth-ldaps.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/doc/content/design/external-auth-ldaps.md b/doc/content/design/external-auth-ldaps.md index 1e8c6478312..b3ab6cdae26 100644 --- a/doc/content/design/external-auth-ldaps.md +++ b/doc/content/design/external-auth-ldaps.md @@ -97,18 +97,20 @@ For the existing joined domain, user can switch between LDAP and LDAPS with this - `pool` (Ref _pool): pool to set LDAPS - `ldaps` (Bool): whether LDAPS is required +- `force` (Bool): whether to set ldaps even when ldaps is currently set This API will set the `ldaps` in database (Refer to 2.1). This API performs following sanity check and rejects update if check fails: - AD has already been enabled +- ldaps has already been enabled without force - Find proper certificate (Refer to 4.1 for the details) - Do a `ldaps` query to embedded user `krbtgt` for the joined domain **Note:** -- This API allow en-entry for debug purpose -- Will not do the LDAPS query on the trusted domains, as xapi does not have trusted domain details +- This API allow re-entry with `force` to perform an extra `ldaps ping` for debug purpose +- This API will not do the LDAPS query on the trusted domains, as xapi does not have trusted domain details - The joined domain likely has multiple DCs. LDAPS query tries every DC of the domain. Check pass if LDAPS query succeeds on any DC of the domain. This implies iterate and locate a DC supporting LDAPS (with proper certificate trust setup) before LDAPS query. However, this does not introduce performance problems as the LDAPS query happens in backend and refreshes result into XAPI DB - Pool coordinator dispatches this API request to every host, and only succeeds if all hosts pass the check - This API needs to be synced with other APIs. For example, `authenticate_username_password` should fail if this API is performing checking and configuration From c1d44032d11d7d7307a578c2fdb8b307782f29aa Mon Sep 17 00:00:00 2001 From: Ming Lu Date: Tue, 27 Jan 2026 16:41:13 +0800 Subject: [PATCH 26/65] Don't use CRLs for pool internal host-host TLS communications As these TLS communications use 'verifyPeer=yes' actually while applying CRLs requires root CA certificates and 'verifyChain=yes'. Signed-off-by: Ming Lu --- ocaml/libs/stunnel/stunnel.ml | 31 +++++++++++++++++++++---------- ocaml/libs/stunnel/stunnel.mli | 1 + 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/ocaml/libs/stunnel/stunnel.ml b/ocaml/libs/stunnel/stunnel.ml index 1b6e9c3ee18..adfa670ba5d 100644 --- a/ocaml/libs/stunnel/stunnel.ml +++ b/ocaml/libs/stunnel/stunnel.ml @@ -122,6 +122,7 @@ type verification_config = { sni: string option ; verify: verify ; cert_bundle_path: string + ; crl_dir: string option } type t = { @@ -140,6 +141,7 @@ let appliance = sni= None ; verify= CheckHost ; cert_bundle_path= "/etc/stunnel/xapi-stunnel-ca-bundle.pem" + ; crl_dir= Some crl_path } let pool = @@ -147,10 +149,16 @@ let pool = sni= Some "pool" ; verify= VerifyPeer ; cert_bundle_path= "/etc/stunnel/xapi-pool-ca-bundle.pem" + ; crl_dir= None } let external_host ext_host_cert_file = - {sni= None; verify= VerifyPeer; cert_bundle_path= ext_host_cert_file} + { + sni= None + ; verify= VerifyPeer + ; cert_bundle_path= ext_host_cert_file + ; crl_dir= None + } let debug_conf_of_bool verbose : string = if verbose then @@ -219,7 +227,7 @@ let config_file ?(accept = None) config host port = ; ( match config with | None -> [] - | Some {sni; verify; cert_bundle_path} -> + | Some {sni; verify; cert_bundle_path; crl_dir} -> List.rev_append ( match verify with | VerifyPeer -> @@ -234,14 +242,17 @@ let config_file ?(accept = None) config host port = ; "# the cert of the server we connect to" ; (match sni with None -> "" | Some s -> sprintf "sni = %s" s) ; sprintf "CAfile=%s" cert_bundle_path - ; ( match Sys.readdir crl_path with - | [||] -> - "" - | _ -> - sprintf "CRLpath=%s" crl_path - | exception _ -> - "" - ) + ; Option.fold ~none:"" + ~some:(fun crl_dir -> + match Sys.readdir crl_dir with + | [||] -> + "" + | _ -> + sprintf "CRLpath=%s" crl_dir + | exception _ -> + "" + ) + crl_dir ] ) ; [""] diff --git a/ocaml/libs/stunnel/stunnel.mli b/ocaml/libs/stunnel/stunnel.mli index ac7d08a0af5..7d7ffae02c3 100644 --- a/ocaml/libs/stunnel/stunnel.mli +++ b/ocaml/libs/stunnel/stunnel.mli @@ -40,6 +40,7 @@ type verification_config = { sni: string option ; verify: verify ; cert_bundle_path: string + ; crl_dir: string option } (** Represents an active stunnel connection *) From 91dec395427824f707d061ec8b4f320e689b031e Mon Sep 17 00:00:00 2001 From: Pau Ruiz Safont Date: Tue, 27 Jan 2026 10:33:51 +0000 Subject: [PATCH 27/65] stunnel: add doccoments to the configuration functions Otherwise it might not be clear why 3 different configurations are needed. Signed-off-by: Pau Ruiz Safont --- ocaml/libs/stunnel/stunnel_client.mli | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ocaml/libs/stunnel/stunnel_client.mli b/ocaml/libs/stunnel/stunnel_client.mli index b3dd0392bf4..e9812c6cdb1 100644 --- a/ocaml/libs/stunnel/stunnel_client.mli +++ b/ocaml/libs/stunnel/stunnel_client.mli @@ -17,7 +17,14 @@ val get_verify_by_default : unit -> bool val set_verify_by_default : bool -> unit val pool : unit -> Stunnel.verification_config option +(** [pool ()] returns the configuration that's meant to be used to connect to + other xapi hosts in the pool *) val appliance : unit -> Stunnel.verification_config option +(** [appliance ()] returns the configuration that's meant to be used to connect + to appliances providing services, like WLB or a licensing server. *) val external_host : string -> Stunnel.verification_config option +(** [external_host path] returns the configuration that's meant to be used to connect to + a xapi hosts outside the pool. This is useful, for example, to provide an + update repository to download updates from. *) From 4275f76a6c9c5e470b5513b356fb5adf28773bcb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= Date: Thu, 22 Jan 2026 10:20:15 +0000 Subject: [PATCH 28/65] CA-423173: XAPI underestimates low memory emergency pool size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The only Xen command-line related to this is `low_mem_virq_limit`, which is 64MiB. A new quicktest has shown that we are sometimes off by ~10MiB or more, and get failures booting VMs even after `assert_can_boot_here` said yes. Sometimes the error messages can be quite ugly, internal xenguest/xenopsd errors, instead of HOST_NOT_ENOUGH_FREE_MEMORY. Signed-off-by: Edwin Török --- ocaml/squeezed/src/squeeze_xen.ml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ocaml/squeezed/src/squeeze_xen.ml b/ocaml/squeezed/src/squeeze_xen.ml index 31bac6df75b..b8a3f600a2b 100644 --- a/ocaml/squeezed/src/squeeze_xen.ml +++ b/ocaml/squeezed/src/squeeze_xen.ml @@ -61,7 +61,8 @@ let ( -* ) = Int64.sub let mib = 1024L (** Same as xen commandline *) -let low_mem_emergency_pool = 1L ** mib +(** CA-423173: this is `low_mem_virq_limit` Xen, default 64MiB *) +let low_mem_emergency_pool = 64L ** mib (** Return the extra amount we always add onto maxmem *) let xen_max_offset_kib domain_type = From e2db96cf2818097b7a3866f807f69f4e0f1243fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= Date: Tue, 27 Jan 2026 13:59:07 +0000 Subject: [PATCH 29/65] [maintenance]: fix formatting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Edwin Török --- ocaml/squeezed/src/squeeze_xen.ml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ocaml/squeezed/src/squeeze_xen.ml b/ocaml/squeezed/src/squeeze_xen.ml index b8a3f600a2b..b4d9cc54172 100644 --- a/ocaml/squeezed/src/squeeze_xen.ml +++ b/ocaml/squeezed/src/squeeze_xen.ml @@ -60,8 +60,8 @@ let ( -* ) = Int64.sub let mib = 1024L -(** Same as xen commandline *) -(** CA-423173: this is `low_mem_virq_limit` Xen, default 64MiB *) +(** Same as xen commandline + CA-423173: this is `low_mem_virq_limit` Xen, default 64MiB *) let low_mem_emergency_pool = 64L ** mib (** Return the extra amount we always add onto maxmem *) From ec3bd4a8874b21fb2184f39ebb0baf026f80d132 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= Date: Thu, 22 Jan 2026 09:56:58 +0000 Subject: [PATCH 30/65] CA-423172: Xen uses ~294 pages/vCPU, not 256 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Measured the actual increase in host memory usage when increasing the number of vCPUs on a VM from 1 to 64: ``` vcpu,memory_overhead_pages,coeff 1,264,264 2,558,279 3,776,258.667 4,1032,258 5,1350,270 6,1614,269 7,1878,268.286 8,2056,257 9,2406,267.333 10,2670,267 11,2934,266.727 12,3198,266.5 13,3462,266.308 14,3726,266.143 15,3990,266 16,4254,265.875 17,4518,265.765 18,4782,265.667 19,5046,265.579 20,5310,265.5 21,5574,265.429 22,5838,265.364 23,6102,265.304 24,6366,265.25 25,6630,265.2 26,6894,265.154 27,7158,265.111 28,7422,265.071 29,7686,265.034 30,7952,265.067 31,8216,265.032 32,8480,265 33,8744,264.97 34,9009,264.971 35,9276,265.029 36,9543,265.083 37,9810,265.135 38,10076,265.158 39,10340,265.128 40,10604,265.1 41,10869,265.098 42,11133,265.071 43,11397,265.047 44,11662,265.045 45,11925,265 46,12191,265.022 47,12454,264.979 0,30,0 1,294,294 2,558,279 3,822,274 4,1086,271.5 5,1350,270 6,1614,269 7,1878,268.286 8,2142,267.75 9,2406,267.333 10,2670,267 11,2934,266.727 12,3198,266.5 13,3462,266.308 14,3726,266.143 15,3990,266 16,4254,265.875 17,4518,265.765 18,4782,265.667 19,5046,265.579 20,5310,265.5 21,5574,265.429 22,5838,265.364 23,6102,265.304 24,6366,265.25 25,6630,265.2 26,6894,265.154 27,7158,265.111 28,7422,265.071 29,7686,265.034 30,7952,265.067 31,8216,265.032 32,8480,265 33,8744,264.97 34,9011,265.029 35,9278,265.086 36,9546,265.167 37,9811,265.162 38,10076,265.158 39,10340,265.128 40,10603,265.075 41,10869,265.098 42,11132,265.048 43,11397,265.047 44,11663,265.068 45,11925,265 46,12191,265.022 47,12456,265.021 [INFO]VM memory_overhead_pages = ... + vcpu * 294 =~ ... + vcpu * 294 ``` We already allocate 256 pages/vcpu as part of shadow, so we need an extra 294-256=38 pages/vcpu. This can lead to internal errors raised by xenguest, or NOT_ENOUGH_FREE_MEMORY errors raised by xenopsd, after `assert_can_boot_here` has already replied yes, even when booting VMs sequentially. It could also lead XAPI to choose the wrong host to evacuate a VM too, which could lead to RPU migration failures. This is a pre-existing bug, affecting both the versions of Xen in XS8 and XS9. Cannot allocate this from shadow, because otherwise the memory usage would never converge (Xen doesn't allocate these from shadow). On another host the measured overhead is less, take the maximum for now: ``` [INFO]VM memory_overhead_pages = ... + vcpu * 264.067 =~ ... + vcpu * 265 ``` Also the amount of shadow memory reserved is nearly twice as much as needed, especially that shadow is compiled out of Xen, but overestimates are OK, and we might fix that separately. Signed-off-by: Edwin Török --- ocaml/xapi-idl/memory/memory.ml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ocaml/xapi-idl/memory/memory.ml b/ocaml/xapi-idl/memory/memory.ml index 99951f7e3e8..45c221c8f94 100644 --- a/ocaml/xapi-idl/memory/memory.ml +++ b/ocaml/xapi-idl/memory/memory.ml @@ -218,6 +218,8 @@ module Memory_model (D : MEMORY_MODEL_DATA) = struct D.extra_internal_mib +++ D.extra_external_mib +++ shadow_mib static_max_mib vcpu_count multiplier + (* CA-423172: Xen uses some extra memory/vCPU *) + +++ mib_of_pages_used (38L *** Int64.of_int vcpu_count) +++ D.shim_mib static_max_mib let footprint_mib target_mib static_max_mib vcpu_count multiplier = From f40cc4834056f35c19b009ea78313ee6fc05ea21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= Date: Thu, 18 Dec 2025 13:53:53 +0000 Subject: [PATCH 31/65] CA-422187: only ENOMEM is retrieable when a single-node NUMA claim fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Edwin Török --- ocaml/xenopsd/xc/domain.ml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocaml/xenopsd/xc/domain.ml b/ocaml/xenopsd/xc/domain.ml index 4016fcda86e..92e587c2b10 100644 --- a/ocaml/xenopsd/xc/domain.ml +++ b/ocaml/xenopsd/xc/domain.ml @@ -1071,7 +1071,7 @@ let numa_placement domid ~vcpus ~cores ~memory affinity = D.debug "NUMAClaim domid %d: local claim not available" domid ; set_vcpu_affinity cpu_affinity ; None - | Xenctrlext.Unix_error (errno, _) -> + | Xenctrlext.Unix_error ((Unix.ENOMEM as errno), _) -> D.info "%s: unable to claim enough memory, domain %d won't be hosted in a \ single NUMA node. (error %s)" From 0707b37b9766427b084436e8ce7fbca6cb2aec16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= Date: Tue, 16 Dec 2025 14:03:07 +0000 Subject: [PATCH 32/65] CA-422187,CA-422188: either always use claims or never use claims MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Do not mix using claims with not using claims. Xen cannot currently guarantee that it'll honour a VM's memory claim, unless all other VMs also use claims. Global claims have existed since a long time in Xen, so this should be safe to do on both XS8 and XS9. Safer defaults for global claims: Xen may have already allocated some memory for the domain, and the overhead is only an estimate. A global claim failing is a hard failure, so instead use a more conservative estimate: `memory.build_start_mib`. This is similar to `required_host_free_mib`, but doesn't take overhead into account. Eventually we'd want to have another argument to the create hypercall that tells it what NUMA node(s) to use, and then we can include all the overhead too there. For the single node claim keep the amount as it was, it is only a best effort claim. Do not claim shadow_mib, it has already been allocated: When rebooting lots of VMs in parallel we might run out of memory and fail to boot all the VMs again. This is because we overestimate the amount of memory required, and claim too much. That memory is released when the domain build finishes, but when building domains in parallel it'll temporarily result in an out of memory error. Instead try to claim only what is left to be allocated: the p2m map and shadow map have already been allocated by this point, i.e. claim just the bare minimum. Signed-off-by: Edwin Török --- ocaml/xapi-idl/memory/memory.ml | 5 ++++- ocaml/xenopsd/xc/domain.ml | 31 ++++++++++++++++++++++++++----- 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/ocaml/xapi-idl/memory/memory.ml b/ocaml/xapi-idl/memory/memory.ml index 45c221c8f94..9c30f9fa06f 100644 --- a/ocaml/xapi-idl/memory/memory.ml +++ b/ocaml/xapi-idl/memory/memory.ml @@ -185,6 +185,7 @@ type memory_config = { ; shadow_mib: int64 ; required_host_free_mib: int64 ; overhead_mib: int64 + ; build_claim_pages: int64 } module Memory_model (D : MEMORY_MODEL_DATA) = struct @@ -228,14 +229,16 @@ module Memory_model (D : MEMORY_MODEL_DATA) = struct let shadow_multiplier_default = 1.0 let full_config static_max_mib video_mib target_mib vcpus shadow_multiplier = + let build_start_mib = build_start_mib static_max_mib target_mib video_mib in { build_max_mib= build_max_mib static_max_mib video_mib - ; build_start_mib= build_start_mib static_max_mib target_mib video_mib + ; build_start_mib ; xen_max_mib= xen_max_mib static_max_mib ; shadow_mib= shadow_mib static_max_mib vcpus shadow_multiplier ; required_host_free_mib= footprint_mib target_mib static_max_mib vcpus shadow_multiplier ; overhead_mib= overhead_mib static_max_mib vcpus shadow_multiplier + ; build_claim_pages= pages_of_mib build_start_mib } end diff --git a/ocaml/xenopsd/xc/domain.ml b/ocaml/xenopsd/xc/domain.ml index 92e587c2b10..470225c5d7a 100644 --- a/ocaml/xenopsd/xc/domain.ml +++ b/ocaml/xenopsd/xc/domain.ml @@ -1057,7 +1057,7 @@ let numa_placement domid ~vcpus ~cores ~memory affinity = __FUNCTION__ domid ; None in - let nr_pages = Int64.div memory 4096L |> Int64.to_int in + let nr_pages = Memory.pages_of_bytes_used memory |> Int64.to_int in try D.debug "NUMAClaim domid %d: local claim on node %d: %d pages" domid node nr_pages ; @@ -1170,10 +1170,31 @@ let build_pre ~xc ~xs ~vcpus ~memory ~hard_affinity domid = and cores = Xenops_server.cores_of_numa_affinity_policy pin ~vcpus in - numa_placement domid ~vcpus ~cores - ~memory:(Int64.mul memory.xen_max_mib 1048576L) - affinity - |> Option.map fst + let memory = Memory.bytes_of_pages memory.build_claim_pages in + match numa_placement domid ~vcpus ~cores ~memory affinity with + | None -> + (* Always perform a global claim when NUMA placement is + enabled, and single node claims failed or were + unavailable: + This tries to ensures that memory allocated for this + domain won't use up memory claimed by other domains. + If claims are mixed with non-claims then Xen can't + currently guarantee that it would honour the existing + claims. + A failure here is a hard failure: we'd fail allocating + memory later anyway + *) + let nr_pages = + Memory.pages_of_bytes_used memory |> Int64.to_int + in + let xcext = Xenctrlext.get_handle () in + D.debug "NUMAClaim domid %d: global claim: %d pages" domid + nr_pages ; + Xenctrlext.domain_claim_pages xcext domid + ~numa_node:Xenctrlext.NumaNode.none nr_pages ; + None + | Some (plan, _) -> + Some plan ) in let store_chan, console_chan = create_channels ~xc uuid domid in From be13737329cb7f0c9f602e2391a065c791c7e29e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= Date: Fri, 19 Dec 2025 11:34:26 +0000 Subject: [PATCH 33/65] CA-422187: more accurate debug messages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Also always print memory free statistics when `wait_xen_free_mem` is called. Turns out `scrub_pages` is always 0, since this never got implemented in Xen (it is hardcoded to 0). Signed-off-by: Edwin Török --- ocaml/xenopsd/xc/domain.ml | 12 +++++++++++- ocaml/xenopsd/xc/xenops_server_xen.ml | 18 ++++++++++++++++-- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/ocaml/xenopsd/xc/domain.ml b/ocaml/xenopsd/xc/domain.ml index 470225c5d7a..12a10461f3a 100644 --- a/ocaml/xenopsd/xc/domain.ml +++ b/ocaml/xenopsd/xc/domain.ml @@ -246,7 +246,10 @@ let wait_xen_free_mem ~xc ?(maximum_wait_time_seconds = 64) required_memory_kib in (* At exponentially increasing intervals, write *) (* a debug message saying how long we've waited: *) - if is_power_of_2 accumulated_wait_time_seconds then + if + accumulated_wait_time_seconds = 0 + || is_power_of_2 accumulated_wait_time_seconds + then debug "Waited %i second(s) for memory to become available: %Ld KiB free, %Ld \ KiB scrub, %Ld KiB required" @@ -1083,6 +1086,13 @@ let numa_placement domid ~vcpus ~cores ~memory affinity = let build_pre ~xc ~xs ~vcpus ~memory ~hard_affinity domid = let open Memory in let uuid = get_uuid ~xc domid in + debug + "VM = %s, build_max_mib = %Ld, build_start_mib = %Ld, xen_max_mib =\n\ + \ %Ld, shadow_mib = %Ld, required_host_free_mib = %Ld, overhead_mib = \ + %Ld" + (Uuidx.to_string uuid) memory.build_max_mib memory.build_start_mib + memory.xen_max_mib memory.shadow_mib memory.required_host_free_mib + memory.overhead_mib ; debug "VM = %s; domid = %d; waiting for %Ld MiB of free host memory" (Uuidx.to_string uuid) domid memory.required_host_free_mib ; (* CA-39743: Wait, if necessary, for the Xen scrubber to catch up. *) diff --git a/ocaml/xenopsd/xc/xenops_server_xen.ml b/ocaml/xenopsd/xc/xenops_server_xen.ml index 173b4b46353..343e46166ff 100644 --- a/ocaml/xenopsd/xc/xenops_server_xen.ml +++ b/ocaml/xenopsd/xc/xenops_server_xen.ml @@ -1848,6 +1848,10 @@ module VM = struct let dbg = Xenops_task.get_dbg task in Mem.with_reservation dbg min_kib max_kib (fun target_plus_overhead_kib reservation_id -> + debug + "VM = %s, memory [%Ld KiB, %Ld KiB], \ + target_plus_overhead=%Ld KiB" + vm.Vm.id min_kib max_kib target_plus_overhead_kib ; let domain_config, persistent = match persistent.VmExtra.domain_config with | Some dc -> @@ -1888,6 +1892,9 @@ module VM = struct let target_bytes = target_plus_overhead_bytes --- overhead_bytes in + debug + "VM = %s, memory target_bytes = %Ld, dynamic max = %Ld" + vm.Vm.id target_bytes vm.memory_dynamic_max ; min vm.memory_dynamic_max target_bytes in set_initial_target ~xs domid (Int64.div initial_target 1024L) ; @@ -3005,6 +3012,7 @@ module VM = struct | _ -> "" in + debug "VM = %s, initial_target = %Ld" vm.Vm.id initial_target ; ({x with Domain.memory_target= initial_target}, timeoffset) in let vtpm = vtpm_of ~vm in @@ -3144,7 +3152,10 @@ module VM = struct let memory_actual = let pages = Int64.of_nativeint di.Xenctrl.total_memory_pages in let kib = Xenctrl.pages_to_kib pages in - Memory.bytes_of_kib kib + let bytes = Memory.bytes_of_kib kib in + D.debug "VM %s memory actual: %Ld pages = %Ld KiB = %Ld bytes" + (Uuidm.to_string uuid) pages kib bytes ; + bytes in let memory_limit = (* The maximum amount of memory the domain can consume is the max @@ -3167,7 +3178,10 @@ module VM = struct in (* CA-31764: may be larger than static_max if maxmem has been increased to initial-reservation. *) - max memory_actual max_memory_bytes + let result = max memory_actual max_memory_bytes in + D.debug "VM %s memory limit = %Ld bytes" (Uuidm.to_string uuid) + result ; + result in let rtc = try From 7238e5f7d10d52ef72dae19af17cd4bb6fb06df9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= Date: Thu, 8 Jan 2026 10:16:00 +0000 Subject: [PATCH 34/65] CA-422187: plumb migration pages through MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We need to reserve the exact same amount of pages on the destination as we had on the source. The amount we reserved when initially booting the domain was only an estimate, but when migrating we know the exact amount. Signed-off-by: Edwin Török --- ocaml/xenopsd/lib/xenops_server.ml | 40 ++++++++++++++++---- ocaml/xenopsd/lib/xenops_server_plugin.ml | 1 + ocaml/xenopsd/lib/xenops_server_simulator.ml | 2 +- ocaml/xenopsd/xc/domain.ml | 22 ++++++++++- ocaml/xenopsd/xc/domain.mli | 2 + ocaml/xenopsd/xc/xenops_server_xen.ml | 30 ++++++++++++--- 6 files changed, 82 insertions(+), 15 deletions(-) diff --git a/ocaml/xenopsd/lib/xenops_server.ml b/ocaml/xenopsd/lib/xenops_server.ml index 55e5844ee29..7e7951d5cb2 100644 --- a/ocaml/xenopsd/lib/xenops_server.ml +++ b/ocaml/xenopsd/lib/xenops_server.ml @@ -155,7 +155,8 @@ type atomic = | VM_create_device_model of (Vm.id * bool) | VM_destroy_device_model of Vm.id | VM_destroy of Vm.id - | VM_create of (Vm.id * int64 option * Vm.id option * bool) (*no_sharept*) + | VM_create of (Vm.id * (int64 * int64 option) option * Vm.id option * bool) + (*no_sharept*) | VM_build of (Vm.id * bool) | VM_shutdown_domain of (Vm.id * shutdown_request * float) | VM_s3suspend of Vm.id @@ -330,6 +331,7 @@ type vm_receive_op = { ; vmr_socket: Unix.file_descr ; vmr_handshake: string option (** handshake protocol *) ; vmr_compressed: bool + ; vmr_memory_total_source: int64 option [@default None] } [@@deriving rpcty] @@ -2317,19 +2319,26 @@ let rec perform_atomic ~progress_callback ?result (op : atomic) | VM_destroy id -> debug "VM.destroy %s" id ; B.VM.destroy t (VM_DB.read_exn id) - | VM_create (id, memory_upper_bound, final_id, no_sharept) -> + | VM_create (id, memory_upper_bound_and_source, final_id, no_sharept) -> let num_of_vbds = List.length (VBD_DB.vbds id) in let num_of_vifs = List.length (VIF_DB.vifs id) in + let memory_upper_bound = Option.map fst memory_upper_bound_and_source + and memory_total_source = + Option.map snd memory_upper_bound_and_source |> Option.join + in debug - "VM.create %s memory_upper_bound = %s, num_of_vbds = %d, num_of_vifs = \ - %d" + "VM.create %s memory_upper_bound = %s, memory_total_source = %s, \ + num_of_vbds = %d, num_of_vifs = %d" id (Option.value ~default:"None" (Option.map Int64.to_string memory_upper_bound) ) + (Option.value ~default:"None" + (Option.map Int64.to_string memory_total_source) + ) num_of_vbds num_of_vifs ; - B.VM.create t memory_upper_bound (VM_DB.read_exn id) final_id no_sharept - num_of_vbds num_of_vifs + B.VM.create t memory_upper_bound memory_total_source (VM_DB.read_exn id) + final_id no_sharept num_of_vbds num_of_vifs | VM_build (id, force) -> debug "VM.build %s" id ; let vbds : Vbd.t list = VBD_DB.vbds id |> vbd_plug_order in @@ -2897,7 +2906,10 @@ and perform_exn ?result (op : operation) (t : Xenops_task.task_handle) : unit = Request.write (fun _ -> ()) request fd in do_request vm_fd - [("memory_limit", Int64.to_string state.Vm.memory_limit)] + [ + ("memory_limit", Int64.to_string state.Vm.memory_limit) + ; ("memory_total_source", Int64.to_string state.Vm.memory_actual) + ] url ; let first_handshake () = ( match Handshake.recv vm_fd with @@ -3004,6 +3016,7 @@ and perform_exn ?result (op : operation) (t : Xenops_task.task_handle) : unit = vmr_id= id ; vmr_final_id= final_id ; vmr_memory_limit= memory_limit + ; vmr_memory_total_source= memory_total_source ; vmr_socket= s ; vmr_handshake= handshake ; vmr_compressed @@ -3084,7 +3097,14 @@ and perform_exn ?result (op : operation) (t : Xenops_task.task_handle) : unit = ) in perform_atomics - ([VM_create (id, Some memory_limit, Some final_id, no_sharept)] + ([ + VM_create + ( id + , Some (memory_limit, memory_total_source) + , Some final_id + , no_sharept + ) + ] (* Perform as many operations as possible on the destination domain before pausing the original domain *) @ atomics_of_operation (VM_restore_vifs id) @@ -3902,6 +3922,9 @@ module VM = struct let module Response = Cohttp.Response.Make (Cohttp_posix_io.Unbuffered_IO) in let dbg = List.assoc "dbg" cookies in let memory_limit = List.assoc "memory_limit" cookies |> Int64.of_string in + let memory_total_source = + List.assoc_opt "memory_total_source" cookies |> Option.map Int64.of_string + in let handshake = List.assoc_opt cookie_mem_migration cookies in let compressed_memory = get_compression cookies in Debug.with_thread_associated dbg @@ -3932,6 +3955,7 @@ module VM = struct ; vmr_socket= transferred_fd ; vmr_handshake= handshake ; vmr_compressed= compressed_memory + ; vmr_memory_total_source= memory_total_source } in let task = diff --git a/ocaml/xenopsd/lib/xenops_server_plugin.ml b/ocaml/xenopsd/lib/xenops_server_plugin.ml index 9c741786678..ed2eaf2817f 100644 --- a/ocaml/xenopsd/lib/xenops_server_plugin.ml +++ b/ocaml/xenopsd/lib/xenops_server_plugin.ml @@ -81,6 +81,7 @@ module type S = sig val create : Xenops_task.task_handle -> int64 option + -> int64 option -> Vm.t -> Vm.id option -> bool (* no_sharept*) diff --git a/ocaml/xenopsd/lib/xenops_server_simulator.ml b/ocaml/xenopsd/lib/xenops_server_simulator.ml index 8b7450f03e8..5256f1a91d1 100644 --- a/ocaml/xenopsd/lib/xenops_server_simulator.ml +++ b/ocaml/xenopsd/lib/xenops_server_simulator.ml @@ -566,7 +566,7 @@ module VM = struct let remove _vm = () - let create _ memory_limit vm _ _ _ _ = + let create _ memory_limit _ vm _ _ _ _ = with_lock m (create_nolock memory_limit vm) let destroy _ vm = with_lock m (destroy_nolock vm) diff --git a/ocaml/xenopsd/xc/domain.ml b/ocaml/xenopsd/xc/domain.ml index 12a10461f3a..065a4d02826 100644 --- a/ocaml/xenopsd/xc/domain.ml +++ b/ocaml/xenopsd/xc/domain.ml @@ -152,6 +152,8 @@ type builder_spec_info = type build_info = { memory_max: int64 (** memory max in kilobytes *) ; memory_target: int64 (** memory target in kilobytes *) + ; memory_total_source: int64 option + (** amount of memory to claim (during migration) *) ; kernel: string (** in hvm case, point to hvmloader *) ; vcpus: int (** vcpus max *) ; priv: builder_spec_info @@ -1180,7 +1182,14 @@ let build_pre ~xc ~xs ~vcpus ~memory ~hard_affinity domid = and cores = Xenops_server.cores_of_numa_affinity_policy pin ~vcpus in - let memory = Memory.bytes_of_pages memory.build_claim_pages in + + let build_claim_bytes = + Memory.bytes_of_pages memory.build_claim_pages + in + D.debug "VM = %s; domid = %d; will claim %Ld bytes = %Ld pages" + (Uuidx.to_string uuid) domid build_claim_bytes + memory.build_claim_pages ; + let memory = build_claim_bytes in match numa_placement domid ~vcpus ~cores ~memory affinity with | None -> (* Always perform a global claim when NUMA placement is @@ -1896,6 +1905,17 @@ let restore (task : Xenops_task.task_handle) ~xc ~xs ~dm ~timeoffset ~extras maybe_ca_140252_workaround ~xc ~vcpus domid ; (memory, vm_stuff, `pvh) in + let memory = + match info.memory_total_source with + | None -> + memory + | Some kib -> + let build_claim_pages = Memory.pages_of_kib_used kib in + let bytes = Memory.bytes_of_kib kib in + debug "Domid %d: memory_total_source = %Ld bytes = %Ld KiB = %Ld pages" + domid bytes kib build_claim_pages ; + Memory.{memory with build_claim_pages} + in let store_port, console_port, numa_placements = build_pre ~xc ~xs ~memory ~vcpus ~hard_affinity:info.hard_affinity domid in diff --git a/ocaml/xenopsd/xc/domain.mli b/ocaml/xenopsd/xc/domain.mli index 574782fdcec..9c71f78fdea 100644 --- a/ocaml/xenopsd/xc/domain.mli +++ b/ocaml/xenopsd/xc/domain.mli @@ -130,6 +130,8 @@ val builder_spec_info : builder_spec_info Rpc.Types.def type build_info = { memory_max: int64 (** memory max in kilobytes *) ; memory_target: int64 (** memory target in kilobytes *) + ; memory_total_source: int64 option + (** memory used on source during migration/resume in kilobytes *) ; kernel: string (** image to load. In HVM case, point to hvmloader *) ; vcpus: int (** vcpus max *) ; priv: builder_spec_info diff --git a/ocaml/xenopsd/xc/xenops_server_xen.ml b/ocaml/xenopsd/xc/xenops_server_xen.ml index 343e46166ff..d3bd53127ce 100644 --- a/ocaml/xenopsd/xc/xenops_server_xen.ml +++ b/ocaml/xenopsd/xc/xenops_server_xen.ml @@ -1637,6 +1637,7 @@ module VM = struct { Domain.memory_max= vm.memory_static_max /// 1024L ; memory_target= vm.memory_dynamic_min /// 1024L + ; memory_total_source= None ; kernel= "" ; vcpus= vm.vcpu_max ; priv= builder_spec_info @@ -1747,8 +1748,8 @@ module VM = struct in (device_id, revision) - let create_exn task memory_upper_bound vm final_id no_sharept num_of_vbds - num_of_vifs = + let create_exn task memory_upper_bound memory_total_source vm final_id + no_sharept num_of_vbds num_of_vifs = let k = vm.Vm.id in with_xc_and_xs (fun xc xs -> (* Ensure the DB contains something for this VM - this is to avoid a @@ -1821,27 +1822,30 @@ module VM = struct needed. If we are live migrating then we will only know an upper bound. If we are starting from scratch then we have a free choice. *) - let min_bytes, max_bytes = + let min_bytes, max_bytes, memory_total_source_bytes = match memory_upper_bound with | Some x -> debug "VM = %s; using memory_upper_bound = %Ld" vm.Vm.id x ; - (x, x) + (x, x, memory_total_source) | None -> if resuming then ( debug "VM = %s; using stored suspend_memory_bytes = %Ld" vm.Vm.id persistent.VmExtra.suspend_memory_bytes ; ( persistent.VmExtra.suspend_memory_bytes , persistent.VmExtra.suspend_memory_bytes + , Some persistent.VmExtra.suspend_memory_bytes ) ) else ( debug "VM = %s; using memory_dynamic_min = %Ld and \ memory_dynamic_max = %Ld" vm.Vm.id vm.memory_dynamic_min vm.memory_dynamic_max ; - (vm.memory_dynamic_min, vm.memory_dynamic_max) + (vm.memory_dynamic_min, vm.memory_dynamic_max, None) ) in let min_kib = kib_of_bytes_used (min_bytes +++ overhead_bytes) + and memory_total_source_kib = + Option.map kib_of_bytes_used memory_total_source_bytes and max_kib = kib_of_bytes_used (max_bytes +++ overhead_bytes) in (* XXX: we would like to be able to cancel an in-progress with_reservation *) @@ -1897,6 +1901,21 @@ module VM = struct vm.Vm.id target_bytes vm.memory_dynamic_max ; min vm.memory_dynamic_max target_bytes in + let persistent = + match persistent with + | {VmExtra.build_info= Some x; _} as t -> + { + t with + build_info= + Some + { + x with + memory_total_source= memory_total_source_kib + } + } + | _ -> + persistent + in set_initial_target ~xs domid (Int64.div initial_target 1024L) ; (* Log uses of obsolete option *) if vm.suppress_spurious_page_faults then @@ -2374,6 +2393,7 @@ module VM = struct { Domain.memory_max= static_max_kib ; memory_target= initial_target + ; memory_total_source= None ; kernel ; vcpus= vm.vcpu_max ; priv From 7adae02970581aa2541ac87282d65f9916692471 Mon Sep 17 00:00:00 2001 From: Christian Lindig Date: Mon, 26 Jan 2026 16:28:39 +0000 Subject: [PATCH 35/65] CP-311165 XSI-1958 rely on Linux guest to announce control features We want to rely on a Linux guest kernel to announce supported control features like suspend. In the unlikely case that a guest Linux kernel does not support this, provide a flag to re-enable previous behavior: the toolstack assumes a set of supported features. Linux kernel 4.9 and above announce features. The flag is false by default, as such introducing the new behaviour. This closes a window for a data race that caused XSI-1958: the toolstack announced control features before the Linux guest kernel was ready and announced them a moment later. A request coming in before the kernel was ready caused the request to be missed by the kernel. Previous behavior and timing is below: observe that dom0 announces the features before domain 7. Some details are omitted. 2026-01-27T10:22:04 dt034 oxd: DOM0 write /l/d/7/control/feature-suspend 1 2026-01-27T10:22:04 dt034 oxd: DOM0 write /l/d/7/control/feature-poweroff 1 2026-01-27T10:22:04 dt034 oxd: DOM0 write /l/d/7/control/feature-reboot 1 2026-01-27T10:22:04 dt034 oxd: DOM0 write /l/d/7/control/feature-vcpu-hotplug 1 2026-01-27T10:22:06 dt034 oxd: DOM7 write control/feature-poweroff 1 2026-01-27T10:22:06 dt034 oxd: DOM7 write control/feature-reboot 1 2026-01-27T10:22:06 dt034 oxd: DOM7 write control/feature-suspend 1 2026-01-27T10:22:10 dt034 oxd: DOM7 write control/feature-balloon 1 Signed-off-by: Christian Lindig --- ocaml/xenopsd/lib/xenopsd.ml | 9 +++++++++ ocaml/xenopsd/xc/device.ml | 14 +++++++++++--- ocaml/xenopsd/xenopsd.conf | 8 ++++++++ 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/ocaml/xenopsd/lib/xenopsd.ml b/ocaml/xenopsd/lib/xenopsd.ml index 8c92caf854b..8245dacddfa 100644 --- a/ocaml/xenopsd/lib/xenopsd.ml +++ b/ocaml/xenopsd/lib/xenopsd.ml @@ -81,6 +81,8 @@ let vm_suspend_timeout = ref 1200. let vm_suspend_ack_timeout = ref 30. +let linux_assume_ctrl_features = ref false + let oxenstored_conf = ref "/etc/xen/oxenstored.conf" let for_each_line path f = @@ -324,6 +326,13 @@ let options = , (fun () -> string_of_bool !Xenops_server.xenopsd_vbd_plug_unplug_legacy) , "False if we want to split the plug atomic into attach/activate" ) + ; ( "linux-assume-control-features" + , Arg.Bool (fun b -> linux_assume_ctrl_features := b) + , (fun () -> string_of_bool !linux_assume_ctrl_features) + , "To support old Linux guest kernels, assume the kernel supports certain \ + control features without announcing them. Linux kernels 4.9 and later \ + announce support in xenstore." + ) ; ( "vm-suspend-timeout" , Arg.Set_float vm_suspend_timeout , (fun () -> string_of_float !vm_suspend_timeout) diff --git a/ocaml/xenopsd/xc/device.ml b/ocaml/xenopsd/xc/device.ml index adedf798d6f..25671be5a49 100644 --- a/ocaml/xenopsd/xc/device.ml +++ b/ocaml/xenopsd/xc/device.ml @@ -2980,6 +2980,16 @@ module Backend = struct ) in let xen_platform_pv_driver_info pv_info = + (* we would like to keep the assumptions about supported + features small and rather have the Linux kernel or a guest agent + announce the actually supported featuresi. See commit message *) + let features = + match !Xenopsd.linux_assume_ctrl_features with + | true -> + ["suspend"; "poweroff"; "reboot"; "vcpu-hotplug"] + | false -> + ["vcpu-hotplug"] + in with_xs (fun xs -> let is_hvm_linux {product_num; build_num} = let _XEN_IOPORT_LINUX_PRODNUM = 3 in @@ -2992,9 +3002,7 @@ module Backend = struct (Printf.sprintf "/local/domain/%d/%s%s" domid prefix x) "1" in - List.iter - (write_local_domain "control/feature-") - ["suspend"; "poweroff"; "reboot"; "vcpu-hotplug"] ; + List.iter (write_local_domain "control/feature-") features ; List.iter (write_local_domain "data/") ["updated"] ) ) diff --git a/ocaml/xenopsd/xenopsd.conf b/ocaml/xenopsd/xenopsd.conf index f8a81ba06c0..99669c84558 100644 --- a/ocaml/xenopsd/xenopsd.conf +++ b/ocaml/xenopsd/xenopsd.conf @@ -122,3 +122,11 @@ disable-logging-for=http tracing tracing_export # Timeout in seconds for a VM to acknowledge a suspend request # vm-suspend-ack-timeout=30 +# + +# A Linux guest kernel 4.9+ announces the control features it supports in +# xenstore. If it does not, xenopsd has to assume it supports a range of +# operations. The default "false" is to rely on guest kernels to announce +# support. Use "true" to assume support even when unannounced. +# linux-assume-control-features = false + From bf61d51826a4037568360663cea69d952f78776a Mon Sep 17 00:00:00 2001 From: Lin Liu Date: Thu, 29 Jan 2026 06:19:31 +0000 Subject: [PATCH 36/65] CP-311215: Remove legacy PBIS code PBIS was dropped since XS8, Here remove the legacy code to keep clean Signed-off-by: Lin Liu --- ocaml/tests/suite_alcotest.ml | 1 - ocaml/tests/test_extauth_plugin_ADpbis.ml | 132 --- ocaml/xapi/extauth_ad.ml | 12 +- ocaml/xapi/extauth_plugin_ADpbis.ml | 1201 --------------------- quality-gate.sh | 4 +- 5 files changed, 3 insertions(+), 1347 deletions(-) delete mode 100644 ocaml/tests/test_extauth_plugin_ADpbis.ml delete mode 100644 ocaml/xapi/extauth_plugin_ADpbis.ml diff --git a/ocaml/tests/suite_alcotest.ml b/ocaml/tests/suite_alcotest.ml index 5cbd192e9a3..62deb8e71ae 100644 --- a/ocaml/tests/suite_alcotest.ml +++ b/ocaml/tests/suite_alcotest.ml @@ -62,7 +62,6 @@ let () = @ Test_vm.tests @ Test_dbsync_master.tests @ Test_pvs_cache_storage.tests - @ Test_extauth_plugin_ADpbis.tests @ Test_helpers.tests @ Test_datamodel_lifecycle.tests @ Test_psr.tests diff --git a/ocaml/tests/test_extauth_plugin_ADpbis.ml b/ocaml/tests/test_extauth_plugin_ADpbis.ml deleted file mode 100644 index cab78883907..00000000000 --- a/ocaml/tests/test_extauth_plugin_ADpbis.ml +++ /dev/null @@ -1,132 +0,0 @@ -(* - * Copyright (C) Citrix Systems Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; version 2.1 only. with the special - * exception on linking described in file LICENSE. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - *) - -open Test_highlevel - -module PbisAuthErrorsCatch = Generic.MakeStateless (struct - module Io = struct - type input_t = string list - - type output_t = Auth_signature.auth_service_error_tag - - let string_of_input_t = Test_printers.(list string) - - let string_of_output_t output = - match output with - | Auth_signature.E_GENERIC -> - "E_GENERIC" - | Auth_signature.E_LOOKUP -> - "E_LOOKUP" - | Auth_signature.E_DENIED -> - "E_DENIED" - | Auth_signature.E_CREDENTIALS -> - "E_CREDENTIALS" - | Auth_signature.E_UNAVAILABLE -> - "E_UNAVAILABLE" - | Auth_signature.E_INVALID_OU -> - "E_INVALID_OU" - | Auth_signature.E_INVALID_ACCOUNT -> - "E_INVALID_ACCOUNT" - end - - let transform = Extauth_plugin_ADpbis.match_error_tag - - let tests = - `QuickAndAutoDocumented - [ - ([], Auth_signature.E_GENERIC) - ; ([""; ""], Auth_signature.E_GENERIC) - ; ([""; "some words"], Auth_signature.E_GENERIC) - ; ([""; "DNS_ERROR_BAD_PACKET"], Auth_signature.E_LOOKUP) - ; ([""; "LW_ERROR_PASSWORD_MISMATCH"], Auth_signature.E_CREDENTIALS) - ; ([""; "LW_ERROR_INVALID_ACCOUNT"], Auth_signature.E_INVALID_ACCOUNT) - ; ([""; "LW_ERROR_ACCESS_DENIED"], Auth_signature.E_DENIED) - ; ([""; "LW_ERROR_DOMAIN_IS_OFFLINE"], Auth_signature.E_UNAVAILABLE) - ; ([""; "LW_ERROR_INVALID_OU"], Auth_signature.E_INVALID_OU) - ; ([""; "prefixDNS_ERROR_BAD_PACKETsuffix"], Auth_signature.E_GENERIC) - ; ([""; "prefix_DNS_ERROR_BAD_PACKET_suffix"], Auth_signature.E_GENERIC) - ; ([""; "prefix(DNS_ERROR_BAD_PACKET)suffix"], Auth_signature.E_LOOKUP) - ; ([""; "prefix.DNS_ERROR_BAD_PACKET.suffix"], Auth_signature.E_LOOKUP) - ; ([""; "prefix DNS_ERROR_BAD_PACKET suffix"], Auth_signature.E_LOOKUP) - ; ([""; "prefix\tDNS_ERROR_BAD_PACKET\tsuffix"], Auth_signature.E_LOOKUP) - ] -end) - -module PbisExtractSid = Generic.MakeStateless (struct - module Io = struct - type input_t = (string * string) list - - type output_t = string list - - let string_of_input_t = Test_printers.(list (pair string string)) - - let string_of_output_t = Test_printers.(list string) - end - - let transform = Extauth_plugin_ADpbis.extract_sid_from_group_list - - let tests = - `QuickAndAutoDocumented - [ - ([(" ", " ")], []) - ; ([("Exception", "Remote connection shutdown!")], []) - ; ( [ - ("Number of groups found for user 'testAD@BLE'", "0") - ; ("Error", "No record found") - ] - , [] - ) - ; ( [ - ("Number of groups found for user 'admin@NVC'", "1") - ; ( "" - , "Group[1 of 1] name = NVC\\testg(ab) (gid = 564135020, sid = \ - S-1-5-21-1171552557-368733809-2946345504-1132)" - ) - ] - , ["S-1-5-21-1171552557-368733809-2946345504-1132"] - ) - ; ( [ - ("Number of groups found for user 'cnk3@UN'", "1") - ; ( "" - , "Group[1 of 1] name = UN\\KnmOJ (gid = 492513842, sid = \ - S-1-5-31-5921451325-154521381-3135732118-4527)" - ) - ] - , ["S-1-5-31-5921451325-154521381-3135732118-4527"] - ) - ; ( [ - ("Number of groups found for user 'test@testdomain'", "2") - ; ( "" - , "Group[1 of 2] name = testdomain\\dnsadmins (gid = 580912206, \ - sid = S-1-5-21-791009147-1041474540-2433379237-1102)" - ) - ; ( "" - , "Group[2 of 2] name = testdomain\\domain+users (gid = 580911617, \ - sid = S-1-5-21-791009147-1041474540-2433379237-513)" - ) - ] - , [ - "S-1-5-21-791009147-1041474540-2433379237-1102" - ; "S-1-5-21-791009147-1041474540-2433379237-513" - ] - ) - ] -end) - -let tests = - make_suite "extauth_ADpbis_" - [ - ("pbis_auth_errors_catch", PbisAuthErrorsCatch.tests) - ; ("pbis_extract_sid", PbisExtractSid.tests) - ] diff --git a/ocaml/xapi/extauth_ad.ml b/ocaml/xapi/extauth_ad.ml index b6b2cca66c1..5218b426cb5 100644 --- a/ocaml/xapi/extauth_ad.ml +++ b/ocaml/xapi/extauth_ad.ml @@ -22,11 +22,9 @@ open D module AD_type = struct exception Unknown_AD_type of string - type t = Pbis | Winbind + type t = Winbind let of_string = function - | "pbis" -> - Pbis | "winbind" -> Winbind | _ as at -> @@ -36,20 +34,14 @@ end module AD = struct let start_backend_daemon ~wait_until_success = function - | AD_type.Pbis -> - Extauth_plugin_ADpbis.Lwsmd.start ~wait_until_success ~timeout:5. | AD_type.Winbind -> Extauth_plugin_ADwinbind.Winbind.start ~wait_until_success ~timeout:5. let stop_backend_daemon ~wait_until_success = function - | AD_type.Pbis -> - Extauth_plugin_ADpbis.Lwsmd.stop ~wait_until_success ~timeout:3. | AD_type.Winbind -> Extauth_plugin_ADwinbind.Winbind.stop ~wait_until_success ~timeout:3. let init_service ~__context = function - | AD_type.Pbis -> - Extauth_plugin_ADpbis.Lwsmd.init_service ~__context | AD_type.Winbind -> Extauth_plugin_ADwinbind.Winbind.init_service ~__context end @@ -71,7 +63,5 @@ let init_service ~__context = let methods () = match !Xapi_globs.extauth_ad_backend |> AD_type.of_string with - | Pbis -> - Extauth_plugin_ADpbis.AuthADlw.methods | Winbind -> Extauth_plugin_ADwinbind.AuthADWinbind.methods diff --git a/ocaml/xapi/extauth_plugin_ADpbis.ml b/ocaml/xapi/extauth_plugin_ADpbis.ml deleted file mode 100644 index 63a5ab7ac81..00000000000 --- a/ocaml/xapi/extauth_plugin_ADpbis.ml +++ /dev/null @@ -1,1201 +0,0 @@ -(* - * Copyright (C) 2006-2009 Citrix Systems Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; version 2.1 only. with the special - * exception on linking described in file LICENSE. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - *) -(** - * @group Access Control -*) - -module D = Debug.Make (struct let name = "extauth_plugin_ADpbis" end) - -open D -open Xapi_stdext_std.Xstringext - -let ( let@ ) = ( @@ ) - -let finally = Xapi_stdext_pervasives.Pervasiveext.finally - -let with_lock = Xapi_stdext_threads.Threadext.Mutex.execute - -let lwsmd_service = "lwsmd" - -module Lwsmd = struct - (* This can be refined by Mtime.Span.hour when mtime is updated to 1.4.0 *) - let restart_interval = Int64.mul 3600L 1000000000L |> Mtime.Span.of_uint64_ns - - let next_check_point = - Mtime.add_span (Mtime_clock.now ()) restart_interval |> ref - - let is_ad_enabled ~__context = - ( Helpers.get_localhost ~__context |> fun self -> - Db.Host.get_external_auth_type ~__context ~self - ) - |> fun x -> x = Xapi_globs.auth_type_AD - - let enable_nsswitch () = - try - ignore - (Forkhelpers.execute_command_get_output - !Xapi_globs.domain_join_cli_cmd - ["configure"; "--enable"; "nsswitch"] - ) - with e -> - error "Fail to run %s with error %s" - !Xapi_globs.domain_join_cli_cmd - (ExnHelper.string_of_exn e) - - let stop ~timeout ~wait_until_success = - Xapi_systemctl.stop ~timeout ~wait_until_success lwsmd_service - - let start ~timeout ~wait_until_success = - Xapi_systemctl.start ~timeout ~wait_until_success lwsmd_service - - let restart ~timeout ~wait_until_success = - Xapi_systemctl.restart ~timeout ~wait_until_success lwsmd_service - - let restart_on_error () = - (* Only restart once within restart_interval *) - let now = Mtime_clock.now () in - match !next_check_point with - | Some check_point -> - if Mtime.is_later now ~than:check_point then ( - debug "Restart %s due to local server error" lwsmd_service ; - next_check_point := Mtime.add_span now restart_interval ; - restart ~timeout:0. ~wait_until_success:false - ) - | None -> - debug "next_check_point overflow" - - let init_service ~__context = - (* This function is called during xapi start *) - (* it will start lwsmd service if the host is authed with AD *) - (* Xapi does not wait lwsmd service to boot up success as following reasons - * 1. The waiting will slow down xapi bootup - * 2. Xapi still needs to boot up even lwsmd bootup fail - * 3. Xapi does not need to use lwsmd functionality during its bootup *) - if is_ad_enabled ~__context then ( - restart ~wait_until_success:false ~timeout:5. ; - (* Xapi help to enable nsswitch during bootup if it find the host is authed with AD - * nsswitch will be automatically enabled with command domainjoin-cli - * but this enabling is necessary when the host authed with AD upgrade - * As it will not run the domainjoin-cli command again *) - enable_nsswitch () - ) -end - -let match_error_tag (lines : string list) = - let err_catch_list = - [ - ("DNS_ERROR_BAD_PACKET", Auth_signature.E_LOOKUP) - ; ("LW_ERROR_PASSWORD_MISMATCH", Auth_signature.E_CREDENTIALS) - ; ("LW_ERROR_INVALID_ACCOUNT", Auth_signature.E_INVALID_ACCOUNT) - ; ("LW_ERROR_ACCESS_DENIED", Auth_signature.E_DENIED) - ; ("LW_ERROR_DOMAIN_IS_OFFLINE", Auth_signature.E_UNAVAILABLE) - ; ("LW_ERROR_INVALID_OU", Auth_signature.E_INVALID_OU) - (* More errors to be caught here *) - ] - in - let split_to_words str = - let seps = ['('; ')'; ' '; '\t'; '.'] in - String.split_f (fun s -> List.exists (fun sep -> sep = s) seps) str - in - let rec has_err lines err_pattern = - match lines with - | [] -> - false - | line :: rest -> ( - try - ignore (List.find (fun w -> w = err_pattern) (split_to_words line)) ; - true - with Not_found -> has_err rest err_pattern - ) - in - try - let _, errtag = - List.find - (fun (err_pattern, _) -> has_err lines err_pattern) - err_catch_list - in - errtag - with Not_found -> Auth_signature.E_GENERIC - -let extract_sid_from_group_list group_list = - List.map - (fun (_, v) -> - let v = String.replace ")" "" v in - let v = String.replace "sid =" "|" v in - let vs = String.split_f (fun c -> c = '|') v in - let sid = String.trim (List.nth vs 1) in - debug "extract_sid_from_group_list get sid=[%s]" sid ; - sid - ) - (List.filter (fun (n, _) -> n = "") group_list) - -let start_damon () = - try Lwsmd.start ~timeout:5. ~wait_until_success:true - with _ -> - raise - (Auth_signature.Auth_service_error - ( Auth_signature.E_GENERIC - , Printf.sprintf "Failed to start %s" lwsmd_service - ) - ) - -module AuthADlw : Auth_signature.AUTH_MODULE = struct - (* - * External Authentication Plugin component - * using AD/Pbis as a backend - * v1 14Nov14 phus.lu@citrix.com - * - *) - - let user_friendly_error_msg = - "The Active Directory Plug-in could not complete the command. Additional \ - information in the logs." - - let mutex_check_availability = - Locking_helpers.Named_mutex.create "IS_SERVER_AVAILABLE" - - let splitlines s = - String.split_f (fun c -> c = '\n') (String.replace "#012" "\n" s) - - let pbis_common_with_password (password : string) (pbis_cmd : string) - (pbis_args : string list) = - let debug_cmd = - pbis_cmd ^ " " ^ List.fold_left (fun p pp -> p ^ " " ^ pp) " " pbis_args - in - try - debug "execute %s" debug_cmd ; - let env = [|"PASSWORD=" ^ password|] in - let _ = Forkhelpers.execute_command_get_output ~env pbis_cmd pbis_args in - [] - with - | Forkhelpers.Spawn_internal_error (stderr, stdout, Unix.WEXITED n) -> - error "execute %s exited with code %d [stdout = '%s'; stderr = '%s']" - debug_cmd n stdout stderr ; - let lines = - List.filter - (fun l -> String.length l > 0) - (splitlines (stdout ^ stderr)) - in - let errmsg = List.hd (List.rev lines) in - let errtag = match_error_tag lines in - raise (Auth_signature.Auth_service_error (errtag, errmsg)) - | e -> - error "execute %s exited: %s" debug_cmd (ExnHelper.string_of_exn e) ; - raise - (Auth_signature.Auth_service_error - (Auth_signature.E_GENERIC, user_friendly_error_msg) - ) - - let pbis_config (name : string) (value : string) = - let pbis_cmd = "/opt/pbis/bin/config" in - let pbis_args = [name; value] in - let debug_cmd = pbis_cmd ^ " " ^ name ^ " " ^ value in - try - debug "execute %s" debug_cmd ; - let _ = Forkhelpers.execute_command_get_output pbis_cmd pbis_args in - () - with - | Forkhelpers.Spawn_internal_error (stderr, stdout, Unix.WEXITED n) -> - error "execute %s exited with code %d [stdout = '%s'; stderr = '%s']" - debug_cmd n stdout stderr ; - let lines = - List.filter - (fun l -> String.length l > 0) - (splitlines (stdout ^ stderr)) - in - let errmsg = List.hd (List.rev lines) in - raise - (Auth_signature.Auth_service_error (Auth_signature.E_GENERIC, errmsg)) - | e -> - error "execute %s exited: %s" debug_cmd (ExnHelper.string_of_exn e) ; - raise - (Auth_signature.Auth_service_error - (Auth_signature.E_GENERIC, user_friendly_error_msg) - ) - - let ensure_pbis_configured () = - pbis_config "SpaceReplacement" "+" ; - pbis_config "CreateHomeDir" "false" ; - pbis_config "SyncSystemTime" "false" ; - pbis_config "LdapSignAndSeal" "true" ; - pbis_config "CacheEntryExpiry" "300" ; - () - - let pbis_common ?(stdin_string = "") (pbis_cmd : string) - (pbis_args : string list) = - let debug_cmd = - pbis_cmd ^ " " ^ List.fold_left (fun p pp -> p ^ " " ^ pp) " " pbis_args - in - let debug_cmd = - if String.has_substr debug_cmd "--password" then - "(omitted for security)" - else - debug_cmd - in - (* stuff to clean up on the way out of the function: *) - let fds_to_close = ref [] in - let files_to_unlink = ref [] in - (* take care to close an fd only once *) - let close_fd fd = - if List.mem fd !fds_to_close then ( - Unix.close fd ; - fds_to_close := List.filter (fun x -> x <> fd) !fds_to_close - ) - in - (* take care to unlink a file only once *) - let unlink_file filename = - if List.mem filename !files_to_unlink then ( - Unix.unlink filename ; - files_to_unlink := List.filter (fun x -> x <> filename) !files_to_unlink - ) - in - (* guarantee to release all resources (files, fds) *) - let finalize () = - List.iter close_fd !fds_to_close ; - List.iter unlink_file !files_to_unlink - in - let finally_finalize f = finally f finalize in - let exited_code = ref 0 in - let output = ref "" in - finally_finalize (fun () -> - let _ = - try - debug "execute %s" debug_cmd ; - (* creates pipes between xapi and pbis process *) - let in_readme, in_writeme = Unix.pipe () in - fds_to_close := in_readme :: in_writeme :: !fds_to_close ; - let out_tmpfile = Filename.temp_file "pbis" ".out" in - files_to_unlink := out_tmpfile :: !files_to_unlink ; - let err_tmpfile = Filename.temp_file "pbis" ".err" in - files_to_unlink := err_tmpfile :: !files_to_unlink ; - let out_writeme = Unix.openfile out_tmpfile [Unix.O_WRONLY] 0o0 in - fds_to_close := out_writeme :: !fds_to_close ; - let err_writeme = Unix.openfile err_tmpfile [Unix.O_WRONLY] 0o0 in - fds_to_close := err_writeme :: !fds_to_close ; - let pid = - Forkhelpers.safe_close_and_exec (Some in_readme) - (Some out_writeme) (Some err_writeme) [] pbis_cmd pbis_args - in - finally - (fun () -> - debug "Created process pid %s for cmd %s" - (Forkhelpers.string_of_pidty pid) - debug_cmd ; - - (* Insert this delay to reproduce the cannot write to stdin bug: - Thread.delay 5.; *) - (* WARNING: we don't close the in_readme because otherwise in the case where the pbis - binary doesn't expect any input there is a race between it finishing (and closing the last - reference to the in_readme) and us attempting to write to in_writeme. If pbis wins the - race then our write will fail with EPIPE (Unix.error 31 in ocamlese). If we keep a reference - to in_readme then our write of "\n" will succeed. - - An alternative fix would be to not write anything when stdin_string = "" *) - - (* push stdin_string to recently created process' STDIN *) - try - (* usually, STDIN contains some sensitive data such as passwords that we do not want showing up in ps *) - (* or in the debug log via debug_cmd *) - let stdin_string = stdin_string ^ "\n" in - (*HACK:without \n, the pbis scripts don't return!*) - let (_ : int) = - Unix.write_substring in_writeme stdin_string 0 - (String.length stdin_string) - in - close_fd in_writeme - (* we need to close stdin, otherwise the unix cmd waits forever *) - with e -> - (* in_string is usually the password or other sensitive param, so never write it to debug or exn *) - debug "Error writing to stdin for cmd %s: %s" debug_cmd - (ExnHelper.string_of_exn e) ; - raise - (Auth_signature.Auth_service_error - (Auth_signature.E_GENERIC, ExnHelper.string_of_exn e) - ) - ) - (fun () -> - match Forkhelpers.waitpid pid with - | _, Unix.WEXITED n -> - exited_code := n ; - output := - Xapi_stdext_unix.Unixext.string_of_file out_tmpfile - ^ Xapi_stdext_unix.Unixext.string_of_file err_tmpfile - | _ -> - error "PBIS %s exit with WSTOPPED or WSIGNALED" debug_cmd ; - raise - (Auth_signature.Auth_service_error - (Auth_signature.E_GENERIC, user_friendly_error_msg) - ) - ) - with e -> - error "execute %s exited: %s" debug_cmd (ExnHelper.string_of_exn e) ; - raise - (Auth_signature.Auth_service_error - (Auth_signature.E_GENERIC, user_friendly_error_msg) - ) - in - if !exited_code <> 0 then ( - error "execute '%s': exit_code=[%d] output=[%s]" debug_cmd - !exited_code - (String.replace "\n" ";" !output) ; - let split_to_words s = - String.split_f (fun c -> c = '(' || c = ')' || c = '.' || c = ' ') s - in - let revlines = - List.rev - (List.filter (fun l -> String.length l > 0) (splitlines !output)) - in - let errmsg = List.hd revlines in - let errcodeline = - if List.length revlines > 1 then - List.nth revlines 1 - else - errmsg - in - let errcode = - List.hd - (List.filter - (fun w -> String.starts_with ~prefix:"LW_ERROR_" w) - (split_to_words errcodeline) - ) - in - debug "Pbis raised an error for cmd %s: (%s) %s" debug_cmd errcode - errmsg ; - match errcode with - | "LW_ERROR_INVALID_GROUP_INFO_LEVEL" -> - raise - (Auth_signature.Auth_service_error - (Auth_signature.E_GENERIC, errcode) - ) - (* For pbis_get_all_byid *) - | "LW_ERROR_NO_SUCH_USER" - | "LW_ERROR_NO_SUCH_GROUP" - | "LW_ERROR_NO_SUCH_OBJECT" -> - raise Not_found (* Subject_cannot_be_resolved *) - | "LW_ERROR_KRB5_CALL_FAILED" - | "LW_ERROR_PASSWORD_MISMATCH" - | "LW_ERROR_ACCOUNT_DISABLED" - | "LW_ERROR_NOT_HANDLED" -> - raise (Auth_signature.Auth_failure errmsg) - | "LW_ERROR_INVALID_OU" -> - raise - (Auth_signature.Auth_service_error - (Auth_signature.E_INVALID_OU, errmsg) - ) - | "LW_ERROR_INVALID_DOMAIN" -> - raise - (Auth_signature.Auth_service_error - (Auth_signature.E_GENERIC, errmsg) - ) - | "LW_ERROR_ERRNO_ECONNREFUSED" -> - (* CA-368806: Restart service to workaround pbis wedged *) - Lwsmd.restart_on_error () ; - raise - (Auth_signature.Auth_service_error - (Auth_signature.E_GENERIC, errmsg) - ) - | "LW_ERROR_LSA_SERVER_UNREACHABLE" | _ -> - raise - (Auth_signature.Auth_service_error - ( Auth_signature.E_GENERIC - , Printf.sprintf "(%s) %s" errcode errmsg - ) - ) - ) else - debug "execute %s: output length=[%d]" debug_cmd - (String.length !output) ; - let lines = - List.filter (fun l -> String.length l > 0) (splitlines !output) - in - let parse_line (acc, currkey) line = - let slices = String.split ~limit:2 ':' line in - debug "parse %s: currkey=[%s] line=[%s]" debug_cmd currkey line ; - if List.length slices > 1 then ( - let key = String.trim (List.hd slices) in - let value = String.trim (List.nth slices 1) in - debug "parse %s: key=[%s] value=[%s] currkey=[%s]" debug_cmd key - value currkey ; - if String.length value > 0 then - (acc @ [(key, value)], "") - else - (acc, key) - ) else - let key = currkey in - let value = String.trim line in - debug "parse %s: key=[%s] value=[%s] currkey=[%s]" debug_cmd key - value currkey ; - (acc @ [(key, value)], currkey) - in - let attrs, _ = List.fold_left parse_line ([], "") lines in - attrs - ) - - (* assoc list for caching pbis_common results, - item value is ((stdin_string, pbis_cmd, pbis_args), (unix_time, pbis_common_result)) - *) - let cache_of_pbis_common : - ((string * string * string list) * (float * (string * string) list)) list - ref = - ref [] - - let cache_of_pbis_common_m = Mutex.create () - - let pbis_common_with_cache ?(stdin_string = "") (pbis_cmd : string) - (pbis_args : string list) = - let expired = 120.0 in - let now = Unix.time () in - let cache_key = (stdin_string, pbis_cmd, pbis_args) in - let f () = - cache_of_pbis_common := - List.filter - (fun (_, (ts, _)) -> now -. ts < expired) - !cache_of_pbis_common ; - try - let _, result = List.assoc cache_key !cache_of_pbis_common in - debug "pbis_common_with_cache hit \"%s\" cache." pbis_cmd ; - result - with Not_found -> - let result = pbis_common ~stdin_string pbis_cmd pbis_args in - cache_of_pbis_common := - !cache_of_pbis_common @ [(cache_key, (Unix.time (), result))] ; - result - in - with_lock cache_of_pbis_common_m f - - let get_joined_domain_name () = - Server_helpers.exec_with_new_task "obtaining joined-domain name" - (fun __context -> - let host = Helpers.get_localhost ~__context in - (* the service_name always contains the domain name provided during domain-join *) - Db.Host.get_external_auth_service_name ~__context ~self:host - ) - - (* CP-842: when resolving AD usernames, make joined-domain prefix optional *) - let get_full_subject_name ?(use_nt_format = true) subject_name = - (* CA-27744: always use NT-style names by default *) - try - (* tests if the UPN account name separator @ is present in subject name *) - ignore (String.index subject_name '@') ; - (* we only reach this point if the separator @ is present in subject_name *) - (* nothing to do, we assume that subject_name already contains the domain name after @ *) - subject_name - with Not_found -> ( - try - (* if no UPN username separator @ was found *) - - (* tests if the NT account name separator \ is present in subject name *) - ignore (String.index subject_name '\\') ; - (* we only reach this point if the separator \ is present in subject_name *) - (* nothing to do, we assume that subject_name already contains the domain name before \ *) - subject_name - with Not_found -> - if - (* if neither the UPN separator @ nor the NT username separator \ was found *) - use_nt_format - then - (* the default: NT names is unique, whereas UPN ones are not (CA-27744) *) - (* we prepend the joined-domain name to the subjectname as an NT name: \ *) - get_joined_domain_name () ^ "\\" ^ subject_name - (* obs: (1) pbis accepts a fully qualified domain name with both formats and *) - (* (2) some pbis commands accept only the NT-format, such as find-group-by-name *) - else - (* UPN format not the default format (CA-27744) *) - (* we append the joined-domain name to the subjectname as a UPN name: @ *) - subject_name ^ "@" ^ get_joined_domain_name () - ) - - (* Converts from UPN format (user@domain.com) to legacy NT format (domain.com\user) *) - (* This function is a workaround to use find-group-by-name, which requires nt-format names) *) - (* For anything else, use the original UPN name *) - let convert_upn_to_nt_username subject_name = - try - (* test if the UPN account name separator @ is present in subject name *) - let i = String.index subject_name '@' in - (* we only reach this point if the separator @ is present in subject_name *) - (* when @ is present, we need to convert the UPN name to NT format *) - let user = String.sub subject_name 0 i in - let domain = - String.sub subject_name (i + 1) (String.length subject_name - i - 1) - in - domain ^ "\\" ^ user - with Not_found -> - (* if no UPN username separator @ was found *) - (* nothing to do in this case *) - subject_name - - let pbis_get_all_byid subject_id = - try - pbis_common_with_cache "/opt/pbis/bin/find-by-sid" - ["--level"; "2"; subject_id] - with - | Auth_signature.Auth_service_error - (Auth_signature.E_GENERIC, "LW_ERROR_INVALID_GROUP_INFO_LEVEL") - -> - pbis_common_with_cache "/opt/pbis/bin/find-by-sid" - ["--level"; "1"; subject_id] - - let pbis_get_group_sids_byname _subject_name = - let subject_name = get_full_subject_name _subject_name in - (* append domain if necessary *) - let subject_attrs = - pbis_common_with_cache "/opt/pbis/bin/list-groups-for-user" - ["--show-sid"; subject_name] - in - (* PBIS list-groups-for-user raw output like - Number of groups found for user 'test@testdomain' : 2 - Group[1 of 2] name = testdomain\dnsadmins (gid = 580912206, sid = S-1-5-21-791009147-1041474540-2433379237-1102) - Group[2 of 2] name = testdomain\domain+users (gid = 580911617, sid = S-1-5-21-791009147-1041474540-2433379237-513) - And pbis_common will return subject_attrs as - [("Number of groups found for user 'test@testdomain'", "2"), ("", line1), ("", line2) ... ("", lineN)] - *) - extract_sid_from_group_list subject_attrs - - (* general Pbis error *) - - let pbis_get_sid_byname _subject_name cmd = - let subject_name = get_full_subject_name _subject_name in - (* append domain if necessary *) - let subject_attrs = pbis_common cmd ["--level"; "1"; subject_name] in - (* find-user-by-name returns several lines. We ony need the SID *) - if List.mem_assoc "SID" subject_attrs then - List.assoc "SID" subject_attrs (* OK, return SID *) - else - (*no SID value returned*) - (* this should not have happend, pbis didn't return an SID field!! *) - let msg = - Printf.sprintf "Pbis didn't return an SID field for user %s" - subject_name - in - debug "Error pbis_get_sid_byname for subject name %s: %s" subject_name msg ; - raise (Auth_signature.Auth_service_error (Auth_signature.E_GENERIC, msg)) - - (* general Pbis error *) - - (* subject_id get_subject_identifier(string subject_name) - - Takes a subject_name (as may be entered into the XenCenter UI when defining subjects -- - see Access Control wiki page); and resolves it to a subject_id against the external - auth/directory service. - Raises Not_found (*Subject_cannot_be_resolved*) if authentication is not succesful. - *) - let get_subject_identifier ~__context _subject_name = - let@ __context = Context.with_tracing ~__context __FUNCTION__ in - - try - (* looks up list of users*) - let subject_name = get_full_subject_name _subject_name in - (* append domain if necessary *) - pbis_get_sid_byname subject_name "/opt/pbis/bin/find-user-by-name" - with _ -> - (* append domain if necessary, find-group-by-name only accepts nt-format names *) - let subject_name = - get_full_subject_name ~use_nt_format:true - (convert_upn_to_nt_username _subject_name) - in - (* looks up list of groups*) - pbis_get_sid_byname subject_name "/opt/pbis/bin/find-group-by-name" - - (* subject_id Authenticate_username_password(string username, string password) - - Takes a username and password, and tries to authenticate against an already configured - auth service (see XenAPI requirements Wiki page for details of how auth service configuration - takes place and the appropriate values are stored within the XenServer Metadata). - If authentication is successful then a subject_id is returned representing the account - corresponding to the supplied credentials (where the subject_id is in a namespace managed by - the auth module/service itself -- e.g. maybe a SID or something in the AD case). - Raises auth_failure if authentication is not successful - *) - - let authenticate_username_password ~__context username password = - let@ __context = Context.with_tracing ~__context __FUNCTION__ in - (* first, we try to authenticated user against our external user database *) - (* pbis_common will raise an Auth_failure if external authentication fails *) - let domain, user = - match String.split_f (fun c -> c = '\\') username with - | [domain; user] -> - (domain, user) - | [user] -> - (get_joined_domain_name (), user) - | _ -> - raise - (Auth_signature.Auth_service_error - (Auth_signature.E_GENERIC, "Invalid username " ^ username) - ) - in - let (_ : (string * string) list) = - pbis_common "/opt/pbis/bin/lsa" - [ - "authenticate-user" - ; "--user" - ; user - ; "--domain" - ; domain - ; "--password" - ; password - ] - in - (* no exception raised, then authentication succeeded, *) - (* now we return the authenticated user's id *) - get_subject_identifier ~__context (get_full_subject_name username) - - (* subject_id Authenticate_ticket(string ticket) - - As above but uses a ticket as credentials (i.e. for single sign-on) - *) - (* not implemented now, not needed for our tests, only for a *) - (* future single sign-on feature *) - let authenticate_ticket ~__context:_ _tgt = - failwith "extauth_plugin authenticate_ticket not implemented" - - (* ((string*string) list) query_subject_information(string subject_identifier) - - Takes a subject_identifier and returns the user record from the directory service as - key/value pairs. In the returned string*string map, there _must_ be a key called - subject_name that refers to the name of the account (e.g. the user or group name as may - be displayed in XenCenter). There is no other requirements to include fields from the user - record -- initially qI'd imagine that we wouldn't bother adding anything else here, but - it's a string*string list anyway for possible future expansion. - Raises Not_found (*Subject_cannot_be_resolved*) if subject_id cannot be resolved by external auth service - *) - let query_subject_information ~__context subject_identifier = - let@ __context = Context.with_tracing ~__context __FUNCTION__ in - let unmap_lw_space_chars lwname = - let defensive_copy = Bytes.of_string lwname in - (* CA-29006: map chars in names back to original space chars in windows-names *) - (* we use + as the pbis space-replacement because it's an invalid NT-username char in windows *) - (* the space-replacement char used by pbis is defined at /etc/pbis/lsassd.conf *) - let current_lw_space_replacement = '+' in - String.iteri - (fun i c -> - if c = current_lw_space_replacement then - Bytes.set defensive_copy i ' ' - else - () - ) - lwname ; - Bytes.unsafe_to_string defensive_copy - in - let get_value name ls = - if List.mem_assoc name ls then - List.assoc name ls - else - "" - in - let infolist = pbis_get_all_byid subject_identifier in - let subject_is_group = get_value "Uid" infolist = "" in - if subject_is_group then - (* subject is group *) - (* in this case, a few info fields are not available: UPN, Uid, Gecos, Account {disabled,expired,locked}, Password expired *) - [ - ("subject-name", unmap_lw_space_chars (get_value "Name" infolist)) - ; ("subject-gid", get_value "Gid" infolist) - ; ("subject-sid", get_value "SID" infolist) - ; ("subject-is-group", "true") - (*(* comma-separated list of subjects that are contained in this subject *) - ("contains-byname", List.fold_left (fun (n,v) m ->m^","^v) "" (List.filter (fun (n,v)->n="Members") infolist));*) - ] - else (* subject is user *) - let subject_name = unmap_lw_space_chars (get_value "Name" infolist) in - let subject_gecos = get_value "Gecos" infolist in - [ - ("subject-name", subject_name) - ; ("subject-upn", get_value "UPN" infolist) - ; ("subject-uid", get_value "Uid" infolist) - ; ("subject-gid", get_value "Gid" infolist) - ; ("subject-sid", get_value "SID" infolist) - ; ("subject-gecos", subject_gecos) - ; ( "subject-displayname" - , if subject_gecos = "" || subject_gecos = "" then - subject_name - else - subject_gecos - ) - ; (*("subject-homedir", get_value "Home dir" infolist);*) - (*("subject-shell", get_value "Shell" infolist);*) - ("subject-is-group", "false") - ; ( "subject-account-disabled" - , get_value "Account disabled (or locked)" infolist - ) - ; ("subject-account-expired", get_value "Account Expired" infolist) - ; ( "subject-account-locked" - , get_value "Account disabled (or locked)" infolist - ) - ; ("subject-password-expired", get_value "Password Expired" infolist) - ] - - (* (string list) query_group_membership(string subject_identifier) - - Takes a subject_identifier and returns its group membership (i.e. a list of subject - identifiers of the groups that the subject passed in belongs to). The set of groups returned - _must_ be transitively closed wrt the is_member_of relation if the external directory service - supports nested groups (as AD does for example) - *) - let query_group_membership ~__context subject_identifier = - let@ __context = Context.with_tracing ~__context __FUNCTION__ in - - let subject_info = - query_subject_information ~__context subject_identifier - in - if - List.assoc "subject-is-group" subject_info = "true" - (* this field is always present *) - then - (* subject is a group, so get_group_sids_byname will not work because pbis's list-groups *) - (* doesnt work if a group name is given as input *) - (* FIXME: default action for groups until workaround is found: return an empty list of membership groups *) - [] - else - (* subject is a user, list-groups and therefore get_group_sids_byname work fine *) - let subject_name = List.assoc "subject-name" subject_info in - (* CA-27744: always use NT-style names *) - let subject_sid_membership_list = - pbis_get_group_sids_byname subject_name - in - debug "Resolved %i group sids for subject %s (%s): %s" - (List.length subject_sid_membership_list) - subject_name subject_identifier - (List.fold_left - (fun p pp -> - if p = "" then - pp - else - p ^ "," ^ pp - ) - "" subject_sid_membership_list - ) ; - subject_sid_membership_list - - (* - In addition, there are some event hooks that auth modules implement as follows: -*) - - let _is_pbis_server_available ~__context max_tries = - (* we _need_ to use a username contained in our domain, otherwise the following tests won't work. - Microsoft KB/Q243330 article provides the KRBTGT account as a well-known built-in SID in AD - Microsoft KB/Q229909 article says that KRBTGT account cannot be renamed or enabled, making - it the perfect target for such a test using a username (Administrator account can be renamed) *) - let krbtgt = "KRBTGT" in - let try_clear_cache () = - (* the primary purpose of this function is to clear the cache so that - [ try_fetch_sid ] is forced to perform an end to end query to the - AD server. as such, we don't care if krbtgt was not originally in - the cache *) - match get_full_subject_name krbtgt with - | exception _ -> - info - "_is_pbis_server_available: failed to get full subject name for %s" - krbtgt ; - Error () - | full_username -> ( - match - ignore - (pbis_common "/opt/pbis/bin/ad-cache" - ["--delete-user"; "--name"; full_username] - ) - with - | () | (exception Not_found) -> - Ok () - | exception e -> - debug "Failed to remove user %s from cache: %s" full_username - (ExnHelper.string_of_exn e) ; - Error () - ) - in - let try_fetch_sid () = - try - let sid = get_subject_identifier ~__context krbtgt in - debug - "Request to external authentication server successful: user %s was \ - found" - krbtgt ; - let (_ : (string * string) list) = - query_subject_information ~__context sid - in - debug - "Request to external authentication server successful: sid %s was \ - found" - sid ; - Ok () - with - | Not_found -> - (* that means that pbis is responding to at least cached subject queries. - in this case, KRBTGT wasn't found in the AD domain. this usually indicates that the - AD domain is offline/inaccessible to pbis, which will cause problems, specially - to the ssh python hook-script, so we need to try again until KRBTGT is found, indicating - that the domain is online and accessible to pbis queries *) - debug - "Request to external authentication server returned KRBTGT \ - Not_found" ; - Error () - | e -> - debug - "Request to external authentication server failed for reason: %s" - (ExnHelper.string_of_exn e) ; - Error () - in - let rec go i = - if i > max_tries then ( - info - "Testing external authentication server failed after %i tries, \ - giving up!" - max_tries ; - false - ) else ( - debug - "Testing if external authentication server is accepting requests... \ - attempt %i of %i" - i max_tries ; - let ( >>= ) = Rresult.( >>= ) in - (* if we don't remove krbtgt from the cache before - query subject information about krbtgt, then - [ try_fetch_sid ] would erroneously return success - in the case that PBIS is running locally, but the - AD domain is offline *) - match try_clear_cache () >>= try_fetch_sid with - | Error () -> - Thread.delay 5.0 ; - (go [@tailcall]) (i + 1) - | Ok () -> - true - ) - in - go 0 - - let is_pbis_server_available ~__context max = - Locking_helpers.Named_mutex.execute mutex_check_availability (fun () -> - _is_pbis_server_available ~__context max - ) - - (* converts from domain.com\user to user@domain.com, in case domain.com is present in the subject_name *) - let convert_nt_to_upn_username subject_name = - try - (* test if the NT account name separator \ is present in subject name *) - let i = String.index subject_name '\\' in - (* we only reach this point if the separator \ is present in subject_name *) - (* when \ is present, we need to convert the NT name to UPN format *) - let domain = String.sub subject_name 0 i in - let user = - String.sub subject_name (i + 1) (String.length subject_name - i - 1) - in - user ^ "@" ^ domain - with Not_found -> - (* if no NT username separator \ was found *) - (* nothing to do in this case *) - subject_name - - (* unit on_enable(((string*string) list) config_params) - - Called internally by xapi _on each host_ when a client enables an external auth service for the - pool via the XenAPI [see AD integration wiki page]. The config_params here are the ones passed - by the client as part of the corresponding XenAPI call. - On receiving this hook, the auth module should: - (i) do whatever it needs to do (if anything) to register with the external auth/directory - service [using the config params supplied to get access] - (ii) Write the config_params that it needs to store persistently in the XenServer metadata - into the Pool.external_auth_configuration field. [Note - the rationale for making the plugin - write the config params it needs long-term into the XenServer metadata itself is so it can - explicitly filter any one-time credentials [like AD username/password for example] that it - does not need long-term.] - *) - let on_enable ~__context config_params = - let@ __context = Context.with_tracing ~__context __FUNCTION__ in - (* but in the ldap plugin, we should 'join the AD/kerberos domain', i.e. we should*) - (* basically: (1) create a machine account in the kerberos realm,*) - (* (2) store the machine account password somewhere locally (in a keytab) *) - start_damon () ; - if - not - (List.mem_assoc "user" config_params - && List.mem_assoc "pass" config_params - ) - then - raise - (Auth_signature.Auth_service_error - ( Auth_signature.E_GENERIC - , "enable requires two config params: user and pass." - ) - ) - else (* we have all the required parameters *) - let hostname = - Server_helpers.exec_with_new_task "retrieving hostname" - (fun __context -> - let host = Helpers.get_localhost ~__context in - Db.Host.get_hostname ~__context ~self:host - ) - in - if - String.fold_left (fun b ch -> b && ch >= '0' && ch <= '9') true hostname - then - raise - (Auth_signature.Auth_service_error - ( Auth_signature.E_GENERIC - , Printf.sprintf "hostname '%s' cannot contain only digits." - hostname - ) - ) - else - let domain = - let service_name = - Server_helpers.exec_with_new_task - "retrieving external_auth_service_name" (fun __context -> - let host = Helpers.get_localhost ~__context in - Db.Host.get_external_auth_service_name ~__context ~self:host - ) - in - (* legacy test: do we have domain name in config? - then config:domain must match service-name *) - if List.mem_assoc "domain" config_params then - let _domain = List.assoc "domain" config_params in - if service_name <> _domain then - raise - (Auth_signature.Auth_service_error - ( Auth_signature.E_GENERIC - , "if present, config:domain must match service-name." - ) - ) - else - service_name - else - (* if no config:domain provided, we simply use the string in service_name for the domain name *) - service_name - in - let _user = List.assoc "user" config_params in - let pass = List.assoc "pass" config_params in - let ou_conf, ou_params = - if List.mem_assoc "ou" config_params then - let ou = List.assoc "ou" config_params in - ([("ou", ou)], ["--ou"; ou]) - else - ([], []) - in - (* Adding the config parameter "config:disable_modules=X,Y,Z" - * will disable the modules X, Y and Z in domainjoin-cli. *) - let disabled_modules = - try - match List.assoc "disable_modules" config_params with - | "" -> - [] - | disabled_modules_string -> - String.split_f (fun c -> c = ',') disabled_modules_string - with Not_found -> [] - in - let disabled_module_params = - List.concat_map - (fun disabled_module -> ["--disable"; disabled_module]) - disabled_modules - in - (* we need to make sure that the user passed to domaijoin-cli command is in the UPN syntax (user@domain.com) *) - let user = convert_nt_to_upn_username _user in - (* execute the pbis domain join cmd *) - try - let (_ : (string * string) list) = - [ - ["join"] - ; ou_params - ; disabled_module_params - ; ["--ignore-pam"; "--notimesync"; domain; user] - ] - |> List.concat - |> pbis_common_with_password pass !Xapi_globs.domain_join_cli_cmd - in - let max_tries = 60 in - (* tests 60 x 5.0 seconds = 300 seconds = 5minutes trying *) - if not (is_pbis_server_available ~__context max_tries) then ( - let errmsg = - Printf.sprintf - "External authentication server not available after %i query \ - tests" - max_tries - in - debug "%s" errmsg ; - raise - (Auth_signature.Auth_service_error - (Auth_signature.E_UNAVAILABLE, errmsg) - ) - ) ; - (* OK SUCCESS, pbis has joined the AD domain successfully *) - (* write persistently the relevant config_params in the host.external_auth_configuration field *) - (* we should not store the user's (admin's) password !! *) - let extauthconf = [("domain", domain); ("user", user)] @ ou_conf in - Server_helpers.exec_with_new_task - "storing external_auth_configuration" (fun __context -> - let host = Helpers.get_localhost ~__context in - Db.Host.set_external_auth_configuration ~__context ~self:host - ~value:extauthconf ; - debug "added external_auth_configuration for host %s" - (Db.Host.get_name_label ~__context ~self:host) - ) ; - with_lock cache_of_pbis_common_m (fun _ -> cache_of_pbis_common := []) ; - ensure_pbis_configured () - with e -> - (*ERROR, we didn't join the AD domain*) - debug - "Error enabling external authentication for domain %s and user %s: \ - %s" - domain user - (ExnHelper.string_of_exn e) ; - raise e - - (* unit on_disable() - - Called internally by xapi _on each host_ when a client disables an auth service via the XenAPI. - The hook will be called _before_ the Pool configuration fields relating to the external-auth - service are cleared (i.e. so you can access the config params you need from the pool metadata - within the body of the on_disable method) - *) - let on_disable ~__context config_params = - let@ __context = Context.with_tracing ~__context __FUNCTION__ in - (* but in the ldap plugin, we should 'leave the AD/kerberos domain', i.e. we should *) - (* (1) remove the machine account from the kerberos realm, (2) remove the keytab locally *) - let pbis_failure = - try - ( if - not - (List.mem_assoc "user" config_params - && List.mem_assoc "pass" config_params - ) - then - (* no windows admin+pass have been provided: leave the pbis host in the AD database *) - (* execute the pbis domain-leave cmd *) - (* this function will raise an exception if something goes wrong *) - let (_ : (string * string) list) = - pbis_common !Xapi_globs.domain_join_cli_cmd ["leave"] - in - () - else - (* windows admin+pass have been provided: ask pbis to remove host from AD database *) - let _user = List.assoc "user" config_params in - let pass = List.assoc "pass" config_params in - (* we need to make sure that the user passed to domaijoin-cli command is in the UPN syntax (user@domain.com) *) - let user = - convert_nt_to_upn_username - (get_full_subject_name ~use_nt_format:false _user) - in - (* execute the pbis domain-leave cmd *) - (* this function will raise an exception if something goes wrong *) - let (_ : (string * string) list) = - pbis_common_with_password pass - !Xapi_globs.domain_join_cli_cmd - ["leave"; user] - in - () - ) ; - None (* no failure observed in pbis *) - with e -> - (* unexpected error disabling pbis *) - debug "Internal Pbis error when disabling external authentication: %s" - (ExnHelper.string_of_exn e) ; - (* CA-27627: we should NOT re-raise the exception e here, because otherwise we might get stuck, *) - (* without being able to disable an external authentication configuration, since the Pbis *) - (* behavior is outside our control. For instance, Pbis raises an exception during domain-leave *) - (* when the domain controller is offline, so it would be impossible to leave a domain that *) - (* has already been deleted. *) - (* Not re-raising an exception here is not too bad, because both ssh and xapi access to the AD/Pbis *) - (* commands will be disabled anyway by host.disable_external_auth. So, even though access to the external *) - (* authentication service might still be possible from Dom0 shell, it will not be possible *) - (* to login as an external user via ssh or to call external-authentication services via xapi/xe. *) - Some e - (* CA-28942: stores exception returned by pbis for later *) - in - (* We always do a manual clean-up of pbis, in order to restore Dom0 to its pre-pbis state *) - (* It doesn't matter if pbis succeeded or not *) - (* This disables Pbis even from Dom0 shell *) - debug "Doing a manual Pbis domain-leave cleanup..." ; - (* When pbis raises an exception during domain-leave, we try again, using *) - (* some of the command-line workarounds that Kyle describes in CA-27627: *) - let pbis_force_domain_leave_script = - "/opt/xensource/libexec/pbis-force-domain-leave" - in - ( try - let output, stderr = - Forkhelpers.execute_command_get_output pbis_force_domain_leave_script - [] - in - debug "execute %s: stdout=[%s],stderr=[%s]" - pbis_force_domain_leave_script - (String.replace "\n" ";" output) - (String.replace "\n" ";" stderr) - with e -> - debug "exception executing %s: %s" pbis_force_domain_leave_script - (ExnHelper.string_of_exn e) - ) ; - (* OK SUCCESS, pbis has left the AD domain successfully *) - (* remove persistently the relevant config_params in the host.external_auth_configuration field *) - Server_helpers.exec_with_new_task "removing external_auth_configuration" - (fun __context -> - let host = Helpers.get_localhost ~__context in - Db.Host.set_external_auth_configuration ~__context ~self:host ~value:[] ; - debug "removed external_auth_configuration for host %s" - (Db.Host.get_name_label ~__context ~self:host) - ) ; - match pbis_failure with - | None -> - () (* OK, return unit*) - | Some e -> - raise e - - (* bubble up pbis failure *) - - (* unit on_xapi_initialize(bool system_boot) - - Called internally by xapi whenever it starts up. The system_boot flag is true iff xapi is - starting for the first time after a host boot - *) - let on_xapi_initialize ~__context _system_boot = - let@ __context = Context.with_tracing ~__context __FUNCTION__ in - - (* the AD server is initialized outside xapi, by init.d scripts *) - - (* this function is called during xapi initialization in xapi.ml *) - - (* make sure that the AD/LSASS server is responding before returning *) - let max_tries = 12 in - (* tests 12 x 5.0 seconds = 60 seconds = up to 1 minute trying *) - if not (is_pbis_server_available ~__context max_tries) then ( - let errmsg = - Printf.sprintf - "External authentication server not available after %i query tests" - max_tries - in - debug "%s" errmsg ; - raise - (Auth_signature.Auth_service_error (Auth_signature.E_GENERIC, errmsg)) - ) ; - () - - (* unit on_xapi_exit() - - Called internally when xapi is doing a clean exit. - *) - let on_xapi_exit ~__context:_ () = - (* nothing to do here in this unix plugin *) - - (* in the ldap plugin, we should remove the tgt ticket in /tmp/krb5cc_0 *) - () - - (* Implement the single value required for the module signature *) - let methods = - Auth_signature. - { - authenticate_username_password - ; authenticate_ticket - ; get_subject_identifier - ; query_subject_information - ; query_group_membership - ; on_enable - ; on_disable - ; on_xapi_initialize - ; on_xapi_exit - } -end diff --git a/quality-gate.sh b/quality-gate.sh index c82a98ea57f..1c8da63f92d 100755 --- a/quality-gate.sh +++ b/quality-gate.sh @@ -3,7 +3,7 @@ set -e list-hd () { - N=244 + N=239 LIST_HD=$(git grep -r --count 'List.hd' -- **/*.ml | cut -d ':' -f 2 | paste -sd+ - | bc) if [ "$LIST_HD" -eq "$N" ]; then echo "OK counted $LIST_HD List.hd usages" @@ -25,7 +25,7 @@ verify-cert () { } mli-files () { - N=457 + N=456 X="ocaml/tests" X+="|ocaml/quicktest" X+="|ocaml/message-switch/core_test" From 57eb90335cc5a49cb2cd20dae61fa35838a79092 Mon Sep 17 00:00:00 2001 From: Pau Ruiz Safont Date: Mon, 12 Jan 2026 16:45:52 +0000 Subject: [PATCH 37/65] xapi_sm: remove nested call to serialize function This means that the complex logic to avoid nested calls to executing a mutex can be deleted, rendering the code easer to read. Signed-off-by: Pau Ruiz Safont --- ocaml/xapi/xapi_sm.ml | 80 +++++++++++++++++++------------------------ 1 file changed, 36 insertions(+), 44 deletions(-) diff --git a/ocaml/xapi/xapi_sm.ml b/ocaml/xapi/xapi_sm.ml index 19257cf8929..4e83673ab4b 100644 --- a/ocaml/xapi/xapi_sm.ml +++ b/ocaml/xapi/xapi_sm.ml @@ -148,51 +148,43 @@ let update_from_query_result ~__context (self, r) q_result = let is_v1 x = version_of_string x < [2; 0] -let _serialize_reg = +let with_lock = let lock = Mutex.create () in - let holder = ref None in - fun f -> - match !holder with - | Some t when t = Thread.self () -> - (* inside a nested layer where the lock is held by myself *) - f () - | _ -> - Xapi_stdext_threads.Threadext.Mutex.execute lock (fun () -> - holder := Some (Thread.self ()) ; - finally f (fun () -> holder := None) - ) - -let unregister_plugin ~__context q_result = - _serialize_reg (fun () -> - let open Storage_interface in - let driver = String.lowercase_ascii q_result.driver in - if is_v1 q_result.required_api_version then - info "Not unregistering SM plugin %s (required_api_version %s < 2.0)" - driver q_result.required_api_version - else - List.iter - (fun (rf, rc) -> - if rc.API.sM_type = driver then - try - info "Unregistering SM plugin %s (version %s)" driver - q_result.version ; - Db.SM.destroy ~__context ~self:rf - with e -> - warn "Ignore unregistering SM plugin failure: %s" - (Printexc.to_string e) - ) - (Db.SM.get_all_records ~__context) - ) + Xapi_stdext_threads.Threadext.Mutex.execute lock -let register_plugin ~__context q_result = - _serialize_reg (fun () -> - let open Storage_interface in - let driver = String.lowercase_ascii q_result.driver in - if is_v1 q_result.required_api_version then - info "Not registering SM plugin %s (required_api_version %s < 2.0)" - driver q_result.required_api_version - else ( - unregister_plugin ~__context q_result ; - create_from_query_result ~__context q_result +let _unregister_plugin ~__context q_result () = + let open Storage_interface in + let driver = String.lowercase_ascii q_result.driver in + if is_v1 q_result.required_api_version then + info "Not unregistering SM plugin %s (required_api_version %s < 2.0)" driver + q_result.required_api_version + else + List.iter + (fun (rf, rc) -> + if rc.API.sM_type = driver then + try + info "Unregistering SM plugin %s (version %s)" driver + q_result.version ; + Db.SM.destroy ~__context ~self:rf + with e -> + warn "Ignore unregistering SM plugin failure: %s" + (Printexc.to_string e) ) + (Db.SM.get_all_records ~__context) + +let _register_plugin ~__context q_result () = + let open Storage_interface in + let driver = String.lowercase_ascii q_result.driver in + if is_v1 q_result.required_api_version then + info "Not registering SM plugin %s (required_api_version %s < 2.0)" driver + q_result.required_api_version + else ( + _unregister_plugin ~__context q_result () ; + create_from_query_result ~__context q_result ) + +let unregister_plugin ~__context q_result () = + with_lock (_unregister_plugin ~__context q_result) + +let register_plugin ~__context q_result = + with_lock (_register_plugin ~__context q_result) From fa7b672301c6da757e2e17f35bd57ae8867bc6fb Mon Sep 17 00:00:00 2001 From: Pau Ruiz Safont Date: Tue, 13 Jan 2026 15:01:19 +0000 Subject: [PATCH 38/65] xapi_sm: add interface Signed-off-by: Pau Ruiz Safont --- ocaml/xapi/xapi_sm.ml | 4 ---- ocaml/xapi/xapi_sm.mli | 27 +++++++++++++++++++++++++++ quality-gate.sh | 2 +- 3 files changed, 28 insertions(+), 5 deletions(-) create mode 100644 ocaml/xapi/xapi_sm.mli diff --git a/ocaml/xapi/xapi_sm.ml b/ocaml/xapi/xapi_sm.ml index 4e83673ab4b..2672bda1079 100644 --- a/ocaml/xapi/xapi_sm.ml +++ b/ocaml/xapi/xapi_sm.ml @@ -20,12 +20,8 @@ module Listext = Xapi_stdext_std.Listext -let finally = Xapi_stdext_pervasives.Pervasiveext.finally - (* We treat versions as '.'-separated integer lists under the usual lexicographic ordering. *) -type version = int list - let version_of_string s = List.map int_of_string (String.split_on_char '.' s) module D = Debug.Make (struct let name = "xapi_sm" end) diff --git a/ocaml/xapi/xapi_sm.mli b/ocaml/xapi/xapi_sm.mli new file mode 100644 index 00000000000..f32847097e0 --- /dev/null +++ b/ocaml/xapi/xapi_sm.mli @@ -0,0 +1,27 @@ +(* Copyright (C) 2026 Vates. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; version 2.1 only. with the special + exception on linking described in file LICENSE. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. +*) + +val create_from_query_result : + __context:Context.t -> Storage_interface.query_result -> unit + +val update_from_query_result : + __context:Context.t + -> [`SM] API.Ref.t * API.sM_t + -> Storage_interface.query_result + -> unit + +val unregister_plugin : + __context:Context.t -> Storage_interface.query_result -> unit -> unit + +val register_plugin : + __context:Context.t -> Storage_interface.query_result -> unit diff --git a/quality-gate.sh b/quality-gate.sh index 1c8da63f92d..8778b15a03f 100755 --- a/quality-gate.sh +++ b/quality-gate.sh @@ -25,7 +25,7 @@ verify-cert () { } mli-files () { - N=456 + N=455 X="ocaml/tests" X+="|ocaml/quicktest" X+="|ocaml/message-switch/core_test" From a88ce1bd94692f4a14a1de481e1e856f3b9d38d5 Mon Sep 17 00:00:00 2001 From: Pau Ruiz Safont Date: Wed, 14 Jan 2026 10:02:23 +0000 Subject: [PATCH 39/65] listext: add better description to set_difference Signed-off-by: Pau Ruiz Safont --- ocaml/libs/xapi-stdext/lib/xapi-stdext-std/listext.mli | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ocaml/libs/xapi-stdext/lib/xapi-stdext-std/listext.mli b/ocaml/libs/xapi-stdext/lib/xapi-stdext-std/listext.mli index ece679d5423..b58b8d84016 100644 --- a/ocaml/libs/xapi-stdext/lib/xapi-stdext-std/listext.mli +++ b/ocaml/libs/xapi-stdext/lib/xapi-stdext-std/listext.mli @@ -154,7 +154,8 @@ module List : sig val set_equiv : 'a list -> 'a list -> bool val set_difference : 'a list -> 'a list -> 'a list - (** Returns the set difference of two lists *) + (** [set_difference a b] returns the list with the elements in [a] that are + not present in [b] *) val intersect : 'a list -> 'a list -> 'a list (** Returns the intersection of two lists. *) From 8bcd6b55e68c1594ad25e5e319af6c8ea73644f1 Mon Sep 17 00:00:00 2001 From: Pau Ruiz Safont Date: Wed, 14 Jan 2026 10:18:25 +0000 Subject: [PATCH 40/65] storage_access: split the code off that queries message switch for SMAPIv2 plugins This helps to visualize the logic of on_xapi_start better with regards to the lists being handled in on_xapi_start Signed-off-by: Pau Ruiz Safont --- ocaml/xapi/storage_access.ml | 97 +++++++++++++++++++----------------- 1 file changed, 50 insertions(+), 47 deletions(-) diff --git a/ocaml/xapi/storage_access.ml b/ocaml/xapi/storage_access.ml index cd9b0aee515..0d4a1cc77cd 100644 --- a/ocaml/xapi/storage_access.ml +++ b/ocaml/xapi/storage_access.ml @@ -21,7 +21,8 @@ let finally = Xapi_stdext_pervasives.Pervasiveext.finally module XenAPI = Client.Client open Storage_interface -open Storage_utils + +let transform_storage_exn = Storage_utils.transform_storage_exn module D = Debug.Make (struct let name = "storage_access" end) @@ -103,8 +104,50 @@ let external_rpc queue_name uri = (* Internal exception, never escapes the module *) exception Message_switch_failure -(* We have to be careful in this function, because an exception raised from - here will cause the startup sequence to fail *) +(* We have to be careful in the call tree of on_xapi_start, because an + exception raised in it will cause the startup sequence to fail *) + +let get_smapiv2_drivers_from_switch () = + let module Client = Message_switch_unix.Protocol_unix.Client in + try + let ( >>| ) result f = + match Client.error_to_msg result with + | Error (`Msg x) -> + error "%s: Error %s while querying message switch queues" __FUNCTION__ + x ; + raise Message_switch_failure + | Ok x -> + f x + in + Client.connect ~switch:!Xcp_client.switch_path () >>| fun t -> + Client.list ~t ~prefix:!Storage_interface.queue_name ~filter:`Alive () + >>| fun running_smapiv2_driver_queues -> + running_smapiv2_driver_queues + (* The results include the prefix itself, but that is the main storage + queue, we don't need it *) + |> List.filter (( <> ) !Storage_interface.queue_name) + |> Listext.List.try_map (fun driver -> + (* Get the last component of the queue name: + org.xen.xapi.storage.sr_type -> sr_type *) + driver + |> String.split_on_char '.' + |> Listext.List.last + |> Option.to_result ~none:(Invalid_argument driver) + ) + |> function + | Ok drivers -> + drivers + | Error exn -> + raise exn + with + | Message_switch_failure -> + [] (* no more logging *) + | e -> + Backtrace.is_important e ; + error "Unexpected error querying the message switch: %s" + (Printexc.to_string e) ; + Debug.log_backtrace e (Backtrace.get e) ; + [] (** Synchronise the SM table with the SMAPIv1 plugins on the disk and the SMAPIv2 plugins mentioned in the configuration file whitelist. *) @@ -130,52 +173,12 @@ let on_xapi_start ~__context = List.map (fun (_, rc) -> rc.API.sR_type) (Db.SR.get_all_records ~__context) in let to_keep = configured_drivers @ in_use_drivers in - (* The SMAPIv2 drivers we know about *) - let smapiv2_drivers = Listext.List.set_difference to_keep smapiv1_drivers in (* Query the message switch to detect running SMAPIv2 plugins. *) let running_smapiv2_drivers = - if !Xcp_client.use_switch then ( - try - let open Message_switch_unix.Protocol_unix in - let ( >>| ) result f = - match Client.error_to_msg result with - | Error (`Msg x) -> - error "Error %s while querying message switch queues" x ; - raise Message_switch_failure - | Ok x -> - f x - in - Client.connect ~switch:!Xcp_client.switch_path () >>| fun t -> - Client.list ~t ~prefix:!Storage_interface.queue_name ~filter:`Alive () - >>| fun running_smapiv2_driver_queues -> - running_smapiv2_driver_queues - (* The results include the prefix itself, but that is the main storage - queue, we don't need it *) - |> List.filter (( <> ) !Storage_interface.queue_name) - |> Listext.List.try_map (fun driver -> - (* Get the last component of the queue name: - org.xen.xapi.storage.sr_type -> sr_type *) - driver - |> String.split_on_char '.' - |> Listext.List.last - |> Option.to_result ~none:(Invalid_argument driver) - ) - |> function - | Ok drivers -> - drivers - | Error exn -> - raise exn - with - | Message_switch_failure -> - [] (* no more logging *) - | e -> - Backtrace.is_important e ; - error "Unexpected error querying the message switch: %s" - (Printexc.to_string e) ; - Debug.log_backtrace e (Backtrace.get e) ; - [] - ) else - smapiv2_drivers + if !Xcp_client.use_switch then + get_smapiv2_drivers_from_switch () + else (* The SMAPIv2 drivers we know about *) + Listext.List.set_difference to_keep smapiv1_drivers in (* Add all the running SMAPIv2 drivers *) let to_keep = to_keep @ running_smapiv2_drivers in From e17607c8156a03a1ce99bbde48af4e3bc2f473a2 Mon Sep 17 00:00:00 2001 From: Pau Ruiz Safont Date: Wed, 14 Jan 2026 10:20:54 +0000 Subject: [PATCH 41/65] storage_access: share code for unregistering plugins This helps seeing the structure of on_xapi_start when reading the code Signed-off-by: Pau Ruiz Safont --- ocaml/xapi/storage_access.ml | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/ocaml/xapi/storage_access.ml b/ocaml/xapi/storage_access.ml index 0d4a1cc77cd..1cb020a62ce 100644 --- a/ocaml/xapi/storage_access.ml +++ b/ocaml/xapi/storage_access.ml @@ -149,6 +149,11 @@ let get_smapiv2_drivers_from_switch () = Debug.log_backtrace e (Backtrace.get e) ; [] +let log_and_unregister ~__context ~reason __FUN (self, rc) = + info "%s: unregistering SM plugin %s (%s) since %s" __FUN rc.API.sM_name_label + rc.API.sM_uuid reason ; + try Db.SM.destroy ~__context ~self with _ -> () + (** Synchronise the SM table with the SMAPIv1 plugins on the disk and the SMAPIv2 plugins mentioned in the configuration file whitelist. *) let on_xapi_start ~__context = @@ -182,26 +187,22 @@ let on_xapi_start ~__context = in (* Add all the running SMAPIv2 drivers *) let to_keep = to_keep @ running_smapiv2_drivers in + let unused = Listext.List.set_difference (List.map fst existing) to_keep in let unavailable = List.filter (fun (_, (_, rc)) -> not (is_available rc)) existing in (* Delete all records which aren't configured or in-use *) - List.iter - (fun ty -> - info - "Unregistering SM plugin %s since not in the whitelist and not in-use" - ty ; - let self, _ = List.assoc ty existing in - try Db.SM.destroy ~__context ~self with _ -> () - ) - (Listext.List.set_difference (List.map fst existing) to_keep) ; - List.iter - (fun (name, (self, rc)) -> - info "%s: unregistering SM plugin %s (%s) since it is unavailable" - __FUNCTION__ name rc.API.sM_uuid ; - try Db.SM.destroy ~__context ~self with _ -> () - ) - unavailable ; + let unregister_unused ty = + let sm = List.assoc ty existing in + let reason = "it's not in the allowed list and not in-use" in + log_and_unregister ~__context ~reason __FUNCTION__ sm + in + let unregister_unavailable (_, sm) = + let reason = "it's unavailable" in + log_and_unregister ~__context ~reason __FUNCTION__ sm + in + List.iter unregister_unused unused ; + List.iter unregister_unavailable unavailable ; (* Synchronize SMAPIv1 plugins *) From 183b64826d5a10612845e3cdfbbc230220c5f829 Mon Sep 17 00:00:00 2001 From: Pau Ruiz Safont Date: Wed, 14 Jan 2026 16:21:25 +0000 Subject: [PATCH 42/65] storage_access: avoid the addition of multiple SM with the same type When manipulating SMs, it's important to avoid more than one plugin per type of SM. Use sets for handling collections of types of SMs to avoid duplicated registrations at the end of the processing done for SMs, which happens on xapi statup. Also make sure that, when converting from SM type to SMs, all SMs for each type are handled, both for updating and registering. Previously some set-like operations where being used, but no explicit deduplication was being done. Signed-off-by: Pau Ruiz Safont --- ocaml/xapi/storage_access.ml | 69 +++++++++++++++++++++++------------- 1 file changed, 44 insertions(+), 25 deletions(-) diff --git a/ocaml/xapi/storage_access.ml b/ocaml/xapi/storage_access.ml index 1cb020a62ce..e047bcc2de5 100644 --- a/ocaml/xapi/storage_access.ml +++ b/ocaml/xapi/storage_access.ml @@ -154,6 +154,18 @@ let log_and_unregister ~__context ~reason __FUN (self, rc) = rc.API.sM_uuid reason ; try Db.SM.destroy ~__context ~self with _ -> () +module StringSet = Set.Make (String) + +let list_assoc_all a = + List.filter_map (fun (k, v) -> + if String.equal k a then + Some v + else + None + ) + +let ( let@ ) f x = f x + (** Synchronise the SM table with the SMAPIv1 plugins on the disk and the SMAPIv2 plugins mentioned in the configuration file whitelist. *) let on_xapi_start ~__context = @@ -168,63 +180,69 @@ let on_xapi_start ~__context = |> List.map (fun (rf, rc) -> (rc.API.sM_type, (rf, rc))) in let explicitly_configured_drivers = - List.filter_map - (function `Sm x -> Some x | _ -> None) - !Xapi_globs.sm_plugins + !Xapi_globs.sm_plugins + |> List.filter_map (function `Sm x -> Some x | _ -> None) + |> StringSet.of_list + in + let smapiv1_drivers = Sm.supported_drivers () |> StringSet.of_list in + let configured_drivers = + StringSet.union explicitly_configured_drivers smapiv1_drivers in - let smapiv1_drivers = Sm.supported_drivers () in - let configured_drivers = explicitly_configured_drivers @ smapiv1_drivers in let in_use_drivers = List.map (fun (_, rc) -> rc.API.sR_type) (Db.SR.get_all_records ~__context) + |> StringSet.of_list in - let to_keep = configured_drivers @ in_use_drivers in + let to_keep = StringSet.union configured_drivers in_use_drivers in (* Query the message switch to detect running SMAPIv2 plugins. *) let running_smapiv2_drivers = if !Xcp_client.use_switch then - get_smapiv2_drivers_from_switch () + get_smapiv2_drivers_from_switch () |> StringSet.of_list else (* The SMAPIv2 drivers we know about *) - Listext.List.set_difference to_keep smapiv1_drivers + StringSet.diff to_keep smapiv1_drivers in (* Add all the running SMAPIv2 drivers *) - let to_keep = to_keep @ running_smapiv2_drivers in - let unused = Listext.List.set_difference (List.map fst existing) to_keep in + let to_keep = StringSet.union to_keep running_smapiv2_drivers in + let existing_types = List.map fst existing |> StringSet.of_list in + let unused = StringSet.diff existing_types to_keep in let unavailable = List.filter (fun (_, (_, rc)) -> not (is_available rc)) existing in (* Delete all records which aren't configured or in-use *) let unregister_unused ty = - let sm = List.assoc ty existing in + let sms = list_assoc_all ty existing in let reason = "it's not in the allowed list and not in-use" in - log_and_unregister ~__context ~reason __FUNCTION__ sm + List.iter (log_and_unregister ~__context ~reason __FUNCTION__) sms in let unregister_unavailable (_, sm) = let reason = "it's unavailable" in log_and_unregister ~__context ~reason __FUNCTION__ sm in - List.iter unregister_unused unused ; + StringSet.iter unregister_unused unused ; List.iter unregister_unavailable unavailable ; (* Synchronize SMAPIv1 plugins *) (* Create all missing SMAPIv1 plugins *) - List.iter + StringSet.iter (fun ty -> let query_result = Sm.info_of_driver ty |> Smint.query_result_of_sr_driver_info in Xapi_sm.create_from_query_result ~__context query_result ) - (Listext.List.set_difference smapiv1_drivers (List.map fst existing)) ; + (StringSet.diff smapiv1_drivers existing_types) ; (* Update all existing SMAPIv1 plugins *) - List.iter + StringSet.iter (fun ty -> let query_result = Sm.info_of_driver ty |> Smint.query_result_of_sr_driver_info in - Xapi_sm.update_from_query_result ~__context (List.assoc ty existing) - query_result + list_assoc_all ty existing + |> List.iter (fun sm -> + Xapi_sm.update_from_query_result ~__context sm query_result + ) ) - (Listext.List.intersect smapiv1_drivers (List.map fst existing)) ; + (StringSet.inter smapiv1_drivers existing_types) ; (* Synchronize SMAPIv2 plugins *) @@ -245,18 +263,19 @@ let on_xapi_start ~__context = f query_result ) in - List.iter + StringSet.iter (fun ty -> with_query_result ty (Xapi_sm.create_from_query_result ~__context) ) - (Listext.List.set_difference running_smapiv2_drivers (List.map fst existing)) ; + (StringSet.diff running_smapiv2_drivers existing_types) ; (* Update all existing SMAPIv2 plugins *) - List.iter + StringSet.iter (fun ty -> - with_query_result ty - (Xapi_sm.update_from_query_result ~__context (List.assoc ty existing)) + let@ qr = with_query_result ty in + list_assoc_all ty existing + |> List.iter (fun sm -> Xapi_sm.update_from_query_result ~__context sm qr) ) - (Listext.List.intersect running_smapiv2_drivers (List.map fst existing)) + (StringSet.inter running_smapiv2_drivers existing_types) let bind ~__context ~pbd = let dbg = Context.string_of_task __context in From eb18ff9d11e64979aefdfc74107dc28ad05cb6df Mon Sep 17 00:00:00 2001 From: Pau Ruiz Safont Date: Tue, 20 Jan 2026 15:10:36 +0000 Subject: [PATCH 43/65] xapi_sm: Don't allow host_pending_features that are empty When none on of the hosts have pending features, a map was produced with all the hosts with an empty value for each after upgrading. Empty the map instead when that occurs. Signed-off-by: Pau Ruiz Safont --- ocaml/xapi/xapi_sm.ml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ocaml/xapi/xapi_sm.ml b/ocaml/xapi/xapi_sm.ml index 2672bda1079..dd8dc754858 100644 --- a/ocaml/xapi/xapi_sm.ml +++ b/ocaml/xapi/xapi_sm.ml @@ -95,6 +95,12 @@ let remove_valid_features_from_pending ~__context ~self valid_features = (h, Listext.List.set_difference pending_features valid_features) ) in + let new_pending_feature = + if List.for_all (fun (_, v) -> v = []) new_pending_feature then + [] + else + new_pending_feature + in Db.SM.set_host_pending_features ~__context ~self ~value:new_pending_feature let update_from_query_result ~__context (self, r) q_result = From bbd42f28e925363b81bbc603f0901c5053e74e53 Mon Sep 17 00:00:00 2001 From: Pau Ruiz Safont Date: Wed, 28 Jan 2026 15:06:02 +0000 Subject: [PATCH 44/65] storage_access: log when there are SM duplicates on startup This should help document the cases where the issue persists, even after the tweaks done to the function. Only some fields are kept because of their relevance to the issue, and to help identify them. The rest of the fields are less dynamic or hold much less significance. For example, capabilities is an unversioned field of features. Signed-off-by: Pau Ruiz Safont --- ocaml/xapi/storage_access.ml | 46 +++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/ocaml/xapi/storage_access.ml b/ocaml/xapi/storage_access.ml index e047bcc2de5..87e0b266e0f 100644 --- a/ocaml/xapi/storage_access.ml +++ b/ocaml/xapi/storage_access.ml @@ -169,6 +169,7 @@ let ( let@ ) f x = f x (** Synchronise the SM table with the SMAPIv1 plugins on the disk and the SMAPIv2 plugins mentioned in the configuration file whitelist. *) let on_xapi_start ~__context = + let __FUN = __FUNCTION__ in (* An SM is either implemented as a plugin - for which we check its presence, or via an API *) let is_available rc = @@ -275,7 +276,50 @@ let on_xapi_start ~__context = list_assoc_all ty existing |> List.iter (fun sm -> Xapi_sm.update_from_query_result ~__context sm qr) ) - (StringSet.inter running_smapiv2_drivers existing_types) + (StringSet.inter running_smapiv2_drivers existing_types) ; + + (* Warn in logs when there are still duplicates *) + let add_to_dups (last, dups) (_, curr) = + match (last.API.sM_type = curr.API.sM_type, dups) with + | false, _ -> + (curr, dups) + | true, x :: _ when x = last -> + (curr, curr :: dups) + | true, _ -> + (curr, curr :: last :: dups) + in + let find_all_duplicates lst = + lst + |> List.sort (fun (_, a_rc) (_, b_rc) -> + Stdlib.compare a_rc.API.sM_type b_rc.API.sM_type + ) + |> function + | [] -> + [] + | head :: rest -> + List.fold_left add_to_dups (snd head, []) rest |> snd + in + + let features_to_string feats = + Fmt.(to_to_string (Dump.list (Dump.pair string int64)) feats) + in + let plugin_to_string plugin = + Printf.sprintf "{ type:%s; name:%s; UUID:%s; features:%s; }" + plugin.API.sM_type plugin.API.sM_name_label plugin.API.sM_uuid + (features_to_string plugin.API.sM_features) + in + let log_plugins = function + | [] -> + () + | duplicates -> + let duplicates = + String.concat "\n; " (List.map plugin_to_string duplicates) + in + warn "%s: found duplicate SM plugins for the same type: [\n %s\n]" + __FUN duplicates + in + + Db.SM.get_all_records ~__context |> find_all_duplicates |> log_plugins let bind ~__context ~pbd = let dbg = Context.string_of_task __context in From 7eb72bd060c87d5cd4b974c3421cd0611f77ccd3 Mon Sep 17 00:00:00 2001 From: Christian Lindig Date: Thu, 29 Jan 2026 15:18:12 +0000 Subject: [PATCH 45/65] CA-414586 add message-limit to xapi.conf For better discoverarbility, add a line to xapi.conf for the message-limit parameter. Signed-off-by: Christian Lindig --- scripts/xapi.conf | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/xapi.conf b/scripts/xapi.conf index 8736fed6c0d..e8df3cf739b 100644 --- a/scripts/xapi.conf +++ b/scripts/xapi.conf @@ -389,6 +389,9 @@ sm-plugins=ext nfs iscsi lvmoiscsi dummy file hba rawhba udev iso lvm lvmohba lv # How often tracing will export spans to endpoints # export-interval = 30. +# maximium number of messages kept before oldest are removed +# message-limit = 10000 + # The file to check if host reboot required reboot_required_hfxs = /run/reboot-required.hfxs From 1bc5bd7c605334da8829958f89c3b45a0a1e1fc0 Mon Sep 17 00:00:00 2001 From: Lin Liu Date: Wed, 28 Jan 2026 05:55:21 +0000 Subject: [PATCH 46/65] CP-311169: samba: include /etc/samba/smb.extra.conf Samba has lots of feature and configurations, https://www.samba.org/samba/docs/current/man-html/smb.conf.5.html Some of them are passthrough from xapi configuration to samba However, this requires xapi rebuild and make xapi uncessary complicated. On the other side, sometimes we do need customer to tune some args, especially for debug purpose, and we do want such configuration can persist cross xapi restart. A new configuration /etc/samba/smb.extra.conf is introduced. The xapi generated/overrided /etc/samba/smb.conf include it. Customer can update this file with guide of support team for debug purpose Signed-off-by: Lin Liu --- ocaml/xapi/extauth_plugin_ADwinbind.ml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ocaml/xapi/extauth_plugin_ADwinbind.ml b/ocaml/xapi/extauth_plugin_ADwinbind.ml index 7e77f3a8502..523708526ff 100644 --- a/ocaml/xapi/extauth_plugin_ADwinbind.ml +++ b/ocaml/xapi/extauth_plugin_ADwinbind.ml @@ -780,6 +780,7 @@ let query_domain_workgroup ~domain = let config_winbind_daemon ~workgroup ~netbios_name ~domain = let smb_config = "/etc/samba/smb.conf" in + let extra_conf = "/etc/samba/smb.extra.conf" in let string_of_bool = function true -> "yes" | false -> "no" in let scan_trusted_domains = @@ -788,7 +789,8 @@ let config_winbind_daemon ~workgroup ~netbios_name ~domain = ( match (workgroup, netbios_name, domain) with | Some wkgroup, Some netbios, Some dom -> [ - "# autogenerated by xapi" + Printf.sprintf "# This file is managed by xapi, update %s instead" + extra_conf ; "[global]" ; "client use kerberos = required" ; "sync machine password to keytab = \ @@ -815,6 +817,7 @@ let config_winbind_daemon ~workgroup ~netbios_name ~domain = ; "idmap config * : backend = autorid" ; Printf.sprintf "idmap config * : range = %d-%d" 2_000_000 99_999_999 ; Printf.sprintf "log level = %s" (debug_level ()) + ; Printf.sprintf "include = %s" extra_conf ; "" (* Empty line at the end *) ] | _ -> From a7af61e756db8bb83387cf0ca5f0b15b824e844c Mon Sep 17 00:00:00 2001 From: Ming Lu Date: Mon, 2 Feb 2026 16:59:21 +0800 Subject: [PATCH 47/65] Update ocaml-version from 5.3.0 to 5.4.0 Signed-off-by: Ming Lu --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a22f85dc72f..eda226ae94e 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -32,7 +32,7 @@ jobs: ocaml-version: "4.14.2" experimental: true - runs-on: "ubuntu-22.04" - ocaml-version: "5.3.0" + ocaml-version: "5.4.0" experimental: true continue-on-error: ${{ matrix.experimental }} From b6e2a9ce5bbd25ea9a39f894c54e9bb04927c97b Mon Sep 17 00:00:00 2001 From: Ming Lu Date: Mon, 2 Feb 2026 17:40:12 +0800 Subject: [PATCH 48/65] Don't block switching to a different edition when HA is enabled It assumes that the switching will not cause any differences on the availability of features required by HA. Signed-off-by: Ming Lu --- ocaml/xapi/xapi_host.ml | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/ocaml/xapi/xapi_host.ml b/ocaml/xapi/xapi_host.ml index f6b1109d6fd..7b9243d54fd 100644 --- a/ocaml/xapi/xapi_host.ml +++ b/ocaml/xapi/xapi_host.ml @@ -2149,21 +2149,13 @@ let apply_edition_internal ~__context ~host ~edition ~additional = ~additional:new_ed.additional_params let apply_edition ~__context ~host ~edition ~force = - (* if HA is enabled do not allow the edition to be changed *) - let pool = Helpers.get_pool ~__context in - if - Db.Pool.get_ha_enabled ~__context ~self:pool - && edition <> Db.Host.get_edition ~__context ~self:host - then - raise (Api_errors.Server_error (Api_errors.ha_is_enabled, [])) - else - let additional = - if force then - [("force", "true")] - else - [] - in - apply_edition_internal ~__context ~host ~edition ~additional + let additional = + if force then + [("force", "true")] + else + [] + in + apply_edition_internal ~__context ~host ~edition ~additional let license_add ~__context ~host ~contents = let license = From a3f89e4ded201c0533c83d17cb96127028c60e80 Mon Sep 17 00:00:00 2001 From: Marcus Granado Date: Sun, 1 Feb 2026 11:53:23 +0000 Subject: [PATCH 49/65] CP-423204: add new xenctrl field claimed to xenctrlext If new XEN_SYSCTL_numa_meminfo is available: * Use new function xc_numa_meminfo() that returns the new value node_meminfo.claimed. If not: * Fall back to previous behaviour using xc_numainfo() with default value of node_meminfo.claimed=0. both for compile time and runtime: * compile time checks for XEN_SYSCTL_numa_meminfo to use the new hypercall. * runtime checks for ENOSYS or EOPNOTSUPP before falling back to the old hypercall. Signed-off-by: Marcus Granado --- ocaml/libs/xenctrl-ext/xenctrlext.ml | 9 +++ ocaml/libs/xenctrl-ext/xenctrlext.mli | 6 ++ ocaml/libs/xenctrl-ext/xenctrlext_stubs.c | 98 +++++++++++++++++++++++ 3 files changed, 113 insertions(+) diff --git a/ocaml/libs/xenctrl-ext/xenctrlext.ml b/ocaml/libs/xenctrl-ext/xenctrlext.ml index 419907ff268..46f54824424 100644 --- a/ocaml/libs/xenctrl-ext/xenctrlext.ml +++ b/ocaml/libs/xenctrl-ext/xenctrlext.ml @@ -167,3 +167,12 @@ module DomainNuma = struct {optimised; nodes; memory} with Failure _ -> default end + +module HostNuma = struct + (* Numa state of a host *) + + type node_meminfo = {size: int64; free: int64; claimed: int64} + + external numa_get_meminfo : handle -> node_meminfo array + = "stub_xenctrlext_numa_meminfo" +end diff --git a/ocaml/libs/xenctrl-ext/xenctrlext.mli b/ocaml/libs/xenctrl-ext/xenctrlext.mli index 1851b3d1b12..435631a84ff 100644 --- a/ocaml/libs/xenctrl-ext/xenctrlext.mli +++ b/ocaml/libs/xenctrl-ext/xenctrlext.mli @@ -121,3 +121,9 @@ module DomainNuma : sig val state : handle -> domid:int -> t end + +module HostNuma : sig + type node_meminfo = {size: int64; free: int64; claimed: int64} + + val numa_get_meminfo : handle -> node_meminfo array +end diff --git a/ocaml/libs/xenctrl-ext/xenctrlext_stubs.c b/ocaml/libs/xenctrl-ext/xenctrlext_stubs.c index 88fae69f98e..2577239b561 100644 --- a/ocaml/libs/xenctrl-ext/xenctrlext_stubs.c +++ b/ocaml/libs/xenctrl-ext/xenctrlext_stubs.c @@ -715,6 +715,104 @@ CAMLprim value stub_xc_domain_numa_get_node_pages_wrapper(value xch_val, value d #endif } +/* + * Get NUMA memory info with claimed memory support + * + * Falls back to previous xc_numainfo with claimed=0 + * if XEN_SYSCTL_numa_meminfo is not available at compile time or runtime + */ +CAMLprim value stub_xenctrlext_numa_meminfo(value xch_val) +{ + CAMLparam1(xch_val); + CAMLlocal2(result, info); + xc_interface *xch = xch_of_val(xch_val); + unsigned int max_nodes = 0; + unsigned int i; + int ret; + +#ifdef XEN_SYSCTL_numa_meminfo + xen_sysctl_node_meminfo_t *meminfo = NULL; + + /* First call to get node count */ + caml_release_runtime_system(); + ret = xc_numa_meminfo(xch, &max_nodes, NULL); + caml_acquire_runtime_system(); + + if (ret == 0) { + /* New hypercall available, use it */ + meminfo = calloc(max_nodes, sizeof(*meminfo)); + if (!meminfo) + caml_raise_out_of_memory(); + + caml_release_runtime_system(); + ret = xc_numa_meminfo(xch, &max_nodes, meminfo); + caml_acquire_runtime_system(); + + if (ret < 0) { + int err = errno; + free(meminfo); + errno = err; + failwith_xc(xch); + } + + result = caml_alloc_tuple(max_nodes); + for (i = 0; i < max_nodes; i++) { + info = caml_alloc_tuple(3); + Store_field(info, 0, caml_copy_int64(meminfo[i].size)); + Store_field(info, 1, caml_copy_int64(meminfo[i].free)); + Store_field(info, 2, caml_copy_int64(meminfo[i].claimed)); + Store_field(result, i, info); + } + + free(meminfo); + CAMLreturn(result); + } + + /* If we get ENOSYS or EOPNOTSUPP, fall back to old hypercall */ + if (errno != ENOSYS && errno != EOPNOTSUPP) + failwith_xc(xch); +#endif + + /* Fallback: use xc_numainfo with claimed=0 */ + { + xc_meminfo_t *old_meminfo = NULL; + + caml_release_runtime_system(); + ret = xc_numainfo(xch, &max_nodes, NULL, NULL); + caml_acquire_runtime_system(); + + if (ret < 0) + failwith_xc(xch); + + old_meminfo = calloc(max_nodes, sizeof(*old_meminfo)); + if (!old_meminfo) + caml_raise_out_of_memory(); + + caml_release_runtime_system(); + ret = xc_numainfo(xch, &max_nodes, old_meminfo, NULL); + caml_acquire_runtime_system(); + + if (ret < 0) { + int err = errno; + free(old_meminfo); + errno = err; + failwith_xc(xch); + } + + result = caml_alloc_tuple(max_nodes); + for (i = 0; i < max_nodes; i++) { + info = caml_alloc_tuple(3); + Store_field(info, 0, caml_copy_int64(old_meminfo[i].memsize)); + Store_field(info, 1, caml_copy_int64(old_meminfo[i].memfree)); + Store_field(info, 2, caml_copy_int64(0)); /* claimed=0 */ + Store_field(result, i, info); + } + + free(old_meminfo); + CAMLreturn(result); + } +} + /* * Local variables: From 77ac3728b189b7af081ab1b49bf4502a1036326f Mon Sep 17 00:00:00 2001 From: Marcus Granado Date: Mon, 26 Jan 2026 19:06:34 +0000 Subject: [PATCH 50/65] CP-423204: use new xenctrlext field node_meminfo.claimed to calculate the available memory in the node that can actually be further claimed in the node. Signed-off-by: Marcus Granado --- ocaml/xenopsd/xc/domain.ml | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/ocaml/xenopsd/xc/domain.ml b/ocaml/xenopsd/xc/domain.ml index 065a4d02826..63f8cdd9fcc 100644 --- a/ocaml/xenopsd/xc/domain.ml +++ b/ocaml/xenopsd/xc/domain.ml @@ -995,16 +995,26 @@ let numa_hierarchy = let numa_mutex = Mutex.create () +let node_mem_claimable_for_new_vm ~node ~domid m = + let open Xenctrlext.HostNuma in + let nodeid = Fmt.str "%a" Topology.NUMA.pp_dump_node node in + let available = Int64.sub m.free m.claimed in + D.debug + "mem_claimable_for_new_vm: NUMA nodeid=%s, domid=%d: memfree=%Ld \ + memsize=%Ld claimed=%Ld: available=%Ld" + nodeid domid m.free m.size m.claimed available ; + available + let numa_init () = let xcext = Xenctrlext.get_handle () in let host = Lazy.force numa_hierarchy in - let mem = (Xenctrlext.numainfo xcext).memory in + let mem = Xenctrlext.HostNuma.numa_get_meminfo xcext in D.debug "Host NUMA information: %s" (Fmt.to_to_string (Fmt.Dump.option Topology.NUMA.pp_dump) host) ; Array.iteri (fun i m -> - let open Xenctrlext in - D.debug "NUMA node %d: %Ld/%Ld memory free" i m.memfree m.memsize + let open Xenctrlext.HostNuma in + D.debug "NUMA node %d: %Ld/%Ld/%Ld memory free" i m.free m.size m.claimed ) mem @@ -1021,10 +1031,13 @@ let numa_placement domid ~vcpus ~cores ~memory affinity = let ( let* ) = Option.bind in let xcext = get_handle () in let* host = Lazy.force numa_hierarchy in - let numa_meminfo = (numainfo xcext).memory |> Array.to_seq in + let numa_meminfo = HostNuma.numa_get_meminfo xcext |> Array.to_seq in let nodes = Seq.map2 - (fun node m -> NUMA.resource host node ~memory:m.memfree) + (fun node m -> + NUMA.resource host node + ~memory:(node_mem_claimable_for_new_vm ~node ~domid m) + ) (NUMA.nodes host) numa_meminfo in let vm = NUMARequest.make ~memory ~vcpus ~cores in From 93a054686cb785ef46318cadd714f6c1f97db2a0 Mon Sep 17 00:00:00 2001 From: Marcus Granado Date: Mon, 2 Feb 2026 14:54:07 +0000 Subject: [PATCH 51/65] CP-423204: use new xenctrlext function HostNuma.numa_get_meminfo in the remaining instances of the code using old numainfo.memory Signed-off-by: Marcus Granado --- ocaml/libs/xenctrl-ext/xenctrlext.ml | 22 +++++++++++----------- ocaml/xenopsd/xc/numa.ml | 18 ++++++++++-------- 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/ocaml/libs/xenctrl-ext/xenctrlext.ml b/ocaml/libs/xenctrl-ext/xenctrlext.ml index 46f54824424..195682a8f2a 100644 --- a/ocaml/libs/xenctrl-ext/xenctrlext.ml +++ b/ocaml/libs/xenctrl-ext/xenctrlext.ml @@ -126,9 +126,18 @@ exception Not_available let domain_claim_pages handle domid ?(numa_node = NumaNode.none) nr_pages = stub_domain_claim_pages handle domid numa_node nr_pages +module HostNuma = struct + (* Numa state of a host *) + + type node_meminfo = {size: int64; free: int64; claimed: int64} + + external numa_get_meminfo : handle -> node_meminfo array + = "stub_xenctrlext_numa_meminfo" +end + let get_nr_nodes handle = - let info = numainfo handle in - Array.length info.memory + let meminfo = HostNuma.numa_get_meminfo handle in + Array.length meminfo module DomainNuma = struct (* Numa state of a domain *) @@ -167,12 +176,3 @@ module DomainNuma = struct {optimised; nodes; memory} with Failure _ -> default end - -module HostNuma = struct - (* Numa state of a host *) - - type node_meminfo = {size: int64; free: int64; claimed: int64} - - external numa_get_meminfo : handle -> node_meminfo array - = "stub_xenctrlext_numa_meminfo" -end diff --git a/ocaml/xenopsd/xc/numa.ml b/ocaml/xenopsd/xc/numa.ml index bb42101cc53..f05c2765eed 100644 --- a/ocaml/xenopsd/xc/numa.ml +++ b/ocaml/xenopsd/xc/numa.ml @@ -49,17 +49,17 @@ let human_readable_bytes quantity = else loop [] quantity binary_prefixes |> String.concat ", " -let get_memory () = - let {memory; _} = numainfo xc in - memory +let get_memory () = HostNuma.numa_get_meminfo xc let print_mem c mem = for i = 0 to Array.length mem - 1 do - let {memfree; memsize} = mem.(i) in - let memfree = human_readable_bytes memfree in - let memsize = human_readable_bytes memsize in + let {HostNuma.size; free; claimed} = mem.(i) in + let memfree = human_readable_bytes free in + let memsize = human_readable_bytes size in + let memclaimed = human_readable_bytes claimed in Logs.app (fun m -> - m "\t%d: %s free out of %s" i memfree memsize ~tags:(stamp c) + m "\t%d: %s free / %s claimed out of %s" i memfree memclaimed memsize + ~tags:(stamp c) ) done @@ -72,7 +72,9 @@ let print_diff_mem before after = let diff c old cur = let changed_yet = ref false in for i = 0 to Int.min (Array.length old) (Array.length cur) - 1 do - let {memfree= a_free; _}, {memfree= b_free; _} = (old.(i), cur.(i)) in + let {HostNuma.free= a_free; _}, {HostNuma.free= b_free; _} = + (old.(i), cur.(i)) + in if a_free <> b_free then ( if not !changed_yet then changed_yet := true ; let free = human_readable_bytes b_free in From 6faf396e99b1a41a4ae462f5931f6b10dd9867e7 Mon Sep 17 00:00:00 2001 From: Lin Liu Date: Mon, 2 Feb 2026 07:57:26 +0000 Subject: [PATCH 52/65] CP-311020: ldaps design: declare error codes Signed-off-by: Lin Liu --- doc/content/design/external-auth-ldaps.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/doc/content/design/external-auth-ldaps.md b/doc/content/design/external-auth-ldaps.md index b3ab6cdae26..616680c247f 100644 --- a/doc/content/design/external-auth-ldaps.md +++ b/doc/content/design/external-auth-ldaps.md @@ -66,6 +66,8 @@ The [trusted-certificates.md](https://github.com/xapi-project/xen-api/blob/maste ### 3.1 pool.enable_external_auth +#### 3.1.1 Interface + To enable external auth, the current API arguments are as follows: - `pool` (Ref _pool): The pool whose external authentication should be enabled @@ -87,9 +89,15 @@ This API signature does not change. Regarding the config map, one new option is Given `ldaps` default to `false`, this feature is **NOT** enabled until explicitly set. +#### 3.1.2 Error code +Following new error codes added to indicate ldaps enable related error +- AUTH_NO_CERT, no certs can be used for ldaps, refer to 4.1.2 for certs finding. +- AUTH_INVALID_CERT, found certs, but none of the certs can be used to connect to DC + ### 3.2 Set/Get Pool LDAPS Status #### 3.2.1 pool.external_auth_set_ldaps +#### 3.2.1.1 Interface User can specify LDAPS during join domain as in 3.1. @@ -123,6 +131,13 @@ So following xe command can be used to switch between LDAP and LDAPS: xe pool-external-auth-set-ldaps uuid= ldaps= ``` +#### 3.2.1.2 Error code +This API may raise following errors +- AUTH_NO_CERT, no certs found to enable ldaps, refer to 4.1.2 for certs finding +- AUTH_INVALID_CERT, found certs, but none of the certs can be used to connect to DC +- AUTH_IS_DISABLED, AD is not enabled +- AUTH_LDAPS_PING_FAILED, failed to do ldaps query on all DCs with valid certs + #### 3.2.2 Get Pool LDAPS Status xapi generates a get message for each field automatically. To query the LDAPS status, client only needs to query the get method of `host` (class) → `external-auth-configuration` (field), and parse the result. The example as follows: From 45e2ea94b906aee77125bc91342aff6f8fbf7745 Mon Sep 17 00:00:00 2001 From: Rob Hoes Date: Tue, 3 Feb 2026 11:59:40 +0000 Subject: [PATCH 53/65] CA-423369: fix suspend-SR space check Unfortunately, the SR size and utilisation fields are only set after an SR scan, and the new space check has been incorrectly rejecting suspends in some cases. Fix this by just catching the error from `VDI.create` and raising the new error to make it clear that it is the suspend SR that is out of space. Fixes 77c6bf308854524b5f24999fd4e64e164c6502ec. Signed-off-by: Rob Hoes --- ocaml/xapi/helpers.ml | 49 ++++++++++++++------------------------- ocaml/xapi/xapi_xenops.ml | 26 ++++++++++++++------- 2 files changed, 35 insertions(+), 40 deletions(-) diff --git a/ocaml/xapi/helpers.ml b/ocaml/xapi/helpers.ml index e571b962272..e4b46911b6f 100644 --- a/ocaml/xapi/helpers.ml +++ b/ocaml/xapi/helpers.ml @@ -1134,7 +1134,7 @@ let get_sr_free_space ~__context ~sr = Int64.sub size utilisation (* Returns an SR suitable for suspending this VM *) -let choose_suspend_sr ~__context ~vm ~required_space = +let choose_suspend_sr ~__context ~vm = (* If the VM.suspend_SR exists, use that. If it fails, try the Pool.suspend_image_SR. *) (* If that fails, try the Host.suspend_image_SR. *) let vm_sr = Db.VM.get_suspend_SR ~__context ~self:vm in @@ -1142,36 +1142,23 @@ let choose_suspend_sr ~__context ~vm ~required_space = let pool_sr = Db.Pool.get_suspend_image_SR ~__context ~self:pool in let resident_on = Db.VM.get_resident_on ~__context ~self:vm in let host_sr = Db.Host.get_suspend_image_sr ~__context ~self:resident_on in - let sr = - match - ( check_sr_exists_for_host ~__context ~self:vm_sr ~host:resident_on - , check_sr_exists_for_host ~__context ~self:pool_sr ~host:resident_on - , check_sr_exists_for_host ~__context ~self:host_sr ~host:resident_on - ) - with - | Some x, _, _ -> - x - | _, Some x, _ -> - x - | _, _, Some x -> - x - | None, None, None -> - raise - (Api_errors.Server_error - (Api_errors.vm_no_suspend_sr, [Ref.string_of vm]) - ) - in - let free_space = get_sr_free_space ~__context ~sr in - if free_space < required_space then ( - let sr_str = Ref.string_of sr in - error "%s: SR %s free=%Ld needed=%Ld" __FUNCTION__ sr_str free_space - required_space ; - raise - (Api_errors.Server_error - (Api_errors.sr_suspend_space_insufficient, [sr_str]) - ) - ) else - sr + match + ( check_sr_exists_for_host ~__context ~self:vm_sr ~host:resident_on + , check_sr_exists_for_host ~__context ~self:pool_sr ~host:resident_on + , check_sr_exists_for_host ~__context ~self:host_sr ~host:resident_on + ) + with + | Some x, _, _ -> + x + | _, Some x, _ -> + x + | _, _, Some x -> + x + | None, None, None -> + raise + (Api_errors.Server_error + (Api_errors.vm_no_suspend_sr, [Ref.string_of vm]) + ) (* return the operations filtered for cancels functions *) let cancel_tasks ~__context ~ops ~all_tasks_in_db diff --git a/ocaml/xapi/xapi_xenops.ml b/ocaml/xapi/xapi_xenops.ml index f63e3ccf7d9..fe861ab6c43 100644 --- a/ocaml/xapi/xapi_xenops.ml +++ b/ocaml/xapi/xapi_xenops.ml @@ -4028,10 +4028,7 @@ let suspend ~__context ~self = in Int64.(ram |> add vgpu |> add 104857600L) in - let suspend_SR = - Helpers.choose_suspend_sr ~__context ~vm:self - ~required_space:space_needed - in + let suspend_SR = Helpers.choose_suspend_sr ~__context ~vm:self in let sm_config = [ (Constants._sm_vm_hint, id) @@ -4041,11 +4038,22 @@ let suspend ~__context ~self = in Helpers.call_api_functions ~__context (fun rpc session_id -> let vdi = - XenAPI.VDI.create ~rpc ~session_id ~name_label:"Suspend image" - ~name_description:"Suspend image" ~sR:suspend_SR - ~virtual_size:space_needed ~sharable:false ~read_only:false - ~_type:`suspend ~other_config:[] ~xenstore_data:[] ~sm_config - ~tags:[] + try + XenAPI.VDI.create ~rpc ~session_id ~name_label:"Suspend image" + ~name_description:"Suspend image" ~sR:suspend_SR + ~virtual_size:space_needed ~sharable:false ~read_only:false + ~_type:`suspend ~other_config:[] ~xenstore_data:[] ~sm_config + ~tags:[] + with Api_errors.Server_error ("SR_BACKEND_FAILURE_44", _) -> + let sr_uuid = Db.SR.get_uuid ~__context ~self:suspend_SR in + error "Not enough free space on suspend SR %s; need %Ld B" sr_uuid + space_needed ; + raise + (Api_errors.Server_error + ( Api_errors.sr_suspend_space_insufficient + , [Ref.string_of suspend_SR] + ) + ) in let d = disk_of_vdi ~__context ~self:vdi |> Option.get in Db.VM.set_suspend_VDI ~__context ~self ~value:vdi ; From 464de44381553779ad769d6e13a4b09862731b8a Mon Sep 17 00:00:00 2001 From: Marcus Granado Date: Tue, 3 Feb 2026 15:58:58 +0000 Subject: [PATCH 54/65] CP-309998: ignore small amount of pages in other nodes rrdp-squeezed currently has a threshold of >4096 to count the number of nodes. This is to ignore small number of internal data structures that xen or other kernel devices may sometimes allocate for the VM outside the node where the VM's main memory is allocated. This is a temporary fix until we account in CP-311303 about these small number of pages that sometimes appear out of the VM's main node. In experiments, it's usually a single-digit number like 1. The maximum number observed was around ~2200 pages. Without this fix, the VM.numa_nodes calculation is different of the one returned by rrdp-squeezed, and VM.numa_nodes is over-sensitive to these small number of pages in other nodes. Signed-off-by: Marcus Granado --- ocaml/libs/xenctrl-ext/xenctrlext.ml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocaml/libs/xenctrl-ext/xenctrlext.ml b/ocaml/libs/xenctrl-ext/xenctrlext.ml index 419907ff268..2564e217d60 100644 --- a/ocaml/libs/xenctrl-ext/xenctrlext.ml +++ b/ocaml/libs/xenctrl-ext/xenctrlext.ml @@ -156,7 +156,7 @@ module DomainNuma = struct let nodes = Array.fold_left (fun n pages -> - if pages > 0L then + if pages > 4096L then n + 1 else n From 40b45f1b856d1c4bf1336e4aa072f073eeb23668 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= Date: Wed, 4 Feb 2026 17:32:27 +0000 Subject: [PATCH 55/65] CA-423574: Xenops_task: avoid losing stacktraces when tracing is enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Edwin Török --- ocaml/xenopsd/lib/xenops_task.ml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ocaml/xenopsd/lib/xenops_task.ml b/ocaml/xenopsd/lib/xenops_task.ml index f2c3993cf39..b6aad308536 100644 --- a/ocaml/xenopsd/lib/xenops_task.ml +++ b/ocaml/xenopsd/lib/xenops_task.ml @@ -106,9 +106,10 @@ let with_tracing ~name ~task f = result with exn -> let backtrace = Printexc.get_raw_backtrace () in + Backtrace.is_important exn ; let error = (exn, backtrace) in let _ : (Span.t option, exn) result = Tracer.finish span ~error in - raise exn + Printexc.raise_with_backtrace exn backtrace ) | Error e -> D.warn "Failed to start tracing: %s" (Printexc.to_string e) ; From 3401127e6db73bf0d74b5e1e977242c5eb03c777 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= Date: Wed, 4 Feb 2026 17:32:56 +0000 Subject: [PATCH 56/65] CA-432574: Xenopsd: avoid losing stacktraces in with_reservation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Edwin Török --- ocaml/xenopsd/xc/xenops_server_xen.ml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ocaml/xenopsd/xc/xenops_server_xen.ml b/ocaml/xenopsd/xc/xenops_server_xen.ml index d3bd53127ce..bc98189e761 100644 --- a/ocaml/xenopsd/xc/xenops_server_xen.ml +++ b/ocaml/xenopsd/xc/xenops_server_xen.ml @@ -658,7 +658,11 @@ module Mem = struct ~default:(min, ("none", min)) (reserve_memory_range dbg min max) in - try f amount id with e -> delete_reservation dbg id ; raise e + try f amount id + with e -> + let bt = Printexc.get_raw_backtrace () in + delete_reservation dbg id ; + Printexc.raise_with_backtrace e bt (** Transfer this 'reservation' to the given domain id *) let transfer_reservation_to_domain_exn dbg domid (reservation_id, amount) = From 0c3aaca7d46e23a62ba04e0e846adc01d45ce3b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= Date: Wed, 4 Feb 2026 16:41:15 +0000 Subject: [PATCH 57/65] CP-311475: do not change a domain's memory allocation while it is being built MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a domain takes a long time to be built (e.g. >1TiB) then squeezed might run and attempt to change maxmem, causing the domain build to fail to complete. ``` 2026-02-04T13:59:12.915844+00:00 orca squeezed: [debug||9 ||squeeze_xen] Xenctrl.domain_setmaxmem domid=717 max=6370254848 (was=0) 2026-02-04T13:59:22.878301+00:00 orca squeezed: [debug||3 ||squeeze_xen] Xenctrl.domain_setmaxmem domid=717 max=2075287552 (was=6370254848) ``` Squeezed shouldn't change the maxmem setting on domains that have never been run (other than to initialize it if 0). In fact another module in Squeezed had code to detect whether a domain has ever been run, which has been replaced with checking whether it has an active balloon driver (if it hasn't reported a balloon driver it is still not very safe to change it too early). But that check missed one place that was still setting maxmem, ignoring the balloon driver's presence. Fix this (hopefully last!) place: if there is no balloon driver and we attempt to decrease maxmem then just log a message instead. Fixes: 9819bdbf22 ("CA-32810: prevent the memory ballooning daemon capping a domain's memory usage before it has written feature-balloon.") Signed-off-by: Edwin Török --- ocaml/squeezed/src/squeeze_xen.ml | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/ocaml/squeezed/src/squeeze_xen.ml b/ocaml/squeezed/src/squeeze_xen.ml index b4d9cc54172..b24ea7b3931 100644 --- a/ocaml/squeezed/src/squeeze_xen.ml +++ b/ocaml/squeezed/src/squeeze_xen.ml @@ -794,7 +794,8 @@ let make_host ~verbose ~xc = (fun domain -> let domid = domain.Squeeze.domid and target_kib = domain.Squeeze.target_kib in - if target_kib < Domain.get_maxmem xc domid then + if target_kib < Domain.get_maxmem xc domid && domain.Squeeze.can_balloon + then Domain.set_maxmem_noexn xc domid target_kib ) domains ; @@ -828,16 +829,14 @@ let execute_action ~xc action = "Not setting target for domid: %d since no feature-balloon. Setting \ maxmem to %Ld" domid target_kib - ) else ( - if can_balloon then - Domain.set_target_noexn cnx domid target_kib - else - debug - "Not setting target for domid: %d since no feature-balloon. Setting \ - maxmem to %Ld" - domid target_kib ; + ) else if can_balloon then begin + Domain.set_target_noexn cnx domid target_kib ; Domain.set_maxmem_noexn cnx domid target_kib - ) + end else + debug + "Not setting target for domid: %d since no feature-balloon. Not \ + setting maxmem to %Ld" + domid target_kib with e -> debug "Failed to reset balloon target (domid: %d) (target: %Ld): %s" action.Squeeze.action_domid action.Squeeze.new_target_kib From 57adc227fed9fd821984103efa751087529b779e Mon Sep 17 00:00:00 2001 From: Bengang Yuan Date: Thu, 5 Feb 2026 08:06:20 +0000 Subject: [PATCH 58/65] XSI-2128: Ignore RBAC when destroying internal tasks When calling `VDI.copy` or `VDI.pool_migrate` with `vm_power_admin` role, xapi may forward the operation to a remote host. In this case, xapi creates a pool session on the remote host and create a new task. When the operation completes, `try_internal_async` uses the user's session to destroy the task that was created by an internal pool session, but the user doesn't have the permission to destory other user's task (task.destroy/any), so it fails. Solution: This is an internal cleanup operation, so it doesn't need user RBAC restriction and checking. Ignore RBAC when destroying internal tasks by calling Db_actions.DB_Action.Task.destroy directly. Signed-off-by: Bengang Yuan --- ocaml/xapi/helpers.ml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocaml/xapi/helpers.ml b/ocaml/xapi/helpers.ml index e4b46911b6f..e791dc72c3a 100644 --- a/ocaml/xapi/helpers.ml +++ b/ocaml/xapi/helpers.ml @@ -2025,7 +2025,7 @@ let try_internal_async ~__context (marshaller : Rpc.t -> 'b) ) (fun () -> info "try_internal_async: destroying task: t = ( %s )" ref ; - TaskHelper.destroy ~__context t + Db.Task.destroy ~__context ~self:t ) module PoolSecret : sig From 89be6eeeeccb6455cf0a82fbb38d1b014555123b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= Date: Wed, 4 Feb 2026 17:49:49 +0000 Subject: [PATCH 59/65] CA-423576: cli_progress_bar: move into its own internal library MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This will be reused by some test code. Signed-off-by: Edwin Török --- ocaml/xapi-cli-server/dune | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ocaml/xapi-cli-server/dune b/ocaml/xapi-cli-server/dune index fb2a713dcf2..2f4e326cdd6 100644 --- a/ocaml/xapi-cli-server/dune +++ b/ocaml/xapi-cli-server/dune @@ -8,9 +8,15 @@ (run %{gen} utils --filter-internal --filter closed))) ) +(library + (name cli_progress_bar) + (modules cli_progress_bar) + ) + (library (name xapi_cli_server) (modes best) + (modules (:standard \ cli_progress_bar)) (libraries astring base64 @@ -37,6 +43,7 @@ xapi-client xapi-cli-protocol xapi_aux + cli_progress_bar clock xapi-stdext-pervasives xapi-stdext-std From 8f216251559e640cfc55325749ed15a475d4b0eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= Date: Mon, 2 Feb 2026 14:05:56 +0000 Subject: [PATCH 60/65] CA-423576: cli_progress_bar: use monotonic time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Got some `String.blit` exceptions, which could've happened if the calculated time was negative, and thus exceeded 10 digits when printed with `eta`. Use monotonic time instead, so that we don't crash when the clock is adjusted. Signed-off-by: Edwin Török --- ocaml/xapi-cli-server/cli_progress_bar.ml | 12 +++++++----- ocaml/xapi-cli-server/dune | 1 + 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/ocaml/xapi-cli-server/cli_progress_bar.ml b/ocaml/xapi-cli-server/cli_progress_bar.ml index 8febbc248aa..0ea35b4af3f 100644 --- a/ocaml/xapi-cli-server/cli_progress_bar.ml +++ b/ocaml/xapi-cli-server/cli_progress_bar.ml @@ -25,7 +25,7 @@ module Make (T : Floatable) = struct ; width: int ; line: bytes ; mutable spin_index: int - ; start_time: float + ; start_time: Mtime_clock.counter ; mutable summarised: bool } @@ -44,7 +44,7 @@ module Make (T : Floatable) = struct String.blit prefix_s 0 line 0 prefix ; String.blit suffix_s 0 line (width - suffix - 1) suffix ; let spin_index = 0 in - let start_time = Unix.gettimeofday () in + let start_time = Mtime_clock.counter () in { max_value ; current_value @@ -55,6 +55,9 @@ module Make (T : Floatable) = struct ; summarised= false } + let elapsed t = + 1e-9 *. (Mtime_clock.count t.start_time |> Mtime.Span.to_float_ns) + let percent t = int_of_float T.(to_float t.current_value /. to_float t.max_value *. 100.) @@ -73,7 +76,7 @@ module Make (T : Floatable) = struct Printf.sprintf "%02d:%02d:%02d" h m s let eta t = - let time_so_far = Unix.gettimeofday () -. t.start_time in + let time_so_far = elapsed t in let total_time = T.(to_float t.max_value /. to_float t.current_value) *. time_so_far in @@ -108,8 +111,7 @@ module Make (T : Floatable) = struct let summarise t = if not t.summarised then ( t.summarised <- true ; - Printf.sprintf "Total time: %s\n" - (hms (int_of_float (Unix.gettimeofday () -. t.start_time))) + Printf.sprintf "Total time: %s\n" (t |> elapsed |> int_of_float |> hms) ) else "" end diff --git a/ocaml/xapi-cli-server/dune b/ocaml/xapi-cli-server/dune index 2f4e326cdd6..b3c014ec56f 100644 --- a/ocaml/xapi-cli-server/dune +++ b/ocaml/xapi-cli-server/dune @@ -11,6 +11,7 @@ (library (name cli_progress_bar) (modules cli_progress_bar) + (libraries mtime mtime.clock.os) ) (library From 23fa680e8ec7423b9434957703b0dc11a9d0a422 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= Date: Mon, 2 Feb 2026 14:28:05 +0000 Subject: [PATCH 61/65] CA-423576: cli_progress_bar: do not crash when ETA > 99h or infinite MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ETA is copied into a preallocated bytes buffer of fixed length. When the ETA exceeds 99h then drop the seconds and report just `hh:mm`. If it still doesn't fit then replace it with a static string '++:++:++' (this would only happen if the ETA is >11 years, although it could happen if an operation became stuck. The operation could still recover and then the ETA will print a number). This could also happen if a task makes near 0 progress by the time we attempt to print progress, which would result in a near infinite ETA. Signed-off-by: Edwin Török --- ocaml/xapi-cli-server/cli_progress_bar.ml | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/ocaml/xapi-cli-server/cli_progress_bar.ml b/ocaml/xapi-cli-server/cli_progress_bar.ml index 0ea35b4af3f..8f40d0644d0 100644 --- a/ocaml/xapi-cli-server/cli_progress_bar.ml +++ b/ocaml/xapi-cli-server/cli_progress_bar.ml @@ -73,7 +73,17 @@ module Make (T : Floatable) = struct let h = secs / 3600 in let m = secs mod 3600 / 60 in let s = secs mod 60 in - Printf.sprintf "%02d:%02d:%02d" h m s + let str = Printf.sprintf "%02d:%02d:%02d" h m s in + if String.length str > 8 then + (* negative or > 99 hours *) + let str = Printf.sprintf "%05d:%02d" h m in + if String.length str > 8 then + (* still too long, >11 years *) + "++:++:++" + else + str + else + str let eta t = let time_so_far = elapsed t in From b9cd1e556f6a46e599b8c6aa79a0e15ab48f284d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= Date: Thu, 5 Feb 2026 09:26:25 +0000 Subject: [PATCH 62/65] CA-423576: print more detailed total time when a progress bar finishes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently total time is printed as `hh:mm:ss`, and that will be 00:00:00 if an operation takes <1s. But that is confusing when an operation takes 0.9s for example. Using Mtime.Span.pp instead, which prints values at an appropriate scale based on their magnitude. Signed-off-by: Edwin Török --- ocaml/xapi-cli-server/cli_progress_bar.ml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ocaml/xapi-cli-server/cli_progress_bar.ml b/ocaml/xapi-cli-server/cli_progress_bar.ml index 8f40d0644d0..76e7f1f9fd3 100644 --- a/ocaml/xapi-cli-server/cli_progress_bar.ml +++ b/ocaml/xapi-cli-server/cli_progress_bar.ml @@ -121,7 +121,8 @@ module Make (T : Floatable) = struct let summarise t = if not t.summarised then ( t.summarised <- true ; - Printf.sprintf "Total time: %s\n" (t |> elapsed |> int_of_float |> hms) + Format.asprintf "Total time: %a@." Mtime.Span.pp + (Mtime_clock.count t.start_time) ) else "" end From d51aad70b41f712c52bf2b9b2f7913aee3432602 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= Date: Fri, 6 Feb 2026 14:13:32 +0000 Subject: [PATCH 63/65] CA-423574: avoid losing backtraces when reraising MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Xenopsd reraises some exceptions in a different, simplified form. But this needs to retain the stacktrace from the original place that raised the first exception, otherwise it might be hard to debug. Signed-off-by: Edwin Török --- ocaml/xenopsd/xc/xenops_server_xen.ml | 76 +++++++++++++++------------ 1 file changed, 43 insertions(+), 33 deletions(-) diff --git a/ocaml/xenopsd/xc/xenops_server_xen.ml b/ocaml/xenopsd/xc/xenops_server_xen.ml index bc98189e761..f9eae92e8de 100644 --- a/ocaml/xenopsd/xc/xenops_server_xen.ml +++ b/ocaml/xenopsd/xc/xenops_server_xen.ml @@ -31,14 +31,17 @@ let finally = Xapi_stdext_pervasives.Pervasiveext.finally let with_lock = Xapi_stdext_threads.Threadext.Mutex.execute -let internal_error fmt = +let internal_error' e fmt = Printf.ksprintf (fun str -> error "%s" str ; - raise (Xenopsd_error (Internal_error str)) + let err = Xenopsd_error (Internal_error str) in + match e with None -> raise err | Some e -> Backtrace.reraise e err ) fmt +let internal_error fmt = internal_error' None fmt + (* libxl_internal.h:DISABLE_UDEV_PATH *) let disable_udev_path = "libxl/disable_udev" @@ -269,8 +272,9 @@ let di_of_uuid ~xc uuid = let domid_list = String.concat ", " (List.map domid_of_di possible) in let uuid' = Uuidx.to_string uuid in internal_error "More than one domain with uuid %s: (%s)" uuid' domid_list - | exception Failure r -> - internal_error "%s" r + | exception (Failure r as e) -> + Backtrace.is_important e ; + internal_error' (Some e) "%s" r let domid_of_uuid ~xs uuid = (* We don't fully control the domain lifecycle because libxenguest will @@ -2621,36 +2625,42 @@ module VM = struct try build_domain_exn xc xs domid task vm vbds vifs vgpus vusbs extras force - with - | Bootloader.Bad_sexpr x -> - internal_error "VM = %s; domid = %d; Bootloader.Bad_sexpr %s" - vm.Vm.id domid x - | Bootloader.Bad_error x -> - internal_error "VM = %s; domid = %d; Bootloader.Bad_error %s" - vm.Vm.id domid x - | Bootloader.Unknown_bootloader x -> - internal_error - "VM = %s; domid = %d; Bootloader.Unknown_bootloader %s" vm.Vm.id - domid x - | Bootloader.Error_from_bootloader x -> - let m = - Printf.sprintf - "VM = %s; domid = %d; Bootloader.Error_from_bootloader %s" + with e -> ( + Backtrace.is_important e ; + let internal_error fmt = internal_error' (Some e) fmt in + match e with + | Bootloader.Bad_sexpr x -> + internal_error "VM = %s; domid = %d; Bootloader.Bad_sexpr %s" vm.Vm.id domid x - in - debug "%s" m ; - raise (Xenopsd_error (Bootloader_error (vm.Vm.id, x))) - | Domain.Not_enough_memory m -> - debug - "VM = %s; domid = %d; Domain.Not_enough_memory. Needed: %Ld bytes" - vm.Vm.id domid m ; - raise (Xenopsd_error (Not_enough_memory m)) - | e -> - let m = - Printf.sprintf "VM = %s; domid = %d; Error: %s" vm.Vm.id domid - (Printexc.to_string e) - in - debug "%s" m ; raise e + | Bootloader.Bad_error x -> + internal_error "VM = %s; domid = %d; Bootloader.Bad_error %s" + vm.Vm.id domid x + | Bootloader.Unknown_bootloader x -> + internal_error + "VM = %s; domid = %d; Bootloader.Unknown_bootloader %s" vm.Vm.id + domid x + | Bootloader.Error_from_bootloader x -> + let m = + Printf.sprintf + "VM = %s; domid = %d; Bootloader.Error_from_bootloader %s" + vm.Vm.id domid x + in + debug "%s" m ; + Backtrace.reraise e + (Xenopsd_error (Bootloader_error (vm.Vm.id, x))) + | Domain.Not_enough_memory m -> + debug + "VM = %s; domid = %d; Domain.Not_enough_memory. Needed: %Ld \ + bytes" + vm.Vm.id domid m ; + Backtrace.reraise e (Xenopsd_error (Not_enough_memory m)) + | e -> + let m = + Printf.sprintf "VM = %s; domid = %d; Error: %s" vm.Vm.id domid + (Printexc.to_string e) + in + debug "%s" m ; Backtrace.reraise e e + ) ) (fun () -> clean_memory_reservation task di.Xenctrl.domid) From 67b740d361eeaccadbca605887f97cbac567695d Mon Sep 17 00:00:00 2001 From: Christian Lindig Date: Fri, 6 Feb 2026 10:57:42 +0000 Subject: [PATCH 64/65] CP-311125 unhide AD cache pool parameters in XE CLI External authentication using Active Directory can use a cache to improve performance. The corresponding fields on the pool object so far were hidden. This patch makes them visible, and as a consequence more easily accessible from the CLI and its autocompletion. Signed-off-by: Christian Lindig --- ocaml/idl/datamodel_pool.ml | 6 +++--- ocaml/xapi-cli-server/records.ml | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ocaml/idl/datamodel_pool.ml b/ocaml/idl/datamodel_pool.ml index 843ed3b19c5..89b09301fb5 100644 --- a/ocaml/idl/datamodel_pool.ml +++ b/ocaml/idl/datamodel_pool.ml @@ -1516,7 +1516,7 @@ let set_ext_auth_cache_enabled = , "Specifies whether caching is enabled for external authentication" ) ] - ~hide_from_docs:true ~allowed_roles:_R_POOL_OP () + ~hide_from_docs:false ~allowed_roles:_R_POOL_OP () let set_ext_auth_cache_size = call ~name:"set_ext_auth_cache_size" ~lifecycle:[] @@ -1525,7 +1525,7 @@ let set_ext_auth_cache_size = (Ref _pool, "self", "The pool") ; (Int, "value", "The capacity of the external authentication cache") ] - ~hide_from_docs:true ~allowed_roles:_R_POOL_OP () + ~hide_from_docs:false ~allowed_roles:_R_POOL_OP () let set_ext_auth_cache_expiry = call ~name:"set_ext_auth_cache_expiry" ~lifecycle:[] @@ -1538,7 +1538,7 @@ let set_ext_auth_cache_expiry = seconds - 300 seconds, i.e. 5 minutes, is the default value)" ) ] - ~hide_from_docs:true ~allowed_roles:_R_POOL_OP () + ~hide_from_docs:false ~allowed_roles:_R_POOL_OP () let pool_guest_secureboot_readiness = Enum diff --git a/ocaml/xapi-cli-server/records.ml b/ocaml/xapi-cli-server/records.ml index d7f3cdf421d..9e076ac23d7 100644 --- a/ocaml/xapi-cli-server/records.ml +++ b/ocaml/xapi-cli-server/records.ml @@ -1520,7 +1520,7 @@ let pool_record rpc session_id pool = ~get:(fun () -> get_from_map (x ()).API.pool_recommendations) ~get_map:(fun () -> (x ()).API.pool_recommendations) () - ; make_field ~name:"ext-auth-cache-enabled" ~hidden:true + ; make_field ~name:"ext-auth-cache-enabled" ~hidden:false ~get:(fun () -> (x ()).API.pool_ext_auth_cache_enabled |> string_of_bool ) @@ -1529,14 +1529,14 @@ let pool_record rpc session_id pool = ~value:(bool_of_string v) ) () - ; make_field ~name:"ext-auth-cache-size" ~hidden:true + ; make_field ~name:"ext-auth-cache-size" ~hidden:false ~get:(fun () -> (x ()).API.pool_ext_auth_cache_size |> Int64.to_string) ~set:(fun v -> Client.Pool.set_ext_auth_cache_size ~rpc ~session_id ~self:pool ~value:(Int64.of_string v) ) () - ; make_field ~name:"ext-auth-cache-expiry" ~hidden:true + ; make_field ~name:"ext-auth-cache-expiry" ~hidden:false ~get:(fun () -> (x ()).API.pool_ext_auth_cache_expiry |> Int64.to_string ) From d9b4b436ee7321f05ca16f1810cd6d9425cd958f Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Mon, 9 Feb 2026 15:13:56 +0000 Subject: [PATCH 65/65] xapi: Wrap {vhd,qcow}-tool read_header invocation in a thread This allows it to proceed in parallel with the parse side, not deadlocking on the filled pipe. Signed-off-by: Andrii Sultanov --- ocaml/xapi/qcow_tool_wrapper.ml | 18 ++++++++++++------ ocaml/xapi/vhd_tool_wrapper.ml | 17 ++++++++++++----- 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/ocaml/xapi/qcow_tool_wrapper.ml b/ocaml/xapi/qcow_tool_wrapper.ml index c1abd72972f..c04617f4fa6 100644 --- a/ocaml/xapi/qcow_tool_wrapper.ml +++ b/ocaml/xapi/qcow_tool_wrapper.ml @@ -27,12 +27,18 @@ let read_header qcow_path = let pipe_reader, pipe_writer = Unix.pipe ~cloexec:true () in let progress_cb _ = () in - Xapi_stdext_pervasives.Pervasiveext.finally - (fun () -> - Vhd_qcow_parsing.run_tool qcow_tool progress_cb args - ~output_fd:pipe_writer - ) - (fun () -> Unix.close pipe_writer) ; + let (_ : Thread.t) = + Thread.create + (fun () -> + Xapi_stdext_pervasives.Pervasiveext.finally + (fun () -> + Vhd_qcow_parsing.run_tool qcow_tool progress_cb args + ~output_fd:pipe_writer + ) + (fun () -> Unix.close pipe_writer) + ) + () + in pipe_reader let parse_header qcow_path = diff --git a/ocaml/xapi/vhd_tool_wrapper.ml b/ocaml/xapi/vhd_tool_wrapper.ml index e38679b77fe..64afa6b4522 100644 --- a/ocaml/xapi/vhd_tool_wrapper.ml +++ b/ocaml/xapi/vhd_tool_wrapper.ml @@ -122,11 +122,18 @@ let read_vhd_header path = let pipe_reader, pipe_writer = Unix.pipe ~cloexec:true () in let progress_cb _ = () in - Xapi_stdext_pervasives.Pervasiveext.finally - (fun () -> - Vhd_qcow_parsing.run_tool vhd_tool progress_cb args ~output_fd:pipe_writer - ) - (fun () -> Unix.close pipe_writer) ; + let (_ : Thread.t) = + Thread.create + (fun () -> + Xapi_stdext_pervasives.Pervasiveext.finally + (fun () -> + Vhd_qcow_parsing.run_tool vhd_tool progress_cb args + ~output_fd:pipe_writer + ) + (fun () -> Unix.close pipe_writer) + ) + () + in pipe_reader let parse_header vhd_path =