diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a22f85dc72f..eda226ae94e 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -32,7 +32,7 @@ jobs: ocaml-version: "4.14.2" experimental: true - runs-on: "ubuntu-22.04" - ocaml-version: "5.3.0" + ocaml-version: "5.4.0" experimental: true continue-on-error: ${{ matrix.experimental }} diff --git a/doc/content/design/external-auth-ldaps.md b/doc/content/design/external-auth-ldaps.md new file mode 100644 index 00000000000..616680c247f --- /dev/null +++ b/doc/content/design/external-auth-ldaps.md @@ -0,0 +1,340 @@ +--- +title: Secure LDAP (LDAPS) Support for External Authentication +layout: default +design_doc: true +revision: 1 +status: draft +--- + +## Terminology + +| Term | Meaning | +|------|---------| +| AD | Windows Active Directory | +| samba/winbind | Client used in xapi to talk to AD | +| DC | Windows AD domain controller | +| ldap/ldaps | Lightweight Directory Access Protocol / over SSL | +| Joining host | The host joining to a pool | + +## 1. Background + +To integrate XenServer with AD, XenServer performs LDAP queries in the following use cases: + +- **Enable external authentication/Join domain**: Samba LDAP queries DC details +- **Session revalidation**: xapi queries user details (e.g., whether user is still valid, password expired, etc.) to decide whether to destroy a session + +Currently XenServer uses plain LDAP queries, which is a concern for some enterprise customers. + +## 2. Xapi Database + +### 2.1 External Auth Configuration + +External auth details are stored in the `host` (table) → `external_auth_configuration` (field). For example: + +``` +external_auth_configuration: { + domain: xenrt16718.local, + user: Administrator, + workgroup: XENRTXENRT16718, + netbios_name: genus-35-103d, + machine_pwd_last_change_time: 1767508709 +} +``` + +A new field `ldaps` (bool, optional) will be added to `external_auth_configuration` field to state whether LDAPS should be used instead of LDAP. If not set, LDAP will be used for backward compatibility. + +So the field will look like: + +``` +external_auth_configuration: { + domain: xenrt16718.local, + user: Administrator, + workgroup: XENRTXENRT16718, + netbios_name: genus-35-103d, + machine_pwd_last_change_time: 1767508709, + ldaps: true +} +``` + +### 2.2 Certificate + +To enforce security, if customer uses self-signed certificate, they need to upload the root CA certificate to XenServer, so XenServer can verify the certificate/public key used talking to DC for LDAPS. + +The [trusted-certificates.md](https://github.com/xapi-project/xen-api/blob/master/doc/content/design/trusted-certificates.md) design enhanced the `Certificate` table and introduced a new field `purpose` for security, which limits the certificate only for specific purpose. `ldaps` will be added to `purpose` field as a value for LDAPS. + +## 3. Interfaces + +### 3.1 pool.enable_external_auth + +#### 3.1.1 Interface + +To enable external auth, the current API arguments are as follows: + +- `pool` (Ref _pool): The pool whose external authentication should be enabled +- `config` (Map (String, String)): A list of key-values containing the configuration data +- `service_name` (String): The name of the service +- `auth_type` (String): The type of authentication (e.g., AD for Active Directory) + +For example: + +```bash +xe pool-enable-external-auth uuid= auth-type=AD service-name= config:user= config:pass= +``` + +This API signature does not change. Regarding the config map, one new option is added: + +- `config:ldaps`: whether LDAPS is required, default to `false` + - Set `client ldap sasl wrapping` to `ldaps` if true, `seal` otherwise + - This item will be stored in database in section 2.1 + +Given `ldaps` default to `false`, this feature is **NOT** enabled until explicitly set. + +#### 3.1.2 Error code +Following new error codes added to indicate ldaps enable related error +- AUTH_NO_CERT, no certs can be used for ldaps, refer to 4.1.2 for certs finding. +- AUTH_INVALID_CERT, found certs, but none of the certs can be used to connect to DC + +### 3.2 Set/Get Pool LDAPS Status + +#### 3.2.1 pool.external_auth_set_ldaps +#### 3.2.1.1 Interface + +User can specify LDAPS during join domain as in 3.1. + +For the existing joined domain, user can switch between LDAP and LDAPS with this new API. Args as follows: + +- `pool` (Ref _pool): pool to set LDAPS +- `ldaps` (Bool): whether LDAPS is required +- `force` (Bool): whether to set ldaps even when ldaps is currently set + +This API will set the `ldaps` in database (Refer to 2.1). + +This API performs following sanity check and rejects update if check fails: + +- AD has already been enabled +- ldaps has already been enabled without force +- Find proper certificate (Refer to 4.1 for the details) +- Do a `ldaps` query to embedded user `krbtgt` for the joined domain + +**Note:** +- This API allow re-entry with `force` to perform an extra `ldaps ping` for debug purpose +- This API will not do the LDAPS query on the trusted domains, as xapi does not have trusted domain details +- The joined domain likely has multiple DCs. LDAPS query tries every DC of the domain. Check pass if LDAPS query succeeds on any DC of the domain. This implies iterate and locate a DC supporting LDAPS (with proper certificate trust setup) before LDAPS query. However, this does not introduce performance problems as the LDAPS query happens in backend and refreshes result into XAPI DB +- Pool coordinator dispatches this API request to every host, and only succeeds if all hosts pass the check +- This API needs to be synced with other APIs. For example, `authenticate_username_password` should fail if this API is performing checking and configuration + +This API will refresh `winbind` configuration (Refer to 4.1). + +So following xe command can be used to switch between LDAP and LDAPS: + +```bash +xe pool-external-auth-set-ldaps uuid= ldaps= +``` + +#### 3.2.1.2 Error code +This API may raise following errors +- AUTH_NO_CERT, no certs found to enable ldaps, refer to 4.1.2 for certs finding +- AUTH_INVALID_CERT, found certs, but none of the certs can be used to connect to DC +- AUTH_IS_DISABLED, AD is not enabled +- AUTH_LDAPS_PING_FAILED, failed to do ldaps query on all DCs with valid certs + +#### 3.2.2 Get Pool LDAPS Status + +xapi generates a get message for each field automatically. To query the LDAPS status, client only needs to query the get method of `host` (class) → `external-auth-configuration` (field), and parse the result. The example as follows: + +```bash +xe host-param-get uuid= param-name=external-auth-configuration +``` + +### 3.3 Install Certificate + +If the certificate for LDAPS in DC is signed by a private CA (vs a trusted public CA), user needs to import their Root or Intermediate CA certificate into XenServer. + +`pool.install_trusted_certificate` can install the certificate with following parameters, refer to [trusted-certificates.md](https://github.com/xapi-project/xen-api/blob/master/doc/content/design/trusted-certificates.md) for the details: + +- `session` (ref session_id): reference to a valid session +- `self` (ref Pool): reference to the pool +- `ca` (boolean): should always be `true` for `ldaps`. xapi should reject this CA otherwise +- `cert` (string): the trusted certificate in PEM format +- `purpose` (string list): the purposes of this cert. It can be one of following: + - contain `ldaps` if for specific this specific purpose + - empty set, thus would take as general purpose CA. It will be used for `ldaps` if no `ldaps` specific purpose found + +**Note:** If the DCs (of joining domain and trusted domain in use) are signed by different CAs, all the CAs need to be uploaded to XenServer. + +## 4. Configuration Item + +To enforce LDAPS, following are required: + +- Samba needs to be updated to 4.21+ (Already done) +- LDAPS needs to be enabled in smb.conf + +### 4.1 Samba Configuration + +#### 4.1.1 smb.conf + +To enforce LDAPS, xapi just passthrough the configuration to winbind. Following configuration needs to be updated to `/etc/samba/smb.conf`, details refer to [smb.conf](https://www.samba.org/samba/docs/current/man-html/smb.conf.5.html): + +```ini +client ldap sasl wrapping = +tls verify peer = ca_and_name_if_available +tls trust system cas = yes +tls cafile = /etc/trusted-certs/ca-bundle-[ldaps|general].pem +``` + +- Switch between `ldap` and `ldaps` will flip `client ldap sasl wrapping` between `seal` and `ldaps` +- `tls cafile` points to a CA bundle used to verify DC certs. Details refer to 4.1.2 + +#### 4.1.2 Certificate Selection + +This design is following [trusted-certificates.md](https://github.com/xapi-project/xen-api/blob/master/doc/content/design/trusted-certificates.md): + +- Use `/etc/trusted-certs/ca-bundle-ldaps.pem` if exists +- Fall back to `/etc/trusted-certs/ca-bundle-general.pem` if exists and previous not match +- Report error if none of above match + +**Note:** The selection/configuration is only refreshed on following cases: + +- xapi (re)start +- `pool.external_auth_set_ldaps` API +- (Re)join domain + +### 4.2 Xapi Configuration + +#### 4.2.1 winbind-tls-verify-peer + +For security, xapi asks winbind to verify CA certificate. `ca_and_name_if_available` is the default. + +However, user may want to disable this verification for debug purpose. + +`winbind-tls-verify-peer` is introduced for xapi configuration, and the possible values are `no_check`, `ca_only`, `ca_and_name_if_available`, `ca_and_name` and `as_strict_as_possible`. + The configured value will override `tls verify peer` value in xapi generated samba configuration. Refer to [smb.conf](https://www.samba.org/samba/docs/current/man-html/smb.conf.5.html) for the details. + + +**Note:** This item is not intended for public documentation. This is only for debug purpose, or system tuning for specific scenarios from engineering/support team. + +#### 4.2.2 ad-warning-message-interval + +xapi sends warning message to user with this interval on LDAP query failure. Default to 1 week. Refer to section "Session revalidate" for the details. + +## 5. Session Revalidate + +xapi LDAP queries domain user status (if user has been added to manage XenServer) at configurable interval, and destroys the session created by domain user if user no longer in healthy status. + +However, the LDAP query may fail due to various issues as follows: + +- Temporary network issues +- CA certificate is not properly configured, or expired, etc. + +Instead of destroying user session for stability, a warning message will be sent to user with the details at configurable interval `ad-warning-message-interval`. + +- If no LDAP error, do nothing +- If error happens, send the warning message if: + - first time see the error through xapi start up (so no need to persist last send time) or + - `current_time - last_sent_time > winbind_warning_message_interval` + +The message is defined as follows: +- name: AD_DC_LDAP_CHECK +- priority: Warning +- cls: `Host +- Body: LDAP(S) query check to `` of `` failed from `` of `` + +Note: +- The backend session revalidate check only performs on pool coordinator, thus the backend LDAP(S) query check only on coordinator +- `external_auth_set_ldaps` perform LDAP(S) query check on every host +- All previous AD_DC_LDAP_CHECK warning of a host will be cleaned on a successful LDAP(s) query from that host + +## 6. Pool Join/Leave + +### 6.1 pool.join + +#### 6.1.1 AD Pre-checks + +Currently the pool.join pre-check checks the following: + +- `external_auth_type`: whether joined AD +- `external_auth_service_name`: whether joined the same domain + +The pre-check is good enough, no matter whether `ldaps` is in use, as this ensures host can talk to AD. There are following cases: + +- pool with AD, joining host with same AD: check pass as before this design +- pool without AD, joining host without AD: check pass as before this design +- pool without AD, joining host with AD: check failed as before this design +- pool with AD, joining host without AD: + - LDAPS not enabled: joining host needs to join to same AD as before this design + - LDAPS enabled: joining host needs to enable AD without certificate check, details refer to 6.1.2 + +#### 6.1.2 Join Host to Pool with LDAPS Enabled + +When joining a host without AD to a pool with LDAPS enabled, the host may not have the (CA) certificate for the domain. It can be trivial to enforce customer to upload the CA certificate to every joining host, thus client would help to orchestrate certificates. + +The workflow: + +~~~mermaid + +sequenceDiagram +participant user as User +participant client as Client +participant join as Joining host +participant coor as Pool Coordinator +participant dc as AD/DC + +user->>client: pool.join +Note over client: precheck +alt precheck failed +client-->>user: precheck failed +end + +Note over client,coor: sync all ldaps certs +client->>coor: pool.download_trusted_certificate +coor-->>client: +client->>join: pool.install_trusted_certificate +join-->>client: + +user->>client: join domain username/password +client->>join: join domain username/password +join->>dc: join domain +dc-->>join: +join-->>client: + +client->>join: pool.join +Note over join,coor: join pool ops
certs sync +join-->>client: +client-->>user: pool.join succeed + +~~~ + +**Detailed Steps:** + +1. Client find proper `ldaps certs` from pool coordinator as `certs_pool` + - a. find all certs `ldaps in purpose` + - b. if no LDAPS certs, find all `general` certs +2. Client find all certs in joining host as `certs_joining_host` +3. Client identify the certs needs to be synced to joining host as `certs_to_sync = certs_pool - certs_joining_host` (certs in `certs_pool`, but not in `certs_joining_host`), the certs fingerprint should be used to identify the certs +4. Client download all `certs_to_sync`, `pool.download_trusted_certificate` from coordinator +5. Client upload all certs to joining pool, `pool.install_trusted_certificate` to joining pool, with the same purpose +6. Client trigger `pool.join` again with domain username and password +7. After pool.join: + - If pool.join failed, Client call `pool.uninstall_trusted_certificate` on joining host to revert the certs + - If pool.join succeed, do nothing as pool.join would sync the certs anyway + +### 6.2 pool.leave + +`pool.disable_external_auth` is called during pool leave, thus the `ldaps` status is cleaned. + +This design does not change it. + +## 7. Telemetry Support + +### 7.1 External Auth Enabled + +`host` (table) → `external_auth_type` (field) = `AD` + +### 7.2 LDAPS Enabled + +`host` (table) → `external_auth_configuration` (field) → `ldaps` (key) = `true` + +## References + +- [trusted-certificates.md](https://github.com/xapi-project/xen-api/blob/master/doc/content/design/trusted-certificates.md) +- [Samba smb.conf manual](https://www.samba.org/samba/docs/current/man-html/smb.conf.5.html) diff --git a/dune b/dune index ac7f4810205..4810b2793b0 100644 --- a/dune +++ b/dune @@ -3,10 +3,13 @@ (ocamlopt_flags (:standard -g -p -w -39)) (flags (:standard -w -39)) ) - (dev (flags (:standard -g -w -39))) + (dev + (flags (:standard -g -w -39)) + (env-vars (LANG C))) (release (flags (:standard -w -39-6@5)) - (env-vars (ALCOTEST_COMPACT 1)) + (env-vars (ALCOTEST_COMPACT 1) + (LANG C)) ) ) diff --git a/ocaml/idl/datamodel_errors.ml b/ocaml/idl/datamodel_errors.ml index 24262314274..f739665a836 100644 --- a/ocaml/idl/datamodel_errors.ml +++ b/ocaml/idl/datamodel_errors.ml @@ -1119,7 +1119,7 @@ let _ = error Api_errors.sr_suspend_space_insufficient ["sr"] ~doc: "The suspend SR does not have sufficient free space to store the VM \ - suspend image required to complete a snapshot with memory." + suspend image." () ; error Api_errors.pbd_exists ["sr"; "host"; "pbd"] ~doc:"A PBD already exists connecting the SR to the server." () ; diff --git a/ocaml/idl/datamodel_host.ml b/ocaml/idl/datamodel_host.ml index c8d5743b6ae..d4fad6e7d8d 100644 --- a/ocaml/idl/datamodel_host.ml +++ b/ocaml/idl/datamodel_host.ml @@ -2033,6 +2033,13 @@ let disable_external_auth = ; param_release= george_release ; param_default= Some (VMap []) } + ; { + param_type= Bool + ; param_name= "force" + ; param_doc= "Disable external auth even when not enabled" + ; param_release= numbered_release "26.2.0-next" + ; param_default= Some (VBool false) + } ] ~doc:"This call disables external authentication on the local host" ~allowed_roles:_R_POOL_ADMIN () diff --git a/ocaml/idl/datamodel_lifecycle.ml b/ocaml/idl/datamodel_lifecycle.ml index 8cfdf21cef2..ffd0064859a 100644 --- a/ocaml/idl/datamodel_lifecycle.ml +++ b/ocaml/idl/datamodel_lifecycle.ml @@ -138,11 +138,11 @@ let prototyped_of_field = function | "VM_guest_metrics", "netbios_name" -> Some "24.28.0" | "VM_metrics", "numa_node_memory" -> - Some "26.1.0-next" + Some "26.2.0" | "VM_metrics", "numa_nodes" -> - Some "26.1.0-next" + Some "26.2.0" | "VM_metrics", "numa_optimised" -> - Some "26.1.0-next" + Some "26.2.0" | "VM", "groups" -> Some "24.19.1" | "VM", "pending_guidances_full" -> diff --git a/ocaml/idl/datamodel_pool.ml b/ocaml/idl/datamodel_pool.ml index 843ed3b19c5..89b09301fb5 100644 --- a/ocaml/idl/datamodel_pool.ml +++ b/ocaml/idl/datamodel_pool.ml @@ -1516,7 +1516,7 @@ let set_ext_auth_cache_enabled = , "Specifies whether caching is enabled for external authentication" ) ] - ~hide_from_docs:true ~allowed_roles:_R_POOL_OP () + ~hide_from_docs:false ~allowed_roles:_R_POOL_OP () let set_ext_auth_cache_size = call ~name:"set_ext_auth_cache_size" ~lifecycle:[] @@ -1525,7 +1525,7 @@ let set_ext_auth_cache_size = (Ref _pool, "self", "The pool") ; (Int, "value", "The capacity of the external authentication cache") ] - ~hide_from_docs:true ~allowed_roles:_R_POOL_OP () + ~hide_from_docs:false ~allowed_roles:_R_POOL_OP () let set_ext_auth_cache_expiry = call ~name:"set_ext_auth_cache_expiry" ~lifecycle:[] @@ -1538,7 +1538,7 @@ let set_ext_auth_cache_expiry = seconds - 300 seconds, i.e. 5 minutes, is the default value)" ) ] - ~hide_from_docs:true ~allowed_roles:_R_POOL_OP () + ~hide_from_docs:false ~allowed_roles:_R_POOL_OP () let pool_guest_secureboot_readiness = Enum diff --git a/ocaml/libs/stunnel/stunnel.ml b/ocaml/libs/stunnel/stunnel.ml index 1b6e9c3ee18..adfa670ba5d 100644 --- a/ocaml/libs/stunnel/stunnel.ml +++ b/ocaml/libs/stunnel/stunnel.ml @@ -122,6 +122,7 @@ type verification_config = { sni: string option ; verify: verify ; cert_bundle_path: string + ; crl_dir: string option } type t = { @@ -140,6 +141,7 @@ let appliance = sni= None ; verify= CheckHost ; cert_bundle_path= "/etc/stunnel/xapi-stunnel-ca-bundle.pem" + ; crl_dir= Some crl_path } let pool = @@ -147,10 +149,16 @@ let pool = sni= Some "pool" ; verify= VerifyPeer ; cert_bundle_path= "/etc/stunnel/xapi-pool-ca-bundle.pem" + ; crl_dir= None } let external_host ext_host_cert_file = - {sni= None; verify= VerifyPeer; cert_bundle_path= ext_host_cert_file} + { + sni= None + ; verify= VerifyPeer + ; cert_bundle_path= ext_host_cert_file + ; crl_dir= None + } let debug_conf_of_bool verbose : string = if verbose then @@ -219,7 +227,7 @@ let config_file ?(accept = None) config host port = ; ( match config with | None -> [] - | Some {sni; verify; cert_bundle_path} -> + | Some {sni; verify; cert_bundle_path; crl_dir} -> List.rev_append ( match verify with | VerifyPeer -> @@ -234,14 +242,17 @@ let config_file ?(accept = None) config host port = ; "# the cert of the server we connect to" ; (match sni with None -> "" | Some s -> sprintf "sni = %s" s) ; sprintf "CAfile=%s" cert_bundle_path - ; ( match Sys.readdir crl_path with - | [||] -> - "" - | _ -> - sprintf "CRLpath=%s" crl_path - | exception _ -> - "" - ) + ; Option.fold ~none:"" + ~some:(fun crl_dir -> + match Sys.readdir crl_dir with + | [||] -> + "" + | _ -> + sprintf "CRLpath=%s" crl_dir + | exception _ -> + "" + ) + crl_dir ] ) ; [""] diff --git a/ocaml/libs/stunnel/stunnel.mli b/ocaml/libs/stunnel/stunnel.mli index ac7d08a0af5..7d7ffae02c3 100644 --- a/ocaml/libs/stunnel/stunnel.mli +++ b/ocaml/libs/stunnel/stunnel.mli @@ -40,6 +40,7 @@ type verification_config = { sni: string option ; verify: verify ; cert_bundle_path: string + ; crl_dir: string option } (** Represents an active stunnel connection *) diff --git a/ocaml/libs/stunnel/stunnel_client.mli b/ocaml/libs/stunnel/stunnel_client.mli index b3dd0392bf4..e9812c6cdb1 100644 --- a/ocaml/libs/stunnel/stunnel_client.mli +++ b/ocaml/libs/stunnel/stunnel_client.mli @@ -17,7 +17,14 @@ val get_verify_by_default : unit -> bool val set_verify_by_default : bool -> unit val pool : unit -> Stunnel.verification_config option +(** [pool ()] returns the configuration that's meant to be used to connect to + other xapi hosts in the pool *) val appliance : unit -> Stunnel.verification_config option +(** [appliance ()] returns the configuration that's meant to be used to connect + to appliances providing services, like WLB or a licensing server. *) val external_host : string -> Stunnel.verification_config option +(** [external_host path] returns the configuration that's meant to be used to connect to + a xapi hosts outside the pool. This is useful, for example, to provide an + update repository to download updates from. *) diff --git a/ocaml/libs/xapi-stdext/lib/xapi-stdext-std/listext.mli b/ocaml/libs/xapi-stdext/lib/xapi-stdext-std/listext.mli index ece679d5423..b58b8d84016 100644 --- a/ocaml/libs/xapi-stdext/lib/xapi-stdext-std/listext.mli +++ b/ocaml/libs/xapi-stdext/lib/xapi-stdext-std/listext.mli @@ -154,7 +154,8 @@ module List : sig val set_equiv : 'a list -> 'a list -> bool val set_difference : 'a list -> 'a list -> 'a list - (** Returns the set difference of two lists *) + (** [set_difference a b] returns the list with the elements in [a] that are + not present in [b] *) val intersect : 'a list -> 'a list -> 'a list (** Returns the intersection of two lists. *) diff --git a/ocaml/libs/xenctrl-ext/xenctrlext.ml b/ocaml/libs/xenctrl-ext/xenctrlext.ml index 31360870fbc..abdab1dbbb2 100644 --- a/ocaml/libs/xenctrl-ext/xenctrlext.ml +++ b/ocaml/libs/xenctrl-ext/xenctrlext.ml @@ -124,12 +124,20 @@ end exception Not_available let domain_claim_pages handle domid ?(numa_node = NumaNode.none) nr_pages = - if numa_node <> NumaNode.none then raise Not_available ; stub_domain_claim_pages handle domid numa_node nr_pages +module HostNuma = struct + (* Numa state of a host *) + + type node_meminfo = {size: int64; free: int64; claimed: int64} + + external numa_get_meminfo : handle -> node_meminfo array + = "stub_xenctrlext_numa_meminfo" +end + let get_nr_nodes handle = - let info = numainfo handle in - Array.length info.memory + let meminfo = HostNuma.numa_get_meminfo handle in + Array.length meminfo module DomainNuma = struct (* Numa state of a domain *) @@ -157,7 +165,7 @@ module DomainNuma = struct let nodes = Array.fold_left (fun n pages -> - if pages > 0L then + if pages > 4096L then n + 1 else n diff --git a/ocaml/libs/xenctrl-ext/xenctrlext.mli b/ocaml/libs/xenctrl-ext/xenctrlext.mli index 11d6ad8ad66..435631a84ff 100644 --- a/ocaml/libs/xenctrl-ext/xenctrlext.mli +++ b/ocaml/libs/xenctrl-ext/xenctrlext.mli @@ -109,7 +109,21 @@ val get_nr_nodes : handle -> int (** Returns the count of NUMA nodes available in the system. *) module DomainNuma : sig + type domain_numainfo_node_pages = { + tot_pages_per_node: int64 array (* page=4k bytes *) + } + + external domain_get_numa_info_node_pages : + handle -> int -> domain_numainfo_node_pages + = "stub_xc_domain_numa_get_node_pages_wrapper" + type t = {optimised: bool; nodes: int; memory: int64 array (* bytes *)} val state : handle -> domid:int -> t end + +module HostNuma : sig + type node_meminfo = {size: int64; free: int64; claimed: int64} + + val numa_get_meminfo : handle -> node_meminfo array +end diff --git a/ocaml/libs/xenctrl-ext/xenctrlext_stubs.c b/ocaml/libs/xenctrl-ext/xenctrlext_stubs.c index 5da78215acc..2577239b561 100644 --- a/ocaml/libs/xenctrl-ext/xenctrlext_stubs.c +++ b/ocaml/libs/xenctrl-ext/xenctrlext_stubs.c @@ -678,14 +678,15 @@ CAMLprim value stub_xenctrlext_domain_claim_pages(value xch_val, value domid_val value numa_node_val, value nr_pages_val) { CAMLparam4(xch_val, domid_val, numa_node_val, nr_pages_val); +#ifdef XEN_DOMCTL_NUMA_OP_GET_NODE_PAGES int retval, the_errno; xc_interface* xch = xch_of_val(xch_val); uint32_t domid = Int_val(domid_val); - // unsigned int numa_node = Int_val(numa_node_val); + unsigned int numa_node = Int_val(numa_node_val); unsigned long nr_pages = Long_val(nr_pages_val); caml_release_runtime_system(); - retval = xc_domain_claim_pages(xch, domid, /*numa_node,*/ nr_pages); + retval = xc_domain_claim_pages_node(xch, domid, numa_node, nr_pages); the_errno = errno; caml_acquire_runtime_system(); @@ -693,6 +694,9 @@ CAMLprim value stub_xenctrlext_domain_claim_pages(value xch_val, value domid_val raise_unix_errno_msg(the_errno, "Error when trying to claim memory pages"); } +#else + raise_unix_errno_msg(ENOSYS, "xc_domain_claim_pages_node"); +#endif CAMLreturn(Val_unit); } @@ -711,6 +715,104 @@ CAMLprim value stub_xc_domain_numa_get_node_pages_wrapper(value xch_val, value d #endif } +/* + * Get NUMA memory info with claimed memory support + * + * Falls back to previous xc_numainfo with claimed=0 + * if XEN_SYSCTL_numa_meminfo is not available at compile time or runtime + */ +CAMLprim value stub_xenctrlext_numa_meminfo(value xch_val) +{ + CAMLparam1(xch_val); + CAMLlocal2(result, info); + xc_interface *xch = xch_of_val(xch_val); + unsigned int max_nodes = 0; + unsigned int i; + int ret; + +#ifdef XEN_SYSCTL_numa_meminfo + xen_sysctl_node_meminfo_t *meminfo = NULL; + + /* First call to get node count */ + caml_release_runtime_system(); + ret = xc_numa_meminfo(xch, &max_nodes, NULL); + caml_acquire_runtime_system(); + + if (ret == 0) { + /* New hypercall available, use it */ + meminfo = calloc(max_nodes, sizeof(*meminfo)); + if (!meminfo) + caml_raise_out_of_memory(); + + caml_release_runtime_system(); + ret = xc_numa_meminfo(xch, &max_nodes, meminfo); + caml_acquire_runtime_system(); + + if (ret < 0) { + int err = errno; + free(meminfo); + errno = err; + failwith_xc(xch); + } + + result = caml_alloc_tuple(max_nodes); + for (i = 0; i < max_nodes; i++) { + info = caml_alloc_tuple(3); + Store_field(info, 0, caml_copy_int64(meminfo[i].size)); + Store_field(info, 1, caml_copy_int64(meminfo[i].free)); + Store_field(info, 2, caml_copy_int64(meminfo[i].claimed)); + Store_field(result, i, info); + } + + free(meminfo); + CAMLreturn(result); + } + + /* If we get ENOSYS or EOPNOTSUPP, fall back to old hypercall */ + if (errno != ENOSYS && errno != EOPNOTSUPP) + failwith_xc(xch); +#endif + + /* Fallback: use xc_numainfo with claimed=0 */ + { + xc_meminfo_t *old_meminfo = NULL; + + caml_release_runtime_system(); + ret = xc_numainfo(xch, &max_nodes, NULL, NULL); + caml_acquire_runtime_system(); + + if (ret < 0) + failwith_xc(xch); + + old_meminfo = calloc(max_nodes, sizeof(*old_meminfo)); + if (!old_meminfo) + caml_raise_out_of_memory(); + + caml_release_runtime_system(); + ret = xc_numainfo(xch, &max_nodes, old_meminfo, NULL); + caml_acquire_runtime_system(); + + if (ret < 0) { + int err = errno; + free(old_meminfo); + errno = err; + failwith_xc(xch); + } + + result = caml_alloc_tuple(max_nodes); + for (i = 0; i < max_nodes; i++) { + info = caml_alloc_tuple(3); + Store_field(info, 0, caml_copy_int64(old_meminfo[i].memsize)); + Store_field(info, 1, caml_copy_int64(old_meminfo[i].memfree)); + Store_field(info, 2, caml_copy_int64(0)); /* claimed=0 */ + Store_field(result, i, info); + } + + free(old_meminfo); + CAMLreturn(result); + } +} + /* * Local variables: diff --git a/ocaml/qcow-stream-tool/qcow_stream_tool.ml b/ocaml/qcow-stream-tool/qcow_stream_tool.ml index 41b57c9a366..b8605c2f44e 100644 --- a/ocaml/qcow-stream-tool/qcow_stream_tool.ml +++ b/ocaml/qcow-stream-tool/qcow_stream_tool.ml @@ -12,17 +12,18 @@ module Impl = struct let* virtual_size, cluster_bits, _, data_cluster_map = Qcow_stream.start_stream_decode fd in - let clusters = Qcow_types.Cluster.Map.bindings data_cluster_map in + (* TODO: List.map becomes tail-recursive in OCaml 5.1, and could be used here instead *) let clusters = - List.map - (fun (_, virt_address) -> + data_cluster_map + |> Qcow_types.Cluster.Map.to_seq + |> Seq.map (fun (_, virt_address) -> let ( >> ) = Int64.shift_right_logical in let address = Int64.to_int (virt_address >> Int32.to_int cluster_bits) in `Int address - ) - clusters + ) + |> List.of_seq in let json = `Assoc diff --git a/ocaml/squeezed/src/squeeze_xen.ml b/ocaml/squeezed/src/squeeze_xen.ml index 31bac6df75b..b4d9cc54172 100644 --- a/ocaml/squeezed/src/squeeze_xen.ml +++ b/ocaml/squeezed/src/squeeze_xen.ml @@ -60,8 +60,9 @@ let ( -* ) = Int64.sub let mib = 1024L -(** Same as xen commandline *) -let low_mem_emergency_pool = 1L ** mib +(** Same as xen commandline + CA-423173: this is `low_mem_virq_limit` Xen, default 64MiB *) +let low_mem_emergency_pool = 64L ** mib (** Return the extra amount we always add onto maxmem *) let xen_max_offset_kib domain_type = diff --git a/ocaml/tests/suite_alcotest.ml b/ocaml/tests/suite_alcotest.ml index 5cbd192e9a3..62deb8e71ae 100644 --- a/ocaml/tests/suite_alcotest.ml +++ b/ocaml/tests/suite_alcotest.ml @@ -62,7 +62,6 @@ let () = @ Test_vm.tests @ Test_dbsync_master.tests @ Test_pvs_cache_storage.tests - @ Test_extauth_plugin_ADpbis.tests @ Test_helpers.tests @ Test_datamodel_lifecycle.tests @ Test_psr.tests diff --git a/ocaml/tests/test_extauth_plugin_ADpbis.ml b/ocaml/tests/test_extauth_plugin_ADpbis.ml deleted file mode 100644 index cab78883907..00000000000 --- a/ocaml/tests/test_extauth_plugin_ADpbis.ml +++ /dev/null @@ -1,132 +0,0 @@ -(* - * Copyright (C) Citrix Systems Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; version 2.1 only. with the special - * exception on linking described in file LICENSE. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - *) - -open Test_highlevel - -module PbisAuthErrorsCatch = Generic.MakeStateless (struct - module Io = struct - type input_t = string list - - type output_t = Auth_signature.auth_service_error_tag - - let string_of_input_t = Test_printers.(list string) - - let string_of_output_t output = - match output with - | Auth_signature.E_GENERIC -> - "E_GENERIC" - | Auth_signature.E_LOOKUP -> - "E_LOOKUP" - | Auth_signature.E_DENIED -> - "E_DENIED" - | Auth_signature.E_CREDENTIALS -> - "E_CREDENTIALS" - | Auth_signature.E_UNAVAILABLE -> - "E_UNAVAILABLE" - | Auth_signature.E_INVALID_OU -> - "E_INVALID_OU" - | Auth_signature.E_INVALID_ACCOUNT -> - "E_INVALID_ACCOUNT" - end - - let transform = Extauth_plugin_ADpbis.match_error_tag - - let tests = - `QuickAndAutoDocumented - [ - ([], Auth_signature.E_GENERIC) - ; ([""; ""], Auth_signature.E_GENERIC) - ; ([""; "some words"], Auth_signature.E_GENERIC) - ; ([""; "DNS_ERROR_BAD_PACKET"], Auth_signature.E_LOOKUP) - ; ([""; "LW_ERROR_PASSWORD_MISMATCH"], Auth_signature.E_CREDENTIALS) - ; ([""; "LW_ERROR_INVALID_ACCOUNT"], Auth_signature.E_INVALID_ACCOUNT) - ; ([""; "LW_ERROR_ACCESS_DENIED"], Auth_signature.E_DENIED) - ; ([""; "LW_ERROR_DOMAIN_IS_OFFLINE"], Auth_signature.E_UNAVAILABLE) - ; ([""; "LW_ERROR_INVALID_OU"], Auth_signature.E_INVALID_OU) - ; ([""; "prefixDNS_ERROR_BAD_PACKETsuffix"], Auth_signature.E_GENERIC) - ; ([""; "prefix_DNS_ERROR_BAD_PACKET_suffix"], Auth_signature.E_GENERIC) - ; ([""; "prefix(DNS_ERROR_BAD_PACKET)suffix"], Auth_signature.E_LOOKUP) - ; ([""; "prefix.DNS_ERROR_BAD_PACKET.suffix"], Auth_signature.E_LOOKUP) - ; ([""; "prefix DNS_ERROR_BAD_PACKET suffix"], Auth_signature.E_LOOKUP) - ; ([""; "prefix\tDNS_ERROR_BAD_PACKET\tsuffix"], Auth_signature.E_LOOKUP) - ] -end) - -module PbisExtractSid = Generic.MakeStateless (struct - module Io = struct - type input_t = (string * string) list - - type output_t = string list - - let string_of_input_t = Test_printers.(list (pair string string)) - - let string_of_output_t = Test_printers.(list string) - end - - let transform = Extauth_plugin_ADpbis.extract_sid_from_group_list - - let tests = - `QuickAndAutoDocumented - [ - ([(" ", " ")], []) - ; ([("Exception", "Remote connection shutdown!")], []) - ; ( [ - ("Number of groups found for user 'testAD@BLE'", "0") - ; ("Error", "No record found") - ] - , [] - ) - ; ( [ - ("Number of groups found for user 'admin@NVC'", "1") - ; ( "" - , "Group[1 of 1] name = NVC\\testg(ab) (gid = 564135020, sid = \ - S-1-5-21-1171552557-368733809-2946345504-1132)" - ) - ] - , ["S-1-5-21-1171552557-368733809-2946345504-1132"] - ) - ; ( [ - ("Number of groups found for user 'cnk3@UN'", "1") - ; ( "" - , "Group[1 of 1] name = UN\\KnmOJ (gid = 492513842, sid = \ - S-1-5-31-5921451325-154521381-3135732118-4527)" - ) - ] - , ["S-1-5-31-5921451325-154521381-3135732118-4527"] - ) - ; ( [ - ("Number of groups found for user 'test@testdomain'", "2") - ; ( "" - , "Group[1 of 2] name = testdomain\\dnsadmins (gid = 580912206, \ - sid = S-1-5-21-791009147-1041474540-2433379237-1102)" - ) - ; ( "" - , "Group[2 of 2] name = testdomain\\domain+users (gid = 580911617, \ - sid = S-1-5-21-791009147-1041474540-2433379237-513)" - ) - ] - , [ - "S-1-5-21-791009147-1041474540-2433379237-1102" - ; "S-1-5-21-791009147-1041474540-2433379237-513" - ] - ) - ] -end) - -let tests = - make_suite "extauth_ADpbis_" - [ - ("pbis_auth_errors_catch", PbisAuthErrorsCatch.tests) - ; ("pbis_extract_sid", PbisExtractSid.tests) - ] diff --git a/ocaml/tests/test_pkg_mgr.ml b/ocaml/tests/test_pkg_mgr.ml index 3593e01d014..8fd7bd38f40 100644 --- a/ocaml/tests/test_pkg_mgr.ml +++ b/ocaml/tests/test_pkg_mgr.ml @@ -159,6 +159,29 @@ let test_dnf_apply_upgrades = ) ] +let test_dnf_apply_group_upgrades = + [ + ( "" + , `Quick + , check + { + cmd= !Xapi_globs.dnf_cmd + ; params= + [ + "-y" + ; "--disablerepo=*" + ; "--enablerepo=testrepo1,testrepo2" + ; "group" + ; "upgrade" + ; "*" + ] + } + (Pkg_mgr.Dnf_cmd.apply_group_upgrade + ~repositories:["testrepo1"; "testrepo2"] + ) + ) + ] + let test_yum_repo_query_installed = [ ( "" @@ -286,6 +309,29 @@ let test_yum_apply_upgrades = ) ] +let test_yum_apply_group_upgrades = + [ + ( "" + , `Quick + , check + { + cmd= !Xapi_globs.yum_cmd + ; params= + [ + "-y" + ; "--disablerepo=*" + ; "--enablerepo=testrepo1,testrepo2" + ; "group" + ; "upgrade" + ; "*" + ] + } + (Pkg_mgr.Yum_cmd.apply_group_upgrade + ~repositories:["testrepo1"; "testrepo2"] + ) + ) + ] + let test_yum_repo_query_updates = [ ( "" @@ -320,6 +366,7 @@ let tests = ; ("test_dnf_cofig_repo", test_dnf_config_repo) ; ("test_dnf_sync_repo", test_dnf_sync_repo) ; ("test_dnf_apply_upgrades", test_dnf_apply_upgrades) + ; ("test_dnf_apply_group_upgrades", test_dnf_apply_group_upgrades) ; ("test_yum_repo_query_installed", test_yum_repo_query_installed) ; ("test_yum_clean_all_cache", test_yum_clean_all_cache) ; ("test_yum_clean_repo_cache", test_yum_clean_repo_cache) @@ -327,6 +374,7 @@ let tests = ; ("test_yum_cofig_repo", test_yum_config_repo) ; ("test_yum_sync_repo", test_yum_sync_repo) ; ("test_yum_apply_upgrades", test_yum_apply_upgrades) + ; ("test_yum_apply_group_upgrades", test_yum_apply_group_upgrades) ; ("test_yum_repo_query_updates", test_yum_repo_query_updates) ] diff --git a/ocaml/xapi-cli-server/cli_operations.ml b/ocaml/xapi-cli-server/cli_operations.ml index b20ed934107..e0921800b21 100644 --- a/ocaml/xapi-cli-server/cli_operations.ml +++ b/ocaml/xapi-cli-server/cli_operations.ml @@ -7089,7 +7089,7 @@ let host_disable_external_auth _printer rpc session_id params = let host_uuid = List.assoc "host-uuid" params in let host = Client.Host.get_by_uuid ~rpc ~session_id ~uuid:host_uuid in let config = read_map_params "config" params in - Client.Host.disable_external_auth ~rpc ~session_id ~host ~config + Client.Host.disable_external_auth ~rpc ~session_id ~host ~config ~force:true let host_refresh_pack_info _printer rpc session_id params = let host_uuid = List.assoc "host-uuid" params in diff --git a/ocaml/xapi-cli-server/cli_progress_bar.ml b/ocaml/xapi-cli-server/cli_progress_bar.ml index 8febbc248aa..76e7f1f9fd3 100644 --- a/ocaml/xapi-cli-server/cli_progress_bar.ml +++ b/ocaml/xapi-cli-server/cli_progress_bar.ml @@ -25,7 +25,7 @@ module Make (T : Floatable) = struct ; width: int ; line: bytes ; mutable spin_index: int - ; start_time: float + ; start_time: Mtime_clock.counter ; mutable summarised: bool } @@ -44,7 +44,7 @@ module Make (T : Floatable) = struct String.blit prefix_s 0 line 0 prefix ; String.blit suffix_s 0 line (width - suffix - 1) suffix ; let spin_index = 0 in - let start_time = Unix.gettimeofday () in + let start_time = Mtime_clock.counter () in { max_value ; current_value @@ -55,6 +55,9 @@ module Make (T : Floatable) = struct ; summarised= false } + let elapsed t = + 1e-9 *. (Mtime_clock.count t.start_time |> Mtime.Span.to_float_ns) + let percent t = int_of_float T.(to_float t.current_value /. to_float t.max_value *. 100.) @@ -70,10 +73,20 @@ module Make (T : Floatable) = struct let h = secs / 3600 in let m = secs mod 3600 / 60 in let s = secs mod 60 in - Printf.sprintf "%02d:%02d:%02d" h m s + let str = Printf.sprintf "%02d:%02d:%02d" h m s in + if String.length str > 8 then + (* negative or > 99 hours *) + let str = Printf.sprintf "%05d:%02d" h m in + if String.length str > 8 then + (* still too long, >11 years *) + "++:++:++" + else + str + else + str let eta t = - let time_so_far = Unix.gettimeofday () -. t.start_time in + let time_so_far = elapsed t in let total_time = T.(to_float t.max_value /. to_float t.current_value) *. time_so_far in @@ -108,8 +121,8 @@ module Make (T : Floatable) = struct let summarise t = if not t.summarised then ( t.summarised <- true ; - Printf.sprintf "Total time: %s\n" - (hms (int_of_float (Unix.gettimeofday () -. t.start_time))) + Format.asprintf "Total time: %a@." Mtime.Span.pp + (Mtime_clock.count t.start_time) ) else "" end diff --git a/ocaml/xapi-cli-server/dune b/ocaml/xapi-cli-server/dune index fb2a713dcf2..b3c014ec56f 100644 --- a/ocaml/xapi-cli-server/dune +++ b/ocaml/xapi-cli-server/dune @@ -8,9 +8,16 @@ (run %{gen} utils --filter-internal --filter closed))) ) +(library + (name cli_progress_bar) + (modules cli_progress_bar) + (libraries mtime mtime.clock.os) + ) + (library (name xapi_cli_server) (modes best) + (modules (:standard \ cli_progress_bar)) (libraries astring base64 @@ -37,6 +44,7 @@ xapi-client xapi-cli-protocol xapi_aux + cli_progress_bar clock xapi-stdext-pervasives xapi-stdext-std diff --git a/ocaml/xapi-cli-server/records.ml b/ocaml/xapi-cli-server/records.ml index d7f3cdf421d..9e076ac23d7 100644 --- a/ocaml/xapi-cli-server/records.ml +++ b/ocaml/xapi-cli-server/records.ml @@ -1520,7 +1520,7 @@ let pool_record rpc session_id pool = ~get:(fun () -> get_from_map (x ()).API.pool_recommendations) ~get_map:(fun () -> (x ()).API.pool_recommendations) () - ; make_field ~name:"ext-auth-cache-enabled" ~hidden:true + ; make_field ~name:"ext-auth-cache-enabled" ~hidden:false ~get:(fun () -> (x ()).API.pool_ext_auth_cache_enabled |> string_of_bool ) @@ -1529,14 +1529,14 @@ let pool_record rpc session_id pool = ~value:(bool_of_string v) ) () - ; make_field ~name:"ext-auth-cache-size" ~hidden:true + ; make_field ~name:"ext-auth-cache-size" ~hidden:false ~get:(fun () -> (x ()).API.pool_ext_auth_cache_size |> Int64.to_string) ~set:(fun v -> Client.Pool.set_ext_auth_cache_size ~rpc ~session_id ~self:pool ~value:(Int64.of_string v) ) () - ; make_field ~name:"ext-auth-cache-expiry" ~hidden:true + ; make_field ~name:"ext-auth-cache-expiry" ~hidden:false ~get:(fun () -> (x ()).API.pool_ext_auth_cache_expiry |> Int64.to_string ) diff --git a/ocaml/xapi-idl/memory/memory.ml b/ocaml/xapi-idl/memory/memory.ml index 99951f7e3e8..9c30f9fa06f 100644 --- a/ocaml/xapi-idl/memory/memory.ml +++ b/ocaml/xapi-idl/memory/memory.ml @@ -185,6 +185,7 @@ type memory_config = { ; shadow_mib: int64 ; required_host_free_mib: int64 ; overhead_mib: int64 + ; build_claim_pages: int64 } module Memory_model (D : MEMORY_MODEL_DATA) = struct @@ -218,6 +219,8 @@ module Memory_model (D : MEMORY_MODEL_DATA) = struct D.extra_internal_mib +++ D.extra_external_mib +++ shadow_mib static_max_mib vcpu_count multiplier + (* CA-423172: Xen uses some extra memory/vCPU *) + +++ mib_of_pages_used (38L *** Int64.of_int vcpu_count) +++ D.shim_mib static_max_mib let footprint_mib target_mib static_max_mib vcpu_count multiplier = @@ -226,14 +229,16 @@ module Memory_model (D : MEMORY_MODEL_DATA) = struct let shadow_multiplier_default = 1.0 let full_config static_max_mib video_mib target_mib vcpus shadow_multiplier = + let build_start_mib = build_start_mib static_max_mib target_mib video_mib in { build_max_mib= build_max_mib static_max_mib video_mib - ; build_start_mib= build_start_mib static_max_mib target_mib video_mib + ; build_start_mib ; xen_max_mib= xen_max_mib static_max_mib ; shadow_mib= shadow_mib static_max_mib vcpus shadow_multiplier ; required_host_free_mib= footprint_mib target_mib static_max_mib vcpus shadow_multiplier ; overhead_mib= overhead_mib static_max_mib vcpus shadow_multiplier + ; build_claim_pages= pages_of_mib build_start_mib } end diff --git a/ocaml/xapi/extauth_ad.ml b/ocaml/xapi/extauth_ad.ml index b6b2cca66c1..5218b426cb5 100644 --- a/ocaml/xapi/extauth_ad.ml +++ b/ocaml/xapi/extauth_ad.ml @@ -22,11 +22,9 @@ open D module AD_type = struct exception Unknown_AD_type of string - type t = Pbis | Winbind + type t = Winbind let of_string = function - | "pbis" -> - Pbis | "winbind" -> Winbind | _ as at -> @@ -36,20 +34,14 @@ end module AD = struct let start_backend_daemon ~wait_until_success = function - | AD_type.Pbis -> - Extauth_plugin_ADpbis.Lwsmd.start ~wait_until_success ~timeout:5. | AD_type.Winbind -> Extauth_plugin_ADwinbind.Winbind.start ~wait_until_success ~timeout:5. let stop_backend_daemon ~wait_until_success = function - | AD_type.Pbis -> - Extauth_plugin_ADpbis.Lwsmd.stop ~wait_until_success ~timeout:3. | AD_type.Winbind -> Extauth_plugin_ADwinbind.Winbind.stop ~wait_until_success ~timeout:3. let init_service ~__context = function - | AD_type.Pbis -> - Extauth_plugin_ADpbis.Lwsmd.init_service ~__context | AD_type.Winbind -> Extauth_plugin_ADwinbind.Winbind.init_service ~__context end @@ -71,7 +63,5 @@ let init_service ~__context = let methods () = match !Xapi_globs.extauth_ad_backend |> AD_type.of_string with - | Pbis -> - Extauth_plugin_ADpbis.AuthADlw.methods | Winbind -> Extauth_plugin_ADwinbind.AuthADWinbind.methods diff --git a/ocaml/xapi/extauth_plugin_ADpbis.ml b/ocaml/xapi/extauth_plugin_ADpbis.ml deleted file mode 100644 index 63a5ab7ac81..00000000000 --- a/ocaml/xapi/extauth_plugin_ADpbis.ml +++ /dev/null @@ -1,1201 +0,0 @@ -(* - * Copyright (C) 2006-2009 Citrix Systems Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; version 2.1 only. with the special - * exception on linking described in file LICENSE. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - *) -(** - * @group Access Control -*) - -module D = Debug.Make (struct let name = "extauth_plugin_ADpbis" end) - -open D -open Xapi_stdext_std.Xstringext - -let ( let@ ) = ( @@ ) - -let finally = Xapi_stdext_pervasives.Pervasiveext.finally - -let with_lock = Xapi_stdext_threads.Threadext.Mutex.execute - -let lwsmd_service = "lwsmd" - -module Lwsmd = struct - (* This can be refined by Mtime.Span.hour when mtime is updated to 1.4.0 *) - let restart_interval = Int64.mul 3600L 1000000000L |> Mtime.Span.of_uint64_ns - - let next_check_point = - Mtime.add_span (Mtime_clock.now ()) restart_interval |> ref - - let is_ad_enabled ~__context = - ( Helpers.get_localhost ~__context |> fun self -> - Db.Host.get_external_auth_type ~__context ~self - ) - |> fun x -> x = Xapi_globs.auth_type_AD - - let enable_nsswitch () = - try - ignore - (Forkhelpers.execute_command_get_output - !Xapi_globs.domain_join_cli_cmd - ["configure"; "--enable"; "nsswitch"] - ) - with e -> - error "Fail to run %s with error %s" - !Xapi_globs.domain_join_cli_cmd - (ExnHelper.string_of_exn e) - - let stop ~timeout ~wait_until_success = - Xapi_systemctl.stop ~timeout ~wait_until_success lwsmd_service - - let start ~timeout ~wait_until_success = - Xapi_systemctl.start ~timeout ~wait_until_success lwsmd_service - - let restart ~timeout ~wait_until_success = - Xapi_systemctl.restart ~timeout ~wait_until_success lwsmd_service - - let restart_on_error () = - (* Only restart once within restart_interval *) - let now = Mtime_clock.now () in - match !next_check_point with - | Some check_point -> - if Mtime.is_later now ~than:check_point then ( - debug "Restart %s due to local server error" lwsmd_service ; - next_check_point := Mtime.add_span now restart_interval ; - restart ~timeout:0. ~wait_until_success:false - ) - | None -> - debug "next_check_point overflow" - - let init_service ~__context = - (* This function is called during xapi start *) - (* it will start lwsmd service if the host is authed with AD *) - (* Xapi does not wait lwsmd service to boot up success as following reasons - * 1. The waiting will slow down xapi bootup - * 2. Xapi still needs to boot up even lwsmd bootup fail - * 3. Xapi does not need to use lwsmd functionality during its bootup *) - if is_ad_enabled ~__context then ( - restart ~wait_until_success:false ~timeout:5. ; - (* Xapi help to enable nsswitch during bootup if it find the host is authed with AD - * nsswitch will be automatically enabled with command domainjoin-cli - * but this enabling is necessary when the host authed with AD upgrade - * As it will not run the domainjoin-cli command again *) - enable_nsswitch () - ) -end - -let match_error_tag (lines : string list) = - let err_catch_list = - [ - ("DNS_ERROR_BAD_PACKET", Auth_signature.E_LOOKUP) - ; ("LW_ERROR_PASSWORD_MISMATCH", Auth_signature.E_CREDENTIALS) - ; ("LW_ERROR_INVALID_ACCOUNT", Auth_signature.E_INVALID_ACCOUNT) - ; ("LW_ERROR_ACCESS_DENIED", Auth_signature.E_DENIED) - ; ("LW_ERROR_DOMAIN_IS_OFFLINE", Auth_signature.E_UNAVAILABLE) - ; ("LW_ERROR_INVALID_OU", Auth_signature.E_INVALID_OU) - (* More errors to be caught here *) - ] - in - let split_to_words str = - let seps = ['('; ')'; ' '; '\t'; '.'] in - String.split_f (fun s -> List.exists (fun sep -> sep = s) seps) str - in - let rec has_err lines err_pattern = - match lines with - | [] -> - false - | line :: rest -> ( - try - ignore (List.find (fun w -> w = err_pattern) (split_to_words line)) ; - true - with Not_found -> has_err rest err_pattern - ) - in - try - let _, errtag = - List.find - (fun (err_pattern, _) -> has_err lines err_pattern) - err_catch_list - in - errtag - with Not_found -> Auth_signature.E_GENERIC - -let extract_sid_from_group_list group_list = - List.map - (fun (_, v) -> - let v = String.replace ")" "" v in - let v = String.replace "sid =" "|" v in - let vs = String.split_f (fun c -> c = '|') v in - let sid = String.trim (List.nth vs 1) in - debug "extract_sid_from_group_list get sid=[%s]" sid ; - sid - ) - (List.filter (fun (n, _) -> n = "") group_list) - -let start_damon () = - try Lwsmd.start ~timeout:5. ~wait_until_success:true - with _ -> - raise - (Auth_signature.Auth_service_error - ( Auth_signature.E_GENERIC - , Printf.sprintf "Failed to start %s" lwsmd_service - ) - ) - -module AuthADlw : Auth_signature.AUTH_MODULE = struct - (* - * External Authentication Plugin component - * using AD/Pbis as a backend - * v1 14Nov14 phus.lu@citrix.com - * - *) - - let user_friendly_error_msg = - "The Active Directory Plug-in could not complete the command. Additional \ - information in the logs." - - let mutex_check_availability = - Locking_helpers.Named_mutex.create "IS_SERVER_AVAILABLE" - - let splitlines s = - String.split_f (fun c -> c = '\n') (String.replace "#012" "\n" s) - - let pbis_common_with_password (password : string) (pbis_cmd : string) - (pbis_args : string list) = - let debug_cmd = - pbis_cmd ^ " " ^ List.fold_left (fun p pp -> p ^ " " ^ pp) " " pbis_args - in - try - debug "execute %s" debug_cmd ; - let env = [|"PASSWORD=" ^ password|] in - let _ = Forkhelpers.execute_command_get_output ~env pbis_cmd pbis_args in - [] - with - | Forkhelpers.Spawn_internal_error (stderr, stdout, Unix.WEXITED n) -> - error "execute %s exited with code %d [stdout = '%s'; stderr = '%s']" - debug_cmd n stdout stderr ; - let lines = - List.filter - (fun l -> String.length l > 0) - (splitlines (stdout ^ stderr)) - in - let errmsg = List.hd (List.rev lines) in - let errtag = match_error_tag lines in - raise (Auth_signature.Auth_service_error (errtag, errmsg)) - | e -> - error "execute %s exited: %s" debug_cmd (ExnHelper.string_of_exn e) ; - raise - (Auth_signature.Auth_service_error - (Auth_signature.E_GENERIC, user_friendly_error_msg) - ) - - let pbis_config (name : string) (value : string) = - let pbis_cmd = "/opt/pbis/bin/config" in - let pbis_args = [name; value] in - let debug_cmd = pbis_cmd ^ " " ^ name ^ " " ^ value in - try - debug "execute %s" debug_cmd ; - let _ = Forkhelpers.execute_command_get_output pbis_cmd pbis_args in - () - with - | Forkhelpers.Spawn_internal_error (stderr, stdout, Unix.WEXITED n) -> - error "execute %s exited with code %d [stdout = '%s'; stderr = '%s']" - debug_cmd n stdout stderr ; - let lines = - List.filter - (fun l -> String.length l > 0) - (splitlines (stdout ^ stderr)) - in - let errmsg = List.hd (List.rev lines) in - raise - (Auth_signature.Auth_service_error (Auth_signature.E_GENERIC, errmsg)) - | e -> - error "execute %s exited: %s" debug_cmd (ExnHelper.string_of_exn e) ; - raise - (Auth_signature.Auth_service_error - (Auth_signature.E_GENERIC, user_friendly_error_msg) - ) - - let ensure_pbis_configured () = - pbis_config "SpaceReplacement" "+" ; - pbis_config "CreateHomeDir" "false" ; - pbis_config "SyncSystemTime" "false" ; - pbis_config "LdapSignAndSeal" "true" ; - pbis_config "CacheEntryExpiry" "300" ; - () - - let pbis_common ?(stdin_string = "") (pbis_cmd : string) - (pbis_args : string list) = - let debug_cmd = - pbis_cmd ^ " " ^ List.fold_left (fun p pp -> p ^ " " ^ pp) " " pbis_args - in - let debug_cmd = - if String.has_substr debug_cmd "--password" then - "(omitted for security)" - else - debug_cmd - in - (* stuff to clean up on the way out of the function: *) - let fds_to_close = ref [] in - let files_to_unlink = ref [] in - (* take care to close an fd only once *) - let close_fd fd = - if List.mem fd !fds_to_close then ( - Unix.close fd ; - fds_to_close := List.filter (fun x -> x <> fd) !fds_to_close - ) - in - (* take care to unlink a file only once *) - let unlink_file filename = - if List.mem filename !files_to_unlink then ( - Unix.unlink filename ; - files_to_unlink := List.filter (fun x -> x <> filename) !files_to_unlink - ) - in - (* guarantee to release all resources (files, fds) *) - let finalize () = - List.iter close_fd !fds_to_close ; - List.iter unlink_file !files_to_unlink - in - let finally_finalize f = finally f finalize in - let exited_code = ref 0 in - let output = ref "" in - finally_finalize (fun () -> - let _ = - try - debug "execute %s" debug_cmd ; - (* creates pipes between xapi and pbis process *) - let in_readme, in_writeme = Unix.pipe () in - fds_to_close := in_readme :: in_writeme :: !fds_to_close ; - let out_tmpfile = Filename.temp_file "pbis" ".out" in - files_to_unlink := out_tmpfile :: !files_to_unlink ; - let err_tmpfile = Filename.temp_file "pbis" ".err" in - files_to_unlink := err_tmpfile :: !files_to_unlink ; - let out_writeme = Unix.openfile out_tmpfile [Unix.O_WRONLY] 0o0 in - fds_to_close := out_writeme :: !fds_to_close ; - let err_writeme = Unix.openfile err_tmpfile [Unix.O_WRONLY] 0o0 in - fds_to_close := err_writeme :: !fds_to_close ; - let pid = - Forkhelpers.safe_close_and_exec (Some in_readme) - (Some out_writeme) (Some err_writeme) [] pbis_cmd pbis_args - in - finally - (fun () -> - debug "Created process pid %s for cmd %s" - (Forkhelpers.string_of_pidty pid) - debug_cmd ; - - (* Insert this delay to reproduce the cannot write to stdin bug: - Thread.delay 5.; *) - (* WARNING: we don't close the in_readme because otherwise in the case where the pbis - binary doesn't expect any input there is a race between it finishing (and closing the last - reference to the in_readme) and us attempting to write to in_writeme. If pbis wins the - race then our write will fail with EPIPE (Unix.error 31 in ocamlese). If we keep a reference - to in_readme then our write of "\n" will succeed. - - An alternative fix would be to not write anything when stdin_string = "" *) - - (* push stdin_string to recently created process' STDIN *) - try - (* usually, STDIN contains some sensitive data such as passwords that we do not want showing up in ps *) - (* or in the debug log via debug_cmd *) - let stdin_string = stdin_string ^ "\n" in - (*HACK:without \n, the pbis scripts don't return!*) - let (_ : int) = - Unix.write_substring in_writeme stdin_string 0 - (String.length stdin_string) - in - close_fd in_writeme - (* we need to close stdin, otherwise the unix cmd waits forever *) - with e -> - (* in_string is usually the password or other sensitive param, so never write it to debug or exn *) - debug "Error writing to stdin for cmd %s: %s" debug_cmd - (ExnHelper.string_of_exn e) ; - raise - (Auth_signature.Auth_service_error - (Auth_signature.E_GENERIC, ExnHelper.string_of_exn e) - ) - ) - (fun () -> - match Forkhelpers.waitpid pid with - | _, Unix.WEXITED n -> - exited_code := n ; - output := - Xapi_stdext_unix.Unixext.string_of_file out_tmpfile - ^ Xapi_stdext_unix.Unixext.string_of_file err_tmpfile - | _ -> - error "PBIS %s exit with WSTOPPED or WSIGNALED" debug_cmd ; - raise - (Auth_signature.Auth_service_error - (Auth_signature.E_GENERIC, user_friendly_error_msg) - ) - ) - with e -> - error "execute %s exited: %s" debug_cmd (ExnHelper.string_of_exn e) ; - raise - (Auth_signature.Auth_service_error - (Auth_signature.E_GENERIC, user_friendly_error_msg) - ) - in - if !exited_code <> 0 then ( - error "execute '%s': exit_code=[%d] output=[%s]" debug_cmd - !exited_code - (String.replace "\n" ";" !output) ; - let split_to_words s = - String.split_f (fun c -> c = '(' || c = ')' || c = '.' || c = ' ') s - in - let revlines = - List.rev - (List.filter (fun l -> String.length l > 0) (splitlines !output)) - in - let errmsg = List.hd revlines in - let errcodeline = - if List.length revlines > 1 then - List.nth revlines 1 - else - errmsg - in - let errcode = - List.hd - (List.filter - (fun w -> String.starts_with ~prefix:"LW_ERROR_" w) - (split_to_words errcodeline) - ) - in - debug "Pbis raised an error for cmd %s: (%s) %s" debug_cmd errcode - errmsg ; - match errcode with - | "LW_ERROR_INVALID_GROUP_INFO_LEVEL" -> - raise - (Auth_signature.Auth_service_error - (Auth_signature.E_GENERIC, errcode) - ) - (* For pbis_get_all_byid *) - | "LW_ERROR_NO_SUCH_USER" - | "LW_ERROR_NO_SUCH_GROUP" - | "LW_ERROR_NO_SUCH_OBJECT" -> - raise Not_found (* Subject_cannot_be_resolved *) - | "LW_ERROR_KRB5_CALL_FAILED" - | "LW_ERROR_PASSWORD_MISMATCH" - | "LW_ERROR_ACCOUNT_DISABLED" - | "LW_ERROR_NOT_HANDLED" -> - raise (Auth_signature.Auth_failure errmsg) - | "LW_ERROR_INVALID_OU" -> - raise - (Auth_signature.Auth_service_error - (Auth_signature.E_INVALID_OU, errmsg) - ) - | "LW_ERROR_INVALID_DOMAIN" -> - raise - (Auth_signature.Auth_service_error - (Auth_signature.E_GENERIC, errmsg) - ) - | "LW_ERROR_ERRNO_ECONNREFUSED" -> - (* CA-368806: Restart service to workaround pbis wedged *) - Lwsmd.restart_on_error () ; - raise - (Auth_signature.Auth_service_error - (Auth_signature.E_GENERIC, errmsg) - ) - | "LW_ERROR_LSA_SERVER_UNREACHABLE" | _ -> - raise - (Auth_signature.Auth_service_error - ( Auth_signature.E_GENERIC - , Printf.sprintf "(%s) %s" errcode errmsg - ) - ) - ) else - debug "execute %s: output length=[%d]" debug_cmd - (String.length !output) ; - let lines = - List.filter (fun l -> String.length l > 0) (splitlines !output) - in - let parse_line (acc, currkey) line = - let slices = String.split ~limit:2 ':' line in - debug "parse %s: currkey=[%s] line=[%s]" debug_cmd currkey line ; - if List.length slices > 1 then ( - let key = String.trim (List.hd slices) in - let value = String.trim (List.nth slices 1) in - debug "parse %s: key=[%s] value=[%s] currkey=[%s]" debug_cmd key - value currkey ; - if String.length value > 0 then - (acc @ [(key, value)], "") - else - (acc, key) - ) else - let key = currkey in - let value = String.trim line in - debug "parse %s: key=[%s] value=[%s] currkey=[%s]" debug_cmd key - value currkey ; - (acc @ [(key, value)], currkey) - in - let attrs, _ = List.fold_left parse_line ([], "") lines in - attrs - ) - - (* assoc list for caching pbis_common results, - item value is ((stdin_string, pbis_cmd, pbis_args), (unix_time, pbis_common_result)) - *) - let cache_of_pbis_common : - ((string * string * string list) * (float * (string * string) list)) list - ref = - ref [] - - let cache_of_pbis_common_m = Mutex.create () - - let pbis_common_with_cache ?(stdin_string = "") (pbis_cmd : string) - (pbis_args : string list) = - let expired = 120.0 in - let now = Unix.time () in - let cache_key = (stdin_string, pbis_cmd, pbis_args) in - let f () = - cache_of_pbis_common := - List.filter - (fun (_, (ts, _)) -> now -. ts < expired) - !cache_of_pbis_common ; - try - let _, result = List.assoc cache_key !cache_of_pbis_common in - debug "pbis_common_with_cache hit \"%s\" cache." pbis_cmd ; - result - with Not_found -> - let result = pbis_common ~stdin_string pbis_cmd pbis_args in - cache_of_pbis_common := - !cache_of_pbis_common @ [(cache_key, (Unix.time (), result))] ; - result - in - with_lock cache_of_pbis_common_m f - - let get_joined_domain_name () = - Server_helpers.exec_with_new_task "obtaining joined-domain name" - (fun __context -> - let host = Helpers.get_localhost ~__context in - (* the service_name always contains the domain name provided during domain-join *) - Db.Host.get_external_auth_service_name ~__context ~self:host - ) - - (* CP-842: when resolving AD usernames, make joined-domain prefix optional *) - let get_full_subject_name ?(use_nt_format = true) subject_name = - (* CA-27744: always use NT-style names by default *) - try - (* tests if the UPN account name separator @ is present in subject name *) - ignore (String.index subject_name '@') ; - (* we only reach this point if the separator @ is present in subject_name *) - (* nothing to do, we assume that subject_name already contains the domain name after @ *) - subject_name - with Not_found -> ( - try - (* if no UPN username separator @ was found *) - - (* tests if the NT account name separator \ is present in subject name *) - ignore (String.index subject_name '\\') ; - (* we only reach this point if the separator \ is present in subject_name *) - (* nothing to do, we assume that subject_name already contains the domain name before \ *) - subject_name - with Not_found -> - if - (* if neither the UPN separator @ nor the NT username separator \ was found *) - use_nt_format - then - (* the default: NT names is unique, whereas UPN ones are not (CA-27744) *) - (* we prepend the joined-domain name to the subjectname as an NT name: \ *) - get_joined_domain_name () ^ "\\" ^ subject_name - (* obs: (1) pbis accepts a fully qualified domain name with both formats and *) - (* (2) some pbis commands accept only the NT-format, such as find-group-by-name *) - else - (* UPN format not the default format (CA-27744) *) - (* we append the joined-domain name to the subjectname as a UPN name: @ *) - subject_name ^ "@" ^ get_joined_domain_name () - ) - - (* Converts from UPN format (user@domain.com) to legacy NT format (domain.com\user) *) - (* This function is a workaround to use find-group-by-name, which requires nt-format names) *) - (* For anything else, use the original UPN name *) - let convert_upn_to_nt_username subject_name = - try - (* test if the UPN account name separator @ is present in subject name *) - let i = String.index subject_name '@' in - (* we only reach this point if the separator @ is present in subject_name *) - (* when @ is present, we need to convert the UPN name to NT format *) - let user = String.sub subject_name 0 i in - let domain = - String.sub subject_name (i + 1) (String.length subject_name - i - 1) - in - domain ^ "\\" ^ user - with Not_found -> - (* if no UPN username separator @ was found *) - (* nothing to do in this case *) - subject_name - - let pbis_get_all_byid subject_id = - try - pbis_common_with_cache "/opt/pbis/bin/find-by-sid" - ["--level"; "2"; subject_id] - with - | Auth_signature.Auth_service_error - (Auth_signature.E_GENERIC, "LW_ERROR_INVALID_GROUP_INFO_LEVEL") - -> - pbis_common_with_cache "/opt/pbis/bin/find-by-sid" - ["--level"; "1"; subject_id] - - let pbis_get_group_sids_byname _subject_name = - let subject_name = get_full_subject_name _subject_name in - (* append domain if necessary *) - let subject_attrs = - pbis_common_with_cache "/opt/pbis/bin/list-groups-for-user" - ["--show-sid"; subject_name] - in - (* PBIS list-groups-for-user raw output like - Number of groups found for user 'test@testdomain' : 2 - Group[1 of 2] name = testdomain\dnsadmins (gid = 580912206, sid = S-1-5-21-791009147-1041474540-2433379237-1102) - Group[2 of 2] name = testdomain\domain+users (gid = 580911617, sid = S-1-5-21-791009147-1041474540-2433379237-513) - And pbis_common will return subject_attrs as - [("Number of groups found for user 'test@testdomain'", "2"), ("", line1), ("", line2) ... ("", lineN)] - *) - extract_sid_from_group_list subject_attrs - - (* general Pbis error *) - - let pbis_get_sid_byname _subject_name cmd = - let subject_name = get_full_subject_name _subject_name in - (* append domain if necessary *) - let subject_attrs = pbis_common cmd ["--level"; "1"; subject_name] in - (* find-user-by-name returns several lines. We ony need the SID *) - if List.mem_assoc "SID" subject_attrs then - List.assoc "SID" subject_attrs (* OK, return SID *) - else - (*no SID value returned*) - (* this should not have happend, pbis didn't return an SID field!! *) - let msg = - Printf.sprintf "Pbis didn't return an SID field for user %s" - subject_name - in - debug "Error pbis_get_sid_byname for subject name %s: %s" subject_name msg ; - raise (Auth_signature.Auth_service_error (Auth_signature.E_GENERIC, msg)) - - (* general Pbis error *) - - (* subject_id get_subject_identifier(string subject_name) - - Takes a subject_name (as may be entered into the XenCenter UI when defining subjects -- - see Access Control wiki page); and resolves it to a subject_id against the external - auth/directory service. - Raises Not_found (*Subject_cannot_be_resolved*) if authentication is not succesful. - *) - let get_subject_identifier ~__context _subject_name = - let@ __context = Context.with_tracing ~__context __FUNCTION__ in - - try - (* looks up list of users*) - let subject_name = get_full_subject_name _subject_name in - (* append domain if necessary *) - pbis_get_sid_byname subject_name "/opt/pbis/bin/find-user-by-name" - with _ -> - (* append domain if necessary, find-group-by-name only accepts nt-format names *) - let subject_name = - get_full_subject_name ~use_nt_format:true - (convert_upn_to_nt_username _subject_name) - in - (* looks up list of groups*) - pbis_get_sid_byname subject_name "/opt/pbis/bin/find-group-by-name" - - (* subject_id Authenticate_username_password(string username, string password) - - Takes a username and password, and tries to authenticate against an already configured - auth service (see XenAPI requirements Wiki page for details of how auth service configuration - takes place and the appropriate values are stored within the XenServer Metadata). - If authentication is successful then a subject_id is returned representing the account - corresponding to the supplied credentials (where the subject_id is in a namespace managed by - the auth module/service itself -- e.g. maybe a SID or something in the AD case). - Raises auth_failure if authentication is not successful - *) - - let authenticate_username_password ~__context username password = - let@ __context = Context.with_tracing ~__context __FUNCTION__ in - (* first, we try to authenticated user against our external user database *) - (* pbis_common will raise an Auth_failure if external authentication fails *) - let domain, user = - match String.split_f (fun c -> c = '\\') username with - | [domain; user] -> - (domain, user) - | [user] -> - (get_joined_domain_name (), user) - | _ -> - raise - (Auth_signature.Auth_service_error - (Auth_signature.E_GENERIC, "Invalid username " ^ username) - ) - in - let (_ : (string * string) list) = - pbis_common "/opt/pbis/bin/lsa" - [ - "authenticate-user" - ; "--user" - ; user - ; "--domain" - ; domain - ; "--password" - ; password - ] - in - (* no exception raised, then authentication succeeded, *) - (* now we return the authenticated user's id *) - get_subject_identifier ~__context (get_full_subject_name username) - - (* subject_id Authenticate_ticket(string ticket) - - As above but uses a ticket as credentials (i.e. for single sign-on) - *) - (* not implemented now, not needed for our tests, only for a *) - (* future single sign-on feature *) - let authenticate_ticket ~__context:_ _tgt = - failwith "extauth_plugin authenticate_ticket not implemented" - - (* ((string*string) list) query_subject_information(string subject_identifier) - - Takes a subject_identifier and returns the user record from the directory service as - key/value pairs. In the returned string*string map, there _must_ be a key called - subject_name that refers to the name of the account (e.g. the user or group name as may - be displayed in XenCenter). There is no other requirements to include fields from the user - record -- initially qI'd imagine that we wouldn't bother adding anything else here, but - it's a string*string list anyway for possible future expansion. - Raises Not_found (*Subject_cannot_be_resolved*) if subject_id cannot be resolved by external auth service - *) - let query_subject_information ~__context subject_identifier = - let@ __context = Context.with_tracing ~__context __FUNCTION__ in - let unmap_lw_space_chars lwname = - let defensive_copy = Bytes.of_string lwname in - (* CA-29006: map chars in names back to original space chars in windows-names *) - (* we use + as the pbis space-replacement because it's an invalid NT-username char in windows *) - (* the space-replacement char used by pbis is defined at /etc/pbis/lsassd.conf *) - let current_lw_space_replacement = '+' in - String.iteri - (fun i c -> - if c = current_lw_space_replacement then - Bytes.set defensive_copy i ' ' - else - () - ) - lwname ; - Bytes.unsafe_to_string defensive_copy - in - let get_value name ls = - if List.mem_assoc name ls then - List.assoc name ls - else - "" - in - let infolist = pbis_get_all_byid subject_identifier in - let subject_is_group = get_value "Uid" infolist = "" in - if subject_is_group then - (* subject is group *) - (* in this case, a few info fields are not available: UPN, Uid, Gecos, Account {disabled,expired,locked}, Password expired *) - [ - ("subject-name", unmap_lw_space_chars (get_value "Name" infolist)) - ; ("subject-gid", get_value "Gid" infolist) - ; ("subject-sid", get_value "SID" infolist) - ; ("subject-is-group", "true") - (*(* comma-separated list of subjects that are contained in this subject *) - ("contains-byname", List.fold_left (fun (n,v) m ->m^","^v) "" (List.filter (fun (n,v)->n="Members") infolist));*) - ] - else (* subject is user *) - let subject_name = unmap_lw_space_chars (get_value "Name" infolist) in - let subject_gecos = get_value "Gecos" infolist in - [ - ("subject-name", subject_name) - ; ("subject-upn", get_value "UPN" infolist) - ; ("subject-uid", get_value "Uid" infolist) - ; ("subject-gid", get_value "Gid" infolist) - ; ("subject-sid", get_value "SID" infolist) - ; ("subject-gecos", subject_gecos) - ; ( "subject-displayname" - , if subject_gecos = "" || subject_gecos = "" then - subject_name - else - subject_gecos - ) - ; (*("subject-homedir", get_value "Home dir" infolist);*) - (*("subject-shell", get_value "Shell" infolist);*) - ("subject-is-group", "false") - ; ( "subject-account-disabled" - , get_value "Account disabled (or locked)" infolist - ) - ; ("subject-account-expired", get_value "Account Expired" infolist) - ; ( "subject-account-locked" - , get_value "Account disabled (or locked)" infolist - ) - ; ("subject-password-expired", get_value "Password Expired" infolist) - ] - - (* (string list) query_group_membership(string subject_identifier) - - Takes a subject_identifier and returns its group membership (i.e. a list of subject - identifiers of the groups that the subject passed in belongs to). The set of groups returned - _must_ be transitively closed wrt the is_member_of relation if the external directory service - supports nested groups (as AD does for example) - *) - let query_group_membership ~__context subject_identifier = - let@ __context = Context.with_tracing ~__context __FUNCTION__ in - - let subject_info = - query_subject_information ~__context subject_identifier - in - if - List.assoc "subject-is-group" subject_info = "true" - (* this field is always present *) - then - (* subject is a group, so get_group_sids_byname will not work because pbis's list-groups *) - (* doesnt work if a group name is given as input *) - (* FIXME: default action for groups until workaround is found: return an empty list of membership groups *) - [] - else - (* subject is a user, list-groups and therefore get_group_sids_byname work fine *) - let subject_name = List.assoc "subject-name" subject_info in - (* CA-27744: always use NT-style names *) - let subject_sid_membership_list = - pbis_get_group_sids_byname subject_name - in - debug "Resolved %i group sids for subject %s (%s): %s" - (List.length subject_sid_membership_list) - subject_name subject_identifier - (List.fold_left - (fun p pp -> - if p = "" then - pp - else - p ^ "," ^ pp - ) - "" subject_sid_membership_list - ) ; - subject_sid_membership_list - - (* - In addition, there are some event hooks that auth modules implement as follows: -*) - - let _is_pbis_server_available ~__context max_tries = - (* we _need_ to use a username contained in our domain, otherwise the following tests won't work. - Microsoft KB/Q243330 article provides the KRBTGT account as a well-known built-in SID in AD - Microsoft KB/Q229909 article says that KRBTGT account cannot be renamed or enabled, making - it the perfect target for such a test using a username (Administrator account can be renamed) *) - let krbtgt = "KRBTGT" in - let try_clear_cache () = - (* the primary purpose of this function is to clear the cache so that - [ try_fetch_sid ] is forced to perform an end to end query to the - AD server. as such, we don't care if krbtgt was not originally in - the cache *) - match get_full_subject_name krbtgt with - | exception _ -> - info - "_is_pbis_server_available: failed to get full subject name for %s" - krbtgt ; - Error () - | full_username -> ( - match - ignore - (pbis_common "/opt/pbis/bin/ad-cache" - ["--delete-user"; "--name"; full_username] - ) - with - | () | (exception Not_found) -> - Ok () - | exception e -> - debug "Failed to remove user %s from cache: %s" full_username - (ExnHelper.string_of_exn e) ; - Error () - ) - in - let try_fetch_sid () = - try - let sid = get_subject_identifier ~__context krbtgt in - debug - "Request to external authentication server successful: user %s was \ - found" - krbtgt ; - let (_ : (string * string) list) = - query_subject_information ~__context sid - in - debug - "Request to external authentication server successful: sid %s was \ - found" - sid ; - Ok () - with - | Not_found -> - (* that means that pbis is responding to at least cached subject queries. - in this case, KRBTGT wasn't found in the AD domain. this usually indicates that the - AD domain is offline/inaccessible to pbis, which will cause problems, specially - to the ssh python hook-script, so we need to try again until KRBTGT is found, indicating - that the domain is online and accessible to pbis queries *) - debug - "Request to external authentication server returned KRBTGT \ - Not_found" ; - Error () - | e -> - debug - "Request to external authentication server failed for reason: %s" - (ExnHelper.string_of_exn e) ; - Error () - in - let rec go i = - if i > max_tries then ( - info - "Testing external authentication server failed after %i tries, \ - giving up!" - max_tries ; - false - ) else ( - debug - "Testing if external authentication server is accepting requests... \ - attempt %i of %i" - i max_tries ; - let ( >>= ) = Rresult.( >>= ) in - (* if we don't remove krbtgt from the cache before - query subject information about krbtgt, then - [ try_fetch_sid ] would erroneously return success - in the case that PBIS is running locally, but the - AD domain is offline *) - match try_clear_cache () >>= try_fetch_sid with - | Error () -> - Thread.delay 5.0 ; - (go [@tailcall]) (i + 1) - | Ok () -> - true - ) - in - go 0 - - let is_pbis_server_available ~__context max = - Locking_helpers.Named_mutex.execute mutex_check_availability (fun () -> - _is_pbis_server_available ~__context max - ) - - (* converts from domain.com\user to user@domain.com, in case domain.com is present in the subject_name *) - let convert_nt_to_upn_username subject_name = - try - (* test if the NT account name separator \ is present in subject name *) - let i = String.index subject_name '\\' in - (* we only reach this point if the separator \ is present in subject_name *) - (* when \ is present, we need to convert the NT name to UPN format *) - let domain = String.sub subject_name 0 i in - let user = - String.sub subject_name (i + 1) (String.length subject_name - i - 1) - in - user ^ "@" ^ domain - with Not_found -> - (* if no NT username separator \ was found *) - (* nothing to do in this case *) - subject_name - - (* unit on_enable(((string*string) list) config_params) - - Called internally by xapi _on each host_ when a client enables an external auth service for the - pool via the XenAPI [see AD integration wiki page]. The config_params here are the ones passed - by the client as part of the corresponding XenAPI call. - On receiving this hook, the auth module should: - (i) do whatever it needs to do (if anything) to register with the external auth/directory - service [using the config params supplied to get access] - (ii) Write the config_params that it needs to store persistently in the XenServer metadata - into the Pool.external_auth_configuration field. [Note - the rationale for making the plugin - write the config params it needs long-term into the XenServer metadata itself is so it can - explicitly filter any one-time credentials [like AD username/password for example] that it - does not need long-term.] - *) - let on_enable ~__context config_params = - let@ __context = Context.with_tracing ~__context __FUNCTION__ in - (* but in the ldap plugin, we should 'join the AD/kerberos domain', i.e. we should*) - (* basically: (1) create a machine account in the kerberos realm,*) - (* (2) store the machine account password somewhere locally (in a keytab) *) - start_damon () ; - if - not - (List.mem_assoc "user" config_params - && List.mem_assoc "pass" config_params - ) - then - raise - (Auth_signature.Auth_service_error - ( Auth_signature.E_GENERIC - , "enable requires two config params: user and pass." - ) - ) - else (* we have all the required parameters *) - let hostname = - Server_helpers.exec_with_new_task "retrieving hostname" - (fun __context -> - let host = Helpers.get_localhost ~__context in - Db.Host.get_hostname ~__context ~self:host - ) - in - if - String.fold_left (fun b ch -> b && ch >= '0' && ch <= '9') true hostname - then - raise - (Auth_signature.Auth_service_error - ( Auth_signature.E_GENERIC - , Printf.sprintf "hostname '%s' cannot contain only digits." - hostname - ) - ) - else - let domain = - let service_name = - Server_helpers.exec_with_new_task - "retrieving external_auth_service_name" (fun __context -> - let host = Helpers.get_localhost ~__context in - Db.Host.get_external_auth_service_name ~__context ~self:host - ) - in - (* legacy test: do we have domain name in config? - then config:domain must match service-name *) - if List.mem_assoc "domain" config_params then - let _domain = List.assoc "domain" config_params in - if service_name <> _domain then - raise - (Auth_signature.Auth_service_error - ( Auth_signature.E_GENERIC - , "if present, config:domain must match service-name." - ) - ) - else - service_name - else - (* if no config:domain provided, we simply use the string in service_name for the domain name *) - service_name - in - let _user = List.assoc "user" config_params in - let pass = List.assoc "pass" config_params in - let ou_conf, ou_params = - if List.mem_assoc "ou" config_params then - let ou = List.assoc "ou" config_params in - ([("ou", ou)], ["--ou"; ou]) - else - ([], []) - in - (* Adding the config parameter "config:disable_modules=X,Y,Z" - * will disable the modules X, Y and Z in domainjoin-cli. *) - let disabled_modules = - try - match List.assoc "disable_modules" config_params with - | "" -> - [] - | disabled_modules_string -> - String.split_f (fun c -> c = ',') disabled_modules_string - with Not_found -> [] - in - let disabled_module_params = - List.concat_map - (fun disabled_module -> ["--disable"; disabled_module]) - disabled_modules - in - (* we need to make sure that the user passed to domaijoin-cli command is in the UPN syntax (user@domain.com) *) - let user = convert_nt_to_upn_username _user in - (* execute the pbis domain join cmd *) - try - let (_ : (string * string) list) = - [ - ["join"] - ; ou_params - ; disabled_module_params - ; ["--ignore-pam"; "--notimesync"; domain; user] - ] - |> List.concat - |> pbis_common_with_password pass !Xapi_globs.domain_join_cli_cmd - in - let max_tries = 60 in - (* tests 60 x 5.0 seconds = 300 seconds = 5minutes trying *) - if not (is_pbis_server_available ~__context max_tries) then ( - let errmsg = - Printf.sprintf - "External authentication server not available after %i query \ - tests" - max_tries - in - debug "%s" errmsg ; - raise - (Auth_signature.Auth_service_error - (Auth_signature.E_UNAVAILABLE, errmsg) - ) - ) ; - (* OK SUCCESS, pbis has joined the AD domain successfully *) - (* write persistently the relevant config_params in the host.external_auth_configuration field *) - (* we should not store the user's (admin's) password !! *) - let extauthconf = [("domain", domain); ("user", user)] @ ou_conf in - Server_helpers.exec_with_new_task - "storing external_auth_configuration" (fun __context -> - let host = Helpers.get_localhost ~__context in - Db.Host.set_external_auth_configuration ~__context ~self:host - ~value:extauthconf ; - debug "added external_auth_configuration for host %s" - (Db.Host.get_name_label ~__context ~self:host) - ) ; - with_lock cache_of_pbis_common_m (fun _ -> cache_of_pbis_common := []) ; - ensure_pbis_configured () - with e -> - (*ERROR, we didn't join the AD domain*) - debug - "Error enabling external authentication for domain %s and user %s: \ - %s" - domain user - (ExnHelper.string_of_exn e) ; - raise e - - (* unit on_disable() - - Called internally by xapi _on each host_ when a client disables an auth service via the XenAPI. - The hook will be called _before_ the Pool configuration fields relating to the external-auth - service are cleared (i.e. so you can access the config params you need from the pool metadata - within the body of the on_disable method) - *) - let on_disable ~__context config_params = - let@ __context = Context.with_tracing ~__context __FUNCTION__ in - (* but in the ldap plugin, we should 'leave the AD/kerberos domain', i.e. we should *) - (* (1) remove the machine account from the kerberos realm, (2) remove the keytab locally *) - let pbis_failure = - try - ( if - not - (List.mem_assoc "user" config_params - && List.mem_assoc "pass" config_params - ) - then - (* no windows admin+pass have been provided: leave the pbis host in the AD database *) - (* execute the pbis domain-leave cmd *) - (* this function will raise an exception if something goes wrong *) - let (_ : (string * string) list) = - pbis_common !Xapi_globs.domain_join_cli_cmd ["leave"] - in - () - else - (* windows admin+pass have been provided: ask pbis to remove host from AD database *) - let _user = List.assoc "user" config_params in - let pass = List.assoc "pass" config_params in - (* we need to make sure that the user passed to domaijoin-cli command is in the UPN syntax (user@domain.com) *) - let user = - convert_nt_to_upn_username - (get_full_subject_name ~use_nt_format:false _user) - in - (* execute the pbis domain-leave cmd *) - (* this function will raise an exception if something goes wrong *) - let (_ : (string * string) list) = - pbis_common_with_password pass - !Xapi_globs.domain_join_cli_cmd - ["leave"; user] - in - () - ) ; - None (* no failure observed in pbis *) - with e -> - (* unexpected error disabling pbis *) - debug "Internal Pbis error when disabling external authentication: %s" - (ExnHelper.string_of_exn e) ; - (* CA-27627: we should NOT re-raise the exception e here, because otherwise we might get stuck, *) - (* without being able to disable an external authentication configuration, since the Pbis *) - (* behavior is outside our control. For instance, Pbis raises an exception during domain-leave *) - (* when the domain controller is offline, so it would be impossible to leave a domain that *) - (* has already been deleted. *) - (* Not re-raising an exception here is not too bad, because both ssh and xapi access to the AD/Pbis *) - (* commands will be disabled anyway by host.disable_external_auth. So, even though access to the external *) - (* authentication service might still be possible from Dom0 shell, it will not be possible *) - (* to login as an external user via ssh or to call external-authentication services via xapi/xe. *) - Some e - (* CA-28942: stores exception returned by pbis for later *) - in - (* We always do a manual clean-up of pbis, in order to restore Dom0 to its pre-pbis state *) - (* It doesn't matter if pbis succeeded or not *) - (* This disables Pbis even from Dom0 shell *) - debug "Doing a manual Pbis domain-leave cleanup..." ; - (* When pbis raises an exception during domain-leave, we try again, using *) - (* some of the command-line workarounds that Kyle describes in CA-27627: *) - let pbis_force_domain_leave_script = - "/opt/xensource/libexec/pbis-force-domain-leave" - in - ( try - let output, stderr = - Forkhelpers.execute_command_get_output pbis_force_domain_leave_script - [] - in - debug "execute %s: stdout=[%s],stderr=[%s]" - pbis_force_domain_leave_script - (String.replace "\n" ";" output) - (String.replace "\n" ";" stderr) - with e -> - debug "exception executing %s: %s" pbis_force_domain_leave_script - (ExnHelper.string_of_exn e) - ) ; - (* OK SUCCESS, pbis has left the AD domain successfully *) - (* remove persistently the relevant config_params in the host.external_auth_configuration field *) - Server_helpers.exec_with_new_task "removing external_auth_configuration" - (fun __context -> - let host = Helpers.get_localhost ~__context in - Db.Host.set_external_auth_configuration ~__context ~self:host ~value:[] ; - debug "removed external_auth_configuration for host %s" - (Db.Host.get_name_label ~__context ~self:host) - ) ; - match pbis_failure with - | None -> - () (* OK, return unit*) - | Some e -> - raise e - - (* bubble up pbis failure *) - - (* unit on_xapi_initialize(bool system_boot) - - Called internally by xapi whenever it starts up. The system_boot flag is true iff xapi is - starting for the first time after a host boot - *) - let on_xapi_initialize ~__context _system_boot = - let@ __context = Context.with_tracing ~__context __FUNCTION__ in - - (* the AD server is initialized outside xapi, by init.d scripts *) - - (* this function is called during xapi initialization in xapi.ml *) - - (* make sure that the AD/LSASS server is responding before returning *) - let max_tries = 12 in - (* tests 12 x 5.0 seconds = 60 seconds = up to 1 minute trying *) - if not (is_pbis_server_available ~__context max_tries) then ( - let errmsg = - Printf.sprintf - "External authentication server not available after %i query tests" - max_tries - in - debug "%s" errmsg ; - raise - (Auth_signature.Auth_service_error (Auth_signature.E_GENERIC, errmsg)) - ) ; - () - - (* unit on_xapi_exit() - - Called internally when xapi is doing a clean exit. - *) - let on_xapi_exit ~__context:_ () = - (* nothing to do here in this unix plugin *) - - (* in the ldap plugin, we should remove the tgt ticket in /tmp/krb5cc_0 *) - () - - (* Implement the single value required for the module signature *) - let methods = - Auth_signature. - { - authenticate_username_password - ; authenticate_ticket - ; get_subject_identifier - ; query_subject_information - ; query_group_membership - ; on_enable - ; on_disable - ; on_xapi_initialize - ; on_xapi_exit - } -end diff --git a/ocaml/xapi/extauth_plugin_ADwinbind.ml b/ocaml/xapi/extauth_plugin_ADwinbind.ml index 7e77f3a8502..523708526ff 100644 --- a/ocaml/xapi/extauth_plugin_ADwinbind.ml +++ b/ocaml/xapi/extauth_plugin_ADwinbind.ml @@ -780,6 +780,7 @@ let query_domain_workgroup ~domain = let config_winbind_daemon ~workgroup ~netbios_name ~domain = let smb_config = "/etc/samba/smb.conf" in + let extra_conf = "/etc/samba/smb.extra.conf" in let string_of_bool = function true -> "yes" | false -> "no" in let scan_trusted_domains = @@ -788,7 +789,8 @@ let config_winbind_daemon ~workgroup ~netbios_name ~domain = ( match (workgroup, netbios_name, domain) with | Some wkgroup, Some netbios, Some dom -> [ - "# autogenerated by xapi" + Printf.sprintf "# This file is managed by xapi, update %s instead" + extra_conf ; "[global]" ; "client use kerberos = required" ; "sync machine password to keytab = \ @@ -815,6 +817,7 @@ let config_winbind_daemon ~workgroup ~netbios_name ~domain = ; "idmap config * : backend = autorid" ; Printf.sprintf "idmap config * : range = %d-%d" 2_000_000 99_999_999 ; Printf.sprintf "log level = %s" (debug_level ()) + ; Printf.sprintf "include = %s" extra_conf ; "" (* Empty line at the end *) ] | _ -> diff --git a/ocaml/xapi/helpers.ml b/ocaml/xapi/helpers.ml index e571b962272..e791dc72c3a 100644 --- a/ocaml/xapi/helpers.ml +++ b/ocaml/xapi/helpers.ml @@ -1134,7 +1134,7 @@ let get_sr_free_space ~__context ~sr = Int64.sub size utilisation (* Returns an SR suitable for suspending this VM *) -let choose_suspend_sr ~__context ~vm ~required_space = +let choose_suspend_sr ~__context ~vm = (* If the VM.suspend_SR exists, use that. If it fails, try the Pool.suspend_image_SR. *) (* If that fails, try the Host.suspend_image_SR. *) let vm_sr = Db.VM.get_suspend_SR ~__context ~self:vm in @@ -1142,36 +1142,23 @@ let choose_suspend_sr ~__context ~vm ~required_space = let pool_sr = Db.Pool.get_suspend_image_SR ~__context ~self:pool in let resident_on = Db.VM.get_resident_on ~__context ~self:vm in let host_sr = Db.Host.get_suspend_image_sr ~__context ~self:resident_on in - let sr = - match - ( check_sr_exists_for_host ~__context ~self:vm_sr ~host:resident_on - , check_sr_exists_for_host ~__context ~self:pool_sr ~host:resident_on - , check_sr_exists_for_host ~__context ~self:host_sr ~host:resident_on - ) - with - | Some x, _, _ -> - x - | _, Some x, _ -> - x - | _, _, Some x -> - x - | None, None, None -> - raise - (Api_errors.Server_error - (Api_errors.vm_no_suspend_sr, [Ref.string_of vm]) - ) - in - let free_space = get_sr_free_space ~__context ~sr in - if free_space < required_space then ( - let sr_str = Ref.string_of sr in - error "%s: SR %s free=%Ld needed=%Ld" __FUNCTION__ sr_str free_space - required_space ; - raise - (Api_errors.Server_error - (Api_errors.sr_suspend_space_insufficient, [sr_str]) - ) - ) else - sr + match + ( check_sr_exists_for_host ~__context ~self:vm_sr ~host:resident_on + , check_sr_exists_for_host ~__context ~self:pool_sr ~host:resident_on + , check_sr_exists_for_host ~__context ~self:host_sr ~host:resident_on + ) + with + | Some x, _, _ -> + x + | _, Some x, _ -> + x + | _, _, Some x -> + x + | None, None, None -> + raise + (Api_errors.Server_error + (Api_errors.vm_no_suspend_sr, [Ref.string_of vm]) + ) (* return the operations filtered for cancels functions *) let cancel_tasks ~__context ~ops ~all_tasks_in_db @@ -2038,7 +2025,7 @@ let try_internal_async ~__context (marshaller : Rpc.t -> 'b) ) (fun () -> info "try_internal_async: destroying task: t = ( %s )" ref ; - TaskHelper.destroy ~__context t + Db.Task.destroy ~__context ~self:t ) module PoolSecret : sig diff --git a/ocaml/xapi/message_forwarding.ml b/ocaml/xapi/message_forwarding.ml index 1d8d228cc80..83786bd8afd 100644 --- a/ocaml/xapi/message_forwarding.ml +++ b/ocaml/xapi/message_forwarding.ml @@ -3730,11 +3730,13 @@ functor in do_op_on ~local_fn ~__context ~host ~remote_fn - let disable_external_auth ~__context ~host ~config = + let disable_external_auth ~__context ~host ~config ~force = info "Host.disable_external_auth: host = '%s'" (host_uuid ~__context host) ; - let local_fn = Local.Host.disable_external_auth ~host ~config in - let remote_fn = Client.Host.disable_external_auth ~host ~config in + let local_fn = Local.Host.disable_external_auth ~host ~config ~force in + let remote_fn = + Client.Host.disable_external_auth ~host ~config ~force + in do_op_on ~local_fn ~__context ~host ~remote_fn let install_ca_certificate ~__context ~host ~name ~cert = diff --git a/ocaml/xapi/pkg_mgr.ml b/ocaml/xapi/pkg_mgr.ml index ed2550ffe2a..af140d5a48c 100644 --- a/ocaml/xapi/pkg_mgr.ml +++ b/ocaml/xapi/pkg_mgr.ml @@ -37,6 +37,9 @@ module type S = sig val get_updates_from_upgrade_dry_run : repositories:string list -> cmd_line + val get_updates_from_group_upgrade_dry_run : + repositories:string list -> cmd_line + val is_obsoleted : pkg_name:string -> repositories:string list -> cmd_line val repoquery_updates : repositories:string list -> cmd_line @@ -52,6 +55,8 @@ module type S = sig val sync_repo : repo_name:string -> cmd_line val apply_upgrade : repositories:string list -> cmd_line + + val apply_group_upgrade : repositories:string list -> cmd_line end module type Args = sig @@ -71,6 +76,8 @@ module type Args = sig val get_updates_from_upgrade_dry_run : string list -> string list + val get_updates_from_group_upgrade_dry_run : string list -> string list + val is_obsoleted : string -> string list -> string list val repoquery_updates : string list -> string list @@ -86,6 +93,8 @@ module type Args = sig val sync_repo : string -> string list val apply_upgrade : string list -> string list + + val apply_group_upgrade : string list -> string list end let repoquery_sep = ":|" @@ -125,6 +134,16 @@ module Common_args = struct ; "upgrade" ] + let get_updates_from_group_upgrade_dry_run repositories = + [ + "--disablerepo=*" + ; Printf.sprintf "--enablerepo=%s" (String.concat "," repositories) + ; "--assumeno" + ; "group" + ; "upgrade" + ; "*" + ] + let repoquery repositories = [ "--disablerepo=*" @@ -161,6 +180,16 @@ module Common_args = struct ; Printf.sprintf "--enablerepo=%s" (String.concat "," repositories) ; "upgrade" ] + + let apply_group_upgrade repositories = + [ + "-y" + ; "--disablerepo=*" + ; Printf.sprintf "--enablerepo=%s" (String.concat "," repositories) + ; "group" + ; "upgrade" + ; "*" + ] end module Yum_args : Args = struct @@ -179,6 +208,10 @@ module Yum_args : Args = struct let get_updates_from_upgrade_dry_run repositories = ["--quiet"] @ Common_args.get_updates_from_upgrade_dry_run repositories + let get_updates_from_group_upgrade_dry_run repositories = + ["--quiet"] + @ Common_args.get_updates_from_group_upgrade_dry_run repositories + let is_obsoleted pkg_name repositories = ["--all"] @ Common_args.is_obsoleted pkg_name repositories @ ["--plugins"] @@ -283,6 +316,12 @@ module Cmd_line (M : Args) : S = struct let get_updates_from_upgrade_dry_run ~repositories = {cmd= M.pkg_cmd; params= M.get_updates_from_upgrade_dry_run repositories} + let get_updates_from_group_upgrade_dry_run ~repositories = + { + cmd= M.pkg_cmd + ; params= M.get_updates_from_group_upgrade_dry_run repositories + } + let is_obsoleted ~pkg_name ~repositories = {cmd= M.repoquery_cmd; params= M.is_obsoleted pkg_name repositories} @@ -305,6 +344,9 @@ module Cmd_line (M : Args) : S = struct let apply_upgrade ~repositories = {cmd= M.pkg_cmd; params= M.apply_upgrade repositories} + + let apply_group_upgrade ~repositories = + {cmd= M.pkg_cmd; params= M.apply_group_upgrade repositories} end module Yum_cmd = Cmd_line (Yum_args) diff --git a/ocaml/xapi/pkg_mgr.mli b/ocaml/xapi/pkg_mgr.mli index a2849fb4ce3..059943c1f10 100644 --- a/ocaml/xapi/pkg_mgr.mli +++ b/ocaml/xapi/pkg_mgr.mli @@ -41,6 +41,10 @@ module type S = sig val get_updates_from_upgrade_dry_run : repositories:string list -> cmd_line (** Command line and arguments to dry run an upgrade, with repositories enabled *) + val get_updates_from_group_upgrade_dry_run : + repositories:string list -> cmd_line + (** Command line and arguments to dry run a group upgrade, with repositories enabled *) + val is_obsoleted : pkg_name:string -> repositories:string list -> cmd_line (** Command line and arguments to check whether a package is obsoleted by any other * package in given repositories *) @@ -67,6 +71,9 @@ module type S = sig val apply_upgrade : repositories:string list -> cmd_line (** Command line and arguments to apply upgrades from repos *) + + val apply_group_upgrade : repositories:string list -> cmd_line + (** Command line and arguments to apply group upgrades from repos *) end (** Exposed only for unittest, do not use the modules directly diff --git a/ocaml/xapi/repository.ml b/ocaml/xapi/repository.ml index 9fb284fcc55..8ee982ef02b 100644 --- a/ocaml/xapi/repository.ml +++ b/ocaml/xapi/repository.ml @@ -169,7 +169,14 @@ let sync ~__context ~self ~token ~token_id ~username ~password = ) | `bundle -> let uri = - Uri.make ~scheme:"file" ~path:!Xapi_globs.bundle_repository_dir () + (* dnf requires a URI that is file:///absolute/path, using the + * file:/absolute/path variant will result in a failure to locate + * the signature file, thus the host parameter to Uri.make is + * required + *) + Uri.make ~scheme:"file" ~host:"" + ~path:!Xapi_globs.bundle_repository_dir + () in (Uri.to_string uri, None, false, NoAuth, NoAuth) | `remote_pool -> @@ -558,6 +565,20 @@ let get_host_updates_in_json ~__context ~installed = let latest_updates' = get_updates_from_yum_upgrade_dry_run repositories in + let latest_updates_group' = + get_updates_from_yum_group_upgrade_dry_run repositories + in + let latest_updates_combined' = + match (latest_updates', latest_updates_group') with + | Some pkgs', Some group_pkgs' -> + Some (List.sort_uniq compare (pkgs' @ group_pkgs')) + | Some pkgs', None -> + Some pkgs' + | None, Some group_pkgs' -> + Some group_pkgs' + | None, None -> + None + in let latest_updates'' = get_updates_from_repoquery repositories in (* To ensure the updating function will not strand, use redundant * functions to get the update/installation list. @@ -569,7 +590,7 @@ let get_host_updates_in_json ~__context ~installed = let fail_on_error = Xapi_fist.fail_on_error_in_yum_upgrade_dry_run () in let latest_updates = get_latest_updates_from_redundancy ~fail_on_error - ~pkgs:latest_updates' ~fallback_pkgs:latest_updates'' + ~pkgs:latest_updates_combined' ~fallback_pkgs:latest_updates'' in List.iter (fun r -> clean_yum_cache r) repositories ; let latest_updates_in_json = @@ -679,8 +700,15 @@ let get_pool_updates_in_json ~__context ~hosts = let apply ~__context ~host = (* This function runs on member host *) with_local_repositories ~__context (fun repositories -> - let Pkg_mgr.{cmd; params} = Pkgs.apply_upgrade ~repositories in - try ignore (Helpers.call_script cmd params) + let upgrade () = + let Pkg_mgr.{cmd; params} = Pkgs.apply_upgrade ~repositories in + ignore (Helpers.call_script cmd params) + in + let group_upgrade () = + let Pkg_mgr.{cmd; params} = Pkgs.apply_group_upgrade ~repositories in + ignore (Helpers.call_script cmd params) + in + try upgrade () ; group_upgrade () with e -> let host' = Ref.string_of host in error "Failed to apply updates on host ref='%s': %s" host' diff --git a/ocaml/xapi/repository_helpers.ml b/ocaml/xapi/repository_helpers.ml index 487e8e30f17..1afa819a437 100644 --- a/ocaml/xapi/repository_helpers.ml +++ b/ocaml/xapi/repository_helpers.ml @@ -771,8 +771,10 @@ module YumUpgradeOutput = struct | false -> ( take_till is_eol <* end_of_line >>= function | ( "Installing:" + | "Installing group/module packages:" | "Updating:" | "Upgrading:" + | "Upgrading groups:" | "Removing:" | "Reinstalling:" | "Downgrading:" @@ -890,6 +892,7 @@ module YumUpgradeOutput = struct |> List.filter (fun (section, _) -> match section with | "Installing:" + | "Installing group/module packages:" | "Updating:" | "Upgrading:" | "Installing for dependencies:" @@ -958,6 +961,31 @@ let get_updates_from_yum_upgrade_dry_run repositories = error "%s" (ExnHelper.string_of_exn e) ; None +let get_updates_from_yum_group_upgrade_dry_run repositories = + let Pkg_mgr.{cmd; params} = + Pkgs.get_updates_from_group_upgrade_dry_run ~repositories + in + match Forkhelpers.execute_command_get_output cmd params with + | _, _ -> + Some [] + | exception Forkhelpers.Spawn_internal_error (stderr, stdout, Unix.WEXITED 1) + -> ( + (*Yum put the details to stderr while dnf to stdout*) + (match Pkgs.manager with Yum -> stderr | Dnf -> stdout) + |> YumUpgradeOutput.parse_output_of_dry_run + |> function + | Ok (pkgs, Some txn_file) -> + Unixext.unlink_safe txn_file ; + Some pkgs + | Ok (pkgs, None) -> + Some pkgs + | Error msg -> + error "%s" msg ; None + ) + | exception e -> + error "%s" (ExnHelper.string_of_exn e) ; + None + let get_latest_updates_from_redundancy ~fail_on_error ~pkgs ~fallback_pkgs = let err = "Failed to parse output of 'yum upgrade (dry run)' correctly" in let get_latest_updates_from_redundancy' ~fail_on_error ~pkgs ~fallback_pkgs = diff --git a/ocaml/xapi/storage_access.ml b/ocaml/xapi/storage_access.ml index cd9b0aee515..87e0b266e0f 100644 --- a/ocaml/xapi/storage_access.ml +++ b/ocaml/xapi/storage_access.ml @@ -21,7 +21,8 @@ let finally = Xapi_stdext_pervasives.Pervasiveext.finally module XenAPI = Client.Client open Storage_interface -open Storage_utils + +let transform_storage_exn = Storage_utils.transform_storage_exn module D = Debug.Make (struct let name = "storage_access" end) @@ -103,12 +104,72 @@ let external_rpc queue_name uri = (* Internal exception, never escapes the module *) exception Message_switch_failure -(* We have to be careful in this function, because an exception raised from - here will cause the startup sequence to fail *) +(* We have to be careful in the call tree of on_xapi_start, because an + exception raised in it will cause the startup sequence to fail *) + +let get_smapiv2_drivers_from_switch () = + let module Client = Message_switch_unix.Protocol_unix.Client in + try + let ( >>| ) result f = + match Client.error_to_msg result with + | Error (`Msg x) -> + error "%s: Error %s while querying message switch queues" __FUNCTION__ + x ; + raise Message_switch_failure + | Ok x -> + f x + in + Client.connect ~switch:!Xcp_client.switch_path () >>| fun t -> + Client.list ~t ~prefix:!Storage_interface.queue_name ~filter:`Alive () + >>| fun running_smapiv2_driver_queues -> + running_smapiv2_driver_queues + (* The results include the prefix itself, but that is the main storage + queue, we don't need it *) + |> List.filter (( <> ) !Storage_interface.queue_name) + |> Listext.List.try_map (fun driver -> + (* Get the last component of the queue name: + org.xen.xapi.storage.sr_type -> sr_type *) + driver + |> String.split_on_char '.' + |> Listext.List.last + |> Option.to_result ~none:(Invalid_argument driver) + ) + |> function + | Ok drivers -> + drivers + | Error exn -> + raise exn + with + | Message_switch_failure -> + [] (* no more logging *) + | e -> + Backtrace.is_important e ; + error "Unexpected error querying the message switch: %s" + (Printexc.to_string e) ; + Debug.log_backtrace e (Backtrace.get e) ; + [] + +let log_and_unregister ~__context ~reason __FUN (self, rc) = + info "%s: unregistering SM plugin %s (%s) since %s" __FUN rc.API.sM_name_label + rc.API.sM_uuid reason ; + try Db.SM.destroy ~__context ~self with _ -> () + +module StringSet = Set.Make (String) + +let list_assoc_all a = + List.filter_map (fun (k, v) -> + if String.equal k a then + Some v + else + None + ) + +let ( let@ ) f x = f x (** Synchronise the SM table with the SMAPIv1 plugins on the disk and the SMAPIv2 plugins mentioned in the configuration file whitelist. *) let on_xapi_start ~__context = + let __FUN = __FUNCTION__ in (* An SM is either implemented as a plugin - for which we check its presence, or via an API *) let is_available rc = @@ -120,107 +181,69 @@ let on_xapi_start ~__context = |> List.map (fun (rf, rc) -> (rc.API.sM_type, (rf, rc))) in let explicitly_configured_drivers = - List.filter_map - (function `Sm x -> Some x | _ -> None) - !Xapi_globs.sm_plugins + !Xapi_globs.sm_plugins + |> List.filter_map (function `Sm x -> Some x | _ -> None) + |> StringSet.of_list + in + let smapiv1_drivers = Sm.supported_drivers () |> StringSet.of_list in + let configured_drivers = + StringSet.union explicitly_configured_drivers smapiv1_drivers in - let smapiv1_drivers = Sm.supported_drivers () in - let configured_drivers = explicitly_configured_drivers @ smapiv1_drivers in let in_use_drivers = List.map (fun (_, rc) -> rc.API.sR_type) (Db.SR.get_all_records ~__context) + |> StringSet.of_list in - let to_keep = configured_drivers @ in_use_drivers in - (* The SMAPIv2 drivers we know about *) - let smapiv2_drivers = Listext.List.set_difference to_keep smapiv1_drivers in + let to_keep = StringSet.union configured_drivers in_use_drivers in (* Query the message switch to detect running SMAPIv2 plugins. *) let running_smapiv2_drivers = - if !Xcp_client.use_switch then ( - try - let open Message_switch_unix.Protocol_unix in - let ( >>| ) result f = - match Client.error_to_msg result with - | Error (`Msg x) -> - error "Error %s while querying message switch queues" x ; - raise Message_switch_failure - | Ok x -> - f x - in - Client.connect ~switch:!Xcp_client.switch_path () >>| fun t -> - Client.list ~t ~prefix:!Storage_interface.queue_name ~filter:`Alive () - >>| fun running_smapiv2_driver_queues -> - running_smapiv2_driver_queues - (* The results include the prefix itself, but that is the main storage - queue, we don't need it *) - |> List.filter (( <> ) !Storage_interface.queue_name) - |> Listext.List.try_map (fun driver -> - (* Get the last component of the queue name: - org.xen.xapi.storage.sr_type -> sr_type *) - driver - |> String.split_on_char '.' - |> Listext.List.last - |> Option.to_result ~none:(Invalid_argument driver) - ) - |> function - | Ok drivers -> - drivers - | Error exn -> - raise exn - with - | Message_switch_failure -> - [] (* no more logging *) - | e -> - Backtrace.is_important e ; - error "Unexpected error querying the message switch: %s" - (Printexc.to_string e) ; - Debug.log_backtrace e (Backtrace.get e) ; - [] - ) else - smapiv2_drivers + if !Xcp_client.use_switch then + get_smapiv2_drivers_from_switch () |> StringSet.of_list + else (* The SMAPIv2 drivers we know about *) + StringSet.diff to_keep smapiv1_drivers in (* Add all the running SMAPIv2 drivers *) - let to_keep = to_keep @ running_smapiv2_drivers in + let to_keep = StringSet.union to_keep running_smapiv2_drivers in + let existing_types = List.map fst existing |> StringSet.of_list in + let unused = StringSet.diff existing_types to_keep in let unavailable = List.filter (fun (_, (_, rc)) -> not (is_available rc)) existing in (* Delete all records which aren't configured or in-use *) - List.iter - (fun ty -> - info - "Unregistering SM plugin %s since not in the whitelist and not in-use" - ty ; - let self, _ = List.assoc ty existing in - try Db.SM.destroy ~__context ~self with _ -> () - ) - (Listext.List.set_difference (List.map fst existing) to_keep) ; - List.iter - (fun (name, (self, rc)) -> - info "%s: unregistering SM plugin %s (%s) since it is unavailable" - __FUNCTION__ name rc.API.sM_uuid ; - try Db.SM.destroy ~__context ~self with _ -> () - ) - unavailable ; + let unregister_unused ty = + let sms = list_assoc_all ty existing in + let reason = "it's not in the allowed list and not in-use" in + List.iter (log_and_unregister ~__context ~reason __FUNCTION__) sms + in + let unregister_unavailable (_, sm) = + let reason = "it's unavailable" in + log_and_unregister ~__context ~reason __FUNCTION__ sm + in + StringSet.iter unregister_unused unused ; + List.iter unregister_unavailable unavailable ; (* Synchronize SMAPIv1 plugins *) (* Create all missing SMAPIv1 plugins *) - List.iter + StringSet.iter (fun ty -> let query_result = Sm.info_of_driver ty |> Smint.query_result_of_sr_driver_info in Xapi_sm.create_from_query_result ~__context query_result ) - (Listext.List.set_difference smapiv1_drivers (List.map fst existing)) ; + (StringSet.diff smapiv1_drivers existing_types) ; (* Update all existing SMAPIv1 plugins *) - List.iter + StringSet.iter (fun ty -> let query_result = Sm.info_of_driver ty |> Smint.query_result_of_sr_driver_info in - Xapi_sm.update_from_query_result ~__context (List.assoc ty existing) - query_result + list_assoc_all ty existing + |> List.iter (fun sm -> + Xapi_sm.update_from_query_result ~__context sm query_result + ) ) - (Listext.List.intersect smapiv1_drivers (List.map fst existing)) ; + (StringSet.inter smapiv1_drivers existing_types) ; (* Synchronize SMAPIv2 plugins *) @@ -241,18 +264,62 @@ let on_xapi_start ~__context = f query_result ) in - List.iter + StringSet.iter (fun ty -> with_query_result ty (Xapi_sm.create_from_query_result ~__context) ) - (Listext.List.set_difference running_smapiv2_drivers (List.map fst existing)) ; + (StringSet.diff running_smapiv2_drivers existing_types) ; (* Update all existing SMAPIv2 plugins *) - List.iter + StringSet.iter (fun ty -> - with_query_result ty - (Xapi_sm.update_from_query_result ~__context (List.assoc ty existing)) + let@ qr = with_query_result ty in + list_assoc_all ty existing + |> List.iter (fun sm -> Xapi_sm.update_from_query_result ~__context sm qr) ) - (Listext.List.intersect running_smapiv2_drivers (List.map fst existing)) + (StringSet.inter running_smapiv2_drivers existing_types) ; + + (* Warn in logs when there are still duplicates *) + let add_to_dups (last, dups) (_, curr) = + match (last.API.sM_type = curr.API.sM_type, dups) with + | false, _ -> + (curr, dups) + | true, x :: _ when x = last -> + (curr, curr :: dups) + | true, _ -> + (curr, curr :: last :: dups) + in + let find_all_duplicates lst = + lst + |> List.sort (fun (_, a_rc) (_, b_rc) -> + Stdlib.compare a_rc.API.sM_type b_rc.API.sM_type + ) + |> function + | [] -> + [] + | head :: rest -> + List.fold_left add_to_dups (snd head, []) rest |> snd + in + + let features_to_string feats = + Fmt.(to_to_string (Dump.list (Dump.pair string int64)) feats) + in + let plugin_to_string plugin = + Printf.sprintf "{ type:%s; name:%s; UUID:%s; features:%s; }" + plugin.API.sM_type plugin.API.sM_name_label plugin.API.sM_uuid + (features_to_string plugin.API.sM_features) + in + let log_plugins = function + | [] -> + () + | duplicates -> + let duplicates = + String.concat "\n; " (List.map plugin_to_string duplicates) + in + warn "%s: found duplicate SM plugins for the same type: [\n %s\n]" + __FUN duplicates + in + + Db.SM.get_all_records ~__context |> find_all_duplicates |> log_plugins let bind ~__context ~pbd = let dbg = Context.string_of_task __context in diff --git a/ocaml/xapi/xapi_host.ml b/ocaml/xapi/xapi_host.ml index 08ded94e240..7b9243d54fd 100644 --- a/ocaml/xapi/xapi_host.ml +++ b/ocaml/xapi/xapi_host.ml @@ -1889,13 +1889,13 @@ let enable_external_auth ~__context ~host ~config ~service_name ~auth_type = (* CP-718: Disables external auth/directory service for host *) let disable_external_auth_common ?(during_pool_eject = false) ~__context ~host - ~config () = + ~config ~force () = (* CP-825: Serialize execution of host-enable-extauth and host-disable-extauth *) (* we need to protect against concurrent access to the host.external_auth_type variable *) with_lock serialize_host_enable_disable_extauth (fun () -> let host_name_label = Db.Host.get_name_label ~__context ~self:host in let auth_type = Db.Host.get_external_auth_type ~__context ~self:host in - if auth_type = "" then + if auth_type = "" && not force then (* nothing to do, external authentication is already disabled *) let msg = "external authentication service is already disabled" in debug "Failed to disable external authentication in host %s: %s" @@ -1936,6 +1936,8 @@ let disable_external_auth_common ?(during_pool_eject = false) ~__context ~host , [msg] ) ) + | Extauth_is_disabled -> + Some Extauth_is_disabled | e -> (*absorb any exception*) debug @@ -1957,19 +1959,6 @@ let disable_external_auth_common ?(during_pool_eject = false) ~__context ~host Xapi_globs.event_hook_auth_on_xapi_initialize_succeeded := true ; (* succeeds because there's no need to initialize anymore *) - - (* If any cache is present, clear it in order to ensure cached - logins don't persist after disabling external - authentication. *) - Xapi_session.clear_external_auth_cache () ; - - (* 3. CP-703: we always revalidate all sessions after the external authentication has been disabled *) - (* so that all sessions that were externally authenticated will be destroyed *) - debug - "calling revalidate_all_sessions after disabling external auth for \ - host %s" - host_name_label ; - Xapi_session.revalidate_all_sessions ~__context ; if not during_pool_eject then (* CA-28168 *) (* CA-24856: detect non-homogeneous external-authentication config in this host *) @@ -1978,19 +1967,18 @@ let disable_external_auth_common ?(during_pool_eject = false) ~__context ~host if auth_type = Xapi_globs.auth_type_AD then Extauth_ad.stop_backend_daemon ~wait_until_success:false ; match plugin_disable_failure with - | None -> + (* we do not want to stop pool_eject and permit Extauth_is_disabled during force *) + | Some e when during_pool_eject || (e = Extauth_is_disabled && force) -> () | Some e -> - if not during_pool_eject then - raise e (* bubble up plugin's on_disable exception *) - else - () - (* we do not want to stop pool_eject *) + raise e + | None -> + () ) -let disable_external_auth ~__context ~host ~config = +let disable_external_auth ~__context ~host ~config ~force = disable_external_auth_common ~during_pool_eject:false ~__context ~host ~config - () + ~force () module Static_vdis_list = Xapi_database.Static_vdis_list @@ -2161,21 +2149,13 @@ let apply_edition_internal ~__context ~host ~edition ~additional = ~additional:new_ed.additional_params let apply_edition ~__context ~host ~edition ~force = - (* if HA is enabled do not allow the edition to be changed *) - let pool = Helpers.get_pool ~__context in - if - Db.Pool.get_ha_enabled ~__context ~self:pool - && edition <> Db.Host.get_edition ~__context ~self:host - then - raise (Api_errors.Server_error (Api_errors.ha_is_enabled, [])) - else - let additional = - if force then - [("force", "true")] - else - [] - in - apply_edition_internal ~__context ~host ~edition ~additional + let additional = + if force then + [("force", "true")] + else + [] + in + apply_edition_internal ~__context ~host ~edition ~additional let license_add ~__context ~host ~contents = let license = diff --git a/ocaml/xapi/xapi_host.mli b/ocaml/xapi/xapi_host.mli index d01bc221305..f153d37422a 100644 --- a/ocaml/xapi/xapi_host.mli +++ b/ocaml/xapi/xapi_host.mli @@ -361,6 +361,7 @@ val disable_external_auth_common : -> __context:Context.t -> host:API.ref_host -> config:(string * string) list + -> force:bool -> unit -> unit @@ -368,6 +369,7 @@ val disable_external_auth : __context:Context.t -> host:API.ref_host -> config:(string * string) list + -> force:bool -> unit (** {2 Static VDIs} *) diff --git a/ocaml/xapi/xapi_pool.ml b/ocaml/xapi/xapi_pool.ml index ef4abe5cd07..5214fb3998a 100644 --- a/ocaml/xapi/xapi_pool.ml +++ b/ocaml/xapi/xapi_pool.ml @@ -2037,7 +2037,7 @@ let eject_self ~__context ~host = (* disable the external authentication of this slave being ejected *) (* this call will return an exception if something goes wrong *) Xapi_host.disable_external_auth_common ~during_pool_eject:true ~__context - ~host ~config:[] () ; + ~host ~config:[] ~force:false () ; (* FIXME: in the future, we should send the windows AD admin/pass here *) (* in order to remove the slave from the AD database during pool-eject *) @@ -2973,7 +2973,7 @@ let enable_external_auth ~__context ~pool:_ ~config ~service_name ~auth_type = (* best-effort attempt to disable all enabled hosts, swallowing any exceptions *) try call_fn_on_host ~__context - (Client.Host.disable_external_auth ~config) + (Client.Host.disable_external_auth ~config ~force:false) host with e -> debug @@ -3041,7 +3041,7 @@ let disable_external_auth ~__context ~pool:_ ~config = (* forward the call to the host in the pool *) try call_fn_on_host ~__context - (Client.Host.disable_external_auth ~config) + (Client.Host.disable_external_auth ~config ~force:false) host ; (* no failed host to add to the filtered list, just visit next host *) (host, "", "") @@ -3100,9 +3100,21 @@ let disable_external_auth ~__context ~pool:_ ~config = ) ) ) else (* OK *) + ( + (* If any cache is present, clear it in order to ensure cached + logins don't persist after disabling external + authentication. *) + Xapi_session.clear_external_auth_cache () ; + + (* CP-703: we always revalidate all sessions after the external authentication has been disabled *) + (* so that all sessions that were externally authenticated will be destroyed *) + debug "calling revalidate_all_sessions after disabling external auth" ; + Xapi_session.revalidate_all_sessions ~__context ; + debug "The external authentication of all hosts in the pool was disabled \ successfully" + ) ) (* CA-24856: detect non-homogeneous external-authentication config in pool *) diff --git a/ocaml/xapi/xapi_sm.ml b/ocaml/xapi/xapi_sm.ml index 19257cf8929..dd8dc754858 100644 --- a/ocaml/xapi/xapi_sm.ml +++ b/ocaml/xapi/xapi_sm.ml @@ -20,12 +20,8 @@ module Listext = Xapi_stdext_std.Listext -let finally = Xapi_stdext_pervasives.Pervasiveext.finally - (* We treat versions as '.'-separated integer lists under the usual lexicographic ordering. *) -type version = int list - let version_of_string s = List.map int_of_string (String.split_on_char '.' s) module D = Debug.Make (struct let name = "xapi_sm" end) @@ -99,6 +95,12 @@ let remove_valid_features_from_pending ~__context ~self valid_features = (h, Listext.List.set_difference pending_features valid_features) ) in + let new_pending_feature = + if List.for_all (fun (_, v) -> v = []) new_pending_feature then + [] + else + new_pending_feature + in Db.SM.set_host_pending_features ~__context ~self ~value:new_pending_feature let update_from_query_result ~__context (self, r) q_result = @@ -148,51 +150,43 @@ let update_from_query_result ~__context (self, r) q_result = let is_v1 x = version_of_string x < [2; 0] -let _serialize_reg = +let with_lock = let lock = Mutex.create () in - let holder = ref None in - fun f -> - match !holder with - | Some t when t = Thread.self () -> - (* inside a nested layer where the lock is held by myself *) - f () - | _ -> - Xapi_stdext_threads.Threadext.Mutex.execute lock (fun () -> - holder := Some (Thread.self ()) ; - finally f (fun () -> holder := None) - ) - -let unregister_plugin ~__context q_result = - _serialize_reg (fun () -> - let open Storage_interface in - let driver = String.lowercase_ascii q_result.driver in - if is_v1 q_result.required_api_version then - info "Not unregistering SM plugin %s (required_api_version %s < 2.0)" - driver q_result.required_api_version - else - List.iter - (fun (rf, rc) -> - if rc.API.sM_type = driver then - try - info "Unregistering SM plugin %s (version %s)" driver - q_result.version ; - Db.SM.destroy ~__context ~self:rf - with e -> - warn "Ignore unregistering SM plugin failure: %s" - (Printexc.to_string e) - ) - (Db.SM.get_all_records ~__context) - ) + Xapi_stdext_threads.Threadext.Mutex.execute lock -let register_plugin ~__context q_result = - _serialize_reg (fun () -> - let open Storage_interface in - let driver = String.lowercase_ascii q_result.driver in - if is_v1 q_result.required_api_version then - info "Not registering SM plugin %s (required_api_version %s < 2.0)" - driver q_result.required_api_version - else ( - unregister_plugin ~__context q_result ; - create_from_query_result ~__context q_result +let _unregister_plugin ~__context q_result () = + let open Storage_interface in + let driver = String.lowercase_ascii q_result.driver in + if is_v1 q_result.required_api_version then + info "Not unregistering SM plugin %s (required_api_version %s < 2.0)" driver + q_result.required_api_version + else + List.iter + (fun (rf, rc) -> + if rc.API.sM_type = driver then + try + info "Unregistering SM plugin %s (version %s)" driver + q_result.version ; + Db.SM.destroy ~__context ~self:rf + with e -> + warn "Ignore unregistering SM plugin failure: %s" + (Printexc.to_string e) ) + (Db.SM.get_all_records ~__context) + +let _register_plugin ~__context q_result () = + let open Storage_interface in + let driver = String.lowercase_ascii q_result.driver in + if is_v1 q_result.required_api_version then + info "Not registering SM plugin %s (required_api_version %s < 2.0)" driver + q_result.required_api_version + else ( + _unregister_plugin ~__context q_result () ; + create_from_query_result ~__context q_result ) + +let unregister_plugin ~__context q_result () = + with_lock (_unregister_plugin ~__context q_result) + +let register_plugin ~__context q_result = + with_lock (_register_plugin ~__context q_result) diff --git a/ocaml/xapi/xapi_sm.mli b/ocaml/xapi/xapi_sm.mli new file mode 100644 index 00000000000..f32847097e0 --- /dev/null +++ b/ocaml/xapi/xapi_sm.mli @@ -0,0 +1,27 @@ +(* Copyright (C) 2026 Vates. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; version 2.1 only. with the special + exception on linking described in file LICENSE. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. +*) + +val create_from_query_result : + __context:Context.t -> Storage_interface.query_result -> unit + +val update_from_query_result : + __context:Context.t + -> [`SM] API.Ref.t * API.sM_t + -> Storage_interface.query_result + -> unit + +val unregister_plugin : + __context:Context.t -> Storage_interface.query_result -> unit -> unit + +val register_plugin : + __context:Context.t -> Storage_interface.query_result -> unit diff --git a/ocaml/xapi/xapi_vdi.ml b/ocaml/xapi/xapi_vdi.ml index be82fc7450f..1d4b881c418 100644 --- a/ocaml/xapi/xapi_vdi.ml +++ b/ocaml/xapi/xapi_vdi.ml @@ -739,6 +739,18 @@ let db_forget ~__context ~vdi = debug "db_forget uuid=%s ref=%s" (Db.VDI.get_uuid ~__context ~self:vdi) (Ref.string_of vdi) ; + (* CA-419840 mark VBD as empty when it is a CDR *) + ( Db.VDI.get_VBDs ~__context ~self:vdi |> function + | [] -> + debug "%s: no VBD for VDI %s" __FUNCTION__ (Ref.string_of vdi) + | self :: _ when self = Ref.null -> + warn "%s: NULL VBD for VDI %s" __FUNCTION__ (Ref.string_of vdi) + | self :: _ when Db.VBD.get_type ~__context ~self = `CD -> + Db.VBD.set_VDI ~__context ~self ~value:Ref.null ; + Db.VBD.set_empty ~__context ~self ~value:true + | _ -> + () (* not a CDR *) + ) ; Db.VDI.destroy ~__context ~self:vdi let introduce ~__context ~uuid ~name_label ~name_description ~sR ~_type diff --git a/ocaml/xapi/xapi_xenops.ml b/ocaml/xapi/xapi_xenops.ml index f63e3ccf7d9..fe861ab6c43 100644 --- a/ocaml/xapi/xapi_xenops.ml +++ b/ocaml/xapi/xapi_xenops.ml @@ -4028,10 +4028,7 @@ let suspend ~__context ~self = in Int64.(ram |> add vgpu |> add 104857600L) in - let suspend_SR = - Helpers.choose_suspend_sr ~__context ~vm:self - ~required_space:space_needed - in + let suspend_SR = Helpers.choose_suspend_sr ~__context ~vm:self in let sm_config = [ (Constants._sm_vm_hint, id) @@ -4041,11 +4038,22 @@ let suspend ~__context ~self = in Helpers.call_api_functions ~__context (fun rpc session_id -> let vdi = - XenAPI.VDI.create ~rpc ~session_id ~name_label:"Suspend image" - ~name_description:"Suspend image" ~sR:suspend_SR - ~virtual_size:space_needed ~sharable:false ~read_only:false - ~_type:`suspend ~other_config:[] ~xenstore_data:[] ~sm_config - ~tags:[] + try + XenAPI.VDI.create ~rpc ~session_id ~name_label:"Suspend image" + ~name_description:"Suspend image" ~sR:suspend_SR + ~virtual_size:space_needed ~sharable:false ~read_only:false + ~_type:`suspend ~other_config:[] ~xenstore_data:[] ~sm_config + ~tags:[] + with Api_errors.Server_error ("SR_BACKEND_FAILURE_44", _) -> + let sr_uuid = Db.SR.get_uuid ~__context ~self:suspend_SR in + error "Not enough free space on suspend SR %s; need %Ld B" sr_uuid + space_needed ; + raise + (Api_errors.Server_error + ( Api_errors.sr_suspend_space_insufficient + , [Ref.string_of suspend_SR] + ) + ) in let d = disk_of_vdi ~__context ~self:vdi |> Option.get in Db.VM.set_suspend_VDI ~__context ~self ~value:vdi ; diff --git a/ocaml/xcp-rrdd/bin/rrdp-cpu/rrdp_cpu.ml b/ocaml/xcp-rrdd/bin/rrdp-cpu/rrdp_cpu.ml index 0cea2e90295..763aaaed5d8 100644 --- a/ocaml/xcp-rrdd/bin/rrdp-cpu/rrdp_cpu.ml +++ b/ocaml/xcp-rrdd/bin/rrdp-cpu/rrdp_cpu.ml @@ -80,6 +80,28 @@ let dss_vcpus xc doms = ) ] in + let nonaffine_vcpus_ds = + match ri.Xenctrl.Runstateinfo.V2.running with + | 0L -> + [] + | _ -> + [ + ( Rrd.VM uuid + , Ds.ds_make ~name:"numa_node_nonaffine_vcpus" + ~units:"(fraction)" + ~value: + (Rrd.VT_Float + (Int64.to_float ri.Xenctrl.Runstateinfo.V2.nonaffine + /. 1.0e9 + ) + ) + ~description: + "Fraction of vCPU time running outside of vCPU \ + soft-affinity" + ~ty:Rrd.Derive ~default:false ~min:0.0 ~max:1.0 () + ) + ] + in ( Rrd.VM uuid , Ds.ds_make ~name:"runstate_fullrun" ~units:"(fraction)" ~value: @@ -172,6 +194,7 @@ let dss_vcpus xc doms = ) :: dss @ runnable_vcpus_ds + @ nonaffine_vcpus_ds with _ -> dss in try cpus 0 dss with _ -> dss diff --git a/ocaml/xcp-rrdd/bin/rrdp-squeezed/dune b/ocaml/xcp-rrdd/bin/rrdp-squeezed/dune index 75c8e1f5ab5..ee5f217f137 100644 --- a/ocaml/xcp-rrdd/bin/rrdp-squeezed/dune +++ b/ocaml/xcp-rrdd/bin/rrdp-squeezed/dune @@ -13,6 +13,7 @@ xapi-log xapi-rrd xenctrl + xenctrl_ext xenstore xenstore.unix xenstore_transport diff --git a/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml b/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml index df49dca259f..951b7a8b1d5 100644 --- a/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml +++ b/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml @@ -252,6 +252,62 @@ let free_other uuid free = ~value:(Rrd.VT_Int64 free) ~ty:Rrd.Gauge ~min:0.0 ~default:true () ) +let dss_numa_info uuid domid = + try + let handle = Xenctrlext.get_handle () in + let host_nr_nodes = Xenctrlext.get_nr_nodes handle in + let vm_nodes = + Xenctrlext.DomainNuma.domain_get_numa_info_node_pages handle domid + in + let dss_memory_numa_nodes_of_vm (dss, nr_nodes_used_by_vm) + (node_id, tot_pages_per_node) = + (* + for each numa node used by the host, show the + corresponding amount of memory used by the vm + *) + let is_node_used_by_vm = tot_pages_per_node > 4096L in + let is_node_used_by_host = node_id < host_nr_nodes in + if is_node_used_by_host then + ( ( Rrd.VM uuid + , Ds.ds_make + ~name:(Printf.sprintf "memory_numa_node_%d" node_id) + ~units:"B" + ~description: + (Printf.sprintf "Memory from NUMA node %d used by VM" node_id) + ~value:(Rrd.VT_Int64 (Int64.mul tot_pages_per_node 4096L)) + ~min:0.0 ~ty:Rrd.Gauge ~default:false () + ) + :: dss + (* remember the number of nodes used by vm *) + , nr_nodes_used_by_vm + + + if is_node_used_by_vm then + 1 + else + 0 + ) + else + (dss, nr_nodes_used_by_vm) + in + let dss_numa_nodes_of_vm (dss, nr_nodes_used_by_vm) = + ( Rrd.VM uuid + , Ds.ds_make + ~name:(Printf.sprintf "numa_nodes") + ~units:"count" + ~description:(Printf.sprintf "Number of NUMA nodes used by VM") + ~value:(Rrd.VT_Int64 (Int64.of_int nr_nodes_used_by_vm)) + ~min:0.0 ~ty:Rrd.Gauge ~default:false () + ) + :: List.rev dss + in + vm_nodes.Xenctrlext.DomainNuma.tot_pages_per_node + |> Array.mapi (fun i x -> (i, x)) + |> Array.fold_left dss_memory_numa_nodes_of_vm ([], 0) + |> dss_numa_nodes_of_vm + with e -> + D.debug "dss_numa_info: %s" (Printexc.to_string e) ; + [] + let get_list f = Option.to_list (f ()) let generate_vm_sources domains = @@ -288,6 +344,7 @@ let generate_vm_sources domains = ~value:(Rrd.VT_Int64 memory) ~ty:Rrd.Gauge ~min:0.0 ~default:true () ) in + let get_list_numa_info = dss_numa_info uuid domid in (* CA-34383: Memory updates from paused domains serve no useful purpose. During a migrate such updates can also cause undesirable discontinuities in the observed value of memory_actual. Hence, we @@ -295,7 +352,7 @@ let generate_vm_sources domains = if dom.Xenctrl.paused then [] else - get_list target @ get_list free @ get_list total + get_list target @ get_list free @ get_list total @ get_list_numa_info in List.concat_map metrics_of domains diff --git a/ocaml/xenopsd/lib/xenops_server.ml b/ocaml/xenopsd/lib/xenops_server.ml index 55e5844ee29..7e7951d5cb2 100644 --- a/ocaml/xenopsd/lib/xenops_server.ml +++ b/ocaml/xenopsd/lib/xenops_server.ml @@ -155,7 +155,8 @@ type atomic = | VM_create_device_model of (Vm.id * bool) | VM_destroy_device_model of Vm.id | VM_destroy of Vm.id - | VM_create of (Vm.id * int64 option * Vm.id option * bool) (*no_sharept*) + | VM_create of (Vm.id * (int64 * int64 option) option * Vm.id option * bool) + (*no_sharept*) | VM_build of (Vm.id * bool) | VM_shutdown_domain of (Vm.id * shutdown_request * float) | VM_s3suspend of Vm.id @@ -330,6 +331,7 @@ type vm_receive_op = { ; vmr_socket: Unix.file_descr ; vmr_handshake: string option (** handshake protocol *) ; vmr_compressed: bool + ; vmr_memory_total_source: int64 option [@default None] } [@@deriving rpcty] @@ -2317,19 +2319,26 @@ let rec perform_atomic ~progress_callback ?result (op : atomic) | VM_destroy id -> debug "VM.destroy %s" id ; B.VM.destroy t (VM_DB.read_exn id) - | VM_create (id, memory_upper_bound, final_id, no_sharept) -> + | VM_create (id, memory_upper_bound_and_source, final_id, no_sharept) -> let num_of_vbds = List.length (VBD_DB.vbds id) in let num_of_vifs = List.length (VIF_DB.vifs id) in + let memory_upper_bound = Option.map fst memory_upper_bound_and_source + and memory_total_source = + Option.map snd memory_upper_bound_and_source |> Option.join + in debug - "VM.create %s memory_upper_bound = %s, num_of_vbds = %d, num_of_vifs = \ - %d" + "VM.create %s memory_upper_bound = %s, memory_total_source = %s, \ + num_of_vbds = %d, num_of_vifs = %d" id (Option.value ~default:"None" (Option.map Int64.to_string memory_upper_bound) ) + (Option.value ~default:"None" + (Option.map Int64.to_string memory_total_source) + ) num_of_vbds num_of_vifs ; - B.VM.create t memory_upper_bound (VM_DB.read_exn id) final_id no_sharept - num_of_vbds num_of_vifs + B.VM.create t memory_upper_bound memory_total_source (VM_DB.read_exn id) + final_id no_sharept num_of_vbds num_of_vifs | VM_build (id, force) -> debug "VM.build %s" id ; let vbds : Vbd.t list = VBD_DB.vbds id |> vbd_plug_order in @@ -2897,7 +2906,10 @@ and perform_exn ?result (op : operation) (t : Xenops_task.task_handle) : unit = Request.write (fun _ -> ()) request fd in do_request vm_fd - [("memory_limit", Int64.to_string state.Vm.memory_limit)] + [ + ("memory_limit", Int64.to_string state.Vm.memory_limit) + ; ("memory_total_source", Int64.to_string state.Vm.memory_actual) + ] url ; let first_handshake () = ( match Handshake.recv vm_fd with @@ -3004,6 +3016,7 @@ and perform_exn ?result (op : operation) (t : Xenops_task.task_handle) : unit = vmr_id= id ; vmr_final_id= final_id ; vmr_memory_limit= memory_limit + ; vmr_memory_total_source= memory_total_source ; vmr_socket= s ; vmr_handshake= handshake ; vmr_compressed @@ -3084,7 +3097,14 @@ and perform_exn ?result (op : operation) (t : Xenops_task.task_handle) : unit = ) in perform_atomics - ([VM_create (id, Some memory_limit, Some final_id, no_sharept)] + ([ + VM_create + ( id + , Some (memory_limit, memory_total_source) + , Some final_id + , no_sharept + ) + ] (* Perform as many operations as possible on the destination domain before pausing the original domain *) @ atomics_of_operation (VM_restore_vifs id) @@ -3902,6 +3922,9 @@ module VM = struct let module Response = Cohttp.Response.Make (Cohttp_posix_io.Unbuffered_IO) in let dbg = List.assoc "dbg" cookies in let memory_limit = List.assoc "memory_limit" cookies |> Int64.of_string in + let memory_total_source = + List.assoc_opt "memory_total_source" cookies |> Option.map Int64.of_string + in let handshake = List.assoc_opt cookie_mem_migration cookies in let compressed_memory = get_compression cookies in Debug.with_thread_associated dbg @@ -3932,6 +3955,7 @@ module VM = struct ; vmr_socket= transferred_fd ; vmr_handshake= handshake ; vmr_compressed= compressed_memory + ; vmr_memory_total_source= memory_total_source } in let task = diff --git a/ocaml/xenopsd/lib/xenops_server_plugin.ml b/ocaml/xenopsd/lib/xenops_server_plugin.ml index 9c741786678..ed2eaf2817f 100644 --- a/ocaml/xenopsd/lib/xenops_server_plugin.ml +++ b/ocaml/xenopsd/lib/xenops_server_plugin.ml @@ -81,6 +81,7 @@ module type S = sig val create : Xenops_task.task_handle -> int64 option + -> int64 option -> Vm.t -> Vm.id option -> bool (* no_sharept*) diff --git a/ocaml/xenopsd/lib/xenops_server_simulator.ml b/ocaml/xenopsd/lib/xenops_server_simulator.ml index 8b7450f03e8..5256f1a91d1 100644 --- a/ocaml/xenopsd/lib/xenops_server_simulator.ml +++ b/ocaml/xenopsd/lib/xenops_server_simulator.ml @@ -566,7 +566,7 @@ module VM = struct let remove _vm = () - let create _ memory_limit vm _ _ _ _ = + let create _ memory_limit _ vm _ _ _ _ = with_lock m (create_nolock memory_limit vm) let destroy _ vm = with_lock m (destroy_nolock vm) diff --git a/ocaml/xenopsd/lib/xenops_task.ml b/ocaml/xenopsd/lib/xenops_task.ml index f2c3993cf39..b6aad308536 100644 --- a/ocaml/xenopsd/lib/xenops_task.ml +++ b/ocaml/xenopsd/lib/xenops_task.ml @@ -106,9 +106,10 @@ let with_tracing ~name ~task f = result with exn -> let backtrace = Printexc.get_raw_backtrace () in + Backtrace.is_important exn ; let error = (exn, backtrace) in let _ : (Span.t option, exn) result = Tracer.finish span ~error in - raise exn + Printexc.raise_with_backtrace exn backtrace ) | Error e -> D.warn "Failed to start tracing: %s" (Printexc.to_string e) ; diff --git a/ocaml/xenopsd/lib/xenopsd.ml b/ocaml/xenopsd/lib/xenopsd.ml index 226a686c283..8245dacddfa 100644 --- a/ocaml/xenopsd/lib/xenopsd.ml +++ b/ocaml/xenopsd/lib/xenopsd.ml @@ -77,6 +77,12 @@ let vm_guest_agent_xenstore_quota = ref 128 let vm_guest_agent_xenstore_quota_warn_interval = ref 3600 +let vm_suspend_timeout = ref 1200. + +let vm_suspend_ack_timeout = ref 30. + +let linux_assume_ctrl_features = ref false + let oxenstored_conf = ref "/etc/xen/oxenstored.conf" let for_each_line path f = @@ -320,6 +326,24 @@ let options = , (fun () -> string_of_bool !Xenops_server.xenopsd_vbd_plug_unplug_legacy) , "False if we want to split the plug atomic into attach/activate" ) + ; ( "linux-assume-control-features" + , Arg.Bool (fun b -> linux_assume_ctrl_features := b) + , (fun () -> string_of_bool !linux_assume_ctrl_features) + , "To support old Linux guest kernels, assume the kernel supports certain \ + control features without announcing them. Linux kernels 4.9 and later \ + announce support in xenstore." + ) + ; ( "vm-suspend-timeout" + , Arg.Set_float vm_suspend_timeout + , (fun () -> string_of_float !vm_suspend_timeout) + , "Timeout in seconds for a VM to suspend after acknowledging a suspend \ + request" + ) + ; ( "vm-suspend-ack-timeout" + , Arg.Set_float vm_suspend_ack_timeout + , (fun () -> string_of_float !vm_suspend_ack_timeout) + , "Timeout in seconds for a VM to acknowledge a suspend request" + ) ] let path () = Filename.concat !sockets_path "xenopsd" diff --git a/ocaml/xenopsd/xc/device.ml b/ocaml/xenopsd/xc/device.ml index adedf798d6f..25671be5a49 100644 --- a/ocaml/xenopsd/xc/device.ml +++ b/ocaml/xenopsd/xc/device.ml @@ -2980,6 +2980,16 @@ module Backend = struct ) in let xen_platform_pv_driver_info pv_info = + (* we would like to keep the assumptions about supported + features small and rather have the Linux kernel or a guest agent + announce the actually supported featuresi. See commit message *) + let features = + match !Xenopsd.linux_assume_ctrl_features with + | true -> + ["suspend"; "poweroff"; "reboot"; "vcpu-hotplug"] + | false -> + ["vcpu-hotplug"] + in with_xs (fun xs -> let is_hvm_linux {product_num; build_num} = let _XEN_IOPORT_LINUX_PRODNUM = 3 in @@ -2992,9 +3002,7 @@ module Backend = struct (Printf.sprintf "/local/domain/%d/%s%s" domid prefix x) "1" in - List.iter - (write_local_domain "control/feature-") - ["suspend"; "poweroff"; "reboot"; "vcpu-hotplug"] ; + List.iter (write_local_domain "control/feature-") features ; List.iter (write_local_domain "data/") ["updated"] ) ) diff --git a/ocaml/xenopsd/xc/domain.ml b/ocaml/xenopsd/xc/domain.ml index 1cdc9a87785..63f8cdd9fcc 100644 --- a/ocaml/xenopsd/xc/domain.ml +++ b/ocaml/xenopsd/xc/domain.ml @@ -152,6 +152,8 @@ type builder_spec_info = type build_info = { memory_max: int64 (** memory max in kilobytes *) ; memory_target: int64 (** memory target in kilobytes *) + ; memory_total_source: int64 option + (** amount of memory to claim (during migration) *) ; kernel: string (** in hvm case, point to hvmloader *) ; vcpus: int (** vcpus max *) ; priv: builder_spec_info @@ -246,7 +248,10 @@ let wait_xen_free_mem ~xc ?(maximum_wait_time_seconds = 64) required_memory_kib in (* At exponentially increasing intervals, write *) (* a debug message saying how long we've waited: *) - if is_power_of_2 accumulated_wait_time_seconds then + if + accumulated_wait_time_seconds = 0 + || is_power_of_2 accumulated_wait_time_seconds + then debug "Waited %i second(s) for memory to become available: %Ld KiB free, %Ld \ KiB scrub, %Ld KiB required" @@ -778,8 +783,11 @@ let shutdown_wait_for_ack (t : Xenops_task.task_handle) ~timeout ~xc ~xs domid Xenctrl.domain_shutdown xc domid (shutdown_to_xc_shutdown req) ) else ( debug - "VM = %s; domid = %d; Waiting for domain to acknowledge shutdown request" - uuid domid ; + "VM = %s; domid = %d; Waiting for domain to acknowledge %s request \ + (timeout = %.0fs)" + uuid domid + (string_of_shutdown_reason req) + timeout ; let path = control_shutdown ~xs domid in let cancel = Domain domid in if @@ -788,8 +796,8 @@ let shutdown_wait_for_ack (t : Xenops_task.task_handle) ~timeout ~xc ~xs domid [Watch.key_to_disappear path] t ~xs ~timeout () then - info "VM = %s; domid = %d; Domain acknowledged shutdown request" uuid - domid + info "VM = %s; domid = %d; Domain acknowledged %s request" uuid domid + (string_of_shutdown_reason req) else debug "VM = %s; domid = %d; Domain disappeared" uuid domid ) @@ -987,18 +995,26 @@ let numa_hierarchy = let numa_mutex = Mutex.create () -let numa_resources = ref None +let node_mem_claimable_for_new_vm ~node ~domid m = + let open Xenctrlext.HostNuma in + let nodeid = Fmt.str "%a" Topology.NUMA.pp_dump_node node in + let available = Int64.sub m.free m.claimed in + D.debug + "mem_claimable_for_new_vm: NUMA nodeid=%s, domid=%d: memfree=%Ld \ + memsize=%Ld claimed=%Ld: available=%Ld" + nodeid domid m.free m.size m.claimed available ; + available let numa_init () = let xcext = Xenctrlext.get_handle () in let host = Lazy.force numa_hierarchy in - let mem = (Xenctrlext.numainfo xcext).memory in + let mem = Xenctrlext.HostNuma.numa_get_meminfo xcext in D.debug "Host NUMA information: %s" (Fmt.to_to_string (Fmt.Dump.option Topology.NUMA.pp_dump) host) ; Array.iteri (fun i m -> - let open Xenctrlext in - D.debug "NUMA node %d: %Ld/%Ld memory free" i m.memfree m.memsize + let open Xenctrlext.HostNuma in + D.debug "NUMA node %d: %Ld/%Ld/%Ld memory free" i m.free m.size m.claimed ) mem @@ -1015,21 +1031,17 @@ let numa_placement domid ~vcpus ~cores ~memory affinity = let ( let* ) = Option.bind in let xcext = get_handle () in let* host = Lazy.force numa_hierarchy in - let numa_meminfo = (numainfo xcext).memory |> Array.to_seq in + let numa_meminfo = HostNuma.numa_get_meminfo xcext |> Array.to_seq in let nodes = Seq.map2 - (fun node m -> NUMA.resource host node ~memory:m.memfree) + (fun node m -> + NUMA.resource host node + ~memory:(node_mem_claimable_for_new_vm ~node ~domid m) + ) (NUMA.nodes host) numa_meminfo in let vm = NUMARequest.make ~memory ~vcpus ~cores in - let nodea = - match !numa_resources with - | None -> - Array.of_seq nodes - | Some a -> - Array.map2 NUMAResource.min_memory (Array.of_seq nodes) a - in - numa_resources := Some nodea ; + let nodea = Array.of_seq nodes in let cpu_affinity, memory_plan = match Softaffinity.plan ~vm host nodea with | None -> @@ -1063,7 +1075,7 @@ let numa_placement domid ~vcpus ~cores ~memory affinity = __FUNCTION__ domid ; None in - let nr_pages = Int64.div memory 4096L |> Int64.to_int in + let nr_pages = Memory.pages_of_bytes_used memory |> Int64.to_int in try D.debug "NUMAClaim domid %d: local claim on node %d: %d pages" domid node nr_pages ; @@ -1077,7 +1089,7 @@ let numa_placement domid ~vcpus ~cores ~memory affinity = D.debug "NUMAClaim domid %d: local claim not available" domid ; set_vcpu_affinity cpu_affinity ; None - | Xenctrlext.Unix_error (errno, _) -> + | Xenctrlext.Unix_error ((Unix.ENOMEM as errno), _) -> D.info "%s: unable to claim enough memory, domain %d won't be hosted in a \ single NUMA node. (error %s)" @@ -1089,6 +1101,13 @@ let numa_placement domid ~vcpus ~cores ~memory affinity = let build_pre ~xc ~xs ~vcpus ~memory ~hard_affinity domid = let open Memory in let uuid = get_uuid ~xc domid in + debug + "VM = %s, build_max_mib = %Ld, build_start_mib = %Ld, xen_max_mib =\n\ + \ %Ld, shadow_mib = %Ld, required_host_free_mib = %Ld, overhead_mib = \ + %Ld" + (Uuidx.to_string uuid) memory.build_max_mib memory.build_start_mib + memory.xen_max_mib memory.shadow_mib memory.required_host_free_mib + memory.overhead_mib ; debug "VM = %s; domid = %d; waiting for %Ld MiB of free host memory" (Uuidx.to_string uuid) domid memory.required_host_free_mib ; (* CA-39743: Wait, if necessary, for the Xen scrubber to catch up. *) @@ -1176,10 +1195,38 @@ let build_pre ~xc ~xs ~vcpus ~memory ~hard_affinity domid = and cores = Xenops_server.cores_of_numa_affinity_policy pin ~vcpus in - numa_placement domid ~vcpus ~cores - ~memory:(Int64.mul memory.xen_max_mib 1048576L) - affinity - |> Option.map fst + + let build_claim_bytes = + Memory.bytes_of_pages memory.build_claim_pages + in + D.debug "VM = %s; domid = %d; will claim %Ld bytes = %Ld pages" + (Uuidx.to_string uuid) domid build_claim_bytes + memory.build_claim_pages ; + let memory = build_claim_bytes in + match numa_placement domid ~vcpus ~cores ~memory affinity with + | None -> + (* Always perform a global claim when NUMA placement is + enabled, and single node claims failed or were + unavailable: + This tries to ensures that memory allocated for this + domain won't use up memory claimed by other domains. + If claims are mixed with non-claims then Xen can't + currently guarantee that it would honour the existing + claims. + A failure here is a hard failure: we'd fail allocating + memory later anyway + *) + let nr_pages = + Memory.pages_of_bytes_used memory |> Int64.to_int + in + let xcext = Xenctrlext.get_handle () in + D.debug "NUMAClaim domid %d: global claim: %d pages" domid + nr_pages ; + Xenctrlext.domain_claim_pages xcext domid + ~numa_node:Xenctrlext.NumaNode.none nr_pages ; + None + | Some (plan, _) -> + Some plan ) in let store_chan, console_chan = create_channels ~xc uuid domid in @@ -1871,6 +1918,17 @@ let restore (task : Xenops_task.task_handle) ~xc ~xs ~dm ~timeoffset ~extras maybe_ca_140252_workaround ~xc ~vcpus domid ; (memory, vm_stuff, `pvh) in + let memory = + match info.memory_total_source with + | None -> + memory + | Some kib -> + let build_claim_pages = Memory.pages_of_kib_used kib in + let bytes = Memory.bytes_of_kib kib in + debug "Domid %d: memory_total_source = %Ld bytes = %Ld KiB = %Ld pages" + domid bytes kib build_claim_pages ; + Memory.{memory with build_claim_pages} + in let store_port, console_port, numa_placements = build_pre ~xc ~xs ~memory ~vcpus ~hard_affinity:info.hard_affinity domid in diff --git a/ocaml/xenopsd/xc/domain.mli b/ocaml/xenopsd/xc/domain.mli index 574782fdcec..9c71f78fdea 100644 --- a/ocaml/xenopsd/xc/domain.mli +++ b/ocaml/xenopsd/xc/domain.mli @@ -130,6 +130,8 @@ val builder_spec_info : builder_spec_info Rpc.Types.def type build_info = { memory_max: int64 (** memory max in kilobytes *) ; memory_target: int64 (** memory target in kilobytes *) + ; memory_total_source: int64 option + (** memory used on source during migration/resume in kilobytes *) ; kernel: string (** image to load. In HVM case, point to hvmloader *) ; vcpus: int (** vcpus max *) ; priv: builder_spec_info diff --git a/ocaml/xenopsd/xc/numa.ml b/ocaml/xenopsd/xc/numa.ml index bb42101cc53..f05c2765eed 100644 --- a/ocaml/xenopsd/xc/numa.ml +++ b/ocaml/xenopsd/xc/numa.ml @@ -49,17 +49,17 @@ let human_readable_bytes quantity = else loop [] quantity binary_prefixes |> String.concat ", " -let get_memory () = - let {memory; _} = numainfo xc in - memory +let get_memory () = HostNuma.numa_get_meminfo xc let print_mem c mem = for i = 0 to Array.length mem - 1 do - let {memfree; memsize} = mem.(i) in - let memfree = human_readable_bytes memfree in - let memsize = human_readable_bytes memsize in + let {HostNuma.size; free; claimed} = mem.(i) in + let memfree = human_readable_bytes free in + let memsize = human_readable_bytes size in + let memclaimed = human_readable_bytes claimed in Logs.app (fun m -> - m "\t%d: %s free out of %s" i memfree memsize ~tags:(stamp c) + m "\t%d: %s free / %s claimed out of %s" i memfree memclaimed memsize + ~tags:(stamp c) ) done @@ -72,7 +72,9 @@ let print_diff_mem before after = let diff c old cur = let changed_yet = ref false in for i = 0 to Int.min (Array.length old) (Array.length cur) - 1 do - let {memfree= a_free; _}, {memfree= b_free; _} = (old.(i), cur.(i)) in + let {HostNuma.free= a_free; _}, {HostNuma.free= b_free; _} = + (old.(i), cur.(i)) + in if a_free <> b_free then ( if not !changed_yet then changed_yet := true ; let free = human_readable_bytes b_free in diff --git a/ocaml/xenopsd/xc/xenops_server_xen.ml b/ocaml/xenopsd/xc/xenops_server_xen.ml index 03ce150ae26..f9eae92e8de 100644 --- a/ocaml/xenopsd/xc/xenops_server_xen.ml +++ b/ocaml/xenopsd/xc/xenops_server_xen.ml @@ -31,14 +31,17 @@ let finally = Xapi_stdext_pervasives.Pervasiveext.finally let with_lock = Xapi_stdext_threads.Threadext.Mutex.execute -let internal_error fmt = +let internal_error' e fmt = Printf.ksprintf (fun str -> error "%s" str ; - raise (Xenopsd_error (Internal_error str)) + let err = Xenopsd_error (Internal_error str) in + match e with None -> raise err | Some e -> Backtrace.reraise e err ) fmt +let internal_error fmt = internal_error' None fmt + (* libxl_internal.h:DISABLE_UDEV_PATH *) let disable_udev_path = "libxl/disable_udev" @@ -269,8 +272,9 @@ let di_of_uuid ~xc uuid = let domid_list = String.concat ", " (List.map domid_of_di possible) in let uuid' = Uuidx.to_string uuid in internal_error "More than one domain with uuid %s: (%s)" uuid' domid_list - | exception Failure r -> - internal_error "%s" r + | exception (Failure r as e) -> + Backtrace.is_important e ; + internal_error' (Some e) "%s" r let domid_of_uuid ~xs uuid = (* We don't fully control the domain lifecycle because libxenguest will @@ -658,7 +662,11 @@ module Mem = struct ~default:(min, ("none", min)) (reserve_memory_range dbg min max) in - try f amount id with e -> delete_reservation dbg id ; raise e + try f amount id + with e -> + let bt = Printexc.get_raw_backtrace () in + delete_reservation dbg id ; + Printexc.raise_with_backtrace e bt (** Transfer this 'reservation' to the given domain id *) let transfer_reservation_to_domain_exn dbg domid (reservation_id, amount) = @@ -1637,6 +1645,7 @@ module VM = struct { Domain.memory_max= vm.memory_static_max /// 1024L ; memory_target= vm.memory_dynamic_min /// 1024L + ; memory_total_source= None ; kernel= "" ; vcpus= vm.vcpu_max ; priv= builder_spec_info @@ -1747,8 +1756,8 @@ module VM = struct in (device_id, revision) - let create_exn task memory_upper_bound vm final_id no_sharept num_of_vbds - num_of_vifs = + let create_exn task memory_upper_bound memory_total_source vm final_id + no_sharept num_of_vbds num_of_vifs = let k = vm.Vm.id in with_xc_and_xs (fun xc xs -> (* Ensure the DB contains something for this VM - this is to avoid a @@ -1821,33 +1830,40 @@ module VM = struct needed. If we are live migrating then we will only know an upper bound. If we are starting from scratch then we have a free choice. *) - let min_bytes, max_bytes = + let min_bytes, max_bytes, memory_total_source_bytes = match memory_upper_bound with | Some x -> debug "VM = %s; using memory_upper_bound = %Ld" vm.Vm.id x ; - (x, x) + (x, x, memory_total_source) | None -> if resuming then ( debug "VM = %s; using stored suspend_memory_bytes = %Ld" vm.Vm.id persistent.VmExtra.suspend_memory_bytes ; ( persistent.VmExtra.suspend_memory_bytes , persistent.VmExtra.suspend_memory_bytes + , Some persistent.VmExtra.suspend_memory_bytes ) ) else ( debug "VM = %s; using memory_dynamic_min = %Ld and \ memory_dynamic_max = %Ld" vm.Vm.id vm.memory_dynamic_min vm.memory_dynamic_max ; - (vm.memory_dynamic_min, vm.memory_dynamic_max) + (vm.memory_dynamic_min, vm.memory_dynamic_max, None) ) in let min_kib = kib_of_bytes_used (min_bytes +++ overhead_bytes) + and memory_total_source_kib = + Option.map kib_of_bytes_used memory_total_source_bytes and max_kib = kib_of_bytes_used (max_bytes +++ overhead_bytes) in (* XXX: we would like to be able to cancel an in-progress with_reservation *) let dbg = Xenops_task.get_dbg task in Mem.with_reservation dbg min_kib max_kib (fun target_plus_overhead_kib reservation_id -> + debug + "VM = %s, memory [%Ld KiB, %Ld KiB], \ + target_plus_overhead=%Ld KiB" + vm.Vm.id min_kib max_kib target_plus_overhead_kib ; let domain_config, persistent = match persistent.VmExtra.domain_config with | Some dc -> @@ -1888,8 +1904,26 @@ module VM = struct let target_bytes = target_plus_overhead_bytes --- overhead_bytes in + debug + "VM = %s, memory target_bytes = %Ld, dynamic max = %Ld" + vm.Vm.id target_bytes vm.memory_dynamic_max ; min vm.memory_dynamic_max target_bytes in + let persistent = + match persistent with + | {VmExtra.build_info= Some x; _} as t -> + { + t with + build_info= + Some + { + x with + memory_total_source= memory_total_source_kib + } + } + | _ -> + persistent + in set_initial_target ~xs domid (Int64.div initial_target 1024L) ; (* Log uses of obsolete option *) if vm.suppress_spurious_page_faults then @@ -2367,6 +2401,7 @@ module VM = struct { Domain.memory_max= static_max_kib ; memory_target= initial_target + ; memory_total_source= None ; kernel ; vcpus= vm.vcpu_max ; priv @@ -2590,36 +2625,42 @@ module VM = struct try build_domain_exn xc xs domid task vm vbds vifs vgpus vusbs extras force - with - | Bootloader.Bad_sexpr x -> - internal_error "VM = %s; domid = %d; Bootloader.Bad_sexpr %s" - vm.Vm.id domid x - | Bootloader.Bad_error x -> - internal_error "VM = %s; domid = %d; Bootloader.Bad_error %s" - vm.Vm.id domid x - | Bootloader.Unknown_bootloader x -> - internal_error - "VM = %s; domid = %d; Bootloader.Unknown_bootloader %s" vm.Vm.id - domid x - | Bootloader.Error_from_bootloader x -> - let m = - Printf.sprintf - "VM = %s; domid = %d; Bootloader.Error_from_bootloader %s" + with e -> ( + Backtrace.is_important e ; + let internal_error fmt = internal_error' (Some e) fmt in + match e with + | Bootloader.Bad_sexpr x -> + internal_error "VM = %s; domid = %d; Bootloader.Bad_sexpr %s" vm.Vm.id domid x - in - debug "%s" m ; - raise (Xenopsd_error (Bootloader_error (vm.Vm.id, x))) - | Domain.Not_enough_memory m -> - debug - "VM = %s; domid = %d; Domain.Not_enough_memory. Needed: %Ld bytes" - vm.Vm.id domid m ; - raise (Xenopsd_error (Not_enough_memory m)) - | e -> - let m = - Printf.sprintf "VM = %s; domid = %d; Error: %s" vm.Vm.id domid - (Printexc.to_string e) - in - debug "%s" m ; raise e + | Bootloader.Bad_error x -> + internal_error "VM = %s; domid = %d; Bootloader.Bad_error %s" + vm.Vm.id domid x + | Bootloader.Unknown_bootloader x -> + internal_error + "VM = %s; domid = %d; Bootloader.Unknown_bootloader %s" vm.Vm.id + domid x + | Bootloader.Error_from_bootloader x -> + let m = + Printf.sprintf + "VM = %s; domid = %d; Bootloader.Error_from_bootloader %s" + vm.Vm.id domid x + in + debug "%s" m ; + Backtrace.reraise e + (Xenopsd_error (Bootloader_error (vm.Vm.id, x))) + | Domain.Not_enough_memory m -> + debug + "VM = %s; domid = %d; Domain.Not_enough_memory. Needed: %Ld \ + bytes" + vm.Vm.id domid m ; + Backtrace.reraise e (Xenopsd_error (Not_enough_memory m)) + | e -> + let m = + Printf.sprintf "VM = %s; domid = %d; Error: %s" vm.Vm.id domid + (Printexc.to_string e) + in + debug "%s" m ; Backtrace.reraise e e + ) ) (fun () -> clean_memory_reservation task di.Xenctrl.domid) @@ -2859,7 +2900,9 @@ module VM = struct not ( with_tracing ~task ~name:"VM_save_domain_suspend_callback_request_shutdown" - @@ fun () -> request_shutdown task vm Suspend 30. + @@ fun () -> + request_shutdown task vm Suspend + !Xenopsd.vm_suspend_ack_timeout ) then raise (Xenopsd_error Failed_to_acknowledge_suspend_request) ; @@ -2870,14 +2913,25 @@ module VM = struct | _ -> () ) ; + let suspend_timeout = !Xenopsd.vm_suspend_timeout in if not ( with_tracing ~task ~name:"VM_save_domain_suspend_callback_wait_shutdown" - @@ fun () -> wait_shutdown task vm Suspend 1200. + @@ fun () -> + debug + "VM = %s; domid = %d; Waiting for domain to suspend \ + (timeout = %.0fs)" + vm.Vm.id domid suspend_timeout ; + wait_shutdown task vm Suspend suspend_timeout ) then - raise (Xenopsd_error (Failed_to_suspend (vm.Vm.id, 1200.))) + raise + (Xenopsd_error + (Failed_to_suspend (vm.Vm.id, suspend_timeout)) + ) + else + debug "VM = %s; domid = %d; Domain suspended" vm.Vm.id domid ) ; (* Record the final memory usage of the domain so we know how much to allocate for the resume *) @@ -2992,6 +3046,7 @@ module VM = struct | _ -> "" in + debug "VM = %s, initial_target = %Ld" vm.Vm.id initial_target ; ({x with Domain.memory_target= initial_target}, timeoffset) in let vtpm = vtpm_of ~vm in @@ -3131,7 +3186,10 @@ module VM = struct let memory_actual = let pages = Int64.of_nativeint di.Xenctrl.total_memory_pages in let kib = Xenctrl.pages_to_kib pages in - Memory.bytes_of_kib kib + let bytes = Memory.bytes_of_kib kib in + D.debug "VM %s memory actual: %Ld pages = %Ld KiB = %Ld bytes" + (Uuidm.to_string uuid) pages kib bytes ; + bytes in let memory_limit = (* The maximum amount of memory the domain can consume is the max @@ -3154,7 +3212,10 @@ module VM = struct in (* CA-31764: may be larger than static_max if maxmem has been increased to initial-reservation. *) - max memory_actual max_memory_bytes + let result = max memory_actual max_memory_bytes in + D.debug "VM %s memory limit = %Ld bytes" (Uuidm.to_string uuid) + result ; + result in let rtc = try diff --git a/ocaml/xenopsd/xenopsd.conf b/ocaml/xenopsd/xenopsd.conf index e1c3c87c7cb..99669c84558 100644 --- a/ocaml/xenopsd/xenopsd.conf +++ b/ocaml/xenopsd/xenopsd.conf @@ -116,3 +116,17 @@ disable-logging-for=http tracing tracing_export # On Intel a similar effect is already achieved with iPAT in Xen, # but setting this to 0 works on Intel too. # xen-platform-pci-bar-uc=false + +# Timeout in seconds for a VM to suspend after acknowledging a suspend request +# vm-suspend-timeout=1200 + +# Timeout in seconds for a VM to acknowledge a suspend request +# vm-suspend-ack-timeout=30 +# + +# A Linux guest kernel 4.9+ announces the control features it supports in +# xenstore. If it does not, xenopsd has to assume it supports a range of +# operations. The default "false" is to rely on guest kernels to announce +# support. Use "true" to assume support even when unannounced. +# linux-assume-control-features = false + diff --git a/quality-gate.sh b/quality-gate.sh index c82a98ea57f..8778b15a03f 100755 --- a/quality-gate.sh +++ b/quality-gate.sh @@ -3,7 +3,7 @@ set -e list-hd () { - N=244 + N=239 LIST_HD=$(git grep -r --count 'List.hd' -- **/*.ml | cut -d ':' -f 2 | paste -sd+ - | bc) if [ "$LIST_HD" -eq "$N" ]; then echo "OK counted $LIST_HD List.hd usages" @@ -25,7 +25,7 @@ verify-cert () { } mli-files () { - N=457 + N=455 X="ocaml/tests" X+="|ocaml/quicktest" X+="|ocaml/message-switch/core_test" diff --git a/scripts/xapi.conf b/scripts/xapi.conf index 8736fed6c0d..e8df3cf739b 100644 --- a/scripts/xapi.conf +++ b/scripts/xapi.conf @@ -389,6 +389,9 @@ sm-plugins=ext nfs iscsi lvmoiscsi dummy file hba rawhba udev iso lvm lvmohba lv # How often tracing will export spans to endpoints # export-interval = 30. +# maximium number of messages kept before oldest are removed +# message-limit = 10000 + # The file to check if host reboot required reboot_required_hfxs = /run/reboot-required.hfxs