diff --git a/CMakeLists.txt b/CMakeLists.txt index 18a5005514..adb5603b68 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -88,6 +88,7 @@ option(WITH_LCOV "Enable code coverage analysis with lcov" OFF) option(WITH_PYTHON "Enable Python external bindings" OFF) option(WITH_BOOST "Enable Boost framework" OFF) option(WITH_MUNGE "Enable Munge authentication" ON) +option(WITH_SYSTEMD "Enable systemd support" OFF) # private extensions set(PROJECT_EXTENSIONS "None" CACHE STRING "directory of private extensions") @@ -188,6 +189,10 @@ if (WITH_MUNGE) add_compile_definitions("OCS_WITH_MUNGE") endif() +if (WITH_SYSTEMD) + add_compile_definitions("OCS_WITH_SYSTEMD") +endif() + #if (SGE_ARCH MATCHES "darwin-arm64" OR SGE_ARCH MATCHES "fbsd-amd64") if (NOT WITH_SPOOL_BERKELEYDB AND NOT WITH_SPOOL_DYNAMIC) set(SPOOLING_LIBS spoolloader spoolc_static spool) diff --git a/cmake/ArchitectureSpecificSettings.cmake b/cmake/ArchitectureSpecificSettings.cmake index 790a15e38e..27cde3ecca 100644 --- a/cmake/ArchitectureSpecificSettings.cmake +++ b/cmake/ArchitectureSpecificSettings.cmake @@ -61,25 +61,7 @@ function(architecture_specific_settings) message("Build with extensions is enabled") endif() - if (SGE_ARCH MATCHES "lx-riscv64") - # Linux RiscV - message(STATUS "We are on Linux: ${SGE_ARCH}") - set(CMAKE_C_FLAGS "-Wall -Werror -pedantic" CACHE STRING "" FORCE) - set(CMAKE_CXX_FLAGS "-Wall -Werror -pedantic" CACHE STRING "" FORCE) - - add_compile_definitions(LINUX _GNU_SOURCE GETHOSTBYNAME_R6 GETHOSTBYADDR_R8 HAS_IN_PORT_T SPOOLING_dynamic __SGE_COMPILE_WITH_GETTEXT__) - add_compile_options(-fPIC) - add_compile_options(-pthread) - add_link_options(-pthread -rdynamic) - - set(TIRPC_INCLUDES /usr/include/tirpc PARENT_SCOPE) - set(TIRPC_LIB tirpc PARENT_SCOPE) - message(STATUS "using libtirpc") - - set(WITH_JEMALLOC OFF PARENT_SCOPE) - set(WITH_MTMALLOC OFF PARENT_SCOPE) - set(JNI_ARCH "linux" PARENT_SCOPE) - elseif (SGE_ARCH MATCHES "lx-.*" OR SGE_ARCH MATCHES "ulx-.*" OR SGE_ARCH MATCHES "xlx-.*") + if (SGE_ARCH MATCHES "lx-.*" OR SGE_ARCH MATCHES "ulx-.*" OR SGE_ARCH MATCHES "xlx-.*") # master is not supported on CentOS 6. Execd is deprecated and will be removed in the future. if (SGE_ARCH STREQUAL "xlx-.*") set(INSTALL_SGE_BIN_MASTER OFF CACHE BOOL "Install master daemon binaries" FORCE) @@ -166,6 +148,23 @@ function(architecture_specific_settings) message(STATUS "no libtirpc or libntirpc found") endif () + # build with systemd? + # @todo we might want to check the api version, we need at least + # - 235: here FreezeUnit and ThawUnit were added (not required, we work around this not being available) + # - 231: 240? here sd_bus_process() was added (not required, we work around this) + # - 221: here StopUnit was added + # Our build hosts are OK as it is (RHEL-8 compatible for lx-* has a recent enough version, + # RHEL-7 compatible for ulx-* does not have it at all) + if (EXISTS /usr/include/systemd/sd-bus.h) + set(WITH_SYSTEMD ON PARENT_SCOPE CACHE STRING "" FORCE) + message(STATUS "systemd development files found") + endif() + + if (SGE_ARCH MATCHES "lx-riscv64") + # Linux RiscV + add_compile_options(-fPIC) + set(WITH_JEMALLOC OFF PARENT_SCOPE) + endif() if (SGE_ARCH STREQUAL "lx-x86" OR SGE_ARCH STREQUAL "ulx-x86" OR SGE_ARCH STREQUAL "xlx-x86") # we need patchelf for setting the run path in the db_* tools # but patchelf is not available on CentOS 7 x86 diff --git a/doc/markdown/man/man5/sge_conf.md b/doc/markdown/man/man5/sge_conf.md index 8b5698476c..bcebaba281 100644 --- a/doc/markdown/man/man5/sge_conf.md +++ b/doc/markdown/man/man5/sge_conf.md @@ -1044,11 +1044,19 @@ completely. ***ENABLE_BINDING*** -If this parameter is set then xxQS_NAMExx enables the core binding module within the execution daemon to apply -binding parameters that are specified during submission time of a job. This parameter is not set per default and +If this parameter is set, then xxQS_NAMExx enables the core binding module within the execution daemon to apply +binding parameters that are specified during submission time of a job. This parameter is not set per default, and therefore all binding related information will be ignored. Find more information for job to core binding in the section `-binding` of qsub(1). +***ENABLE_SYSTEMD*** + +If this parameter is set, +and an execution hosts supports systemd, then jobs will be started in a systemd scope. This allows the execution daemon to +manage the job's processes as a group, which is useful for resource management and job control. + +This parameter is set to true by default, meaning that on hosts that support systemd, jobs will be started in a systemd scope. If a host does not support systemd, then this parameter will be ignored. + ***SCRIPT_TIMEOUT*** This parameter allows to configure the allowed runtime of execution side scripts like prolog, epilog, and the PE @@ -1060,6 +1068,15 @@ in one load report interval. The default for *execd_params* is none. The global configuration entry for this value may be overwritten by the execution host local configuration. +***USAGE_COLLECTION*** + +This parameter controls how xxqs_name_sxx_execd collects the online usage information of jobs. The following values are recognized: + +- *FALSE* : No online usage information is collected. Use with care, this also disables limit enforcement for *s_cpu*, *h_cpu*, *s_rss*, *h_rss*, *s_vmem*, and *h_vmem*. +- *PDC* : Online usage information is collected by the PDC (Portable Data Collector) mode, even if Systemd is available. +- *HYBRID* : Hybrid mode, where online usage information is both gathered via Systemd (if available) and the PDC. Use this mode, when your jobs are controlled by systemd, but you also want to collect usage information for jobs that is not available via Systemd, e.g., vmem, maxvmem, io, and iow. +- *TRUE* : This is the default mode. Online usage information is collected via Systemd if the host supports Systemd and *ENABLE_SYSTEMD* is set to *TRUE* (which is the default). It is collected by the PDC (Portable Data Collector) if the host does not support Systemd or if *ENABLE_SYSTEMD* is set to *FALSE*. + ## gdi_request_limits This value is a global configuration parameter only, and is used to prevent denial-of-service attacks on the xxqs_name_sxx_qmaster(8) process. @@ -1375,3 +1392,4 @@ xxqs_name_sxx_shepherd*(8), cron(8), # COPYRIGHT See xxqs_name_sxx_intro(1) for a full statement of rights and permissions. + diff --git a/doc/markdown/manual/development-guide/00_overview.md b/doc/markdown/manual/development-guide/00_overview.md index 240d09b3dd..420554a466 100644 --- a/doc/markdown/manual/development-guide/00_overview.md +++ b/doc/markdown/manual/development-guide/00_overview.md @@ -34,7 +34,12 @@ Tags and branches before `V9` will also not be described here. | | V900p1\_TAG | patch to the 9.0.0 making it work on GCP (CS-663) | | | | | | V90\_BRANCH | | maintenance of 9.0 | +| | V903\_TAG | third 9.0 patch | +| | V904\_TAG | fourth 9.0 patch | +| | V905\_TAG | fifth 9.0 patch | +| | V906\_TAG | sixth 9.0 patch | +| | V907\_TAG | seventh 9.0 patch | | | | | -[//]: # (Each file has to end with two emty lines) +[//]: # (Each file has to end with two empty lines) diff --git a/doc/markdown/manual/development-guide/01_prepare_dev_env.md b/doc/markdown/manual/development-guide/01_prepare_dev_env.md index 89d670777a..ddfb5b21a6 100644 --- a/doc/markdown/manual/development-guide/01_prepare_dev_env.md +++ b/doc/markdown/manual/development-guide/01_prepare_dev_env.md @@ -267,5 +267,5 @@ git clone https://github.com/hpc-gridware/gcs-extensions git clone https://github.com/hpc-gridware/gcs-testsuite ``` -[//]: # (Each file has to end with two emty lines) +[//]: # (Each file has to end with two empty lines) diff --git a/doc/markdown/manual/development-guide/02_build_configuration.md b/doc/markdown/manual/development-guide/02_build_configuration.md index 766225b522..76ba701e21 100644 --- a/doc/markdown/manual/development-guide/02_build_configuration.md +++ b/doc/markdown/manual/development-guide/02_build_configuration.md @@ -188,5 +188,5 @@ Here we use *CLion* as example because it provides full integration with CMake t Next step is to build and install xxQS_NAMExx. -[//]: # (Eeach file has to end with two emty lines) +[//]: # (Eeach file has to end with two empty lines) diff --git a/doc/markdown/manual/development-guide/03_build_and_installation.md b/doc/markdown/manual/development-guide/03_build_and_installation.md index 20d1a188ab..ab0a2dc5cb 100644 --- a/doc/markdown/manual/development-guide/03_build_and_installation.md +++ b/doc/markdown/manual/development-guide/03_build_and_installation.md @@ -21,5 +21,5 @@ make install You can now either install the product (follow the instructions in the *Installation Guide*) or you can continue to setup the automated test environment as described in the next chapter. -[//]: # (Eeach file has to end with two emty lines) +[//]: # (Eeach file has to end with two empty lines) diff --git a/doc/markdown/manual/development-guide/05_simulating_hosts_and_execution.md b/doc/markdown/manual/development-guide/05_simulating_hosts_and_execution.md index fb3dd31ac6..cb512ccf1b 100644 --- a/doc/markdown/manual/development-guide/05_simulating_hosts_and_execution.md +++ b/doc/markdown/manual/development-guide/05_simulating_hosts_and_execution.md @@ -101,5 +101,5 @@ Instead, the job execution is just simulated. @todo add details -[//]: # (Eeach file has to end with two emty lines) +[//]: # (Eeach file has to end with two empty lines) diff --git a/doc/markdown/manual/development-guide/25_scheduler_thread.md b/doc/markdown/manual/development-guide/25_scheduler_thread.md index 48773c0892..1a40e99567 100644 --- a/doc/markdown/manual/development-guide/25_scheduler_thread.md +++ b/doc/markdown/manual/development-guide/25_scheduler_thread.md @@ -296,5 +296,5 @@ we can switch off a few potentially expensive features and just rely on scheduli * do not configure queue load_thresholds and suspend_thresholds * do not use load adjustments (in the scheduler config) -[//]: # (Eeach file has to end with two emty lines) +[//]: # (Eeach file has to end with two empty lines) diff --git a/doc/markdown/manual/installation-guide/01_planning_the_installation.md b/doc/markdown/manual/installation-guide/01_planning_the_installation.md index e341f58feb..8483f3643e 100644 --- a/doc/markdown/manual/installation-guide/01_planning_the_installation.md +++ b/doc/markdown/manual/installation-guide/01_planning_the_installation.md @@ -280,7 +280,7 @@ Please refer to the next question for more information. ### Where is the spooling area for the master service located? -For HA-setups, it must be a shared network location; otherwise, it can be the local filesystem of the host +For HA setups, it must be a shared network location; otherwise, it can be the local filesystem of the host running the master service. Ensure that the spooling location meets the requirements of the spooling mechanism. Classic spooling can be done on @@ -359,4 +359,5 @@ If this is your first time installing xxQS_NAMExx, we suggest a manual installat Automatic installation is recommended if you need to install or reinstall a cluster multiple times or if you plan to install multiple clusters with slightly different settings. -[//]: # (Eeach file has to end with two emty lines) +[//]: # (Eeach file has to end with two empty lines) + diff --git a/doc/markdown/manual/installation-guide/02_download.md b/doc/markdown/manual/installation-guide/02_download.md index 6bf080004b..53f312a09e 100644 --- a/doc/markdown/manual/installation-guide/02_download.md +++ b/doc/markdown/manual/installation-guide/02_download.md @@ -54,5 +54,5 @@ Once you have downloaded all packages, you can test and install them at the desi 4. If your `` is located on a shared filesystem available on all hosts in the cluster then you can start the installation process. -[//]: # (Eeach file has to end with two emty lines) +[//]: # (Eeach file has to end with two empty lines) diff --git a/doc/markdown/manual/installation-guide/03_installation.md b/doc/markdown/manual/installation-guide/03_installation.md index 5be3fbeaa5..a7af643d01 100644 --- a/doc/markdown/manual/installation-guide/03_installation.md +++ b/doc/markdown/manual/installation-guide/03_installation.md @@ -5,17 +5,17 @@ installation process for xxQS_NAMExx. ## Manual Installation -This section covers the manual installation process on the command line. Note the prerequisites are required as outlined in previous chapters. If the hostname setup, usernames and service configuration are correct for all hosts that you intend to include in you cluster, then you can continue with the installation the master service. +This section covers the manual installation process on the command line. Note the prerequisites are required as outlined in previous chapters. If the hostname setup, usernames and service configuration are correct for all hosts that you intend to include in your cluster, then you can continue with the installation of the master service. ### Installation of the Master Service -During the execution the master's installation procedure following steps are processed. +During the execution of the master's installation procedure, the following steps are processed. * a cell directory will be created that will contain files that are read by all service components as well as client applications communicating with the service within that cell. -* next installation steps for other services are prepared (e.g. admin hosts are defined that will later on be allowed to run execution services). +* next installation steps for other services are prepared (e.g., admin hosts are defined that will later on be allowed to run execution services). -* a default configuration is created and user specific changes are applied. +* a default configuration is created and user-specific changes are applied. * the master service is started and basic tests of its functionality are executed. @@ -37,7 +37,7 @@ Here are the steps required to complete the installation. ``` # ./install_qmaster ``` - In order to install with Munge authentication pass the `-munge` option to the installation script. + To install with Munge authentication pass the `-munge` option to the installation script. ``` # ./install_qmaster -munge @@ -65,7 +65,7 @@ Here are the steps required to complete the installation. Hit to continue >> ``` -4. Admin User: Either accept the suggested admin user or reject it. If you reject the suggestion then you can select a different one. +4. Admin User: Either accept the suggested admin user or reject it. If you reject the suggestion, then you can select a different one. ``` Cluster Scheduler admin user account @@ -105,7 +105,7 @@ Here are the steps required to complete the installation. to use default [] >> ``` -6. Master Service Port: Specify which service port should be used for the master service. If you have an entry in */etc/services* or if a directory service is available that provides that information for `sge_qmaster` then the installer will show you the configured port number and use that as service port. Alternatively you can specify a different port number via shell environment. +6. Master Service Port: Specify which service port should be used for the master service. If you have an entry in */etc/services* or if a directory service is available that provides that information for `sge_qmaster` then the installer will show you the configured port number and use that as service port. Alternatively, you can specify a different port number via shell environment. ``` Cluster Scheduler TCP/IP communication service @@ -132,7 +132,7 @@ Here are the steps required to complete the installation. (default: 2) >> ``` -7. Execution Service Port: Specify which service port should be used for the execution service. If you have an entry in */etc/services* or if a directory service is available that provides that information for `sge_execd` then the installer will show you the configured port number and use that as service port. Alternatively you can specify a different port number via shell environment. +7. Execution Service Port: Specify which service port should be used for the execution service. If you have an entry in */etc/services* or if a directory service is available that provides that information for `sge_execd` then the installer will show you the configured port number and use that as service port. Alternatively, you can specify a different port number via shell environment. ``` Cluster Scheduler TCP/IP communication service @@ -248,7 +248,7 @@ Here are the steps required to complete the installation. Hit to continue >> ``` -13. Choose Spooling Method: Select classic or BDB spooling. As part of this step the spooling file will be created. +13. Choose Spooling Method: Select classic or BDB spooling. As part of this step, the spooling file will be created. ``` Setup spooling @@ -339,7 +339,24 @@ Here are the steps required to complete the installation. Hit to continue >> ``` -18. Autostart: Select if the master service should be integrated into the launch environment of the OS. +18. Systemd slice: If you are using a systemd based operating system, then the installer will create a toplevel systemd slice under which the xxQS_NAMExx services and the jobs will be run. +It is recommended to specify a unique slice name to make sure that the slice does not collide with the setup of other clusters that you might run in parallel. The suggested default is `ocs` + the qmaster service port number, e.g. `ocs6444.slice`. +The slice name is stored in `$SGE_ROOT/$SGE_CELL/default/slice_name`. +This dialog will only be shown once during the installation of the first service running on a systemd based operating system, usually the master service. + + ``` + Setting up systemd toplevel slice name + -------------------------------------- + qmaster will be running withing a top level systemd/cgroups slice, + default is "ocs6444.slice". + If you are running multiple clusters on the same host, + please use a unique slice name. + + Please enter the slice name (without the trailing .slice) or + hit to use [ocs6444] >> + ``` + +19. Autostart: Select if the master service should be integrated into the launch environment of the OS. ``` qmaster startup script @@ -349,7 +366,7 @@ Here are the steps required to complete the installation. start qmaster at machine boot (y/n) [y] >> ``` -19. Service Start: Now the master service is started. +20. Service Start: Now the master service is started. ``` Cluster Scheduler qmaster startup @@ -360,7 +377,7 @@ Here are the steps required to complete the installation. Hit to continue >> ``` -20. Host Permissions: Select the hosts that should later on run the execution service. Those hosts will be administration hosts and submit hosts automatically. +21. Host Permissions: Select the hosts that should later run the execution service. Those hosts will be administration hosts and submit hosts automatically. ``` Adding Cluster Scheduler hosts @@ -395,7 +412,7 @@ Here are the steps required to complete the installation. Host(s): ``` - Optionally you also add your shadow hosts now as administrative hosts: + Optionally, you also add your shadow hosts now as administrative hosts: ``` Adding Cluster Scheduler shadow hosts @@ -430,7 +447,7 @@ Here are the steps required to complete the installation. Host(s): ``` -21. Default Configuration Steps: Depending on you host setup and configuration steps some default configuration objects will be created for you cluster. +22. Default Configuration Steps: Depending on your host setup and configuration steps, some default configuration objects will be created for your cluster. ``` Creating the default queue and hostgroup @@ -442,7 +459,7 @@ Here are the steps required to complete the installation. Hit to continue >> ``` -22. Installation Summary +23. Installation Summary ``` Using Cluster Scheduler @@ -499,7 +516,7 @@ Here are the steps required to complete the installation. Do you want to see previous screen about using Cluster Scheduler again (y/n) [n] >> ``` - Should you have seen error messages during the installation then the mentioned message files will contain more details about them. + Should you have seen error messages during the installation, then the mentioned message files will contain more details about them. ``` Your Cluster Scheduler qmaster installation is now completed @@ -528,7 +545,7 @@ Here are the steps required to complete the installation. ### Installation of the Execution Service -During the execution host installation procedure following steps are processed: +During the execution host installation procedure, the following steps are processed: * It is tested that the master service is running and that the execution host is able to communicate with the master service. @@ -620,7 +637,7 @@ Here are the steps required to complete the installation. Hit to continue >> ``` -8. The installer does verify the local hostname resolution and if the current host is an administrative host. +8. The installer does verify the local hostname resolution, and if the current host is an administrative host. ``` Checking hostname resolving @@ -711,7 +728,7 @@ The auto installation is also able to install services on remote hosts if either 1. Login as root on the system where you intend to install a service. -2. Make of copy of a configuration template file and prepare it with the answers to the questions that are usually asked during the manual installation process. If the root user has no write permissions in $SGE_ROOT then choose a different path but make sure that you preserve the file for the uninstallation process. +2. Make a copy of the configuration template file and prepare it with the answers to the questions that are usually asked during the manual installation process. If the root user has no write permissions in $ SGE_ROOT, then choose a different path but make sure that you preserve the file for the uninstallation process. ``` $ cp $SGE_ROOT/util/install_modules/inst_template.conf $SGE_ROOT/my_template.conf @@ -726,7 +743,7 @@ The auto installation is also able to install services on remote hosts if either ./inst_sge -m -auto $SGE_ROOT/my_template.conf ``` - In order to install with Munge authentication pass the `-munge` option to the installation script. + To install with Munge authentication pass the `-munge` option to the installation script. ``` cd $SGE_ROOT @@ -751,11 +768,11 @@ The auto installation is also able to install services on remote hosts if either ## Uninstallation -The uninstallation of the xxQS_NAMExx software can be done manually or automatically using the configuration template created during the auto installation. If you uninstall an execution host then make sure that there are no running jobs on that host. If you uninstall manually then make sure that all execution hosts are uninstalled first before you uninstall the master host or other services. +The uninstallation of the xxQS_NAMExx software can be done manually or automatically using the configuration template created during the auto installation. If you uninstall an execution host, then make sure that there are no running jobs on that host. If you uninstall manually, then make sure that all execution hosts are uninstalled first before you uninstall the master host or other services. 1. Login as root on the system where you installed a service. -2. Automatic uninstall the execution service on execution hosts. +2. Automatic uninstallation of the execution service on execution hosts. ``` cd $SGE_ROOT @@ -776,4 +793,5 @@ The uninstallation of the xxQS_NAMExx software can be done manually or automatic ./inst_sge ... ``` -[//]: # (Eeach file has to end with two emty lines) +[//]: # (Eeach file has to end with two empty lines) + diff --git a/doc/markdown/manual/installation-guide/04_backup_and_restore.md b/doc/markdown/manual/installation-guide/04_backup_and_restore.md index 8655d9ec15..566e9f91f1 100644 --- a/doc/markdown/manual/installation-guide/04_backup_and_restore.md +++ b/doc/markdown/manual/installation-guide/04_backup_and_restore.md @@ -31,5 +31,5 @@ To restore a backup you have to follow these steps: You will be asked several questions during the restore process (e.g. location of the SGE_ROOT, name of the default cell directory, location of the backup files, etc.). -[//]: # (Eeach file has to end with two emty lines) +[//]: # (Eeach file has to end with two empty lines) diff --git a/doc/markdown/manual/installation-guide/05_upgrade.md b/doc/markdown/manual/installation-guide/05_upgrade.md index 98fc735caa..e7e14dd2df 100644 --- a/doc/markdown/manual/installation-guide/05_upgrade.md +++ b/doc/markdown/manual/installation-guide/05_upgrade.md @@ -180,5 +180,5 @@ Here are the steps required to complete the in-place upgrade: 11. Continue with post installation steps. -[//]: # (Eeach file has to end with two emty lines) +[//]: # (Eeach file has to end with two empty lines) diff --git a/doc/markdown/manual/installation-guide/06_troubleshooting.md b/doc/markdown/manual/installation-guide/06_troubleshooting.md index fb1642eae0..c87ad95938 100644 --- a/doc/markdown/manual/installation-guide/06_troubleshooting.md +++ b/doc/markdown/manual/installation-guide/06_troubleshooting.md @@ -70,5 +70,5 @@ As second step you have to define the `host_aliases` file to tell xxQS_NAMExx th master_host master_host_eth1 master_host_eth2 ``` -[//]: # (Eeach file has to end with two emty lines) +[//]: # (Eeach file has to end with two empty lines) diff --git a/doc/markdown/manual/platform-guide/01_introduction.md b/doc/markdown/manual/platform-guide/01_introduction.md index e8ab598f45..d38447fd35 100644 --- a/doc/markdown/manual/platform-guide/01_introduction.md +++ b/doc/markdown/manual/platform-guide/01_introduction.md @@ -9,5 +9,5 @@ Changes in the *major* version may introduce incompatibilities. Variations in th > > This document is specific to version v9.0.x of HPC-Gridware's product suite and describes the functionality of its components. If you are using or planning to use a different major or minor version, please refer to the corresponding `Platforms Guide` or contact our sales or support team for assistance. -[//]: # (Eeach file has to end with two emty lines) +[//]: # (Eeach file has to end with two empty lines) diff --git a/doc/markdown/manual/platform-guide/02_supported_platform.md b/doc/markdown/manual/platform-guide/02_supported_platform.md index b0aac8d9b6..29740d55d7 100644 --- a/doc/markdown/manual/platform-guide/02_supported_platform.md +++ b/doc/markdown/manual/platform-guide/02_supported_platform.md @@ -71,5 +71,5 @@ The table below outlines the supported operating systems, versions, and architec > > Is your operating system missing from the table above? This simply indicates that we have not completed all the necessary QA checks, not that Gridware Cluster Scheduler or Open Cluster Scheduler cannot be installed. If you want support coverage also for older operating systems then please reach out to our sales and support team for further assistance. -[//]: # (Eeach file has to end with two emty lines) +[//]: # (Eeach file has to end with two empty lines) diff --git a/doc/markdown/manual/platform-guide/03_roadmap.md b/doc/markdown/manual/platform-guide/03_roadmap.md index 30ca333103..6f95976c51 100644 --- a/doc/markdown/manual/platform-guide/03_roadmap.md +++ b/doc/markdown/manual/platform-guide/03_roadmap.md @@ -33,5 +33,5 @@ The following table outlines the maintenance schedule for major and minor releas * **(3)**: Optional extension of Phase 3 for up to three years (two additional years) subject to agreement in the support contract. -[//]: # (Eeach file has to end with two emty lines) +[//]: # (Eeach file has to end with two empty lines) diff --git a/doc/markdown/manual/platform-guide/04_comparison.md b/doc/markdown/manual/platform-guide/04_comparison.md index d5408c226e..468d5db45b 100644 --- a/doc/markdown/manual/platform-guide/04_comparison.md +++ b/doc/markdown/manual/platform-guide/04_comparison.md @@ -24,5 +24,5 @@ The table below outlines the differences between the Gridware Cluster Scheduler | Support Offering | v9.0.0 | - | -[//]: # (Eeach file has to end with two emty lines) +[//]: # (Eeach file has to end with two empty lines) diff --git a/doc/markdown/manual/release-notes/01_license.md b/doc/markdown/manual/release-notes/01_license.md index 149fd83ca2..5aca335f48 100644 --- a/doc/markdown/manual/release-notes/01_license.md +++ b/doc/markdown/manual/release-notes/01_license.md @@ -1,4 +1,4 @@ # License -[//]: # (Eeach file has to end with two emty lines) +[//]: # (Eeach file has to end with two empty lines) diff --git a/doc/markdown/manual/release-notes/03_major_enhancements.md b/doc/markdown/manual/release-notes/03_major_enhancements.md index 5af83cb010..8e2aad8f35 100644 --- a/doc/markdown/manual/release-notes/03_major_enhancements.md +++ b/doc/markdown/manual/release-notes/03_major_enhancements.md @@ -63,6 +63,18 @@ See details in the Installation Guide, chapters "Planning the Installation" and (Available in Gridware Cluster Scheduler and Open Cluster Scheduler) +### Systemd Integration + +xxQS_NAMExx is now integrated with systemd, the system and service manager for Linux operating systems. This integration allows xxQS_NAMExx to be managed as a systemd service, providing better control over the startup, shutdown, and management of the xxQS_NAMExx daemons. + +In addition, xxQS_NAMExx jobs can optionally be run under systemd control, which allows for better management of job execution and resource allocation, including core binding and device isolation through systemd's cgroup management. This behavior can be configured; by default jobs are run under systemd control, if available, but this can be changed by setting the `USE_SYSTEMD` `execd_params` parameter. + +Collection of job resource usage statistics can be done via systemd. + +Given that usage reporting of systemd is not as detailed as the one provided by the xxQS_NAMExx builtin data collector, it is possible to fall back to the xxQS_NAMExx data collector. There is also a hybrid usage collection mode in which the builtin data collector supplements the values systemd does not provide. Which mode of the usage collection is used can be configured via the `USAGE_COLLECTION` `execd_params` parameter. + +For further details on how to configure and use systemd integration, please refer to the xxQS_NAMExx Installation Guide, the xxQS_NAMExx Administration Guide, and the man pages. + ### qsub -sync r Additionally to the existing `qsub -sync y` option, new options have been introduced that allow to wait for certain job states. @@ -71,5 +83,5 @@ Additionally to the existing `qsub -sync y` option, new options have been introd (Available in Gridware Cluster Scheduler and Open Cluster Scheduler) -[//]: # (Eeach file has to end with two emty lines) +[//]: # (Eeach file has to end with two empty lines) diff --git a/doc/markdown/manual/release-notes/04_full_list_of_fixes.md b/doc/markdown/manual/release-notes/04_full_list_of_fixes.md index 730c0b1738..21dc190f9b 100644 --- a/doc/markdown/manual/release-notes/04_full_list_of_fixes.md +++ b/doc/markdown/manual/release-notes/04_full_list_of_fixes.md @@ -4,5 +4,5 @@ ## v9.1.0prealpha -[//]: # (Eeach file has to end with two emty lines) +[//]: # (Eeach file has to end with two empty lines) diff --git a/doc/markdown/manual/release-notes/05_issues_and_limitations.md b/doc/markdown/manual/release-notes/05_issues_and_limitations.md index 8fce2251d8..e121151d95 100644 --- a/doc/markdown/manual/release-notes/05_issues_and_limitations.md +++ b/doc/markdown/manual/release-notes/05_issues_and_limitations.md @@ -1,4 +1,4 @@ # Known Issues and Limitations -[//]: # (Eeach file has to end with two emty lines) +[//]: # (Eeach file has to end with two empty lines) diff --git a/doc/markdown/manual/release-notes/06_upgrade_notes.md b/doc/markdown/manual/release-notes/06_upgrade_notes.md index b5e7aa39f3..dbda57758c 100644 --- a/doc/markdown/manual/release-notes/06_upgrade_notes.md +++ b/doc/markdown/manual/release-notes/06_upgrade_notes.md @@ -1,4 +1,4 @@ # Upgrade Notes -[//]: # (Eeach file has to end with two emty lines) +[//]: # (Eeach file has to end with two empty lines) diff --git a/doc/markdown/manual/release-notes/07_compatibility_notes.md b/doc/markdown/manual/release-notes/07_compatibility_notes.md index dcd25121a5..bb4337f5c8 100644 --- a/doc/markdown/manual/release-notes/07_compatibility_notes.md +++ b/doc/markdown/manual/release-notes/07_compatibility_notes.md @@ -1,4 +1,4 @@ # Compatibility Notes -[//]: # (Eeach file has to end with two emty lines) +[//]: # (Eeach file has to end with two empty lines) diff --git a/source/common/basis_types.h b/source/common/basis_types.h index 8f1d96eb71..124a4367c4 100644 --- a/source/common/basis_types.h +++ b/source/common/basis_types.h @@ -161,6 +161,7 @@ typedef char stringT[MAX_STRING_SIZE]; /* save string format non-quoted */ #define SFN "%-.100s" #define SFN2 "%-.200s" +#define SFN4 "%-.400s" #define SFNMAX "%-.2047s" /* write to buffer of size MAX_STRING_SIZE */ #define PFNMAX "%-.1023s" /* write to buffer of size SGE_PATH_MAX */ /* non-quoted string not limited intentionally */ diff --git a/source/common/msg_common.h b/source/common/msg_common.h index dfb75231fb..8a11539bc8 100644 --- a/source/common/msg_common.h +++ b/source/common/msg_common.h @@ -123,6 +123,8 @@ #define MSG_NONE _MESSAGE(23004, _("none")) #define MSG_NULL _MESSAGE(23005, _("(nullptr)")) #define MSG_SMALLNULL _MESSAGE(23006, _("(null)")) +#define MSG_YES _MESSAGE(23007, _("yes")) +#define MSG_NO _MESSAGE(23008, _("no")) #define MSG_FILE_CHDIR_SS _MESSAGE(23011, _("can't change dir to " SFN ": " SFN)) #define MSG_FILE_NOCHOWN_SS _MESSAGE(23012, _("can't chown file or directory " SFN ": " SFN)) diff --git a/source/daemons/common/CMakeLists.txt b/source/daemons/common/CMakeLists.txt index 1067c64723..46ee8c4b2a 100644 --- a/source/daemons/common/CMakeLists.txt +++ b/source/daemons/common/CMakeLists.txt @@ -25,6 +25,7 @@ set(LIBRARY_SOURCES err_trace.cc lock.cc mail.cc + ocs_common_systemd.cc pdc.cc procfs.cc qlogin_starter.cc diff --git a/source/daemons/common/msg_daemons_common.h b/source/daemons/common/msg_daemons_common.h index 28035cd0dd..7039f799ab 100644 --- a/source/daemons/common/msg_daemons_common.h +++ b/source/daemons/common/msg_daemons_common.h @@ -149,4 +149,6 @@ #define MSG_STARTUP_USING_MUNGE _MESSAGE(27102, _("using Munge authentication")) #define MSG_MESSAGE_FROM_DAEMON_WRONG_UID_SSUU _MESSAGE(27103, _("message from " SFN "/" SFN " with wrong uid: " uid_t_fmt " (expected " uid_t_fmt ")")) +#define MSG_STARTUP_IN_CGROUP_S _MESSAGE(27104, _("running in cgroup " SFN)) + // clang-format on diff --git a/source/daemons/common/ocs_common_systemd.cc b/source/daemons/common/ocs_common_systemd.cc new file mode 100644 index 0000000000..f166798795 --- /dev/null +++ b/source/daemons/common/ocs_common_systemd.cc @@ -0,0 +1,59 @@ +/*___INFO__MARK_BEGIN_NEW__*/ +/*************************************************************************** + * + * Copyright 2025 HPC-Gridware GmbH + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************/ +/*___INFO__MARK_END_NEW__*/ + +#include "uti/ocs_Systemd.h" + +#include "ocs_common_systemd.h" + +namespace ocs::common { + /*! + * @brief Check if execd should use PDC for usage collection based on the given usage_collection parameter. + * + * This function checks the configuration to determine if PDC should be used for usage collection + * based on the provided usage_collection parameter. It returns true if PDC is enabled or hybrid mode is configured, + * otherwise false. + * + * @param usage_collection The usage collection mode to check against. + * @return true if PDC is used for usage collection, false otherwise. + */ + bool + use_pdc_for_usage_collection(usage_collection_t usage_collection) { + bool ret = true; + + if (usage_collection == USAGE_COLLECTION_NONE) { + ret = false; // we do not use PDC for usage collection + } else { + // When we are using systemd we usually do not use PDC for usage collection, + // except when we configured in execd_params USAGE_COLLECTION to use PDC or HYBRID. +#if defined(OCS_WITH_SYSTEMD) + if (mconf_get_enable_systemd() && + ocs::uti::Systemd::is_systemd_available()) { + ret = false; + if (usage_collection == USAGE_COLLECTION_PDC || usage_collection == USAGE_COLLECTION_HYBRID) { + ret = true; + } + } +#endif + } + + return ret; + } + +} // namespace ocs::common diff --git a/source/daemons/common/ocs_common_systemd.h b/source/daemons/common/ocs_common_systemd.h new file mode 100644 index 0000000000..547d2b7697 --- /dev/null +++ b/source/daemons/common/ocs_common_systemd.h @@ -0,0 +1,27 @@ +#pragma once +/*___INFO__MARK_BEGIN_NEW__*/ +/*************************************************************************** + * + * Copyright 2025 HPC-Gridware GmbH + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************/ +/*___INFO__MARK_END_NEW__*/ + +#include "sgeobj/sge_conf.h" + +namespace ocs::common { + bool + use_pdc_for_usage_collection(usage_collection_t usage_collection); +} diff --git a/source/daemons/common/pdc.cc b/source/daemons/common/pdc.cc index ebf20591e9..a82ea4d6a5 100644 --- a/source/daemons/common/pdc.cc +++ b/source/daemons/common/pdc.cc @@ -112,13 +112,11 @@ int main(int argc,char *argv[]) #include "cull/cull.h" -#include "sgeobj/sge_feature.h" +#include "ocs_common_systemd.h" -#include "msg_execd.h" #include "sgedefs.h" #include "exec_ifm.h" #include "pdc.h" -#include "ptf.h" #include "procfs.h" #include "basis_types.h" @@ -348,8 +346,13 @@ static int psRetrieveOSJobData() { table to decide whether a process is needed for a job or not. */ pt_open(); - while (!pt_dispatch_proc_to_job(&job_list, time_stamp, last_time)) - ; + // pt_dispatch_proc_to_job will read data of a single process from /proc + // it will return 0 if there are still processes to read + // it will return 1 if there are no more processes to read + // we want to read them all + while (pt_dispatch_proc_to_job(&job_list, time_stamp, last_time) == 0) { + ; + } last_time = time_stamp; pt_close(); } @@ -555,9 +558,7 @@ static int psRetrieveOSJobData() { job_elem = LNK_DATA(curr, job_elem_t, link); job = &job_elem->job; - /* if job has not been watched within 30 seconds of being pre-added - to job list, delete it */ - + // if job has not been watched within 30 seconds of being pre-added to job list, delete it if (job_elem->precreated) { if ((job_elem->precreated + 30) < time_stamp) { @@ -583,14 +584,13 @@ static int psRetrieveOSJobData() { //int proccount; lnk_link_t *currp, *nextp; - /* sum up usage of each processes for this job */ + /* sum up usage of each process for this job */ //proccount = job->jd_proccount; job->jd_utime_a = job->jd_stime_a = 0; job->jd_vmem = 0; job->jd_rss = 0; - for(currp=job_elem->procs.next; currp != &job_elem->procs; - currp=nextp) { + for (currp=job_elem->procs.next; currp != &job_elem->procs; currp=nextp) { proc_elem_t *proc_elem = LNK_DATA(currp, proc_elem_t, link); psProc_t *proc = &proc_elem->proc; @@ -598,19 +598,25 @@ static int psRetrieveOSJobData() { nextp = currp->next; /* in case currp is deleted */ if (time_stamp == proc->pd_tstamp) { - /* maybe still living */ - job->jd_utime_a += proc->pd_utime; - job->jd_stime_a += proc->pd_stime; - job->jd_vmem += proc_elem->vmem; - job->jd_rss += proc_elem->rss; - job->jd_mem += (proc_elem->mem/1024.0); + // maybe still living + // in hybrid mode, we are not interested in cpu and rss / maxrss + if (ocs::common::use_pdc_for_usage_collection(job_elem->usage_collection)) { + job->jd_utime_a += proc->pd_utime; + job->jd_stime_a += proc->pd_stime; + job->jd_rss += proc_elem->rss; + } + job->jd_vmem += proc_elem->vmem; + job->jd_mem += (proc_elem->mem/1024.0); #if defined(LINUX) job->jd_chars += proc_elem->delta_chars; #endif } else { - /* most likely exited */ - job->jd_utime_c += proc->pd_utime; - job->jd_stime_c += proc->pd_stime; + // most likely exited + // we do not sum up memory usage (@todo should we?) + if (ocs::common::use_pdc_for_usage_collection(job_elem->usage_collection)) { + job->jd_utime_c += proc->pd_utime; + job->jd_stime_c += proc->pd_stime; + } job->jd_proccount--; /* remove process entry from list */ @@ -625,8 +631,10 @@ static int psRetrieveOSJobData() { if (job->jd_vmem > job->jd_himem) { job->jd_himem = job->jd_vmem; } - if (job->jd_rss > job->jd_maxrss) { - job->jd_maxrss = job->jd_rss; + if (ocs::common::use_pdc_for_usage_collection(job_elem->usage_collection)) { + if (job->jd_rss > job->jd_maxrss) { + job->jd_maxrss = job->jd_rss; + } } } @@ -691,9 +699,10 @@ int psStopCollector() } -int psWatchJob(JobID_t JobID) +int psWatchJob(JobID_t JobID, usage_collection_t usage_collection) { - lnk_link_t *curr; + if (JobID != 0) { + lnk_link_t *curr; # if DEBUG @@ -704,20 +713,22 @@ int psWatchJob(JobID_t JobID) # endif - /* if job to watch is not already in the list then add it */ - - if ((curr=find_job(JobID))) { - LNK_DATA(curr, job_elem_t, link)->precreated = 0; - } else { - job_elem_t *job_elem = (job_elem_t *)sge_malloc(sizeof(job_elem_t)); - memset(job_elem, 0, sizeof(job_elem_t)); - job_elem->starttime = get_gmt(); - job_elem->job.jd_jid = JobID; - job_elem->job.jd_length = sizeof(psJob_t); - LNK_INIT(&job_elem->procs); - LNK_INIT(&job_elem->arses); - /* add to job list */ - LNK_ADD(job_list.prev, &job_elem->link); + /* if job to watch is not already in the list then add it */ + curr = find_job(JobID); + if (curr != nullptr) { + LNK_DATA(curr, job_elem_t, link)->precreated = 0; + } else { + job_elem_t *job_elem = (job_elem_t *)sge_malloc(sizeof(job_elem_t)); + memset(job_elem, 0, sizeof(job_elem_t)); + job_elem->starttime = get_gmt(); + job_elem->job.jd_jid = JobID; + job_elem->usage_collection = usage_collection; + job_elem->job.jd_length = sizeof(psJob_t); + LNK_INIT(&job_elem->procs); + LNK_INIT(&job_elem->arses); + /* add to job list */ + LNK_ADD(job_list.prev, &job_elem->link); + } } return 0; @@ -725,13 +736,15 @@ int psWatchJob(JobID_t JobID) int psIgnoreJob(JobID_t JobID) { - lnk_link_t *curr; + if (JobID != 0) { + lnk_link_t *curr; - /* if job is in the list, remove it */ + /* if job is in the list, remove it */ - if ((curr = find_job(JobID))) { - LNK_DELETE(curr); - free_job(LNK_DATA(curr, job_elem_t, link)); + if ((curr = find_job(JobID))) { + LNK_DELETE(curr); + free_job(LNK_DATA(curr, job_elem_t, link)); + } } return 0; @@ -847,11 +860,7 @@ struct psJob_s *psGetAllJobs() psRetrieveOSJobData(); /* calculate size of return data */ -#ifndef SOLARIS rsize = sizeof(uint64); -#else - rsize = 8; -#endif for (curr=job_list.next; curr != &job_list; curr=curr->next) { job_elem_t *job_elem = LNK_DATA(curr, job_elem_t, link); @@ -870,11 +879,8 @@ struct psJob_s *psGetAllJobs() /* fill in return data */ jobs = rjob; *(uint64 *)jobs = jobcount; -#ifndef SOLARIS + INCJOBPTR(jobs, sizeof(uint64)); -#else - INCJOBPTR(jobs, 8); -#endif /* copy the job data */ for (curr=job_list.next; curr != &job_list; curr=curr->next) { diff --git a/source/daemons/common/pdc.h b/source/daemons/common/pdc.h index 59faa726c6..ab9171e26a 100644 --- a/source/daemons/common/pdc.h +++ b/source/daemons/common/pdc.h @@ -33,6 +33,8 @@ ************************************************************************/ /*___INFO__MARK_END__*/ +#include "sgeobj/sge_conf.h" + #include "err_trace.h" #include "exec_ifm.h" @@ -154,6 +156,7 @@ typedef struct psSys_s psSys_t; typedef struct { lnk_link_t link; psJob_t job; + usage_collection_t usage_collection; // USAGE_COLLECTION_DEFAULT, ... lnk_link_t procs; lnk_link_t arses; time_t precreated; /* set if job element created before psWatchJob */ @@ -170,7 +173,7 @@ typedef struct { typedef struct { lnk_link_t link; - JobID_t jid; + JobID_t jid; psProc_t proc; double bwtime; double rwtime; @@ -198,7 +201,7 @@ extern long pagesize; int psStartCollector(); int psStopCollector(); -int psWatchJob(JobID_t JobID); +int psWatchJob(JobID_t JobID, usage_collection_t usage_collection); int psIgnoreJob(JobID_t JobID); struct psStat_s *psStatus(); struct psJob_s *psGetOneJob(JobID_t JobID); diff --git a/source/daemons/common/procfs.cc b/source/daemons/common/procfs.cc index f20372baa4..e77533b595 100644 --- a/source/daemons/common/procfs.cc +++ b/source/daemons/common/procfs.cc @@ -371,21 +371,18 @@ void procfs_kill_addgrpid(gid_t add_grp_id, int sig, tShepherd_trace shepherd_tr DRETURN_VOID; } -int pt_open() -{ +int pt_open() { cwd = opendir(PROC_DIR); - return !cwd; + return cwd == nullptr; } -void pt_close() -{ + +void pt_close() { closedir(cwd); } -int pt_dispatch_proc_to_job( -lnk_link_t *job_list, -int time_stamp, -time_t last_time -) { +int pt_dispatch_proc_to_job(lnk_link_t *job_list, int time_stamp, time_t last_time) { + DENTER(TOP_LAYER); + char procnam[1024]; int fd = -1; #if defined(LINUX) @@ -426,8 +423,6 @@ time_t last_time double old_time = 0; uint64 old_vmem = 0; - DENTER(TOP_LAYER); - max_groups = sge_sysconf(SGE_SYSCONF_NGROUPS_MAX); if (max_groups <= 0) { ERROR(SFNMAX, MSG_SGE_NGROUPS_MAXOSRECONFIGURATIONNECESSARY); @@ -440,30 +435,41 @@ time_t last_time DRETURN(1); } - /* find next valid entry in procfs */ + // loop over all processes in /proc while ((dent = readdir(cwd))) { char *pidname; #ifndef LINUX - if (!dent->d_name) + // on Linux d_name is a char array, it cannot be nullptr + // on other systems it is a pointer to a string, which can be nullptr + if (dent->d_name == nullptr) { continue; + } #endif - if (!dent->d_name[0]) + // skip empty string + if (dent->d_name[0] == '\0') { continue; + } - if (!strcmp(dent->d_name, "..") || !strcmp(dent->d_name, ".")) + // skip directory entries for current and parent directory + if (strcmp(dent->d_name, "..") == 0 || strcmp(dent->d_name, ".") == 0) { continue; + } - if (dent->d_name[0] == '.') - pidname = &dent->d_name[1]; - else - pidname = dent->d_name; + // handle process names starting with a dot (e.g. ".1234") - @todo why would this be the case? + if (dent->d_name[0] == '.') { + pidname = &dent->d_name[1]; + } else { + pidname = dent->d_name; + } - if (atoi(pidname) == 0) + // skip entries which are not numeric (e.g. "self", "thread-self") + if (atoi(pidname) == 0) { continue; + } #if defined(LINUX) - /* check only processes which belongs to a GE job */ + // check only processes that belong to a job if ((pr = get_pr(atoi(pidname))) != nullptr) { /* set process as still running */ lSetPosBool(pr, pos_run, true); @@ -472,6 +478,8 @@ time_t last_time } } + // read the stat file + // we get stime and utime, vmem and rss from it sprintf(procnam, PROC_DIR "/%s/stat", dent->d_name); if (SGE_STAT(procnam, &fst)) { if (errno != ENOENT) { @@ -675,14 +683,15 @@ time_t last_time for (group=0; !found_it && groupjob.jd_jid == list[group]) { // @todo: is this correct? jd_jid is a pid_t, list[group] gid_t #if defined(LINUX) - /* mark this process as relevant */ + // mark this process as relevant lSetPosBool(pr, pos_rel, true); #endif found_it = 1; } } - if (found_it) + if (found_it) { break; + } } if (curr == job_list) { /* this is not a traced process */ @@ -690,14 +699,13 @@ time_t last_time continue; } - /* we always read only one entry per function call - the while loop is needed to read next one */ + // we always read only one entry per function call the while loop is needed to read next one break; } /* while */ sge_free(&list); - if (!dent) {/* visited all files in procfs */ + if (dent == nullptr) { // visited all files in procfs #if defined(LINUX) clean_procList(); #endif diff --git a/source/daemons/common/setosjobid.cc b/source/daemons/common/setosjobid.cc index bc117b7479..78beea7081 100644 --- a/source/daemons/common/setosjobid.cc +++ b/source/daemons/common/setosjobid.cc @@ -44,30 +44,34 @@ #include "err_trace.h" #include "setosjobid.h" +#include + void setosjobid(pid_t sid, gid_t *add_grp_id_ptr, struct passwd *pw) { - FILE *fp=nullptr; - shepherd_trace("setosjobid: uid = " pid_t_fmt ", euid = " pid_t_fmt, getuid(), geteuid()); + FILE *fp = nullptr; + # if defined(SOLARIS) || defined(LINUX) || defined(FREEBSD) || defined(DARWIN) /* Read SgeId from config-File and create Addgrpid-File */ - { - char *cp; - if ((cp = search_conf_val("add_grp_id"))) + { + char *cp = search_conf_val("add_grp_id"); + if (cp != nullptr) { *add_grp_id_ptr = atol(cp); - else + } else { *add_grp_id_ptr = 0; + } } - if ((fp = fopen("addgrpid", "w")) == nullptr) { - shepherd_error(1, "can't open \"addgrpid\" file"); + fp = fopen(ADDGRPID, "w"); + if (fp == nullptr) { + shepherd_error(1, "can't open \"addgrpid\" file"); } fprintf(fp, gid_t_fmt"\n", *add_grp_id_ptr); FCLOSE(fp); # else { char osjobid[100]; - if ((fp = fopen("osjobid", "w")) == nullptr) { + if ((fp = fopen(OSJOBID, "w")) == nullptr) { shepherd_error(1, "can't open \"osjobid\" file"); } diff --git a/source/daemons/common/shutdown.cc b/source/daemons/common/shutdown.cc index 57f7dab898..8232db5a49 100644 --- a/source/daemons/common/shutdown.cc +++ b/source/daemons/common/shutdown.cc @@ -32,7 +32,11 @@ ************************************************************************/ /*___INFO__MARK_END__*/ #include +#include +#include +#include +#include "uti/ocs_Systemd.h" #include "uti/sge_arch.h" #include "uti/sge_log.h" #include "uti/sge_rmon_macros.h" @@ -45,32 +49,47 @@ void starting_up() { - u_long32 old_ll = log_state_get_log_level(); - dstring ds; - dstring ds2 = DSTRING_INIT; - dstring ds3 = DSTRING_INIT; - char buffer[256]; - DENTER(TOP_LAYER); - sge_dstring_init(&ds, buffer, sizeof(buffer)); + // switch to the INFO logging level, we want to see the startup messages regardless of the configured log level + u_long32 old_ll = log_state_get_log_level(); log_state_set_log_level(LOG_INFO); + DSTRING_STATIC(ds, 256); + dstring ds2 = DSTRING_INIT; + dstring ds3 = DSTRING_INIT; + if (feature_get_active_featureset_id() == FEATURE_NO_SECURITY) { sge_dstring_copy_string(&ds2, feature_get_product_name(FS_VERSION, &ds)); } else { - sge_dstring_sprintf(&ds2, "%s (%s)", - feature_get_product_name(FS_VERSION, &ds), - feature_get_featureset_name( - feature_get_active_featureset_id())); + sge_dstring_sprintf(&ds2, "%s (%s)", feature_get_product_name(FS_VERSION, &ds), + feature_get_featureset_name( feature_get_active_featureset_id())); } INFO(MSG_STARTUP_STARTINGUP_SSS, feature_get_product_name(FS_SHORT, &ds3), sge_dstring_get_string(&ds2), sge_get_arch()); + sge_dstring_free(&ds2); + sge_dstring_free(&ds3); + + + // log if we are using Munge if (bootstrap_get_use_munge()) { INFO(SFNMAX, MSG_STARTUP_USING_MUNGE); } - sge_dstring_free(&ds2); - sge_dstring_free(&ds3); + // if we are running within a cgroup then output the cgroup path +#if defined (LINUX) + std::string proc_cgroup = "/proc/self/cgroup"; + if (std::filesystem::exists(proc_cgroup)) { + std::ifstream cgroup_file(proc_cgroup); + if (cgroup_file.is_open()) { + std::string cgroup; + if (std::getline(cgroup_file, cgroup)) { + INFO(MSG_STARTUP_IN_CGROUP_S, cgroup.c_str()); + } + } + } +#endif + + // reset the log level to the previous value log_state_set_log_level(old_ll); DRETURN_VOID; diff --git a/source/daemons/execd/CMakeLists.txt b/source/daemons/execd/CMakeLists.txt index fa4d658296..3e61eb5018 100644 --- a/source/daemons/execd/CMakeLists.txt +++ b/source/daemons/execd/CMakeLists.txt @@ -28,7 +28,9 @@ add_executable( execd_get_new_conf.cc execd_job_exec.cc execd_kill_execd.cc + execd_profiling.cc execd_signal_queue.cc + ocs_execd_systemd.cc execd_ticket.cc get_path.cc job_report_execd.cc diff --git a/source/daemons/execd/dispatcher.cc b/source/daemons/execd/dispatcher.cc index f1969479ff..83dca17b58 100644 --- a/source/daemons/execd/dispatcher.cc +++ b/source/daemons/execd/dispatcher.cc @@ -38,6 +38,7 @@ #include "uti/sge_bootstrap.h" #include "uti/sge_log.h" #include "uti/sge_monitor.h" +#include "uti/sge_profiling.h" #include "uti/sge_rmon_macros.h" #include "uti/sge_time.h" @@ -54,6 +55,7 @@ #include "msg_execd.h" #include "execd.h" #include "execd_job_exec.h" +#include "execd_profiling.h" #include "execd_ticket.h" #include "job_report_execd.h" #include "execd_signal_queue.h" @@ -100,6 +102,9 @@ int sge_execd_process_messages() { } while (!terminate) { + ocs::execd_profiling_start_stop(); + PROF_START_MEASUREMENT(SGE_PROF_CUSTOM1); + u_long64 now = sge_get_gmt64(); ocs::gdi::ClientServerBase::struct_msg_t msg; char* buffer = nullptr; @@ -161,7 +166,7 @@ int sge_execd_process_messages() { do_ack(&msg); break; case ocs::gdi::ClientServerBase::TAG_SIGQUEUE: - case ocs::gdi::ClientServerBase::TAG_SIGJOB: + case ocs::gdi::ClientServerBase::TAG_SIGJOB: if (init_packbuffer(&apb, 1024) == PACK_SUCCESS) { do_signal_queue(&msg, &apb); is_apb_used = true; @@ -353,7 +358,7 @@ int sge_execd_process_messages() { } if (ocs::gdi::ClientBase::sge_get_com_error_flag(EXECD, ocs::gdi::SGE_COM_ACCESS_DENIED, false)) { - /* we have to reconnect, when the problem is fixed */ + /* we have to reconnect when the problem is fixed */ do_reconnect = true; /* we do not expect that the problem is fast to fix */ sleep(EXECD_MAX_RECONNECT_TIMEOUT); @@ -374,6 +379,9 @@ int sge_execd_process_messages() { do_reconnect = true; } } + + PROF_STOP_MEASUREMENT(SGE_PROF_CUSTOM1); + ocs::execd_profiling_output(); } sge_monitor_free(&monitor); diff --git a/source/daemons/execd/exec_job.cc b/source/daemons/execd/exec_job.cc index ced4bb3cc8..8f627df41e 100644 --- a/source/daemons/execd/exec_job.cc +++ b/source/daemons/execd/exec_job.cc @@ -41,6 +41,7 @@ #include #include +#include "uti/ocs_Systemd.h" #include "uti/sge_afsutil.h" #include "uti/sge_arch.h" #include "uti/sge_binding_hlp.h" @@ -52,6 +53,7 @@ #include "uti/sge_log.h" #include "uti/sge_os.h" #include "uti/sge_parse_num_par.h" +#include "uti/sge_profiling.h" #include "uti/sge_rmon_macros.h" #include "uti/sge_stdio.h" #include "uti/sge_stdlib.h" @@ -64,6 +66,7 @@ #include "sgeobj/ocs_BindingExecd2Shepherd.h" #include "sgeobj/ocs_DataStore.h" +#include "sgeobj/ocs_Job.h" #include "sgeobj/sge_conf.h" #include "sgeobj/sge_pe.h" #include "sgeobj/sge_ja_task.h" @@ -89,6 +92,7 @@ #include "sge_job_qmaster.h" #include "tmpdir.h" #include "exec_job.h" +#include "ocs_execd_systemd.h" #include "mail.h" #include "basis_types.h" #include "pdc.h" @@ -299,6 +303,7 @@ int sge_exec_job(lListElem *jep, lListElem *jatep, lListElem *petep, char *err_s const lList *path_aliases = nullptr; char dce_wrapper_cmd[128]; + bool starting_shepherd_ok = true; #if COMPILE_DC #if defined(SOLARIS) || defined(LINUX) || defined(FREEBSD) || defined(DARWIN) @@ -800,10 +805,9 @@ int sge_exec_job(lListElem *jep, lListElem *jatep, lListElem *petep, char *err_s petep == nullptr ? lGetString(jep, JB_job_name) : lGetString(petep, PET_name)); var_list_set_string(&environmentList, "HOSTNAME", lGetHost(master_q, QU_qhostname)); var_list_set_string(&environmentList, "QUEUE", lGetString(master_q, QU_qname)); - /* JB: TODO (ENV): shouldn't we better have a SGE_JOB_ID? */ + var_list_set_uint32t(&environmentList, "JOB_ID", job_id); - /* JG: TODO (ENV): shouldn't we better use SGE_JATASK_ID and have an additional SGE_PETASK_ID? */ if (job_is_array(jep)) { u_long32 start, end, step; @@ -993,27 +997,38 @@ int sge_exec_job(lListElem *jep, lListElem *jatep, lListElem *petep, char *err_s #ifdef COMPILE_DC -# if defined(SOLARIS) || defined(LINUX) || defined(FREEBSD) || defined(DARWIN) +#if defined(SOLARIS) || defined(LINUX) || defined(FREEBSD) || defined(DARWIN) + +#if defined(LINUX) + if (!sup_groups_in_proc()) { + lFreeList(&environmentList); + snprintf(err_str, err_length, SFNMAX, MSG_EXECD_NOSGID); + FCLOSE(fp); + DRETURN(-2); + } +#endif + // The usage_collection mode. { + usage_collection_t usage_collection = mconf_get_usage_collection(); + fprintf(fp, "usage_collection= " sge_u32 "\n", usage_collection); + if (petep == nullptr) { + lSetUlong(jatep, JAT_usage_collection, usage_collection); + } else { + lSetUlong(petep, PET_usage_collection, usage_collection); + } + } + + // Set the additional group id. + // When we are using systemd we do not need to set an additional group id - we use value 0. + // Unless in hybrid usage collection mode where we get usage from both Systemd and via PDC. + // And when we enabled killing by add_grp_id, we also need to set it. + if (ocs::execd::execd_use_pdc_for_usage_collection() || mconf_get_enable_addgrp_kill()) { + // parse range and create list lList *rlp = nullptr; lList *alp = nullptr; gid_t temp_id; - char str_id[256]; - char *gid_range = nullptr; -# if defined(LINUX) - - if (!sup_groups_in_proc()) { - lFreeList(&environmentList); - snprintf(err_str, err_length, SFNMAX, MSG_EXECD_NOSGID); - FCLOSE(fp); - DRETURN(-2); - } - -# endif - - /* parse range add create list */ - gid_range = mconf_get_gid_range(); + char *gid_range = mconf_get_gid_range(); DPRINTF("gid_range = %s\n", gid_range); range_list_parse_from_string(&rlp, &alp, gid_range, 0, 0, INF_NOT_ALLOWED); @@ -1038,23 +1053,28 @@ int sge_exec_job(lListElem *jep, lListElem *jatep, lListElem *petep, char *err_s DRETURN(-1); } } + lFreeList(&rlp); + lFreeList(&alp); + } else { + // We do not use PDC for usage collection nor do we kill by addrgp, + // no need to set an additional group id. + last_addgrpid = 0; + } - /* write add_grp_id to job-structure and file */ - snprintf(str_id, sizeof(str_id), "%ld", (long) last_addgrpid); - fprintf(fp, "add_grp_id=" gid_t_fmt "\n", last_addgrpid); + // write add_grp_id to config file and to the job-structure + fprintf(fp, "add_grp_id=" gid_t_fmt "\n", last_addgrpid); + { + char str_id[256]; + snprintf(str_id, sizeof(str_id), gid_t_fmt, last_addgrpid); if (petep == nullptr) { lSetString(jatep, JAT_osjobid, str_id); } else { lSetString(petep, PET_osjobid, str_id); } + } - if (mconf_get_ignore_ngroups_max_limit()) { - fprintf(fp, "skip_ngroups_max_silently=yes\n"); - } - - lFreeList(&rlp); - lFreeList(&alp); - + if (mconf_get_ignore_ngroups_max_limit()) { + fprintf(fp, "skip_ngroups_max_silently=yes\n"); } #endif @@ -1347,15 +1367,60 @@ int sge_exec_job(lListElem *jep, lListElem *jatep, lListElem *petep, char *err_s fprintf(fp, "shell_start_mode=%s\n", job_get_shell_start_mode(jep, master_q, shell_start_mode)); sge_free(&shell_start_mode); + /* we need the basename for loginshell test */ shell = strrchr(shell_path, '/'); - if (!shell) + if (shell == nullptr) { shell = shell_path; - else + } else { shell++; - + } fprintf(fp, "use_login_shell=%d\n", ck_login_sh(shell) ? 1 : 0); + // systemd specific options: + // - enable_systemd + // - slice and scope + // - devices_allow +#ifdef OCS_WITH_SYSTEMD + { + bool enable_systemd = mconf_get_enable_systemd() && ocs::uti::Systemd::is_systemd_available(); + fprintf(fp, "enable_systemd=%d\n", enable_systemd ? 1 : 0); + + if (enable_systemd) { + // slice and scope + std::string systemd_slice; + std::string systemd_scope; + DSTRING_STATIC(error_dstr, MAX_STRING_SIZE); + if (ocs::Job::job_get_systemd_slice_and_scope(jep, jatep, petep, systemd_slice, systemd_scope, &error_dstr)) { + fprintf(fp, "systemd_slice=%s\n", systemd_slice.c_str()); + fprintf(fp, "systemd_scope=%s\n", systemd_scope.c_str()); + } + + // device isolation + // for testing purposes until we have device isolation via RSMAPs + env = lGetElemStr(lGetList(jep, JB_env_list), VA_variable, "SGE_DEBUG_DEVICES_ALLOW"); + if (env != nullptr) { + const char *devices_allow = lGetString(env, VA_value); + fprintf(fp, "devices_allow=%s\n", devices_allow != nullptr ? devices_allow : ""); + } else { + fprintf(fp, "devices_allow=\n"); + } + if (petep == nullptr) { + lSetString(jatep, JAT_systemd_scope, systemd_scope.c_str()); + } else { + lSetString(petep, PET_systemd_scope, systemd_scope.c_str()); + } + + // in case of tightly integrated parallel jobs, we need to store the systemd slice. + // this is one place (where the master task is started), + // the other one is when the slave container is started. + if (petep == nullptr) { + ocs::execd::execd_store_tight_pe_slice(jep, jatep, systemd_slice.c_str()); + } + } + } +#endif + /* the following values are needed by the reaper */ if (mailrec_unparse(lGetList(jep, JB_mail_list), mail_str, sizeof(mail_str))) { ERROR(MSG_MAIL_MAILLISTTOOLONG_U, job_id); @@ -1587,6 +1652,8 @@ int sge_exec_job(lListElem *jep, lListElem *jatep, lListElem *petep, char *err_s fprintf(fp, "qlogin_daemon=%s\n", qlogin_daemon); sge_free(&qlogin_daemon); } else { + // @todo CS-1262 we do no longer deliver a rshd or rlogind + // no longer need the write_osjob_id config value, remove it from here and from shepherd if (JOB_TYPE_IS_QRSH(jb_now)) { char *rsh_daemon = mconf_get_rsh_daemon(); strcat(daemon, "rshd"); @@ -1844,7 +1911,7 @@ int sge_exec_job(lListElem *jep, lListElem *jatep, lListElem *petep, char *err_s } if (chdir(execd_spool_dir)) /* go back */ - /* if this happens (dont know how) we have a real problem */ + /* if this happens (don't know how) we have a real problem */ ERROR(MSG_FILE_CHDIR_SS, execd_spool_dir, strerror(errno)); if (i == -1) { if (getenv("SGE_FAILURE_BEFORE_FORK")) { @@ -1857,7 +1924,21 @@ int sge_exec_job(lListElem *jep, lListElem *jatep, lListElem *petep, char *err_s DRETURN(i); } - { /* close all fd's except 0,1,2 */ + // If we are running under systemd control, + // we need to move the shepherd out of the execd scope. + // We create a new scope for the shepherd, e.g. "ocs-shepherd.scope". + // The shepherd process may *not* run within the scope of running a job - it might have limits which would lead + // to sge_shepherd being killed. + // Moving the shepherd pid to a new scope must be done here (in the child process), otherwise the shepherd + // might already have forked and have started job processes within the execd scope. + // The shepherd scope is automatically deleted again when the last shepherd process exits. + // Therefore, we need to create a new scope when the first shepherd process is started, + // and just attach the following shepherd processes to this scope. +#if defined (OCS_WITH_SYSTEMD) + starting_shepherd_ok = ocs::execd::execd_move_shepherd_to_scope(); +#endif + + if (starting_shepherd_ok) { /* close all fd's except 0,1,2 */ int keep_open[3]; keep_open[0] = 0; @@ -1870,62 +1951,66 @@ int sge_exec_job(lListElem *jep, lListElem *jatep, lListElem *petep, char *err_s * set KRB5CCNAME so shepherd assumes user's identify for * access to DFS or AFS file systems */ - if ((feature_is_enabled(FEATURE_DCE_SECURITY) || - feature_is_enabled(FEATURE_KERBEROS_SECURITY)) && - lGetString(jep, JB_cred)) { - - char ccname[1024]; - snprintf(ccname, sizeof(ccname), "KRB5CCNAME=FILE:/tmp/krb5cc_%s_" sge_u32, "sge", job_id); - putenv(ccname); + if (starting_shepherd_ok) { + if ((feature_is_enabled(FEATURE_DCE_SECURITY) || + feature_is_enabled(FEATURE_KERBEROS_SECURITY)) && + lGetString(jep, JB_cred)) { + + char ccname[1024]; + snprintf(ccname, sizeof(ccname), "KRB5CCNAME=FILE:/tmp/krb5cc_%s_" sge_u32, "sge", job_id); + putenv(ccname); + } } - DPRINTF("**********************CHILD*********************\n"); - shepherd_name = SGE_SHEPHERD; - snprintf(ps_name, sizeof(ps_name), "%s-" sge_u32, shepherd_name, job_id); + if (starting_shepherd_ok) { + DPRINTF("**********************CHILD*********************\n"); + shepherd_name = SGE_SHEPHERD; + snprintf(ps_name, sizeof(ps_name), "%s-" sge_u32, shepherd_name, job_id); + + pag_cmd = mconf_get_pag_cmd(); + shepherd_cmd = mconf_get_shepherd_cmd(); + if (shepherd_cmd && strlen(shepherd_cmd) && + strcasecmp(shepherd_cmd, "none")) { + DPRINTF("CHILD - About to exec shepherd wrapper job ->%s< under queue -<%s<\n", + lGetString(jep, JB_job_name), + lGetString(master_q, QU_full_name)); + execlp(shepherd_cmd, ps_name, nullptr); + } else if (mconf_get_do_credentials() && feature_is_enabled(FEATURE_DCE_SECURITY)) { + DPRINTF("CHILD - About to exec DCE shepherd wrapper job ->%s< under queue -<%s<\n", + lGetString(jep, JB_job_name), + lGetString(master_q, QU_full_name)); + execlp(dce_wrapper_cmd, ps_name, nullptr); + } else if (!feature_is_enabled(FEATURE_AFS_SECURITY) || !pag_cmd || + !strlen(pag_cmd) || !strcasecmp(pag_cmd, "none")) { + DPRINTF("CHILD - About to exec ->%s< under queue -<%s<\n", + lGetString(jep, JB_job_name), + lGetString(master_q, QU_full_name)); + + if (ISTRACE) + execlp(shepherd_path, ps_name, nullptr); + else + execlp(shepherd_path, ps_name, "-bg", nullptr); + } else { + char commandline[2048]; - pag_cmd = mconf_get_pag_cmd(); - shepherd_cmd = mconf_get_shepherd_cmd(); - if (shepherd_cmd && strlen(shepherd_cmd) && - strcasecmp(shepherd_cmd, "none")) { - DPRINTF("CHILD - About to exec shepherd wrapper job ->%s< under queue -<%s<\n", - lGetString(jep, JB_job_name), - lGetString(master_q, QU_full_name)); - execlp(shepherd_cmd, ps_name, nullptr); - } else if (mconf_get_do_credentials() && feature_is_enabled(FEATURE_DCE_SECURITY)) { - DPRINTF("CHILD - About to exec DCE shepherd wrapper job ->%s< under queue -<%s<\n", - lGetString(jep, JB_job_name), - lGetString(master_q, QU_full_name)); - execlp(dce_wrapper_cmd, ps_name, nullptr); - } else if (!feature_is_enabled(FEATURE_AFS_SECURITY) || !pag_cmd || - !strlen(pag_cmd) || !strcasecmp(pag_cmd, "none")) { - DPRINTF("CHILD - About to exec ->%s< under queue -<%s<\n", - lGetString(jep, JB_job_name), - lGetString(master_q, QU_full_name)); - - if (ISTRACE) - execlp(shepherd_path, ps_name, nullptr); - else - execlp(shepherd_path, ps_name, "-bg", nullptr); - } else { - char commandline[2048]; + DPRINTF("CHILD - About to exec PAG command job ->%s< under queue -<%s<\n", + lGetString(jep, JB_job_name), lGetString(master_q, QU_full_name)); + if (ISTRACE) { + snprintf(commandline, sizeof(commandline), "exec %s", shepherd_path); + } else { + snprintf(commandline, sizeof(commandline), "exec %s -bg", shepherd_path); + } - DPRINTF("CHILD - About to exec PAG command job ->%s< under queue -<%s<\n", - lGetString(jep, JB_job_name), lGetString(master_q, QU_full_name)); - if (ISTRACE) { - snprintf(commandline, sizeof(commandline), "exec %s", shepherd_path); - } else { - snprintf(commandline, sizeof(commandline), "exec %s -bg", shepherd_path); + execlp(pag_cmd, pag_cmd, "-c", commandline, nullptr); } - - execlp(pag_cmd, pag_cmd, "-c", commandline, nullptr); + sge_free(&pag_cmd); + sge_free(&shepherd_cmd); } - sge_free(&pag_cmd); - sge_free(&shepherd_cmd); - /*---------------------------------------------------*/ /* exec() failed - do what shepherd does if it fails */ - + // @todo can we set the host in error state (all queues) when starting_shepherd_ok is false? + // Otherwise the job will get scheduled to this host over and over again. fp = fopen("error", "w"); if (fp) { fprintf(fp, "failed to exec shepherd for job" sge_u32"\n", job_id); @@ -2010,6 +2095,3 @@ get_nhosts(const lList *gdil_orig) { DRETURN(nhosts); } - - - diff --git a/source/daemons/execd/execd.cc b/source/daemons/execd/execd.cc index e9d7f02a4f..7389e0759d 100644 --- a/source/daemons/execd/execd.cc +++ b/source/daemons/execd/execd.cc @@ -36,6 +36,7 @@ #include #include +#include "uti/ocs_Systemd.h" #include "uti/ocs_TerminationManager.h" #include "uti/sge_log.h" #include "uti/sge_monitor.h" @@ -61,6 +62,8 @@ #include "sge_load_sensor.h" #include "dispatcher.h" +#include "ocs_execd_systemd.h" +#include "execd_profiling.h" #include "load_avg.h" #include "reaper_execd.h" #include "setup_execd.h" @@ -68,6 +71,8 @@ #include "sig_handlers.h" #include "usage.h" #include "execd.h" + + #include "sge.h" #include "msg_common.h" #include "msg_execd.h" @@ -104,8 +109,6 @@ static lList *sge_parse_cmdline_execd(char **argv, lList **ppcmdline); static lList *sge_parse_execd(lList **ppcmdline, lList **ppreflist, u_long32 *help); static u_long64 last_qmaster_registration_time = 0; - - u_long64 get_last_qmaster_register_time() { return last_qmaster_registration_time; } @@ -161,7 +164,6 @@ int main(int argc, char **argv) int printed_points = 0; int max_enroll_tries; static char tmp_err_file_name[SGE_PATH_MAX]; - u_long64 next_prof_output = 0; int execd_exit_state = 0; lList **master_job_list = nullptr; lList *alp = nullptr; @@ -174,8 +176,7 @@ int main(int argc, char **argv) set_thread_name(pthread_self(),"Execd Thread"); - prof_set_level_name(SGE_PROF_CUSTOM1, "Execd Thread", nullptr); - prof_set_level_name(SGE_PROF_CUSTOM2, "Execd Dispatch", nullptr); + ocs::execd_profiling_initialize(); #ifdef __SGE_COMPILE_WITH_GETTEXT__ /* init language output for gettext() , it will use the right language */ @@ -328,18 +329,22 @@ int main(int argc, char **argv) } /* - * We write pid file when we are connected to qmaster. Otherwise, an old + * We write a pid file when we are connected to qmaster. Otherwise, an old * execd might overwrite our pidfile. */ sge_write_pid(EXECD_PID_FILE); /* - * At this point we are sure we are the only sge_execd and we are connected - * to the current qmaster. First we have to report any reaped children + * At this point, we are sure we are the only sge_execd, and we are connected + * to the current qmaster. First, we have to report any reaped children * that might exist. */ starting_up(); +#if defined (OCS_WITH_SYSTEMD) + ocs::execd::execd_systemd_init(); +#endif + /* * Log a warning message if execd hasn't been started by a superuser */ @@ -366,18 +371,6 @@ int main(int argc, char **argv) sge_sig_handler_in_main_loop = 1; - if (thread_prof_active_by_id(pthread_self())) { - prof_start(SGE_PROF_CUSTOM1, nullptr); - prof_start(SGE_PROF_CUSTOM2, nullptr); - prof_start(SGE_PROF_GDI_REQUEST, nullptr); - } else { - prof_stop(SGE_PROF_CUSTOM1, nullptr); - prof_stop(SGE_PROF_CUSTOM2, nullptr); - prof_stop(SGE_PROF_GDI_REQUEST, nullptr); - } - - PROF_START_MEASUREMENT(SGE_PROF_CUSTOM1); - /* Start dispatching */ execd_exit_state = sge_execd_process_messages(); @@ -396,17 +389,7 @@ int main(int argc, char **argv) #endif lFreeList(master_job_list); - PROF_STOP_MEASUREMENT(SGE_PROF_CUSTOM1); - if (prof_is_active(SGE_PROF_ALL)) { - u_long64 now = sge_get_gmt64(); - - if (now > next_prof_output) { - prof_output_info(SGE_PROF_ALL, false, "profiling summary:\n"); - prof_reset(SGE_PROF_ALL,nullptr); - next_prof_output = now + sge_gmt32_to_gmt64(60); - } - } - sge_prof_cleanup(); + ocs::execd_profiling_cleanup(); sge_shutdown(execd_exit_state); DRETURN(execd_exit_state); diff --git a/source/daemons/execd/execd_ck_to_do.cc b/source/daemons/execd/execd_ck_to_do.cc index 7846f25b9a..20a9844f89 100644 --- a/source/daemons/execd/execd_ck_to_do.cc +++ b/source/daemons/execd/execd_ck_to_do.cc @@ -48,6 +48,7 @@ #include "uti/sge_unistd.h" #include "sgeobj/ocs_DataStore.h" +#include "sgeobj/ocs_Job.h" #include "sgeobj/sge_conf.h" #include "sgeobj/sge_ja_task.h" #include "sgeobj/sge_job.h" @@ -101,15 +102,21 @@ extern volatile int jobs_to_start; extern lList *jr_list; #ifdef COMPILE_DC -static void notify_ptf(); -static void notify_ptf() -{ - lListElem *jep; - lListElem *tep; - int write_job = -1; +/** + * @brief Notify PTF about newly started jobs + * + * This function checks the job list for jobs that are in the JWAITING4OSJID state + * and attempts to register them at PTF. If successful, it changes their state to JRUNNING. + * If still waiting for osjobid, it sets the (global) waiting4osjid flag to true, + * which will trigger a re-check in the next call to this function. + */ +static void notify_ptf() { DENTER(TOP_LAYER); + lListElem *jep; + int write_job = -1; + #ifdef DEBUG_DC ptf_show_registered_jobs(); #endif @@ -143,12 +150,13 @@ static void notify_ptf() } } - for_each_rw (tep, lGetList(jatep, JAT_task_list)) { - if (lGetUlong(tep, PET_status) == JWAITING4OSJID) { - switch (register_at_ptf(jep, jatep, tep)) { + lListElem *petep; + for_each_rw (petep, lGetList(jatep, JAT_task_list)) { + if (lGetUlong(petep, PET_status) == JWAITING4OSJID) { + switch (register_at_ptf(jep, jatep, petep)) { case 0: /* succeeded */ - lSetUlong(tep, PET_status, JRUNNING); + lSetUlong(petep, PET_status, JRUNNING); /* spool state transition */ write_job = 1; @@ -429,6 +437,7 @@ update_wallclock_usage(u_long64 now, const lListElem *job, const lListElem *ja_t int do_ck_to_do(bool is_qmaster_down) { DENTER(TOP_LAYER); + u_long64 now = sge_get_gmt64(); static u_long64 next_pdc = 0; static u_long64 next_signal = 0; @@ -439,13 +448,14 @@ int do_ck_to_do(bool is_qmaster_down) { int return_value = 0; const char *qualified_hostname = component_get_qualified_hostname(); - - #ifdef KERBEROS krb_renew_tgts(Master_Job_List); #endif - /* start jobs if present */ + // start jobs if present + // when job start orders are received, they are not started immediately, + // but are only added to job list and spooled + // status JIDLE tells us that the job is ready to be started if (jobs_to_start) { /* reset jobs_to_start before starting jobs. We may loose * a job start if we reset jobs_to_start after sge_start_jobs() @@ -473,13 +483,13 @@ int do_ck_to_do(bool is_qmaster_down) { // PDC trigger can be ignored if there are no jobs to observe if (lGetNumberOfElem(*ocs::DataStore::get_master_list(SGE_TYPE_JOB)) > 0 && do_pdc) { + // register newly submitted jobs at PTF notify_ptf(); - sge_switch2start_user(); + // get online usage of running jobs ptf_update_job_usage(); - sge_switch2admin_user(); - /* check for job limits */ + // check for job limits if (check_for_queue_limits()) { force_job_rlimit(qualified_hostname); } @@ -501,7 +511,7 @@ int do_ck_to_do(bool is_qmaster_down) { for_each_rw (jep, *ocs::DataStore::get_master_list_rw(SGE_TYPE_JOB)) { for_each_rw (jatep, lGetList(jep, JB_ja_tasks)) { - // don't update wallclock before job actually started or after it ended */ + // don't update wallclock before a job actually started or after it ended */ u_long32 status = lGetUlong(jatep, JAT_status); if (status == JWAITING4OSJID || status == JEXITING) { continue; @@ -874,32 +884,16 @@ static int exec_job_or_task(lListElem *jep, lListElem *jatep, lListElem *petep) } #ifdef COMPILE_DC -int register_at_ptf( -const lListElem *job, -const lListElem *ja_task, -const lListElem *pe_task -) { - u_long32 job_id; +int register_at_ptf(const lListElem *job, const lListElem *ja_task, const lListElem *pe_task) { + DENTER(TOP_LAYER); + + u_long32 job_id; u_long32 ja_task_id; const char *pe_task_id = nullptr; - int success; - FILE *fp; SGE_STRUCT_STAT sb; - char id_buffer[MAX_STRING_SIZE]; /* static dstring for job id string */ - dstring id_dstring; - -#if defined(SOLARIS) || defined(LINUX) || defined(FREEBSD) || defined(DARWIN) - gid_t addgrpid; - dstring addgrpid_path = DSTRING_INIT; -#else - dstring osjobid_path = DSTRING_INIT; - osjobid_t osjobid; -#endif - DENTER(TOP_LAYER); - - sge_dstring_init(&id_dstring, id_buffer, MAX_STRING_SIZE); + DSTRING_STATIC(id_dstring, MAX_STRING_SIZE); job_id = lGetUlong(job, JB_job_number); ja_task_id = lGetUlong(ja_task, JAT_task_number); @@ -908,81 +902,136 @@ const lListElem *pe_task } #if defined(SOLARIS) || defined(LINUX) || defined(FREEBSD) || defined(DARWIN) - /** - ** read additional group id and use it as osjobid - **/ - - /* open addgrpid file */ - sge_get_active_job_file_path(&addgrpid_path, - job_id, ja_task_id, pe_task_id, ADDGRPID); - DPRINTF("Registering job %s with PTF\n", - job_get_id_string(job_id, ja_task_id, pe_task_id, &id_dstring)); + // Check if the addgrpid file exists, + // this means that the shepherd has started the job. + DSTRING_STATIC(addgrpid_path, SGE_PATH_MAX); + sge_get_active_job_file_path(&addgrpid_path, job_id, ja_task_id, pe_task_id, ADDGRPID); + DPRINTF("Registering job %s with PTF\n", job_get_id_string(job_id, ja_task_id, pe_task_id, &id_dstring)); if (SGE_STAT(sge_dstring_get_string(&addgrpid_path), &sb) && errno == ENOENT) { - DPRINTF("still waiting for addgrpid of job %s\n", - job_get_id_string(job_id, ja_task_id, pe_task_id, &id_dstring)); - sge_dstring_free(&addgrpid_path); + DPRINTF("still waiting for addgrpid of job %s\n", job_get_id_string(job_id, ja_task_id, pe_task_id, &id_dstring)); DRETURN(1); } + gid_t addgrpid; +#if 1 + // We store the addgrpid in the ja_task/pe_task when writing the config file, + // and the job is spooled immediately after forking the shepherd, + // so we can use it here, even when sge_execd gets restarted. + const char *addgrpid_str; + if (pe_task != nullptr) { + addgrpid_str = lGetString(pe_task, PET_osjobid); + } else { + addgrpid_str = lGetString(ja_task, JAT_osjobid); + } + addgrpid = static_cast(std::stoul(addgrpid_str)); +#else + // We need to read the addgrpid file. + FILE *fp; if (!(fp = fopen(sge_dstring_get_string(&addgrpid_path), "r"))) { ERROR(MSG_EXECD_NOADDGIDOPEN_SSS, sge_dstring_get_string(&addgrpid_path), job_get_id_string(job_id, ja_task_id, pe_task_id, &id_dstring), strerror(errno)); - sge_dstring_free(&addgrpid_path); DRETURN(-1); } - - sge_dstring_free(&addgrpid_path); /* read addgrpid */ - success = (fscanf(fp, gid_t_fmt, &addgrpid)==1); + int success = fscanf(fp, gid_t_fmt, &addgrpid) == 1; FCLOSE(fp); if (!success) { - /* can happen that shepherd has opend the file but not written */ - DRETURN((1)); + // can happen that shepherd has opened the file but not yet written + DRETURN(1); } +#endif + { int ptf_error; - DPRINTF("Register job with AddGrpId at " pid_t_fmt " PTF\n", addgrpid); - if ((ptf_error = ptf_job_started(addgrpid, pe_task_id, job, ja_task_id))) { + // when running jobs in systemd scopes store the scope id + // we need it to retrieve usage information + // it is already stored and spooled in the ja_task/pe_task + // similar to the osjobid/addgrpid + const char *scope_str = nullptr; +#if defined(OCS_WITH_SYSTEMD) + if (pe_task != nullptr) { + scope_str = lGetString(pe_task, PET_systemd_scope); + } else { + scope_str = lGetString(ja_task, JAT_systemd_scope); + } +#endif + + // We need the usage collection mode to register the job, + // and we stored it in the ja_task/pe_task when starting the job. + usage_collection_t usage_collection; + if (pe_task != nullptr) { + usage_collection = static_cast(lGetUlong(pe_task, PET_usage_collection)); + } else { + usage_collection = static_cast(lGetUlong(ja_task, JAT_usage_collection)); + } + + // If we do not want to get usage via Systemd, then pass nullptr as scope_str + if (scope_str != nullptr) { + if (usage_collection != USAGE_COLLECTION_DEFAULT && usage_collection != USAGE_COLLECTION_HYBRID) { + scope_str = nullptr; + } + } + + // If we do not want to get usage via PTF, then pass 0 as addgrpid + if (addgrpid != 0) { + // if we have a systemd scope and it is not hybrid, we use only systemd + if (scope_str != nullptr && + usage_collection != USAGE_COLLECTION_HYBRID && usage_collection != USAGE_COLLECTION_PDC) { + addgrpid = 0; + } else if (usage_collection == USAGE_COLLECTION_NONE) { + addgrpid = 0; + } + } + + DPRINTF("Register job with AddGrpId " gid_t_fmt " and systemd scope " SFN " at PTF\n", addgrpid, scope_str != nullptr ? scope_str : "null"); + if ((ptf_error = ptf_job_started(addgrpid, pe_task_id, job, ja_task_id, scope_str, usage_collection))) { ERROR(MSG_JOB_NOREGISTERPTF_SS, job_get_id_string(job_id, ja_task_id, pe_task_id, &id_dstring), ptf_errstr(ptf_error)); - DRETURN((1)); + DRETURN(1); } } /* store addgrpid in job report to be sent to qmaster later on */ - { - char addgrpid_str[64]; - lListElem *jr; - - snprintf(addgrpid_str, sizeof(addgrpid_str), pid_t_fmt, addgrpid); - if ((jr=get_job_report(job_id, ja_task_id, pe_task_id))) { - lSetString(jr, JR_osjobid, addgrpid_str); - } - DPRINTF("job %s: addgrpid = %s\n", job_get_id_string(job_id, ja_task_id, pe_task_id, &id_dstring), addgrpid_str); +{ + // @todo CS-1409 - JR_osjobid is probably not needed at all + lListElem *jr; + if ((jr=get_job_report(job_id, ja_task_id, pe_task_id))) { + lSetString(jr, JR_osjobid, addgrpid_str); } + DPRINTF("job %s: addgrpid = %s\n", job_get_id_string(job_id, ja_task_id, pe_task_id, &id_dstring), addgrpid_str); +} #else /* read osjobid if possible */ - sge_get_active_job_file_path(&osjobid_path, - job_id, ja_task_id, pe_task_id, OSJOBID); - - DPRINTF(("Registering job %s with PTF\n", - job_get_id_string(job_id, ja_task_id, pe_task_id, &id_dstring))); + DSTRING_STATIC(osjobid_path, SGE_PATH_MAX); + sge_get_active_job_file_path(&osjobid_path, job_id, ja_task_id, pe_task_id, OSJOBID); + + DPRINTF(("Registering job %s with PTF\n", job_get_id_string(job_id, ja_task_id, pe_task_id, &id_dstring))); if (SGE_STAT(sge_dstring_get_string(&osjobid_path), &sb) && errno == ENOENT) { - DPRINTF(("still waiting for osjobid of job %s\n", + DPRINTF(("still waiting for osjobid of job %s\n", job_get_id_string(job_id, ja_task_id, pe_task_id, &id_dstring))); - sge_dstring_free(&osjobid_path); + sge_dstring_free(&osjobid_path); DRETURN(1); - } + } + osjobid_t osjobid; +#if 1 + const char *osjobid_str; + if (pe_task != nullptr) { + osjobid_str = lGetString(pe_task, PET_osjobid); + } else { + osjobid_str = lGetString(ja_task, JAT_osjobid); + } + osjobid = static_cast(std::stoul(osjobid_str)); +#else if (!(fp=fopen(sge_dstring_get_string(&osjobid_path), "r"))) { ERROR(MSG_EXECD_NOOSJOBIDOPEN_SSS, sge_dstring_get_string(&osjobid_path), job_get_id_string(job_id, ja_task_id, pe_task_id, &id_dstring), strerror(errno)); - sge_dstring_free(&osjobid_path); + sge_dstring_free(&osjobid_path); DRETURN(-1); } - sge_dstring_free(&osjobid_path); + sge_dstring_free(&osjobid_path); success = (fscanf(fp, OSJOBID_FMT, &osjobid)==1); FCLOSE(fp); @@ -990,7 +1039,7 @@ const lListElem *pe_task /* can happen that shepherd has opend the file but not written */ DRETURN(1); } - +#endif { int ptf_error; if ((ptf_error = ptf_job_started(osjobid, pe_task_id, job, ja_task_id))) { @@ -1001,20 +1050,20 @@ const lListElem *pe_task /* store osjobid in job report to be sent to qmaster later on */ { - char osjobid_str[64]; lListElem *jr; - - sprintf(osjobid_str, OSJOBID_FMT, osjobid); if ((jr=get_job_report(job_id, ja_task_id, pe_task_id))) - lSetString(jr, JR_osjobid, osjobid_str); - DPRINTF(("job %s: osjobid = %s\n", + lSetString(jr, JR_osjobid, osjobid_str); + DPRINTF(("job %s: osjobid = %s\n", job_get_id_string(job_id, ja_task_id, pe_task_id, &id_dstring), osjobid_str)); } #endif DRETURN(0); -FCLOSE_ERROR: - DRETURN(1); +#if 0 + FCLOSE_ERROR: + DRETURN(1); +#endif } + #endif diff --git a/source/daemons/execd/execd_get_new_conf.cc b/source/daemons/execd/execd_get_new_conf.cc index 50a5432c1d..0d0fc4343d 100644 --- a/source/daemons/execd/execd_get_new_conf.cc +++ b/source/daemons/execd/execd_get_new_conf.cc @@ -84,10 +84,11 @@ int do_get_new_conf(ocs::gdi::ClientServerBase::struct_msg_t *aMsg) { WARNING(MSG_WARN_CHANGENOTEFFECTEDUNTILRESTARTOFEXECHOSTS, "execd_spool_dir"); } - // if the keep_active flag has changed, we need to enforce cleanup of old jobs - // to get rid of old active jobs directories + // if the keep_active flag has changed, + // and when keep_active has been switched off, + // we need to enforce cleanup of old jobs to get rid of old active jobs directories keep_active_t keep_active = mconf_get_keep_active(); - if (old_keep_active != keep_active) { + if (old_keep_active != keep_active && keep_active == KEEP_ACTIVE_FALSE) { set_enforce_cleanup_old_jobs(); } diff --git a/source/daemons/execd/execd_job_exec.cc b/source/daemons/execd/execd_job_exec.cc index f9f50bf893..ce38c8c3dd 100644 --- a/source/daemons/execd/execd_job_exec.cc +++ b/source/daemons/execd/execd_job_exec.cc @@ -69,6 +69,7 @@ #include "spool/classic/read_write_job.h" #include "execd.h" +#include "ocs_execd_systemd.h" #include "reaper_execd.h" #include "job_report_execd.h" #include "execd_job_exec.h" @@ -201,7 +202,7 @@ int do_job_slave(ocs::gdi::ClientServerBase::struct_msg_t *aMsg) lFreeList(&answer_list); for_each_rw(ja_task, lGetList(jelem, JB_ja_tasks)) { - DPRINTF("Job: %ld Task: %ld\n", (long) lGetUlong(jelem, JB_job_number), (long) lGetUlong(ja_task, JAT_task_number)); + DPRINTF("Job: " sge_u32 " Task: " sge_u32 "\n", lGetUlong(jelem, JB_job_number), lGetUlong(ja_task, JAT_task_number)); ret = handle_job(jelem, ja_task, 1); } @@ -389,6 +390,15 @@ static int handle_job(lListElem *jelem, lListElem *jatep, int slave) { /* check if job has queue limits and increase global flag if necessary */ modify_queue_limits_flag_for_job(component_get_qualified_hostname(), jelem, true); +#if defined (OCS_WITH_SYSTEMD) + if (slave) { + bool enable_systemd = mconf_get_enable_systemd(); + if (enable_systemd) { + ocs::execd::execd_store_tight_pe_slice(jelem, jatep); + } + } +#endif + /* put into job list */ lAppendElem(*ocs::DataStore::get_master_list_rw(SGE_TYPE_JOB), jelem); diff --git a/source/daemons/execd/execd_profiling.cc b/source/daemons/execd/execd_profiling.cc new file mode 100644 index 0000000000..5566002277 --- /dev/null +++ b/source/daemons/execd/execd_profiling.cc @@ -0,0 +1,69 @@ +/*___INFO__MARK_BEGIN_NEW__*/ +/*************************************************************************** + * + * Copyright 2025 HPC-Gridware GmbH + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************/ +/*___INFO__MARK_END_NEW__*/ + +#include + +#include "uti/sge_log.h" +#include "uti/sge_profiling.h" + +#include "execd_profiling.h" + +namespace ocs { + void execd_profiling_initialize() { + prof_set_level_name(SGE_PROF_CUSTOM1, "dispatcher", nullptr); + prof_set_level_name(SGE_PROF_CUSTOM2, "systemd", nullptr); + prof_set_level_name(SGE_PROF_CUSTOM3, "ptf/pdc", nullptr); + } + + void execd_profiling_start_stop() { + static bool profiling_started = false; + // start / stop profiling depending on configuration (execd_params PROF_EXECD=true) + if (profiling_started) { + if (!thread_prof_active_by_id(pthread_self())) { + prof_stop(SGE_PROF_CUSTOM1, nullptr); + prof_stop(SGE_PROF_CUSTOM2, nullptr); + prof_stop(SGE_PROF_CUSTOM3, nullptr); + prof_stop(SGE_PROF_GDI_REQUEST, nullptr); + profiling_started = false; + DEBUG("profiling disabled"); + } + } else { + if (thread_prof_active_by_id(pthread_self())) { + prof_start(SGE_PROF_CUSTOM1, nullptr); + prof_start(SGE_PROF_CUSTOM2, nullptr); + prof_start(SGE_PROF_CUSTOM3, nullptr); + prof_start(SGE_PROF_GDI_REQUEST, nullptr); + profiling_started = true; + DEBUG("profiling enabled"); + } + } + } + + void execd_profiling_output() { + static u_long64 next_prof_output = 0; + if (thread_prof_active_by_id(pthread_self())) { + thread_output_profiling("execd profiling summary:", &next_prof_output); + } + } + + void execd_profiling_cleanup() { + sge_prof_cleanup(); + } +} diff --git a/source/daemons/execd/execd_profiling.h b/source/daemons/execd/execd_profiling.h new file mode 100644 index 0000000000..81595e8fd4 --- /dev/null +++ b/source/daemons/execd/execd_profiling.h @@ -0,0 +1,27 @@ +#pragma once +/*___INFO__MARK_BEGIN_NEW__*/ +/*************************************************************************** + * + * Copyright 2025 HPC-Gridware GmbH + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************/ +/*___INFO__MARK_END_NEW__*/ + +namespace ocs { + void execd_profiling_initialize(); + void execd_profiling_start_stop(); + void execd_profiling_output(); + void execd_profiling_cleanup(); +} diff --git a/source/daemons/execd/execd_ticket.cc b/source/daemons/execd/execd_ticket.cc index f3154c9b79..815fae5f67 100644 --- a/source/daemons/execd/execd_ticket.cc +++ b/source/daemons/execd/execd_ticket.cc @@ -63,8 +63,6 @@ do_ticket(ocs::gdi::ClientServerBase::struct_msg_t *aMsg) { while (pb_unused(&(aMsg->buf))>0) { - lList *jatasks = nullptr; - if (unpackint(&(aMsg->buf), &jobid) || unpackint(&(aMsg->buf), &jataskid) || unpackdouble(&(aMsg->buf), &ticket)) { ERROR(SFNMAX, MSG_JOB_TICKETFORMAT); DRETURN(0); @@ -72,13 +70,13 @@ do_ticket(ocs::gdi::ClientServerBase::struct_msg_t *aMsg) { DPRINTF("got %lf new tickets for job " sge_u32 "." sge_u32 "\n", ticket, jobid, jataskid); + // @todo we use only very few attributes of JB_Type and JAT_type here - use reduced elements job_ticket = lAddElemUlong(&ticket_modifier, JB_job_number, jobid, JB_Type); - if (job_ticket) { - task_ticket = lAddElemUlong(&jatasks, JAT_task_number, jataskid, JAT_Type); + if (job_ticket != nullptr) { + task_ticket = lAddSubUlong(job_ticket, JAT_task_number, jataskid, JB_ja_tasks, JAT_Type); if (task_ticket) { lSetDouble(task_ticket, JAT_tix, ticket); } - lSetList(job_ticket, JB_ja_tasks, jatasks); } } diff --git a/source/daemons/execd/job_report_execd.cc b/source/daemons/execd/job_report_execd.cc index 962cfb2ebc..07d82d1e3d 100644 --- a/source/daemons/execd/job_report_execd.cc +++ b/source/daemons/execd/job_report_execd.cc @@ -52,6 +52,7 @@ #include "sgeobj/sge_qinstance.h" #include "job_report_execd.h" +#include "ocs_execd_systemd.h" #include "reaper_execd.h" #include "execd_signal_queue.h" #include "load_avg.h" @@ -219,7 +220,7 @@ int add_usage(lListElem *jr, const char *name, const char *val_as_str, double va DRETURN(-1); } - /* check if we already have an usage value with this name */ + /* check if we already have a usage value with this name */ usage = lGetSubStrRW(jr, UA_name, name, JR_usage); if (!usage) { if (!(usage = lAddSubStr(jr, UA_name, name, JR_usage, UA_Type))) { @@ -287,7 +288,7 @@ int do_ack(ocs::gdi::ClientServerBase::struct_msg_t *aMsg) /* ** This is the answer of qmaster if we report a job as exiting ** - job gets removed from job report list and from job list -** - job gets cleaned from file system +** - job gets cleaned from file system ** - retry is triggered by next job report sent to qmaster ** containing this job as "exiting" */ @@ -295,9 +296,16 @@ int do_ack(ocs::gdi::ClientServerBase::struct_msg_t *aMsg) jataskid = lGetUlong(ack, ACK_id2); pe_task_id_str = lGetString(ack, ACK_str); - DPRINTF("remove exiting job " sge_u32 "/" sge_u32 "/%s\n", + DPRINTF("remove exiting job " sge_u32 "." sge_u32 " %s\n", jobid, jataskid, pe_task_id_str?pe_task_id_str:""); +#if defined(OCS_WITH_SYSTEMD) + if (pe_task_id_str == nullptr) { + // remove the tight pe job slave container (if any) job from systemd + ocs::execd::execd_delete_tight_pe_slice(jobid, jataskid, pe_task_id_str); + } +#endif + if ((jr = get_job_report(jobid, jataskid, pe_task_id_str))) { remove_acked_job_exit(jobid, jataskid, pe_task_id_str, jr); } else { diff --git a/source/daemons/execd/msg_execd.h b/source/daemons/execd/msg_execd.h index 1b3dfb6c67..e53ef26f6a 100644 --- a/source/daemons/execd/msg_execd.h +++ b/source/daemons/execd/msg_execd.h @@ -63,7 +63,7 @@ #define MSG_STATUS_LOADSENSOREXITEDWITHEXITSTATUS_I _MESSAGE(29015, _("load sensor exited with exit status = %d")) #define MSG_STATUS_MAILERDIEDTHROUGHSIGNALXY_SI _MESSAGE(29016, _("mailer died through signal " SFN " = %d")) #define MSG_STATUS_MAILEREXITEDWITHEXITSTATUS_I _MESSAGE(29017, _("mailer exited with exit status = %d")) -#define MSG_JOB_REAPINGJOBXPTFCOMPLAINSY_US _MESSAGE(29018, _("reaping job \"" sge_u32 "\" ptf complains: " SFN)) +#define MSG_JOB_REAPINGJOBXPTFCOMPLAINSY_SS _MESSAGE(29018, _("reaping job " SFN ": ptf complains: " SFN)) #define MSG_JOB_CLEANUPJOBCALLEDWITHINVALIDPARAMETERS _MESSAGE(29019, _("clean_up_job() called with invalid parameters")) #define MSG_JOB_CANTFINDDIRXFORREAPINGJOBYZ_SS _MESSAGE(29020, _("can't find directory " SFN " for reaping job " SFN)) #define MSG_JOB_CANTREADCONFIGFILEFORJOBXY_S _MESSAGE(29021, _("can't read config file for job " SFN)) @@ -247,7 +247,12 @@ #define MSG_REJECT_TASK_SLOTS_IN_USE_UUIUS _MESSAGE(29224, _("rejecting pe task request for job " sge_u32 "." sge_u32 ": %d out of " sge_u32 " slots in queue " SFN " are in use")) #define MSG_REJECT_TASK_NO_FREE_SLOT_FOUND_UU _MESSAGE(29225, _("rejecting pe task request for job " sge_u32 "." sge_u32 ": no free slot found")) +#define MSG_SYSTEMD_INITIALIZED_II _MESSAGE(29230, _("systemd integration initialized, systemd version %d, cgroups version %d")) +#define MSG_SYSTEMD_RUNNING_AS_SERVICE_S _MESSAGE(29231, _("running as service " SFN)) +#define MSG_EXECD_SYSTEMD_MOVE_SHEPHERD_TO_SCOPE_S _MESSAGE(29232, _("systemd: moving shepherd to scope failed: " SFN4)) +#define MSG_EXECD_CANNOT_CONNECT_TO_SYSTEMD_S _MESSAGE(29233, _("cannot connect to systemd: " SFN4)) +#define MSG_CANNOT_TO_GET_PROPERTY_SSS _MESSAGE(29234, _("cannot to get property " SFN " from systemd scope " SFN ": " SFN4)) + // clang-format on #endif /* __MSG_EXECD_H */ - diff --git a/source/daemons/execd/ocs_execd_systemd.cc b/source/daemons/execd/ocs_execd_systemd.cc new file mode 100644 index 0000000000..d18c626862 --- /dev/null +++ b/source/daemons/execd/ocs_execd_systemd.cc @@ -0,0 +1,461 @@ +/*___INFO__MARK_BEGIN_NEW__*/ +/*************************************************************************** + * + * Copyright 2025 HPC-Gridware GmbH + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************/ +/*___INFO__MARK_END_NEW__*/ + +#include "cull/cull.h" + +#include "ocs_common_systemd.h" + +#include "sgeobj/cull/sge_ptf_JL_L.h" +#include "sgeobj/cull/sge_ptf_JO_L.h" +#include "sgeobj/ocs_Job.h" +#include "sgeobj/sge_conf.h" +#include "sgeobj/sge_ja_task.h" +#include "sgeobj/sge_pe.h" +#include "sgeobj/sge_usage.h" + +#include "uti/ocs_Systemd.h" +#include "uti/sge_log.h" +#include "uti/sge_profiling.h" +#include "uti/sge_rmon_macros.h" + +#include "execd.h" +#include "ocs_execd_systemd.h" + +#include "msg_execd.h" +#include "ptf.h" + +// from ptf.cc +extern lList *ptf_jobs; + +namespace ocs::execd { +#if defined(OCS_WITH_SYSTEMD) + + /*! + * @brief Initialize the Systemd integration for execd. + * This function checks if Systemd integration is available and if the process is running as a Systemd service. + * It initializes the Systemd library and logs the Systemd version and cgroup version. + */ + void execd_systemd_init() { + // try to initialize the Systemd integration, + // create an instance of Systemd and try to connect to the system bus, + // figure out if we are running as Systemd service + DSTRING_STATIC(error_dstr, MAX_STRING_SIZE); + if (ocs::uti::Systemd::initialize(ocs::uti::Systemd::execd_service_name, &error_dstr)) { + u_long32 old_ll = log_state_get_log_level(); + log_state_set_log_level(LOG_INFO); + INFO(MSG_SYSTEMD_INITIALIZED_II, ocs::uti::Systemd::get_systemd_version(), + ocs::uti::Systemd::get_cgroup_version()); + if (ocs::uti::Systemd::is_running_as_service()) { + INFO(MSG_SYSTEMD_RUNNING_AS_SERVICE_S, ocs::uti::Systemd::execd_service_name.c_str()); + } + log_state_set_log_level(old_ll); + } else if (sge_dstring_strlen(&error_dstr) > 0) { + WARNING(SFNMAX, sge_dstring_get_string(&error_dstr)); + } + } + + /*! + * @brief Move the shepherd process to the Systemd scope. + * + * This function moves the shepherd process to the Systemd scope if sge_execd is running as a service. + * Reason: When sge_execd is running as a service, shutting down the service would also kill all the + * sge_shepherd processes, which is not desired. E.g., we want to be able to update the sge_execd + * while we have running jobs (with their shepherds also staying alive). + * + * It connects to Systemd and attempts to move the shepherd process to the appropriate scope. + * Returns true on success, false on failure. + */ + bool + execd_move_shepherd_to_scope() { + bool ret = true; + + if (ocs::uti::Systemd::is_running_as_service()) { + PROF_START_MEASUREMENT(SGE_PROF_CUSTOM2); + DSTRING_STATIC(err_dstr, MAX_STRING_SIZE); + ocs::uti::Systemd systemd; + // connect as root, we want to have write access + sge_switch2start_user(); + bool connected = systemd.connect(&err_dstr); + sge_switch2admin_user(); + if (connected) { + pid_t pid = getpid(); + bool success = systemd.move_shepherd_to_scope(pid, &err_dstr); + if (!success) { + WARNING(MSG_EXECD_SYSTEMD_MOVE_SHEPHERD_TO_SCOPE_S, sge_dstring_get_string(&err_dstr)); + ret = false; + } + } else { + // connect failed + WARNING(SFNMAX, sge_dstring_get_string(&err_dstr)); + ret = false; + } + PROF_STOP_MEASUREMENT(SGE_PROF_CUSTOM2); + if (prof_is_active(SGE_PROF_CUSTOM2)) { + double prof_systemd = prof_get_measurement_wallclock(SGE_PROF_CUSTOM2, true, nullptr); + PROFILING("PROF: moving shepherd to systemd scope took %.6f seconds", prof_systemd); + } + } + + return ret; + } + + /*! + * @brief Store the Systemd slice name for a tightly integrated parallel job. + * + * This function stores the Systemd slice name in the JA task element if the job is a tightly integrated + * parallel job (i.e., it has control slaves enabled). + * If no slice name is provided, it builds one based on the job and task information. + * This slice name is used at job end to delete the slice if no more tasks are running. + * + * This function is called + * - when a job is made available on a slave host of a tightly integrated parallel job + * - when the master task of a tightly integrated parallel job is started + * + * @param job The job element containing job information. + * @param ja_task The JA task element where the slice name will be stored. + * @param slice_name The name of the Systemd slice to store, or nullptr to build it automatically. + */ + void + execd_store_tight_pe_slice(const lListElem *job, lListElem *ja_task, const char *slice_name) { + const lListElem *pe = lGetObject(ja_task, JAT_pe_object); + if (pe != nullptr && lGetBool(pe, PE_control_slaves)) { + // tightly integrated parallel job, store the slice name + if (slice_name != nullptr) { + lSetString(ja_task, JAT_systemd_slice, slice_name); + } else { + // no slice name given, build it ourselves + std::string systemd_slice; + std::string systemd_scope; + DSTRING_STATIC(error_dstr, MAX_STRING_SIZE); + if (ocs::Job::job_get_systemd_slice_and_scope(job, ja_task, nullptr, systemd_slice, systemd_scope, &error_dstr)) { + lSetString(ja_task, JAT_systemd_slice, systemd_slice.c_str()); + } + } + } + } + + /*! + * @brief Delete the Systemd slice for a tightly integrated parallel job. + * + * This function deletes the Systemd slice for a tightly integrated parallel job. + * It connects to Systemd and attempts to stop the specified slice. + * If the slice is successfully stopped, it is removed from the system by systemd. + * + * @param slice The name of the Systemd slice to delete. + */ + static void + execd_delete_tight_pe_slice(const char *slice) { + if (slice != nullptr) { + ocs::uti::Systemd systemd; + DSTRING_STATIC(err_dstr, MAX_STRING_SIZE); + // connect as root, we want to have write access + sge_switch2start_user(); + bool connected = systemd.connect(&err_dstr); + sge_switch2admin_user(); + if (connected) { + bool success = systemd.stop_unit(slice, &err_dstr); + if (!success) { + WARNING(SFNMAX, sge_dstring_get_string(&err_dstr)); + } + } else { + // connect failed + WARNING(SFNMAX, sge_dstring_get_string(&err_dstr)); + } + } + } + + /*! + * @brief Delete the Systemd slice for a tightly integrated parallel job. + * + * This function is called when a tightly integrated parallel job is finished, + * and there are no more pe tasks left in the job. + * It checks if the job has a Systemd slice and deletes it if it exists. + * It clears the slice name in the JA task element to ensure that deletion is not repeated. + * + * @param job_id The job ID of the job to check. + * @param ja_task_id The JA task ID of the job to check. + * @param pe_task_id The PE task ID, or nullptr if this is the master task. + */ + void + execd_delete_tight_pe_slice(u_long32 job_id, u_long32 ja_task_id, const char *pe_task_id) { + // We might need to remove a systemd slice (in case this is the master task of a tightly integrated pe job). + // Only if there are no more pe tasks left in the job. + if (pe_task_id == nullptr) { + bool enable_systemd = mconf_get_enable_systemd(); + if (enable_systemd) { + lListElem *job = nullptr; + lListElem *ja_task = nullptr; + if (execd_get_job_ja_task(job_id, ja_task_id, &job, &ja_task, false)) { + const char *slice = lGetString(ja_task, JAT_systemd_slice); + // only if there is a job slice and it has not yet been deleted + if (slice != nullptr) { + const lListElem *pe = lGetObject(ja_task, JAT_pe_object); + // if it is tight integration and there are no running pe tasks left, + // we can delete the systemd slice + if (pe != nullptr && lGetBool(pe, PE_control_slaves) && + lGetNumberOfElem(lGetList(ja_task, JAT_task_list)) == 0) { + // tightly integrated parallel job, no more pe tasks left + execd_delete_tight_pe_slice(slice); + // make sure to delete the slice only once + lSetString(ja_task, JAT_systemd_slice, nullptr); + } + } + } + } + } + } + + /*! + * @brief Get one value from Systemd for a specific scope. + * + * This function retrieves one usage value from Systemd for a given scope and updates the usage_list. + * A scaling factor can be applied to convert the value to the desired unit + * (e.g., converting CPUUsageNSec from nanoseconds to seconds). + * + * @param systemd The Systemd object used to interact with the Systemd bus. + * @param scope The scope for which the usage values are retrieved. + * @param usage_list The list where the usage values will be stored. + * @param property_str The property to retrieve from Systemd (e.g., "CPUUsageNSec"). + * @param usage_attr_str The attribute name in the usage_list (e.g., USAGE_ATTR_CPU). + * @param factor The factor to convert the value (e.g., 1.0 / 1000000000.0 for nanoseconds to seconds). + */ + static void + ptf_get_usage_value_from_systemd(ocs::uti::Systemd &systemd, std::string &scope, lList *usage_list, const char *property_str, const char *usage_attr_str, double factor) { + DENTER(TOP_LAYER); + + lListElem *usage_elem = lGetElemStrRW(usage_list, UA_name, usage_attr_str); + // the usage element must already exist in the usage_list, don't create a new one + if (usage_elem != nullptr) { + // Get the usage value from systemd + DSTRING_STATIC(error_dstr, MAX_STRING_SIZE); + uint64_t value{}; + std::string property{property_str}; + if (systemd.sd_bus_get_property("Scope", scope, property, value, &error_dstr)) { + double usage_value = value * factor; + lSetDouble(usage_elem, UA_value, usage_value); + DPRINTF("==> Updated usage %s for scope '%s': %f", usage_attr_str, scope.c_str(), usage_value); + } else { + // I18N, and the message should already contain all necessary information (?) + WARNING(MSG_CANNOT_TO_GET_PROPERTY_SSS, property.c_str(), scope.c_str(), sge_dstring_get_string(&error_dstr)); + } + } + + DRETURN_VOID; + } + +#if 0 + /*! + * @brief Get two usage values from Systemd for a specific scope. + * + * This function retrieves and sums up two usage values from Systemd for a given scope and updates the usage_list. + * For example, it can be used to get both IOReadBytes and IOWriteBytes for a job scope and combine them + * into the one USAGE_ATTR_IO value. + * A scaling factor can be applied to convert the values to the desired unit. + * + * @param systemd The Systemd object used to interact with the Systemd bus. + * @param scope The scope for which the usage values are retrieved. + * @param usage_list The list where the usage values will be stored. + * @param property1_str The first property to retrieve from Systemd (e.g., "IOReadBytes"). + * @param property2_str The second property to retrieve from Systemd (e.g., "IOWriteBytes"). + * @param usage_attr_str The attribute name in the usage_list (e.g., USAGE_ATTR_IO). + * @param factor The factor to convert the value (e.g., 1.0 for bytes). + */ + static void + ptf_get_usage_value_from_systemd2(ocs::uti::Systemd &systemd, std::string &scope, lList *usage_list, const char *property1_str, const char *property2_str, const char *usage_attr_str, double factor) { + DENTER(TOP_LAYER); + + lListElem *usage_elem = lGetElemStrRW(usage_list, UA_name, usage_attr_str); + // the usage element must already exist in the usage_list, don't create a new one + if (usage_elem != nullptr) { + // Get the usage value from systemd + DSTRING_STATIC(error_dstr, MAX_STRING_SIZE); + uint64_t value1{}, value2{}; + std::string property = property1_str; + if (!systemd.sd_bus_get_property("Scope", scope, property, value1, &error_dstr)) { + // I18N, and the message should already contain all necessary information (?) + WARNING(MSG_CANNOT_TO_GET_PROPERTY_SSS, property.c_str(), scope.c_str(), sge_dstring_get_string(&error_dstr)); + } + property = property2_str; + if (!systemd.sd_bus_get_property("Scope", scope, property, value2, &error_dstr)) { + // I18N, and the message should already contain all necessary information (?) + WARNING(MSG_CANNOT_TO_GET_PROPERTY_SSS, property.c_str(), scope.c_str(), sge_dstring_get_string(&error_dstr)); + } + double usage_value = (value1 + value2) * factor; + lSetDouble(usage_elem, UA_value, usage_value); + DPRINTF("Updated usage %s: %lu + %s: %lu = %f for scope '%s'", property1_str, value1, property2_str, value2, usage_value, scope.c_str()); + } + + DRETURN_VOID; + } +#endif + + /*! + * @brief Get usage information from Systemd for all jobs, ja_tasks, and pe_tasks. + * + * This function retrieves usage information from Systemd for all jobs, ja_tasks, and pe_tasks + * that are currently active. It updates the usage_list for each job with + * CPU, memory, and optionally IO usage values. + * @note There are a number of open issues here: + * - CS-1389: IO needs to be re-considered, as the values delivered by Systemd looked incorrect, at least + * on certain OSes, possibly depending on the Systemd or the Cgroup version. + * - CS-1398: Re-evaluate how we handle MemoryPeak, as it is not available on all OSes and Systemd versions. + * @note Need to be root for calling this function - call switch2start_user(). + */ + void + ptf_get_usage_from_systemd() { + DENTER(TOP_LAYER); + + DSTRING_STATIC(error_dstr, MAX_STRING_SIZE); + + // Initialize the systemd connection and retrieve usage information + ocs::uti::Systemd systemd; + bool success = systemd.connect(&error_dstr); + if (!success) { + WARNING(MSG_EXECD_CANNOT_CONNECT_TO_SYSTEMD_S, sge_dstring_get_string(&error_dstr)); + } else { + // Loop over all jobs, ja_tasks, and pe_tasks + lListElem *ptf_job; + for_each_rw (ptf_job, ptf_jobs) { + lListElem *os_job; + for_each_rw (os_job, lGetList(ptf_job, JL_OS_job_list)) { + usage_collection_t usage_collection = static_cast(lGetUlong(os_job, JO_usage_collection)); + const char *scope_str = lGetString(os_job, JO_systemd_scope); + if (scope_str != nullptr) { + std::string scope{scope_str}; + std::string state; + success = systemd.sd_bus_get_property("Unit", scope, "ActiveState", state, &error_dstr); + if (success) { + if (state.compare("active") == 0) { + DPRINTF("==> Job is active in systemd scope %s", scope.c_str()); + lList *usage_list = lGetListRW(os_job, JO_usage_list); + if (usage_list == nullptr) { + usage_list = ptf_build_usage_list("usagelist", usage_collection); + lSetList(os_job, JO_usage_list, usage_list); + } + // from systemd we do *not* get vmem / maxvmem + ptf_get_usage_value_from_systemd(systemd, scope, usage_list, "CPUUsageNSec", USAGE_ATTR_CPU, 1.0 / 1000000000.0); // convert nanoseconds to seconds + ptf_get_usage_value_from_systemd(systemd, scope, usage_list, "MemoryCurrent", USAGE_ATTR_RSS, 1.0); + // With cgroup v2 we can get MemoryPeak, with cgroup v1 we need to calculate it ourselves. + // But only with Systemd version >= 247. + // And on Ubuntu-22.04 we have cgroup v2, Systemd version 249, but no MemoryPeak property. + // @todo How to handle this? + if (0 && ocs::uti::Systemd::get_cgroup_version() == 2 && ocs::uti::Systemd::get_systemd_version() >= 247) { + // cgroup v2 + ptf_get_usage_value_from_systemd(systemd, scope, usage_list, "MemoryPeak", USAGE_ATTR_MAXRSS, 1.0); + // IO usage + // @todo I don't really find information about it. + // IOReadBytes and IOWriteBytes are not shown by introspection, + // but exist e.g. on Ubuntu 24.04 according to `systemctl show `. + // The values delivered seem not to make sense. + // ptf_get_usage_value_from_systemd2(systemd, scope, usage_list, "IOReadBytes", "IOWriteBytes", USAGE_ATTR_IO, 1.0); + } else { + // cgroup v1 or too old systemd version + lListElem *usage_elem = lGetElemStrRW(usage_list, UA_name, USAGE_ATTR_RSS); + if (usage_elem != nullptr) { + double rss = lGetDouble(usage_elem, UA_value); + usage_elem = lGetElemStrRW(usage_list, UA_name, USAGE_ATTR_MAXRSS); + if (rss > lGetDouble(usage_elem, UA_value)) { + lSetDouble(usage_elem, UA_value, rss); + DPRINTF("==> Updated MAXRSS for scope '%s': %f", scope.c_str(), rss); + } + } + } + } else { + DPRINTF("==> Job is not active in systemd scope %s, state: %s", scope.c_str(), state.c_str()); + } + } else { + WARNING("Failed to get property '%s' from systemd scope '%s': %s", "ActiveState", scope.c_str(), sge_dstring_get_string(&error_dstr)); + } + } + } + } + + // get slice and scope for the job/ja_task/pe_task + // and retrieve the usage information + //systemd.get_usage_from_systemd(slice, scope, &error_dstr); + } + + DRETURN_VOID; + } + +#endif + + /*! + * @brief Check if execd should use PDC for usage collection. + * + * This function checks the configuration to determine if PDC should be used for usage collection. + * It returns true if PDC is enabled or hybrid mode is configured, otherwise false. + * + * @return true if PDC is used for usage collection, false otherwise. + */ + bool + execd_use_pdc_for_usage_collection() { + return ocs::common::use_pdc_for_usage_collection(mconf_get_usage_collection()); + } + + /*! + * @brief Check if execd should use Systemd for usage collection. + * + * This function checks the configuration to determine if Systemd should be used for usage collection. + * It returns true if Systemd is enabled and available, and usage collection is not set to NONE or PDC. + * + * @return true if Systemd is used for usage collection, false otherwise. + */ + bool + execd_use_systemd_for_usage_collection() { + bool ret = false; + + // When using systemd is enabled and the binaries are compiled with systemd support, + // we use systemd for usage collection. + // Except when we explicitly disabled it in execd_params USAGE_COLLECTION. +#if defined(OCS_WITH_SYSTEMD) + if (mconf_get_enable_systemd() && + ocs::uti::Systemd::is_systemd_available()) { + ret = true; + usage_collection_t uc = mconf_get_usage_collection(); + if (uc == USAGE_COLLECTION_NONE || uc == USAGE_COLLECTION_PDC) { + ret = false; // we do not use systemd for usage collection + } + } +#endif + + return ret; + } + + /*! + * @brief Check if execd is in hybrid usage collection mode. + * + * This function checks the configuration to determine if execd is in hybrid usage collection mode. + * Hybrid mode means that both PDC and Systemd are used for usage collection. + * It returns true if hybrid mode is enabled, otherwise false. + * + * @return true if execd is in hybrid usage collection mode, false otherwise. + */ + bool + execd_is_hybrid_usage_collection() { + bool hybrid_mode = mconf_get_usage_collection() == USAGE_COLLECTION_HYBRID; + + #if defined(OCS_WITH_SYSTEMD) + hybrid_mode &= ocs::uti::Systemd::is_systemd_available(); + #endif + + return hybrid_mode; + } +} diff --git a/source/daemons/execd/ocs_execd_systemd.h b/source/daemons/execd/ocs_execd_systemd.h new file mode 100644 index 0000000000..d248130b19 --- /dev/null +++ b/source/daemons/execd/ocs_execd_systemd.h @@ -0,0 +1,49 @@ +#pragma once +/*___INFO__MARK_BEGIN_NEW__*/ +/*************************************************************************** + * + * Copyright 2025 HPC-Gridware GmbH + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************/ +/*___INFO__MARK_END_NEW__*/ + +namespace ocs::execd { + + bool + execd_use_pdc_for_usage_collection(); + bool + execd_use_systemd_for_usage_collection(); + bool + execd_is_hybrid_usage_collection(); + +#if defined(OCS_WITH_SYSTEMD) + + void + execd_systemd_init(); + + bool + execd_move_shepherd_to_scope(); + + void + execd_store_tight_pe_slice(const lListElem *job, lListElem *ja_task, const char *slice_name = nullptr); + + void + execd_delete_tight_pe_slice(u_long32 job_id, u_long32 ja_task_id, const char *pe_task_id); + + void + ptf_get_usage_from_systemd(); + +#endif +} diff --git a/source/daemons/execd/ptf.cc b/source/daemons/execd/ptf.cc index 230ea3020c..85dc638aad 100644 --- a/source/daemons/execd/ptf.cc +++ b/source/daemons/execd/ptf.cc @@ -70,29 +70,32 @@ #include "comm/commlib.h" +#include "uti/ocs_Systemd.h" #include "uti/sge_language.h" #include "uti/sge_log.h" +#include "uti/sge_profiling.h" #include "uti/sge_rmon_macros.h" -#include "uti/sge_stdio.h" #include "uti/sge_time.h" #include "uti/sge_uidgid.h" #include "sgeobj/cull/sge_all_listsL.h" #include "sgeobj/sge_conf.h" -#include "sgeobj/sge_feature.h" #include "sgeobj/sge_job.h" #include "sgeobj/sge_pe_task.h" #include "sgeobj/sge_ja_task.h" #include "sgeobj/sge_usage.h" +#include "ocs_common_systemd.h" #include "ptf.h" -#include "sge.h" + +#include "execd.h" + #include "basis_types.h" -#include "msg_common.h" #include "msg_execd.h" #include "sgedefs.h" #include "exec_ifm.h" #include "pdc.h" +#include "ocs_execd_systemd.h" /* * @@ -162,10 +165,12 @@ static void ptf_get_usage_from_data_collector(); static lListElem *ptf_process_job(osjobid_t os_job_id, const char *task_id_str, - const lListElem *new_job, u_long32 jataskid); + const lListElem *new_job, u_long32 jataskid, const char *systemd_scope, usage_collection_t usage_collection); -static lListElem *ptf_get_job_os(const lList *job_list, osjobid_t os_job_id, - lListElem **job_elem); +static lListElem *ptf_get_job_osjob_by_osjobid(const lList *job_list, osjobid_t os_job_id, + lListElem **job_elem); + +static lListElem *ptf_get_osjob_by_ids(lList *osjoblist, u_long32 ja_task_id, const char *pe_task_id); static void ptf_set_job_priority(lListElem *job); @@ -189,7 +194,7 @@ static void ptf_setpriority_addgrpid(const lListElem *job, const lListElem *osjo #endif -static lList *ptf_jobs = nullptr; +lList *ptf_jobs = nullptr; static int is_ptf_running = 0; @@ -270,62 +275,29 @@ static void ptf_set_osjobid(lListElem *osjob, osjobid_t osjobid) * char *name - name of the new list * * RESULT -* static lList* - copy of "old_usage_list" or a real new one +* static lList* - the new usage list ******************************************************************************/ -static lList *ptf_build_usage_list(const char *name) +lList *ptf_build_usage_list(const char *name, usage_collection_t usage_collection) { - lList *usage_list; - lListElem *usage; - DENTER(TOP_LAYER); - usage_list = lCreateList(name, UA_Type); - - usage = lCreateElem(UA_Type); - lSetString(usage, UA_name, USAGE_ATTR_IO); - lSetDouble(usage, UA_value, 0); - lAppendElem(usage_list, usage); - - usage = lCreateElem(UA_Type); - lSetString(usage, UA_name, USAGE_ATTR_IOW); - lSetDouble(usage, UA_value, 0); - lAppendElem(usage_list, usage); + lList* usage_list = lCreateList(name, UA_Type); - usage = lCreateElem(UA_Type); - lSetString(usage, UA_name, USAGE_ATTR_MEM); - lSetDouble(usage, UA_value, 0); - lAppendElem(usage_list, usage); + lAddElemStr(&usage_list, UA_name, USAGE_ATTR_WALLCLOCK, UA_Type); - usage = lCreateElem(UA_Type); - lSetString(usage, UA_name, USAGE_ATTR_CPU); - lSetDouble(usage, UA_value, 0); - lAppendElem(usage_list, usage); + if (usage_collection != USAGE_COLLECTION_NONE) { + lAddElemStr(&usage_list, UA_name, USAGE_ATTR_IO, UA_Type); + lAddElemStr(&usage_list, UA_name, USAGE_ATTR_IOW, UA_Type); + lAddElemStr(&usage_list, UA_name, USAGE_ATTR_MEM, UA_Type); + lAddElemStr(&usage_list, UA_name, USAGE_ATTR_CPU, UA_Type); #if defined(LINUX) || defined(SOLARIS) || defined(FREEBSD) || defined(DARWIN) - usage = lCreateElem(UA_Type); - lSetString(usage, UA_name, USAGE_ATTR_VMEM); - lSetDouble(usage, UA_value, 0); - DPRINTF("adding usage attribute %s\n", USAGE_ATTR_VMEM); - lAppendElem(usage_list, usage); - - usage = lCreateElem(UA_Type); - lSetString(usage, UA_name, USAGE_ATTR_MAXVMEM); - lSetDouble(usage, UA_value, 0); - DPRINTF("adding usage attribute %s\n", USAGE_ATTR_MAXVMEM); - lAppendElem(usage_list, usage); - - usage = lCreateElem(UA_Type); - lSetString(usage, UA_name, USAGE_ATTR_RSS); - lSetDouble(usage, UA_value, 0); - DPRINTF("adding usage attribute %s\n", USAGE_ATTR_RSS); - lAppendElem(usage_list, usage); - - usage = lCreateElem(UA_Type); - lSetString(usage, UA_name, USAGE_ATTR_MAXRSS); - lSetDouble(usage, UA_value, 0); - DPRINTF("adding usage attribute %s\n", USAGE_ATTR_MAXRSS); - lAppendElem(usage_list, usage); + lAddElemStr(&usage_list, UA_name, USAGE_ATTR_VMEM, UA_Type); + lAddElemStr(&usage_list, UA_name, USAGE_ATTR_MAXVMEM, UA_Type); + lAddElemStr(&usage_list, UA_name, USAGE_ATTR_RSS, UA_Type); + lAddElemStr(&usage_list, UA_name, USAGE_ATTR_MAXRSS, UA_Type); #endif + } DRETURN(usage_list); } @@ -499,12 +471,7 @@ static void ptf_setpriority_addgrpid(const lListElem *job, const lListElem *osjo ******************************************************************************/ static lListElem *ptf_get_job(u_long job_id) { - lListElem *job; - lCondition *where; - - where = lWhere("%T(%I == %u)", JL_Type, JL_job_ID, job_id); - job = lFindFirstRW(ptf_jobs, where); - lFreeWhere(&where); + lListElem *job = lGetElemUlongRW(ptf_jobs, JL_job_ID, job_id); return job; } @@ -536,15 +503,18 @@ static lListElem *ptf_get_job(u_long job_id) * static lListElem* - osjob (JO_Type) * or nullptr if it was not found. ******************************************************************************/ -static lListElem *ptf_get_job_os(const lList *job_list, osjobid_t os_job_id, - lListElem **job_elem) +static lListElem *ptf_get_job_osjob_by_osjobid(const lList *job_list, osjobid_t os_job_id, + lListElem **job_elem) { + DENTER(TOP_LAYER); + lListElem *job; lListElem *osjob = nullptr; lCondition *where; - DENTER(TOP_LAYER); - + // @todo use lGetElemUlong instead of building a lCondition + // only possible, when we remove the non LINUX, SOLARIS etc. branch + // or make a u_long64 out of the 2 u_long32 values #if defined(LINUX) || defined(SOLARIS) || defined(DARWIN) || defined(FREEBSD) || defined(NETBSD) where = lWhere("%T(%I == %u)", JO_Type, JO_OS_job_ID, (u_long32) os_job_id); #else @@ -576,6 +546,29 @@ static lListElem *ptf_get_job_os(const lList *job_list, osjobid_t os_job_id, DRETURN(osjob); } +static lListElem *ptf_get_osjob_by_ids(lList *osjoblist, u_long32 ja_task_id, const char *pe_task_id) { + lListElem *osjob; + + for_each_rw(osjob, osjoblist) { + if (lGetUlong(osjob, JO_ja_task_ID) == ja_task_id) { + const char *osjob_pe_task_id = lGetString(osjob, JO_task_id_str); + if (pe_task_id == nullptr) { + if (osjob_pe_task_id == nullptr) { + // we found the array task element (and not a possible pe task element with the same job/ja_task_id) + break; + } + } else { + // we have a pe task, check if the task id matches + if (osjob_pe_task_id != nullptr && strcmp(pe_task_id, osjob_pe_task_id) == 0) { + // found the pe task element + break; + } + } + } + } + + return osjob; +} /*-------------------------------------------------------------------- * ptf_process_job - process a job received from the SGE scheduler. @@ -586,17 +579,14 @@ static lListElem *ptf_get_job_os(const lList *job_list, osjobid_t os_job_id, *--------------------------------------------------------------------*/ static lListElem *ptf_process_job(osjobid_t os_job_id, const char *task_id_str, - const lListElem *new_job, u_long32 jataskid) + const lListElem *new_job, u_long32 jataskid, const char *systemd_scope, usage_collection_t usage_collection) { - lListElem *job, *osjob; - lList *job_list = ptf_jobs; - u_long job_id = lGetUlong(new_job, JB_job_number); - double job_tickets = - lGetDouble(lFirst(lGetList(new_job, JB_ja_tasks)), JAT_tix); - u_long interactive = (lGetString(new_job, JB_script_file) == nullptr); - DENTER(TOP_LAYER); + u_long job_id = lGetUlong(new_job, JB_job_number); + double job_tickets = lGetDouble(lFirst(lGetList(new_job, JB_ja_tasks)), JAT_tix); + bool interactive = lGetString(new_job, JB_script_file) == nullptr; + /* * Add the job to the job list, if it does not already exist */ @@ -612,48 +602,43 @@ static lListElem *ptf_process_job(osjobid_t os_job_id, const char *task_id_str, * else * add osjob job && osjobid == 0 skip */ - job = ptf_get_job(job_id); - if (os_job_id == 0) { - if (job == nullptr) { - DRETURN(nullptr); - } - } else { - lList *osjoblist; - - if (job == nullptr) { - job = lCreateElem(JL_Type); - lAppendElem(job_list, job); - lSetUlong(job, JL_job_ID, job_id); - } - osjoblist = lGetListRW(job, JL_OS_job_list); - osjob = ptf_get_job_os(osjoblist, os_job_id, &job); - if (!osjob) { - if (!osjoblist) { - osjoblist = lCreateList("osjoblist", JO_Type); - lSetList(job, JL_OS_job_list, osjoblist); + lListElem *job = ptf_get_job(job_id); + if (job == nullptr) { + job = lAddElemUlong(&ptf_jobs, JL_job_ID, job_id, JL_Type); + } + if (job != nullptr) { + lList *osjoblist = lGetListRW(job, JL_OS_job_list); + lListElem *osjob = ptf_get_osjob_by_ids(osjoblist, jataskid, task_id_str); + if (osjob == nullptr) { + osjob = lAddSubUlong(job, JO_ja_task_ID, jataskid, JL_OS_job_list, JO_Type); + if (osjob != nullptr) { + if (task_id_str != nullptr) { + lSetString(osjob, JO_task_id_str, task_id_str); + } + lSetList(osjob, JO_usage_list, ptf_build_usage_list("usagelist", usage_collection)); } - osjob = lCreateElem(JO_Type); - lSetUlong(osjob, JO_ja_task_ID, jataskid); - lAppendElem(osjoblist, osjob); - lSetList(osjob, JO_usage_list, - ptf_build_usage_list("usagelist")); - ptf_set_osjobid(osjob, os_job_id); } - if (task_id_str) { - lSetString(osjob, JO_task_id_str, task_id_str); + // this function might get called first before we have the osjobid + // in later calls then make sure to update osjobid and systemd_scope + if (osjob != nullptr) { + ptf_set_osjobid(osjob, os_job_id); + lSetUlong(osjob, JO_usage_collection, usage_collection); + if (systemd_scope != nullptr) { + lSetString(osjob, JO_systemd_scope, systemd_scope); + } } - } - /* - * set number of tickets in job entry - */ - lSetUlong(job, JL_tickets, (u_long32)MAX(job_tickets, 1)); + /* + * set number of tickets in job entry + */ + lSetUlong(job, JL_tickets, static_cast(MAX(job_tickets, 1.0))); - /* - * set interactive job flag - */ - if (interactive) { - lSetUlong(job, JL_interactive, 1); + /* + * set interactive job flag + */ + if (interactive) { + lSetUlong(job, JL_interactive, 1); + } } DRETURN(job); @@ -690,15 +675,13 @@ static void ptf_get_usage_from_data_collector() const char *tid; int i, j; + // in case of hybrid mode, we will not use PDC data for systemd provided usage (cpu, rss, maxrss) + ojobs = jobs = psGetAllJobs(); if (jobs) { jobcount = *(uint64 *) jobs; -#ifndef SOLARIS INCJOBPTR(jobs, sizeof(uint64)); -#else - INCJOBPTR(jobs, 8); -#endif for (i = 0; i < (int)jobcount; i++) { lList *usage_list; @@ -708,106 +691,112 @@ static void ptf_get_usage_from_data_collector() /* look up job in job list */ job = nullptr; - osjob = ptf_get_job_os(ptf_jobs, jobs->jd_jid, &job); - - if (osjob) { - u_long job_state = lGetUlong(osjob, JO_state); - - tmp_jobs = jobs; - - /* fill in job completion state */ - lSetUlong(osjob, JO_state, jobs->jd_refcnt ? - (job_state & ~JL_JOB_COMPLETE) : (job_state | JL_JOB_COMPLETE)); - - /* fill in usage for job */ - usage_list = lGetListRW(osjob, JO_usage_list); - if (!usage_list) { - usage_list = ptf_build_usage_list("usagelist"); - lSetList(osjob, JO_usage_list, usage_list); - } + // Passing nullptr as systemd_scope: + // Here we are in the data collector, there must be an add_grp_id / osjobid. + osjob = ptf_get_job_osjob_by_osjobid(ptf_jobs, jobs->jd_jid, &job); + if (osjob != nullptr) { + // If the osjobid / addgrp == 0, we do not want to get usage from PDC. + if (lGetUlong(osjob, JO_OS_job_ID) != 0) { + u_long job_state = lGetUlong(osjob, JO_state); + + tmp_jobs = jobs; + + /* fill in job completion state */ + lSetUlong(osjob, JO_state, jobs->jd_refcnt ? + (job_state & ~JL_JOB_COMPLETE) : (job_state | JL_JOB_COMPLETE)); + + /* fill in usage for job */ + usage_collection_t usage_collection = static_cast(lGetUlong(osjob, JO_usage_collection)); + usage_list = lGetListRW(osjob, JO_usage_list); + if (usage_list == nullptr) { + usage_list = ptf_build_usage_list("usagelist", usage_collection); + lSetList(osjob, JO_usage_list, usage_list); + } - /* set CPU usage */ - cpu_usage_value = jobs->jd_utime_c + jobs->jd_utime_a + - jobs->jd_stime_c + jobs->jd_stime_a; - if ((usage = lGetElemStrRW(usage_list, UA_name, USAGE_ATTR_CPU))) { - lSetDouble(usage, UA_value, MAX(cpu_usage_value, lGetDouble(usage, UA_value))); - } + /* set CPU usage */ + if (ocs::common::use_pdc_for_usage_collection(usage_collection)) { + cpu_usage_value = jobs->jd_utime_c + jobs->jd_utime_a + + jobs->jd_stime_c + jobs->jd_stime_a; + if ((usage = lGetElemStrRW(usage_list, UA_name, USAGE_ATTR_CPU))) { + lSetDouble(usage, UA_value, MAX(cpu_usage_value, lGetDouble(usage, UA_value))); + } - /* set mem usage (in GB seconds) */ - if ((usage = lGetElemStrRW(usage_list, UA_name, USAGE_ATTR_MEM))) { - lSetDouble(usage, UA_value, (double) jobs->jd_mem / 1048576.0); - } + /* set rss and maxrss usage */ + if ((usage = lGetElemStrRW(usage_list, UA_name, USAGE_ATTR_RSS))) { + lSetDouble(usage, UA_value, jobs->jd_rss); + } + if ((usage = lGetElemStrRW(usage_list, UA_name, USAGE_ATTR_MAXRSS))) { + lSetDouble(usage, UA_value, jobs->jd_maxrss); + } + } - /* set I/O usage (in GB) */ - if ((usage = lGetElemStrRW(usage_list, UA_name, USAGE_ATTR_IO))) { - lSetDouble(usage, UA_value, - (double) jobs->jd_chars / 1073741824.0); - } + /* set mem usage (in GB seconds) */ + if ((usage = lGetElemStrRW(usage_list, UA_name, USAGE_ATTR_MEM))) { + lSetDouble(usage, UA_value, (double) jobs->jd_mem / 1048576.0); + } - /* set I/O wait time */ - if ((usage = lGetElemStrRW(usage_list, UA_name, USAGE_ATTR_IOW))) { - lSetDouble(usage, UA_value, - (double) jobs->jd_bwtime_c + jobs->jd_bwtime_a + - jobs->jd_rwtime_c + jobs->jd_rwtime_a); - } + /* set I/O usage (in GB) */ + if ((usage = lGetElemStrRW(usage_list, UA_name, USAGE_ATTR_IO))) { + lSetDouble(usage, UA_value, (double) jobs->jd_chars / 1073741824.0); + } - /* set vmem and maxvmem usage */ - if ((usage = lGetElemStrRW(usage_list, UA_name, USAGE_ATTR_VMEM))) { - lSetDouble(usage, UA_value, jobs->jd_vmem); - } - if ((usage = lGetElemStrRW(usage_list, UA_name, USAGE_ATTR_MAXVMEM))) { - lSetDouble(usage, UA_value, jobs->jd_himem); - } + /* set I/O wait time */ + if ((usage = lGetElemStrRW(usage_list, UA_name, USAGE_ATTR_IOW))) { + lSetDouble(usage, UA_value, + (double) jobs->jd_bwtime_c + jobs->jd_bwtime_a + + jobs->jd_rwtime_c + jobs->jd_rwtime_a); + } - /* set rss and maxrss usage */ - if ((usage = lGetElemStrRW(usage_list, UA_name, USAGE_ATTR_RSS))) { - lSetDouble(usage, UA_value, jobs->jd_rss); - } - if ((usage = lGetElemStrRW(usage_list, UA_name, USAGE_ATTR_MAXRSS))) { - lSetDouble(usage, UA_value, jobs->jd_maxrss); - } + /* set vmem and maxvmem usage */ + if ((usage = lGetElemStrRW(usage_list, UA_name, USAGE_ATTR_VMEM))) { + lSetDouble(usage, UA_value, jobs->jd_vmem); + } + if ((usage = lGetElemStrRW(usage_list, UA_name, USAGE_ATTR_MAXVMEM))) { + lSetDouble(usage, UA_value, jobs->jd_himem); + } - /* build new pid list */ - proccount = jobs->jd_proccount; - INCJOBPTR(jobs, jobs->jd_length); + /* build new pid list */ + proccount = jobs->jd_proccount; + INCJOBPTR(jobs, jobs->jd_length); - if (proccount > 0) { - oldpidlist = lGetListRW(osjob, JO_pid_list); - pidlist = lCreateList("pidlist", JP_Type); + if (proccount > 0) { + oldpidlist = lGetListRW(osjob, JO_pid_list); + pidlist = lCreateList("pidlist", JP_Type); - procs = (struct psProc_s *) jobs; - for (j = 0; j < proccount; j++) { - lListElem *pid; + procs = (struct psProc_s *) jobs; + for (j = 0; j < proccount; j++) { + lListElem *pid; - if (procs->pd_state == 1) { - if ((pid = lGetElemUlongRW(oldpidlist, JP_pid, procs->pd_pid))) { - lAppendElem(pidlist, lCopyElem(pid)); - } else { - pid = lCreateElem(JP_Type); + if (procs->pd_state == 1) { + if ((pid = lGetElemUlongRW(oldpidlist, JP_pid, procs->pd_pid))) { + lAppendElem(pidlist, lCopyElem(pid)); + } else { + pid = lCreateElem(JP_Type); - lSetUlong(pid, JP_pid, procs->pd_pid); - lAppendElem(pidlist, pid); + lSetUlong(pid, JP_pid, procs->pd_pid); + lAppendElem(pidlist, pid); + } } + INCPROCPTR(procs, procs->pd_length); } - INCPROCPTR(procs, procs->pd_length); + + jobs = (struct psJob_s *)procs; + lSetList(osjob, JO_pid_list, pidlist); + } else { + lSetList(osjob, JO_pid_list, nullptr); } - jobs = (struct psJob_s *)procs; - lSetList(osjob, JO_pid_list, pidlist); - } else { - lSetList(osjob, JO_pid_list, nullptr); + tid = lGetString(osjob, JO_task_id_str); + DPRINTF("JOB " sge_u32 "." sge_u32 ": %s: (cpu = %8.3lf / mem = " + UINT64_FMT " / io = " UINT64_FMT " / vmem = " + UINT64_FMT " / himem = " UINT64_FMT ")\n", + lGetUlong(job, JL_job_ID), + lGetUlong(osjob, JO_ja_task_ID), tid ? tid : "", + tmp_jobs->jd_utime_c + tmp_jobs->jd_utime_a + + tmp_jobs->jd_stime_c + tmp_jobs->jd_stime_a, + tmp_jobs->jd_mem, tmp_jobs->jd_chars, + tmp_jobs->jd_vmem, tmp_jobs->jd_himem); } - - tid = lGetString(osjob, JO_task_id_str); - DPRINTF("JOB " sge_u32 "." sge_u32 ": %s: (cpu = %8.3lf / mem = " - UINT64_FMT " / io = " UINT64_FMT " / vmem = " - UINT64_FMT " / himem = " UINT64_FMT ")\n", - lGetUlong(job, JL_job_ID), - lGetUlong(osjob, JO_ja_task_ID), tid ? tid : "", - tmp_jobs->jd_utime_c + tmp_jobs->jd_utime_a + - tmp_jobs->jd_stime_c + tmp_jobs->jd_stime_a, - tmp_jobs->jd_mem, tmp_jobs->jd_chars, - tmp_jobs->jd_vmem, tmp_jobs->jd_himem); } else { /* * NOTE: Under what conditions would DC have a job @@ -853,8 +842,7 @@ static void ptf_get_usage_from_data_collector() /* set job state */ if (!active_jobs) { - lSetUlong(job, JL_state, lGetUlong(job, JL_state) - & JL_JOB_COMPLETE); + lSetUlong(job, JL_state, lGetUlong(job, JL_state) & JL_JOB_COMPLETE); } } } @@ -1167,22 +1155,22 @@ static void ptf_set_OS_scheduling_parameters(lList *job_list, double min_share, /*-------------------------------------------------------------------- * ptf_job_started - process new job *--------------------------------------------------------------------*/ -int ptf_job_started(osjobid_t os_job_id, const char *task_id_str, - const lListElem *new_job, u_long32 jataskid) +int ptf_job_started(osjobid_t os_job_id, const char *task_id_str, + const lListElem *new_job, u_long32 jataskid, const char *systemd_scope, usage_collection_t usage_collection) { DENTER(TOP_LAYER); /* * Add new job to job list */ - ptf_process_job(os_job_id, task_id_str, new_job, jataskid); + ptf_process_job(os_job_id, task_id_str, new_job, jataskid, systemd_scope, usage_collection); /* * Tell data collector to start collecting data for this job */ #ifdef USE_DC if (os_job_id > 0) { - psWatchJob(os_job_id); + psWatchJob(os_job_id, usage_collection); } #else @@ -1212,26 +1200,31 @@ int ptf_job_started(osjobid_t os_job_id, const char *task_id_str, int ptf_job_complete(u_long32 job_id, u_long32 ja_task_id, const char *pe_task_id, lList **usage) { - lListElem *ptf_job, *osjob; - lList *osjobs; - DENTER(TOP_LAYER); - ptf_job = ptf_get_job(job_id); + lListElem *ptf_job = ptf_get_job(job_id); if (ptf_job == nullptr) { DRETURN(PTF_ERROR_JOB_NOT_FOUND); } - osjobs = lGetListRW(ptf_job, JL_OS_job_list); - /* * if job is not complete, go get latest job usage info */ if (!(lGetUlong(ptf_job, JL_state) & JL_JOB_COMPLETE)) { + // @todo This will get usage for all jobs, not just the one, might be expensive when many jobs finish, + // e.g. with short jobs on a big machine. + // And does it make sense at all? We get here when a job finished - all its processes / its systemd scope + // should have vanished by now. sge_switch2start_user(); ptf_get_usage_from_data_collector(); sge_switch2admin_user(); + +#if defined (OCS_WITH_SYSTEMD) + if (ocs::uti::Systemd::is_systemd_available()) { + ocs::execd::ptf_get_usage_from_systemd(); + } +#endif } /* @@ -1240,21 +1233,9 @@ int ptf_job_complete(u_long32 job_id, u_long32 ja_task_id, const char *pe_task_i */ *usage = _ptf_get_job_usage(ptf_job, ja_task_id, pe_task_id); - /* Search ja/pe ptf task */ - if (pe_task_id == nullptr) { - osjob = lFirstRW(osjobs); - } else { - for_each_rw(osjob, osjobs) { - if (lGetUlong(osjob, JO_ja_task_ID) == ja_task_id) { - const char *osjob_pe_task_id = lGetString(osjob, JO_task_id_str); - - if (osjob_pe_task_id != nullptr && - strcmp(pe_task_id, osjob_pe_task_id) == 0) { - break; - } - } - } - } + // search ptf osjob + lList *osjobs = lGetListRW(ptf_job, JL_OS_job_list); + lListElem *osjob = ptf_get_osjob_by_ids(osjobs, ja_task_id, pe_task_id); if (osjob == nullptr) { DRETURN(PTF_ERROR_JOB_NOT_FOUND); @@ -1305,8 +1286,6 @@ int ptf_job_complete(u_long32 job_id, u_long32 ja_task_id, const char *pe_task_i int ptf_process_job_ticket_list(lList *job_ticket_list) { - lListElem *jte, *job; - DENTER(TOP_LAYER); /* @@ -1314,23 +1293,40 @@ int ptf_process_job_ticket_list(lList *job_ticket_list) * tickets from the job ticket list. Reset the usage to the * minimum usage value. */ + lListElem *jte; for_each_rw(jte, job_ticket_list) { - - /* + /* * set JB_script_file because we don't know if this is * an interactive job */ + // @todo required? lSetString(jte, JB_script_file, "dummy"); - job = ptf_process_job(0, nullptr, jte, - lGetUlong(lFirst(lGetList(jte, JB_ja_tasks)), - JAT_task_number)); - if (job) { - /* reset temporary usage and priority */ - lSetDouble(job, JL_usage, MAX(PTF_MIN_JOB_USAGE, - lGetDouble(job, JL_usage) * 0.1)); - - lSetDouble(job, JL_curr_pri, 0); + // The job and os job should already exist. + // If not it would not get created in ptf_process_job(), but probably later on + // once the job is started. + // @todo what about tightly integrated PE tasks? The ja_task then only is a SLAVE container, + // it doesn't have osjobid, systemd_scope, usage_collection. + u_long32 job_id = lGetUlong(jte, JB_job_number); + const lListElem *jte_ja_task = lFirst(lGetList(jte, JB_ja_tasks)); + u_long32 ja_task_id = lGetUlong(jte_ja_task, JAT_task_number); + lListElem *job, *ja_task; + if (execd_get_job_ja_task(job_id, ja_task_id, &job, &ja_task, false)) { + osjobid_t osjobid{}; + const char *osjobid_str = lGetString(ja_task, JAT_osjobid); + if (osjobid_str != nullptr) { + osjobid = static_cast(std::stoi(osjobid_str)); + } + lListElem *ptf_job = ptf_process_job(osjobid, nullptr, + jte, ja_task_id, lGetString(ja_task, JAT_systemd_scope), + static_cast(lGetUlong(ja_task, JAT_usage_collection))); + if (ptf_job != nullptr) { + /* reset temporary usage and priority */ + lSetDouble(ptf_job, JL_usage, MAX(PTF_MIN_JOB_USAGE, lGetDouble(ptf_job, JL_usage) * 0.1)); + lSetDouble(ptf_job, JL_curr_pri, 0); + } + } else { + // this might be a valid situation, e.g., immediately after job start } } @@ -1341,7 +1337,28 @@ void ptf_update_job_usage() { DENTER(TOP_LAYER); + // We always call ptf_get_usage_from_data_collector() to update the usage + // even if we are running jobs via systemd, because we might have jobs + // which were started with USAGE_COLLECTION being configured to PDC or HYBRID. + // If all jobs have been started with systemd usage collection only, then + // ptf_get_usage_from_data_collector() will not find any jobs to update. + sge_switch2start_user(); + PROF_START_MEASUREMENT(SGE_PROF_CUSTOM3); ptf_get_usage_from_data_collector(); + PROF_STOP_MEASUREMENT(SGE_PROF_CUSTOM3); + sge_switch2admin_user(); + + // Similar for systemd, we always call ptf_get_usage_from_systemd() if systemd is available at all. + // Whether a job gets usage via systemd can be determined from the systemd scope stored in the array task + // or pe task object. +#if defined (OCS_WITH_SYSTEMD) + if (ocs::uti::Systemd::is_systemd_available()) { + PROF_START_MEASUREMENT(SGE_PROF_CUSTOM2); + ocs::execd::ptf_get_usage_from_systemd(); + PROF_STOP_MEASUREMENT(SGE_PROF_CUSTOM2); + } +#endif + DRETURN_VOID; } diff --git a/source/daemons/execd/ptf.h b/source/daemons/execd/ptf.h index fbeb7392d3..42204141f6 100644 --- a/source/daemons/execd/ptf.h +++ b/source/daemons/execd/ptf.h @@ -37,6 +37,8 @@ #include "cull/cull.h" +#include "sgeobj/sge_conf.h" + #define PTF_COMPENSATION_FACTOR 2.0 typedef pid_t osjobid_t; @@ -140,13 +142,13 @@ void ptf_unregister_registered_job(u_long32 job_id, u_long32 ja_task_id ); void ptf_reinit_queue_priority(u_long32 job_id, u_long32 ja_task_idr, const char *pe_task_id_str, int priority); -int ptf_job_started(osjobid_t os_jobid, const char *task_id_str, - const lListElem *job, u_long32 jataskid); +int ptf_job_started(osjobid_t os_jobid, const char *task_id_str, + const lListElem *job, u_long32 jataskid, const char *systemd_scope, usage_collection_t usage_collection); +lList *ptf_build_usage_list(const char *name, usage_collection_t usage_collection); int ptf_get_usage(lList **jobs); -lList *ptf_get_job_usage(u_long job_id, u_long ja_task_id, - const char *task_id); +lList *ptf_get_job_usage(u_long job_id, u_long ja_task_id, const char *task_id); int ptf_process_job_ticket_list(lList *jobs); diff --git a/source/daemons/execd/reaper_execd.cc b/source/daemons/execd/reaper_execd.cc index 042ba70917..1bb51ba3cf 100644 --- a/source/daemons/execd/reaper_execd.cc +++ b/source/daemons/execd/reaper_execd.cc @@ -353,6 +353,9 @@ static void unregister_from_ptf(u_long32 job_id, u_long32 ja_task_id, /* if the job was a 'short-runner' omit the warning */ if (execd_get_job_ja_task(job_id, ja_task_id, &job, &ja_task, false)) { + // @todo what about short PE tasks? + // @todo and I doubt that the code is required at all, + // might just have been a workaround for bug CS-1019 /* check if the job was a short-runner */ u_long64 time_since_started = sge_get_gmt64() - lGetUlong64(ja_task, JAT_start_time); if (time_since_started <= sge_gmt32_to_gmt64(2)) { @@ -362,7 +365,9 @@ static void unregister_from_ptf(u_long32 job_id, u_long32 ja_task_id, } } - WARNING(MSG_JOB_REAPINGJOBXPTFCOMPLAINSY_US, job_id, ptf_errstr(ptf_error)); + DSTRING_STATIC(dstr_error, MAX_STRING_SIZE); + WARNING(MSG_JOB_REAPINGJOBXPTFCOMPLAINSY_SS, job_get_id_string(job_id, ja_task_id, pe_task_id, &dstr_error), + ptf_errstr(ptf_error)); } else { if (usage) { lXchgList(jr, JR_usage, &usage); @@ -465,11 +470,10 @@ static int clean_up_job(lListElem *jr, int failed, int shepherd_exit_status, DTRACE; - /* job to core binding: on Solaris the processor set have to be deleted + /* job to core binding: on Solaris the processor set have to be deleted and the cores have to be freed */ binding = get_conf_val("binding"); - clean_up_binding(binding); /* @@ -542,8 +546,7 @@ static int clean_up_job(lListElem *jr, int failed, int shepherd_exit_status, } /* look for error file this overrules errors found yet */ - sge_get_active_job_file_path(&fname, - job_id, ja_task_id, pe_task_id, "error"); + sge_get_active_job_file_path(&fname, job_id, ja_task_id, pe_task_id, "error"); if ((fp = fopen(sge_dstring_get_string(&fname), "r"))) { int n; char *new_line; @@ -630,8 +633,7 @@ static int clean_up_job(lListElem *jr, int failed, int shepherd_exit_status, * If the job finishes, the shepherd must remove the "checkpointed" file */ - sge_get_active_job_file_path(&fname, - job_id, ja_task_id, pe_task_id, "checkpointed"); + sge_get_active_job_file_path(&fname, job_id, ja_task_id, pe_task_id, "checkpointed"); ckpt_arena = 1; /* 1 job will be restarted in case of failure * * 2 job will be restarted from ckpt arena */ if (!SGE_STAT(sge_dstring_get_string(&fname), &statbuf)) { @@ -647,8 +649,7 @@ static int clean_up_job(lListElem *jr, int failed, int shepherd_exit_status, FCLOSE_IGNORE_ERROR(fp); } - sge_get_active_job_file_path(&fname, job_id, ja_task_id, pe_task_id, - "job_pid"); + sge_get_active_job_file_path(&fname, job_id, ja_task_id, pe_task_id, "job_pid"); if (!SGE_STAT(sge_dstring_get_string(&fname), &statbuf)) { if ((fp = fopen(sge_dstring_get_string(&fname), "r"))) { if (!fscanf(fp, sge_u32 , &job_pid)) @@ -847,7 +848,19 @@ void remove_acked_job_exit(u_long32 job_id, u_long32 ja_task_id, const char *pe_ if (execd_get_job_ja_task(job_id, ja_task_id, &jep, &jatep, false)) { lListElem *master_q; int used_slots; - + + // We get here through an ACK_JOB_EXIT. + // If it is not a pe task we are removing, then it is the master task. + // If there are still pe tasks running, we may not yet remove the master task, + // as we would delete the pe tasks' active_job directories and other data. + // Just return, the ACK_JOB_EXIT will be repeated by sge_qmaster. + if (pe_task_id == nullptr && + lGetNumberOfElem(lGetList(jatep, JAT_task_list)) > 0) { + // flush the job report to speed up repeating of the ACK_JOB_EXIT + flush_job_report(jr); + DRETURN_VOID; + } + DPRINTF("REMOVING WITH jep && jatep\n"); if (pe_task_id_str) { petep = lGetElemStrRW(lGetList(jatep, JAT_task_list), PET_id, pe_task_id_str); @@ -912,7 +925,7 @@ void remove_acked_job_exit(u_long32 job_id, u_long32 ja_task_id, const char *pe_ DPRINTF("%s: used slots decreased to %d\n", lGetString(master_q, QU_full_name), used_slots); } - /* + /* * when the job finished / the last task of a pe job finished * delete the tmpdir for this job * the slave container of a tightly integrated parallel job @@ -1310,6 +1323,9 @@ clean_up_old_jobs(bool startup) { // Do early exit: // - if cleanup was already done // - if there are no jobs to process (0 jobs or only simulated jobs) + // @todo do we actually want to do the cleanup if there are (only) running jobs? + // and even with finished jobs, cleanup could get between the job having finished and the ack from qmaster + // triggering deletion of the active job directory DRETURN(true); } @@ -1689,6 +1705,8 @@ read_dusage(lListElem *jr, const char *jobdir, u_long32 jobid, u_long32 jataskid static void build_derived_final_usage(lListElem *jr, u_long32 job_id, u_long32 ja_task_id, const char *pe_task_id) { + DENTER(TOP_LAYER); + const lList *usage_list; double ru_cpu, pdc_cpu; double cpu, r_cpu, @@ -1701,8 +1719,6 @@ static void build_derived_final_usage(lListElem *jr, u_long32 job_id, u_long32 j bool accounting_summary = false; - DENTER(TOP_LAYER); - usage_list = lGetList(jr, JR_usage); /* cpu = MAX(sum of "ru_utime" and "ru_stime" , PDC "cpu" usage) */ @@ -1819,6 +1835,7 @@ static void build_derived_final_usage(lListElem *jr, u_long32 job_id, u_long32 j } } } else { + // @todo What sense does this code make? We fetched the values from the jr->usage list and here write them back? add_usage(jr, USAGE_ATTR_CPU, nullptr, cpu); add_usage(jr, USAGE_ATTR_MEM, nullptr, mem); add_usage(jr, USAGE_ATTR_IO, nullptr, io); @@ -2053,6 +2070,9 @@ reaper_sendmail(lListElem *jep, lListElem *jr) { * This function sets the slave job to status JEXITING, * if all pe tasks already exited, it triggers sending * of the final slave job report. +* The ACK_SIGNAL_SLAVE is repeated by sge_qmaster until all +* slave tasks have finished and sge_execd sent the final +* report for this job. * * INPUTS * u_long32 job_id - job id of the slave job diff --git a/source/daemons/qmaster/sge_follow.cc b/source/daemons/qmaster/sge_follow.cc index 1d20d72aa7..ab06ec0b53 100644 --- a/source/daemons/qmaster/sge_follow.cc +++ b/source/daemons/qmaster/sge_follow.cc @@ -281,6 +281,7 @@ sge_follow_order(lListElem *ep, char *ruser, char *rhost, lList **topp, monitori if (enrolled_task) { job_unenroll(jep, nullptr, &jatp); } + // @todo add more info, e.g. in which state / status the job is ERROR(MSG_ORD_TWICE_UU, job_number, task_number); DRETURN(-1); } @@ -614,7 +615,7 @@ sge_follow_order(lListElem *ep, char *ruser, char *rhost, lList **topp, monitori lSetObject(jatp, JAT_pe_object, lCopyElem(pe)); } - // @todo: can this be summaized with the mod event that will set the job in t-state? + // @todo: can this be summarized with the mod event that will set the job in t-state? sge_add_event(now, sgeE_JATASK_ADD, job_number, task_number, nullptr, nullptr, lGetString(jep, JB_session), jatp, gdi_session); @@ -1209,6 +1210,12 @@ sge_follow_order(lListElem *ep, char *ruser, char *rhost, lList **topp, monitori DPRINTF("%s %s usage updating with %d jobs\n", MSG_OBJ_PRJ, up_name, lGetNumberOfElem(lGetList(up_order, PR_debited_job_usage))); + if (!(up = prj_list_locate(master_project_list, up_name))) { + /* order contains reference to unknown user/prj object */ + continue; + } + + // @todo does the version have a meaning here? if ((pos = lGetPosViaElem(up_order, PR_version, SGE_NO_ABORT)) >= 0 && (lGetPosUlong(up_order, pos) != lGetUlong(up, PR_version))) { /* order contains update for outdated user/project usage */ diff --git a/source/daemons/shepherd/CMakeLists.txt b/source/daemons/shepherd/CMakeLists.txt index 908d8c4d48..85ac21e905 100644 --- a/source/daemons/shepherd/CMakeLists.txt +++ b/source/daemons/shepherd/CMakeLists.txt @@ -21,6 +21,7 @@ # source/daemons/shepherd add_executable(sge_shepherd builtin_starter.cc + ocs_shepherd_systemd.cc setjoblimit.cc setrlimits.cc sge_fileio.cc diff --git a/source/daemons/shepherd/builtin_starter.cc b/source/daemons/shepherd/builtin_starter.cc index 65796470aa..3fa1c6dbd1 100644 --- a/source/daemons/shepherd/builtin_starter.cc +++ b/source/daemons/shepherd/builtin_starter.cc @@ -47,6 +47,7 @@ #include "uti/sge_unistd.h" #include "uti/sge_arch.h" #include "uti/config_file.h" +#include "uti/sge_time.h" #include "uti/sge_uidgid.h" #include "setosjobid.h" @@ -55,6 +56,7 @@ #include "builtin_starter.h" #include "err_trace.h" +#include "ocs_shepherd_systemd.h" #include "setrlimits.h" #include "get_path.h" #include "basis_types.h" @@ -65,6 +67,7 @@ #define MAX_NUMBER_OF_ENV_VARS 1023 extern bool g_new_interactive_job_support; + extern int g_noshell; extern int g_newpgrp; @@ -116,9 +119,9 @@ static int count_command(char *command) { } /************************************************************************ - This is the shepherds buitin starter. + This is the shepherds builtin starter. - It is also used to start the external starter command .. + It is also used to start the external starter command. ************************************************************************/ void son(const char *childname, char *script_file, int truncate_stderr_out) { @@ -160,7 +163,7 @@ void son(const char *childname, char *script_file, int truncate_stderr_out) const char *fs_stdin_file=""; const char *fs_stdout_file=""; const char *fs_stderr_file=""; - pid_t pid, pgrp, newpgrp; + pid_t pgrp, newpgrp; gid_t add_grp_id = 0; gid_t gid; struct passwd *pw=nullptr; @@ -173,11 +176,13 @@ void son(const char *childname, char *script_file, int truncate_stderr_out) foreground = 0; /* VX sends SIGTTOU if trace messages go to foreground */ /* From here only the son --------------------------------------*/ - if (!script_file) { + if (script_file == nullptr) { /* output error and exit */ shepherd_error(1, "received nullptr als script file"); } + pid_t pid = getpid(); + /* ** interactive jobs have script_file name interactive and ** as exec_file the configuration value for xterm @@ -219,7 +224,6 @@ void son(const char *childname, char *script_file, int truncate_stderr_out) } } - pid = getpid(); pgrp = GETPGRP; #ifdef SOLARIS @@ -295,7 +299,7 @@ void son(const char *childname, char *script_file, int truncate_stderr_out) umask(022); - if (!strcmp(childname, "job")) { + if (strcmp(childname, "job") == 0) { char *write_osjob_id = get_conf_val("write_osjob_id"); if(write_osjob_id != nullptr && atoi(write_osjob_id) != 0) { setosjobid(newpgrp, &add_grp_id, pw); @@ -303,7 +307,8 @@ void son(const char *childname, char *script_file, int truncate_stderr_out) } shepherd_trace("setting limits"); - setrlimits(!strcmp(childname, "job")); + setrlimits(strcmp(childname, "job") == 0); + ocs::move_shepherd_child_to_job_scope(pid); shepherd_trace("setting environment"); sge_set_environment(); @@ -340,7 +345,7 @@ void son(const char *childname, char *script_file, int truncate_stderr_out) /* --- switch to intermediate user */ shepherd_trace("switching to intermediate/target user"); - if(is_qlogin_starter && !g_new_interactive_job_support) { + if (is_qlogin_starter && !g_new_interactive_job_support) { /* * In the old IJS, we didn't have to set the additional group id, * because our custom rshd did it for us. @@ -630,7 +635,7 @@ void son(const char *childname, char *script_file, int truncate_stderr_out) shepherd_error(1, "error: fd for in is not 0"); } - if(!is_qlogin_starter) { + if (!is_qlogin_starter) { /* -cwd or from pw->pw_dir */ if (sge_chdir(cwd)) { shepherd_state = SSTATE_NO_CWD; @@ -707,10 +712,11 @@ void son(const char *childname, char *script_file, int truncate_stderr_out) /* get basename of shell for building argv[0] */ cp = strrchr(shell_path, '/'); - if (!cp) + if (cp == nullptr) { shell_basename = shell_path; - else + } else { shell_basename = cp+1; + } { SGE_STRUCT_STAT sbuf; @@ -749,7 +755,7 @@ void son(const char *childname, char *script_file, int truncate_stderr_out) snprintf(str_title, sizeof(str_title), "SGE Interactive Job %s on %s in Queue %s", job_id, host, queue); } -/* ---- switch to target user */ + /* ---- switch to target user */ if (intermediate_user) { if (is_qlogin_starter) { ret = sge_set_uid_gid_addgrp(target_user, nullptr, 0, 0, 0, @@ -802,7 +808,7 @@ void son(const char *childname, char *script_file, int truncate_stderr_out) } } } - start_command(childname, shell_path, script_file, argv0, shell_start_mode, + start_command(childname, shell_path, script_file, argv0, shell_start_mode, is_interactive, is_qlogin, is_rsh, is_rlogin, str_title, use_starter_method); @@ -1131,25 +1137,14 @@ static char **read_job_args(char **preargs, int extra_args) return args; } - /*-------------------------------------------------------------------- * set_shepherd_signal_mask * set signal mask that shepherd can handle signals from execd + * If use_starter_method is set, then the shellpath contains the starter_method. *--------------------------------------------------------------------*/ -void start_command( -const char *childname, -char *shell_path, -char *script_file, -char *argv0, -const char *shell_start_mode, -int is_interactive, -int is_qlogin, -int is_rsh, -int is_rlogin, -const char *str_title, -int use_starter_method /* If this flag is set the shellpath contains the - * starter_method */ -) { +void start_command(const char *childname, char *shell_path, char *script_file, char *argv0, + const char *shell_start_mode, int is_interactive, int is_qlogin, int is_rsh, int is_rlogin, + const char *str_title, int use_starter_method) { char **args; char **pstr; char *pc; @@ -1157,6 +1152,8 @@ int use_starter_method /* If this flag is set the shellpath contains the char **pre_args_ptr; char err_str[2048]; + bool is_the_job = strcmp(childname, "job") == 0; + pre_args_ptr = &pre_args[0]; #if 0 @@ -1177,7 +1174,7 @@ int use_starter_method /* If this flag is set the shellpath contains the */ if ((atoi(get_conf_val("handle_as_binary")) == 1) && (atoi(get_conf_val("no_shell")) == 0) && - !is_rsh && !is_qlogin && !strcmp(childname, "job") && use_starter_method != 1 ) { + !is_rsh && !is_qlogin && is_the_job && use_starter_method != 1 ) { int arg_id = 0; dstring arguments = DSTRING_INIT; int n_job_args; @@ -1310,8 +1307,7 @@ int use_starter_method /* If this flag is set the shellpath contains the /* ** unix_behaviour/raw_exec */ - if (!strcmp(childname, "job")) { - + if (is_the_job) { int arg_id = 0; #if 0 shepherd_trace("Case 7.1: job" ); diff --git a/source/daemons/shepherd/builtin_starter.h b/source/daemons/shepherd/builtin_starter.h index 9d1e67b14a..dd4b70a05d 100644 --- a/source/daemons/shepherd/builtin_starter.h +++ b/source/daemons/shepherd/builtin_starter.h @@ -36,8 +36,10 @@ void son(const char *childname, char *script_file, int truncate_stderr_out); int sge_set_environment(); char** sge_get_environment(); -int sge_set_env_value(const char *, const char *); -const char *sge_get_env_value(const char *); -void start_command(const char *, char *, char *, char *, const char *, int, int, int, int, const char *, int); +int sge_set_env_value(const char *name, const char* value); +const char *sge_get_env_value(const char *name); +void start_command(const char *childname, char *shell_path, char *script_file, char *argv0, + const char *shell_start_mode, int is_interactive, int is_qlogin, int is_rsh, int is_rlogin, + const char *str_title, int use_starter_method); int check_configured_method(const char *method, const char *name, char *err_str, size_t err_str_size); char* build_path(int type); diff --git a/source/daemons/shepherd/ocs_shepherd_systemd.cc b/source/daemons/shepherd/ocs_shepherd_systemd.cc new file mode 100644 index 0000000000..e57a4ccbfb --- /dev/null +++ b/source/daemons/shepherd/ocs_shepherd_systemd.cc @@ -0,0 +1,304 @@ +/*___INFO__MARK_BEGIN_NEW__*/ +/*************************************************************************** + * + * Copyright 2025 HPC-Gridware GmbH + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************/ +/*___INFO__MARK_END_NEW__*/ + +#include + +#include "sgeobj/sge_conf.h" + +#include "uti/sge_signal.h" +#include "uti/sge_string.h" +#include "uti/sge_time.h" +#include "uti/sge_uidgid.h" + +#include "config_file.h" +#include "err_trace.h" +#include "ocs_shepherd_systemd.h" + + +namespace ocs { + bool g_use_systemd = true; + ocs::uti::SystemdProperties_t g_systemd_properties; + + /** + * @brief Initialize the Systemd integration. + * + * This function checks if Systemd integration is enabled via the configuration + * and initializes the Systemd library if it is enabled. It also checks if the + * shepherd is running under Systemd control. + */ + void shepherd_systemd_init() { +#if defined (OCS_WITH_SYSTEMD) + // we can enable/disable systemd integration via execd_param ENABLE_SYSTEMD + const char *enable_systemd = get_conf_val("enable_systemd"); + if (enable_systemd == nullptr) { + // execd didn't write config? + g_use_systemd = false; + } else { + g_use_systemd = std::stoi(enable_systemd) != 0; + } + if (g_use_systemd) { + // try to initialize the Systemd integration, + // create an instance of Systemd and try to connect to the system bus, + // figure out if we are running as Systemd service + DSTRING_STATIC(error_dstr, MAX_STRING_SIZE); + if (ocs::uti::Systemd::initialize(ocs::uti::Systemd::shepherd_scope_name, &error_dstr)) { + shepherd_trace("initialized systemd library, systemd version %d, cgroups version %d", + ocs::uti::Systemd::get_systemd_version(), ocs::uti::Systemd::get_cgroup_version()); + if (ocs::uti::Systemd::is_running_as_service()) { + // we are running under systemd control + shepherd_trace("shepherd is running under systemd control in scope %s", + ocs::uti::Systemd::shepherd_scope_name.c_str()); + } + } else { + shepherd_trace("initializing systemd library failed: %s", sge_dstring_get_string(&error_dstr)); + g_use_systemd = false; + } + } else { + shepherd_trace("systemd integration is disabled"); + } +#endif + } + +#if defined (OCS_WITH_SYSTEMD) + /** + * @brief Adds accounting settings to the systemd properties. + * + * This function sets CPU and Memory accounting properties for systemd. + * If cgroup version 2 is used, it also enables IO accounting. + * @note We enable IOAccounting, but we actually do not use the IO values delivered + * in sge_execd for the `io` usage value, see CS-1389. + * @note We might want to enable further accounting settings in the future, + * like IPAccounting, see CS-1390. + */ + static void + add_accounting_settings() { + if (g_use_systemd) { + // get usage collection mode from execd_params + const char *usage_collection_str = get_conf_val("usage_collection"); + usage_collection_t usage_collection = USAGE_COLLECTION_DEFAULT; + if (usage_collection_str != nullptr) { + usage_collection = static_cast(std::stoi(usage_collection_str)); + } + // we enable accounting settings only if we are using systemd for usage collection + if (usage_collection == USAGE_COLLECTION_DEFAULT || usage_collection == USAGE_COLLECTION_HYBRID) { + g_systemd_properties["CPUAccounting"] = true; + g_systemd_properties["MemoryAccounting"] = true; + if (ocs::uti::Systemd::get_cgroup_version() == 2) { + g_systemd_properties["IOAccounting"] = true; + } + } + } + } + +#define DEVICES_DELIMITOR ";" +#define DEVICES_DEFAULT_MODE "r" + // DeviceAllow, array of structs having two strings: device name and access mode: a(ss) + // use config file entry devices_allow to specify devices which are allowed + // @todo have an execd_params for devices which shall always be allowed? + // DevicePolicy, string: + // "strict" - no devices allowed except what is specified in DeviceAllow + // "closed" - like strict, but also allows /dev/null, /dev/zero, /dev/full, /dev/random, /dev/urandom + // "auto" - allows all devices, unless DeviceAllow is set, then it behaves like closed (?) + // @todo have an execd_params for this? + static void + add_devices_allow() { + if (g_use_systemd) { + char *devices_allow = get_conf_val("devices_allow"); + if (devices_allow != nullptr && strlen(devices_allow) > 0) { + // switch to closed device policy + g_systemd_properties["DevicePolicy"] = "closed"; + std::vector devices; + saved_vars_s *context = nullptr; + char *device = sge_strtok_r(devices_allow, DEVICES_DELIMITOR, &context); + ocs::uti::SystemdDevice_t systemd_device{}; + while (device != nullptr) { + // device is a string of the form "device_name=access_mode" + // where access_mode can contain "r", "w", "rw" + char *access_mode = strchr(device, '='); + if (access_mode == nullptr || *access_mode == '\0') { + shepherd_trace("no mode specifice for device %s, using \"rw\" as default", device); + systemd_device.second = DEVICES_DEFAULT_MODE; // default access mode + } else { + *access_mode = '\0'; // split device name and access mode + access_mode++; + systemd_device.second = access_mode; + } + systemd_device.first = device; // device name + shepherd_trace("adding device %s with access mode %s to systemd properties DeviceAllow", + systemd_device.first.c_str(), systemd_device.second.c_str()); + devices.push_back(systemd_device); + + // optionally next device + device = sge_strtok_r(nullptr, DEVICES_DELIMITOR, &context); + } + g_systemd_properties["DeviceAllow"] = devices; + } + } + } +#endif + + /** + * @brief Moves the shepherd child process to the job scope. + * + * This function is used to move the shepherd child process (the job) to the job scope + * in systemd. It should be called before switching to the job user, when we can still become + * the start user (root). + * + * @param pid - The PID of the shepherd child process to move. + * + * @note This function is only applicable when systemd integration is enabled. + */ + void + move_shepherd_child_to_job_scope(pid_t pid) { + // move the shepherd child to the job scope + // we do this only for the job, not for prolog, epilog, pe_start, pe_stop +#if defined (OCS_WITH_SYSTEMD) + if (g_use_systemd) { + DSTRING_STATIC(error_dstr, MAX_STRING_SIZE); + + add_accounting_settings(); + add_devices_allow(); + + const u_long64 start_time = sge_get_gmt64(); + const char *slice = get_conf_val("systemd_slice"); + const char *scope = get_conf_val("systemd_scope"); + if (slice != nullptr && scope != nullptr) { + ocs::uti::Systemd systemd; + sge_switch2start_user(); + bool connected = systemd.connect(&error_dstr); + sge_switch2admin_user(); + if (connected) { + shepherd_trace("moving shepherd child " pid_t_fmt " to job scope '%s' in slice '%s'", pid, scope, slice); + bool scope_already_exists = false; + bool success = systemd.create_scope_with_pid(scope, slice, g_systemd_properties, pid, scope_already_exists, &error_dstr); + if (success) { + shepherd_trace("moving shepherd child took " sge_u64 " µs", sge_get_gmt64() - start_time); + } else { + shepherd_error(1, "moving shepherd child to job scope failed: %s", sge_dstring_get_string(&error_dstr)); + } + } else { + // we treat a connect-error as fatal, connecting worked before + shepherd_error(1, "connecting to systemd failed: %s", sge_dstring_get_string(&error_dstr)); + } + } else { + shepherd_error(1, "systemd_slice and/or systemd_scope missing in config file, cannot move shepherd child to job scope"); + } + } +#endif +} + + + /** + * @brief Adds the CPU binding to the systemd properties. + * + * Fills in a vector of uint8_t with the CPU mask as bits and adds it to + * the systemd properties under the key "AllowedCPUs". + * + * @param cpuset - cpuset from hwloc which contains the CPU binding (logical cpus). + * @return + */ +#if defined(OCS_HWLOC) + void + add_binding_to_systemd_properties(const hwloc_const_bitmap_t cpuset) { + if (g_use_systemd) { + unsigned i; + + // create a vector of uint8_t containing the CPU mask as bits + std::vector cpu_mask(hwloc_bitmap_last(cpuset) / 8 + 1, 0); + hwloc_bitmap_foreach_begin(i, cpuset) { + shepherd_trace("adding CPU %d to AllowedCPUs", i); + cpu_mask[i/8] |= 1 << (i % 8); + } + hwloc_bitmap_foreach_end(); + + g_systemd_properties["AllowedCPUs"] = cpu_mask; + } + } +#endif + + /** + * @brief Signals a job in the systemd scope. + * + * This function sends a signal to the job running in the systemd scope. + * It handles different signals like SIGKILL, SIGSTOP, and SIGCONT and uses the appropriate systemd methods. + * It differentiates between signaling the entire job scope or just the main process of the job. + * + * @param signal - The signal to send to the job. + * @param only_main - If true, only the main process of the job is signaled. + */ + void shepherd_systemd_signal_job(int signal, bool only_main) { +#if defined(OCS_WITH_SYSTEMD) + // Signaling via systemd + // - Need the scope name + // - StopUnit, when the job shall be killed + // - Mode should probably be "replace", in case we get multiple kill signals from execd + // - It will return a job! Need to wait for it to finish. + // - KillUnit, for arbitrary signals + // - attrib "who" should be "all" to kill all processes in the scope + // - FreezeUnit, for suspending the job + // - ThawUnit, for resuming the job + ocs::uti::Systemd systemd; + DSTRING_STATIC(error_dstr, MAX_STRING_SIZE); + sge_switch2start_user(); + bool connected = systemd.connect(&error_dstr); + sge_switch2admin_user(); + if (connected) { + const char *scope = get_conf_val("systemd_scope"); + bool success; + + switch (signal) { + case SIGKILL: + success = systemd.stop_unit(scope, &error_dstr); + break; + case SIGSTOP: + if (systemd.get_systemd_version() >= 250 && systemd.get_cgroup_version() == 2) { + // systemd 250+ supports freeze/thaw for cgroup v2 + success = systemd.freeze_unit(scope, &error_dstr); + } else { + // use KillUnit for older versions or cgroup v1 + success = systemd.signal_unit(scope, signal, only_main, &error_dstr); + } + break; + case SIGCONT: + if (systemd.get_systemd_version() >= 250 && systemd.get_cgroup_version() == 2) { + // systemd 250+ supports freeze/thaw for cgroup v2 + success = systemd.thaw_unit(scope, &error_dstr); + } else { + // use KillUnit for older versions or cgroup v1 + success = systemd.signal_unit(scope, signal, only_main, &error_dstr); + } + break; + default: + success = systemd.signal_unit(scope, signal, only_main, &error_dstr); + break; + } + + if (!success) { + shepherd_trace("signalling job in systemd scope '%s' failed: %s", scope, sge_dstring_get_string(&error_dstr)); + } else { + shepherd_trace("signalled job in systemd scope '%s' with signal %s", scope, sge_sys_sig2str(signal)); + } + } else { + // error, but do not exit shepherd - signals are repeated, next time might work + shepherd_error(0, "connecting to systemd failed: %s", sge_dstring_get_string(&error_dstr)); + } +#endif + } +} // namespace ocs diff --git a/source/daemons/shepherd/ocs_shepherd_systemd.h b/source/daemons/shepherd/ocs_shepherd_systemd.h new file mode 100644 index 0000000000..97ff9b7677 --- /dev/null +++ b/source/daemons/shepherd/ocs_shepherd_systemd.h @@ -0,0 +1,36 @@ +#pragma once +/*___INFO__MARK_BEGIN_NEW__*/ +/*************************************************************************** + * + * Copyright 2025 HPC-Gridware GmbH + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************/ +/*___INFO__MARK_END_NEW__*/ + +#include "uti/ocs_Systemd.h" +#include "uti/ocs_topology.h" + +namespace ocs { + extern bool g_use_systemd; + extern ocs::uti::SystemdProperties_t g_systemd_properties; + + void shepherd_systemd_init(); + void move_shepherd_child_to_job_scope(pid_t pid); + + void shepherd_systemd_signal_job(int signal, bool only_main); +#if defined(OCS_HWLOC) + void add_binding_to_systemd_properties(hwloc_const_bitmap_t cpuset); +#endif +} // namespace ocs diff --git a/source/daemons/shepherd/setrlimits.cc b/source/daemons/shepherd/setrlimits.cc index ad0b62e487..a0349b40c7 100644 --- a/source/daemons/shepherd/setrlimits.cc +++ b/source/daemons/shepherd/setrlimits.cc @@ -50,16 +50,18 @@ #define FORMAT_LIMIT(x) (x==RLIMIT_INFINITY)?0:x, (x==RLIMIT_INFINITY)?"\bINFINITY":"" #include "basis_types.h" +#include "ocs_shepherd_systemd.h" #include "setrlimits.h" #include "err_trace.h" #include "setjoblimit.h" +#include "uti/ocs_Systemd.h" #include "uti/sge_parse_num_par.h" #include "uti/config_file.h" #include "uti/sge_uidgid.h" #include "uti/sge_os.h" #include "sgeobj/sge_conf.h" -static void pushlimit(int, struct RLIMIT_STRUCT_TAG *, int trace_rlimit); +static void pushlimit(int, struct RLIMIT_STRUCT_TAG *, bool trace_rlimit); static int get_resource_info(u_long32 resource, const char **name, int *resource_type); @@ -113,7 +115,7 @@ static int sge_parse_limit(sge_rlim_t *rlvalp, char *s, char *error_str, return 1; } -void setrlimits(int trace_rlimit) { +void setrlimits(bool trace_rlimit) { sge_rlim_t s_cpu, s_cpu_is_consumable_job; sge_rlim_t h_cpu, h_cpu_is_consumable_job; @@ -267,7 +269,11 @@ void setrlimits(int trace_rlimit) { h_stack = RL_MIN(h_stack, h_vmem);*/ priority = atoi(get_conf_val("priority")); - /* had problems doing this with admin user priviledges under HPUX */ + // We might need root privileges to set the nice value, depending on the soft RLIMIT_NICE, + // see man page setpriority.2: + // Traditionally, only a privileged process could lower the nice value (i.e., set a higher priority). + // However, since Linux 2.6.12, an unprivileged process can decrease the nice value of a target process + // that has a suitable RLIMIT_NICE soft limit; see getrlimit(2) for details. sge_switch2start_user(); SETPRIORITY(priority); sge_switch2admin_user(); @@ -412,6 +418,27 @@ void setrlimits(int trace_rlimit) { rlp.rlim_max = h_rss; pushlimit(RLIMIT_RSS, &rlp, trace_rlimit); # endif + + // add systemd limits + if (ocs::g_use_systemd) { + // We apply the minimum of rss and vmem limits. + // Systemd doesn't have vmem limits. + // So we either apply the (usually lower) rss limit + // or the (usually higher) vmem limit to ensure the job does not exceed with rss at least + // a given vmem (total memory) limit. + sge_rlim_t h_memory = RL_MIN(h_rss, h_vmem); + sge_rlim_t s_memory = RL_MIN(s_rss, s_vmem); + if (h_memory != RLIM_INFINITY) { + shepherd_trace("SYSTEMD MemoryMax = " sge_u64, h_memory); + // make sure that we have the right datatype for the std::variant + ocs::g_systemd_properties["MemoryMax"] = reinterpret_cast(h_memory); + } + if (s_memory != RLIM_INFINITY) { + // make sure that we have the right datatype for the std::variant + shepherd_trace("SYSTEMD MemoryHigh = " sge_u64, s_memory); + ocs::g_systemd_properties["MemoryHigh"] = reinterpret_cast(s_memory); + } + } } /* *INDENT-OFF* */ @@ -473,7 +500,7 @@ static int get_resource_info(u_long32 resource, const char **name, } static void pushlimit(int resource, struct RLIMIT_STRUCT_TAG *rlp, - int trace_rlimit) + bool trace_rlimit) { const char *limit_str; char trace_str[1024]; diff --git a/source/daemons/shepherd/setrlimits.h b/source/daemons/shepherd/setrlimits.h index 4395399f27..a141973705 100644 --- a/source/daemons/shepherd/setrlimits.h +++ b/source/daemons/shepherd/setrlimits.h @@ -32,6 +32,7 @@ * ************************************************************************/ /*___INFO__MARK_END__*/ + #define RES_PROC 1 #define RES_JOB 2 #define RES_BOTH (RES_PROC|RES_JOB) @@ -52,4 +53,4 @@ struct resource_table_entry { int resource_type[2]; }; -void setrlimits(int trace_limits); +void setrlimits(bool trace_limits); diff --git a/source/daemons/shepherd/sge_fileio.cc b/source/daemons/shepherd/sge_fileio.cc index 6f05e136b8..b43c183749 100644 --- a/source/daemons/shepherd/sge_fileio.cc +++ b/source/daemons/shepherd/sge_fileio.cc @@ -46,6 +46,7 @@ #include "err_trace.h" #include "execution_states.h" #include "msg_common.h" +#include "sge.h" bool shepherd_write_pid_file(pid_t pid, dstring *errmsg) { @@ -216,29 +217,7 @@ shepherd_write_sig_info_file(const char *filename, const char *task_id, u_long32 return false; } - -bool shepherd_write_osjobid_file(const char *osjobid) -{ - bool ret = true; - const char *const filename = "osjobid"; - FILE *fp = nullptr; - - fp = fopen(filename, "w"); - if (fp != nullptr) { - FPRINTF((fp, "%s\n", osjobid)); - FCLOSE(fp); - } else { - shepherd_error(1, MSG_FILE_NOOPEN_SS, filename, strerror(errno)); - ret = false; - } - return ret; -FPRINTF_ERROR: -FCLOSE_ERROR: - shepherd_error(1, MSG_FILE_NOCLOSE_SS, filename, strerror(errno)); - return false; -} - -bool +bool shepherd_write_processor_set_number_file(int proc_set) { bool ret = true; @@ -398,6 +377,3 @@ checkpointed_file_exists() SGE_STRUCT_STAT buf; return !SGE_STAT("checkpointed", &buf); } - - - diff --git a/source/daemons/shepherd/sge_fileio.h b/source/daemons/shepherd/sge_fileio.h index f87171a9be..cd9ea56fc4 100644 --- a/source/daemons/shepherd/sge_fileio.h +++ b/source/daemons/shepherd/sge_fileio.h @@ -47,10 +47,7 @@ shepherd_write_usage_file(u_long32 wait_status, int exit_status, bool shepherd_write_job_pid_file(const char *job_pid); -bool -shepherd_write_osjobid_file(const char *osjobid); - -bool +bool shepherd_write_shepherd_about_to_exit_file(); bool diff --git a/source/daemons/shepherd/shepherd.cc b/source/daemons/shepherd/shepherd.cc index 352febdfba..7ba6224a9f 100644 --- a/source/daemons/shepherd/shepherd.cc +++ b/source/daemons/shepherd/shepherd.cc @@ -89,6 +89,7 @@ # include #endif +#include "ocs_shepherd_systemd.h" #include "sge_ijs_threads.h" #include "sge_ijs_comm.h" #include "sge_shepherd_ijs.h" @@ -109,6 +110,8 @@ #include "execution_states.h" #include "msg_common.h" +#include "uti/ocs_Systemd.h" + #if defined(SOLARIS) /* wait3() prototype only available if _XOPEN_SOURCE_EXTENDED is defined */ pid_t wait3(int *, int, struct rusage *); @@ -177,7 +180,6 @@ static pid_t start_token_cmd(int wait_for_finish, const char *cmd, /* overridable control methods */ static void verify_method(const char *method_name); -void shepherd_signal_job(pid_t pid, int sig); /* signal functions */ static void signal_handler(int signal); @@ -695,8 +697,7 @@ int main(int argc, char **argv) int ckpt_type; int return_code = 0; int run_epilog, run_pe_stop; - dstring ds; - char buffer[256]; + DSTRING_STATIC(ds, 256); if (argc >= 2) { if ( strcmp(argv[1],"-help") == 0) { @@ -706,8 +707,6 @@ int main(int argc, char **argv) } shepherd_trace_init( ); - sge_dstring_init(&ds, buffer, sizeof(buffer)); - shepherd_trace("shepherd called with uid = " uid_t_fmt ", euid = " uid_t_fmt, getuid(), geteuid()); @@ -853,6 +852,8 @@ int main(int argc, char **argv) script_timeout = atoi(get_conf_val("script_timeout")); notify = atoi(get_conf_val("notify")); + ocs::shepherd_systemd_init(); + /* * Create processor set */ @@ -1181,10 +1182,11 @@ int ckpt_type /* Write pid to job_pid file and set ckpt_pid to original job pid * Kill job if we can't write job_pid file and exit with error - * sets ckpt_pid to 0 for non kernel level checkpointing jobs + * sets ckpt_pid to 0 for non-kernel level checkpointing jobs */ - if (!strcmp(childname, "job")) + if (strcmp(childname, "job") == 0) { handle_job_pid(ckpt_info.type, pid, &(ckpt_info.pid)); + } /* Does not affect pe/prolog/epilog etc. since ckpt_type is set to 0 */ set_ckpt_params(ckpt_info.type, @@ -2755,7 +2757,7 @@ static void handle_job_pid(int ckpt_type, int pid, int *ckpt_pid) shepherd_signal_job(pid, SIGKILL); shepherd_error(1, "can't write \"job_pid\" file"); } -} +} /*-------------------------------------------------------------------------*/ static int start_async_command(const char *descr, char *cmd) @@ -2792,8 +2794,13 @@ static int start_async_command(const char *descr, char *cmd) shepherd_trace("starting %s command: %s", descr, cmd); pid = getpid(); - setpgid(pid, pid); + setpgid(pid, pid); + setrlimits(0); + + // @todo if we want to account prolog etc. to the job, then we need to move the child process into the job scope + // move_shepherd_child_to_job_scope(pid); + sge_set_environment(); umask(022); tmp_str = search_conf_val("qsub_gid"); @@ -2835,7 +2842,7 @@ static int start_async_command(const char *descr, char *cmd) sge_set_def_sig_mask(nullptr, nullptr); start_command(descr, get_conf_val("shell_path"), - cmd, cmd, "start_as_command", 0, 0, 0, 0, "", 0); + cmd, cmd, "start_as_command", 0, 0, 0, 0, "", 0); return 0; } @@ -2881,82 +2888,94 @@ void shepherd_signal_job(pid_t pid, int sig) { /* * Normal signaling for OSes without reliable grouping mechanisms and if - * special signaling fails (e.g. not running as root) + * special signaling fails (e.g., not running as root) */ /* - * if child is a qrsh job (config rsh_daemon exists), get pid of started command - * and pass signal to that one + * if the child is a qrsh job (config rsh_daemon exists), get pid of started command + * and pass signal to that one, * if the signal is the kill signal, we first kill the pid of the started command. * subsequent kills are passed to the shepherd's child. + * In this case (first_kill && sig == SIGKILL) we do not kill via systemd. */ - { - static int first_kill = 1; - static time_t first_kill_ts = 0; - static bool is_qrsh = false; - - if (first_kill == 1 || sig != SIGKILL) { - if (search_conf_val("qrsh_pid_file") != nullptr) { - char *pid_file_name = nullptr; - pid_t qrsh_pid = 0; + static int first_kill = 1; // first time we signal with SIGKILL + static time_t first_kill_ts = 0; + static bool is_qrsh = false; - pid_file_name = get_conf_val("qrsh_pid_file"); + if (first_kill == 1 || sig != SIGKILL) { + if (search_conf_val("qrsh_pid_file") != nullptr) { + char *pid_file_name = nullptr; + pid_t qrsh_pid = 0; - sge_switch2start_user(); + pid_file_name = get_conf_val("qrsh_pid_file"); - if (shepherd_read_qrsh_file(pid_file_name, &qrsh_pid)) { - is_qrsh = true; - pid = -qrsh_pid; - shepherd_trace("found pid of qrsh client command: " pid_t_fmt, pid); - } - sge_switch2admin_user(); + sge_switch2start_user(); + bool read_qrsh_file = shepherd_read_qrsh_file(pid_file_name, &qrsh_pid); + sge_switch2admin_user(); + + if (read_qrsh_file) { + is_qrsh = true; + pid = -qrsh_pid; + shepherd_trace("found pid of qrsh client command: " pid_t_fmt, pid); } } + } - /* - * It is possible that one signal requests from qmaster contains several - * kills for the same process. If this process is a tight integrated job - * the master task can be killed twice. For the slave tasks this means the - * qrsh -d is killed in the same time as the qrsh_starter child and so no - * qrsh_exit_code file is written (see Issue: 1679) - */ - if ((first_kill == 1) || (time(nullptr) - first_kill_ts > 10) || (sig != SIGKILL)) { - shepherd_trace("now sending signal %s to pid " pid_t_fmt, sge_sys_sig2str(sig), pid); - sge_switch2start_user(); - kill(pid, sig); - sge_switch2admin_user(); + /* + * It is possible that one signal requests from qmaster contains several + * kills for the same process. If this process is a tightly integrated job, + * the master task can be killed twice. For the slave tasks, this means the + * qrsh -inherit is killed at the same time as the qrsh_starter child and so no + * qrsh_exit_code file is written (see Issue: 1679) + */ + if (first_kill == 1 || time(nullptr) - first_kill_ts > 10 || sig != SIGKILL) { + // We always signal the pid / process group first via the kill method. + // In addition, we might signal with systemd or via additional group id. + shepherd_trace("now sending signal %s to pid " pid_t_fmt, sge_sys_sig2str(sig), pid); + sge_switch2start_user(); + kill(pid, sig); + sge_switch2admin_user(); #if defined(SOLARIS) || defined(LINUX) || defined(FREEBSD) || defined(DARWIN) - if (first_kill == 0 || sig != SIGKILL || !is_qrsh) { -# if defined(SOLARIS) || defined(LINUX) || defined(FREEBSD) || defined(DARWIN) -# ifdef COMPILE_DC - if (atoi(get_conf_val("enable_addgrp_kill")) == 1) { - gid_t add_grp_id; - char *cp = search_conf_val("add_grp_id"); - - if (cp) { - add_grp_id = atol(cp); - } else { - add_grp_id = 0; - } - - shepherd_trace("pdc_kill_addgrpid: %d %d", (int) add_grp_id , sig); - sge_switch2start_user(); - pdc_kill_addgrpid(add_grp_id, sig, shepherd_trace); - sge_switch2admin_user(); - } -# endif -# endif + if (first_kill == 0 || sig != SIGKILL || !is_qrsh) { + if (ocs::g_use_systemd) { + pid_t systemd_pid = pid; + // for all the nofify signals we only signal the main process (usually: the job script) + // it will handle e.g., a SIGUSR1, child processes usually not + if (sig != SIGKILL && sig != SIGSTOP && sig != SIGCONT) { + systemd_pid = abs(systemd_pid); + } + ocs::shepherd_systemd_signal_job(sig, systemd_pid > 0); + } else { +# ifdef COMPILE_DC + if (atoi(get_conf_val("enable_addgrp_kill")) == 1) { + gid_t add_grp_id; + char *cp = search_conf_val("add_grp_id"); + if (cp != nullptr) { + add_grp_id = atol(cp); + } else { + add_grp_id = 0; + } + if (add_grp_id == 0) { + shepherd_trace("add_grp_id is 0, not calling pdc_kill_addgrpid"); + } else { + shepherd_trace("pdc_kill_addgrpid: %d %d", (int) add_grp_id, sig); + sge_switch2start_user(); + pdc_kill_addgrpid(add_grp_id, sig, shepherd_trace); + sge_switch2admin_user(); + } + } +# endif } + } # endif - } else { - shepherd_trace("ignored signal %s to pid " pid_t_fmt, sge_sys_sig2str(sig), pid); - } + } else { + shepherd_trace("ignored signal %s to pid " pid_t_fmt, sge_sys_sig2str(sig), pid); + } - if (sig == SIGKILL) { - first_kill = 0; - first_kill_ts = time(nullptr); - } + if (sig == SIGKILL) { + first_kill = 0; + first_kill_ts = time(nullptr); } } diff --git a/source/daemons/shepherd/shepherd_binding.cc b/source/daemons/shepherd/shepherd_binding.cc index 4d8dc478d7..13cf7d2c22 100644 --- a/source/daemons/shepherd/shepherd_binding.cc +++ b/source/daemons/shepherd/shepherd_binding.cc @@ -43,12 +43,13 @@ #include "basis_types.h" #include "uti/config_file.h" - +#include "uti/ocs_Systemd.h" #include "uti/sge_binding_hlp.h" #include "uti/sge_dstring.h" #include "uti/sge_string.h" #include "uti/ocs_topology.h" +#include "ocs_shepherd_systemd.h" #include "shepherd_binding.h" #include "err_trace.h" @@ -551,9 +552,10 @@ bool binding_add_core_to_cpuset(hwloc_bitmap_t cpuset, int socket, int core) { } else { shepherd_trace("binding_set_linear_linux: problems while creating SGE_BINDING env"); } - + } else if (g_use_systemd) { + // when we are using Systemd then we set the binding via systemd property AllowCPUs + add_binding_to_systemd_properties(cpuset); } else { - /* bind SET process to mask */ if (!bind_process_to_mask(cpuset)) { /* there was an error while binding */ @@ -712,6 +714,9 @@ bool binding_add_core_to_cpuset(hwloc_bitmap_t cpuset, int socket, int core) { } else { shepherd_trace("binding_set_striding_linux: problems while creating SGE_BINDING env"); } + } else if (g_use_systemd) { + // when we are using Systemd then we set the binding via systemd property AllowCPUs + add_binding_to_systemd_properties(cpuset); } else { /* bind process to mask */ if (bind_process_to_mask(cpuset)) { @@ -860,6 +865,9 @@ bool binding_add_core_to_cpuset(hwloc_bitmap_t cpuset, int socket, int core) { } else { shepherd_trace("binding_explicit: problems while creating SGE_BINDING env"); } + } else if (g_use_systemd) { + // when we are using Systemd then we set the binding via systemd property AllowCPUs + add_binding_to_systemd_properties(cpuset); } else { /* do the core binding for the current process with the mask */ if (bind_process_to_mask(cpuset)) { @@ -934,5 +942,5 @@ bool binding_add_core_to_cpuset(hwloc_bitmap_t cpuset, int socket, int core) { } #endif -} +} diff --git a/source/dist/inst_sge b/source/dist/inst_sge index 7c8884175d..8aa6a9047e 100755 --- a/source/dist/inst_sge +++ b/source/dist/inst_sge @@ -518,11 +518,12 @@ if [ "$UPDATE" = true ]; then CFG_GID_RANGE="" fi RestoreSequenceNumberFiles $QMDIR #Restore jobseqnum and arseqnum - CreateSGEStartUpScripts $euid true master + CreateSGEStartUpScripts $euid true qmaster CreateSGEStartUpScripts $euid true execd CreateSettingsFile InitCA - SetupRcScriptNames master #New qmaster RC script/SMF + SetupSystemdSliceName + SetupRcScriptNames qmaster #New qmaster RC script/SMF InstallRcScript #TODO: Need to ensure there is no qmaster on the same port running # When doing an update from <= 6.1 to >= 6.2 AND with classic spooling @@ -640,11 +641,12 @@ if [ "$QMASTER" = "install" -a "$UPDATE" != "true" ]; then AddPEFiles AddDefaultUsersets AddCommonFiles - CreateSGEStartUpScripts $euid true master + CreateSGEStartUpScripts $euid true qmaster CreateSGEStartUpScripts $euid true execd CreateSettingsFile InitCA - AddSGEStartUpScript $euid master + SetupSystemdSliceName + AddSGEStartUpScript $euid qmaster StartQmaster AddHosts CopyCA execd @@ -1362,7 +1364,7 @@ if [ $MAKE_RC = "true" ]; then COMMONDIR=$SGE_CELL/common - CreateSGEStartUpScripts 0 true master + CreateSGEStartUpScripts 0 true qmaster CreateSGEStartUpScripts 0 true execd $INFOTEXT "\nYour new startup scripts are created. You will find them in the\n" \ diff --git a/source/dist/util/arch_variables b/source/dist/util/arch_variables index 148b05bc68..e225da2ba2 100755 --- a/source/dist/util/arch_variables +++ b/source/dist/util/arch_variables @@ -79,9 +79,17 @@ case $ARCH in LOADSENSOR_COMMAND=undef ;; lx*|ulx*|xlx*) + first_two=`echo $ARCH | cut -c 1-2` AWK=/usr/bin/gawk ECHO="/bin/echo -e" - if [ -x /usr/lib/lsb/install_initd ]; then + if [ `ps -p 1 -o comm=` = "systemd" -a "$first_two" = "lx" ]; then + # nearly all newer Linux distributions use systemd + # but we support it only on lx-* + # ulx-* (CentOS 7) is too old + RC_FILE=systemd + RC_PREFIX=/etc/systemd/system + RC_DIR=unused + elif [ -x /usr/lib/lsb/install_initd ]; then # RedHat, CentOS etc. RC_FILE=lsb RC_PREFIX=/etc/init.d @@ -180,11 +188,6 @@ CHOWN=chown CP=cp MV=mv -# Some of the following shell function require this variable set to false or -# true -# -autoinst=false - #-------------------------------------------------------------------------- # Translate # create localized version of arguments using "gettext" and "printf" @@ -243,9 +246,8 @@ Translate() # WaitClear # Let user press the return key and print message # -# USES: variables "$autoinst" -# "$CLEAR" -# $1 = clear clear screen +# USES: variables "$AUTO" +# $1 = clear clear screen # $1 = # $2 = no string - print default messages # $4 = arg - print argument @@ -261,7 +263,7 @@ WaitClear() Translate 2 "Hit to continue >> " fi - if [ "$autoinst" = true ]; then + if [ "$AUTO" = true ]; then echo else read INP_WaitClear @@ -275,11 +277,11 @@ WaitClear() #------------------------------------------------------------------------- # Enter: input is read and returned to stdout. If input is empty echo $1 # -# USES: variable "$autoinst" +# USES: variable "$AUTO" # Enter() { - if [ "$autoinst" = true ]; then + if [ "$AUTO" = true ]; then $ECHO $1 else read INP @@ -308,7 +310,7 @@ YesNo() text="$text $transout [$default] >> \c" fi - if [ "$autoinst" = true -a $default != x -a $default != X ]; then + if [ "$AUTO" = true -a $default != x -a $default != X ]; then $ECHO "$text" $ECHO $default if [ $default = y -o $default = Y ]; then diff --git a/source/dist/util/install_modules/inst_common.sh b/source/dist/util/install_modules/inst_common.sh index e12d9242b5..c47383c41e 100644 --- a/source/dist/util/install_modules/inst_common.sh +++ b/source/dist/util/install_modules/inst_common.sh @@ -709,7 +709,7 @@ AddChangedHost() CheckConfigFile() { CONFIG_FILE=$1 - KNOWN_CONFIG_FILE_ENTRIES_INSTALL="SGE_ROOT SGE_QMASTER_PORT SGE_EXECD_PORT CELL_NAME ADMIN_USER QMASTER_SPOOL_DIR EXECD_SPOOL_DIR GID_RANGE SPOOLING_METHOD DB_SPOOLING_DIR PAR_EXECD_INST_COUNT ADMIN_HOST_LIST SUBMIT_HOST_LIST EXEC_HOST_LIST EXECD_SPOOL_DIR_LOCAL HOSTNAME_RESOLVING SHELL_NAME COPY_COMMAND DEFAULT_DOMAIN ADMIN_MAIL ADD_TO_RC SET_FILE_PERMS RESCHEDULE_JOBS SCHEDD_CONF SHADOW_HOST EXEC_HOST_LIST_RM REMOVE_RC CSP_RECREATE CSP_COPY_CERTS CSP_COUNTRY_CODE CSP_STATE CSP_LOCATION CSP_ORGA CSP_ORGA_UNIT CSP_MAIL_ADDRESS SGE_ENABLE_SMF SGE_CLUSTER_NAME" + KNOWN_CONFIG_FILE_ENTRIES_INSTALL="SGE_ROOT SGE_QMASTER_PORT SGE_EXECD_PORT CELL_NAME ADMIN_USER QMASTER_SPOOL_DIR EXECD_SPOOL_DIR GID_RANGE SPOOLING_METHOD DB_SPOOLING_DIR PAR_EXECD_INST_COUNT ADMIN_HOST_LIST SUBMIT_HOST_LIST EXEC_HOST_LIST EXECD_SPOOL_DIR_LOCAL HOSTNAME_RESOLVING SHELL_NAME COPY_COMMAND DEFAULT_DOMAIN ADMIN_MAIL ADD_TO_RC SLICE_NAME SET_FILE_PERMS RESCHEDULE_JOBS SCHEDD_CONF SHADOW_HOST EXEC_HOST_LIST_RM REMOVE_RC CSP_RECREATE CSP_COPY_CERTS CSP_COUNTRY_CODE CSP_STATE CSP_LOCATION CSP_ORGA CSP_ORGA_UNIT CSP_MAIL_ADDRESS SGE_ENABLE_SMF SGE_CLUSTER_NAME" KNOWN_CONFIG_FILE_ENTRIES_BACKUP="SGE_ROOT SGE_CELL BACKUP_DIR TAR BACKUP_FILE" MAX_GID=2147483647 #unsigned int = 32bit - 1 MIN_GID=100 #from 0 - 100 may be reserved GIDs @@ -1475,14 +1475,12 @@ GetDefaultClusterName() { SetupRcScriptNames61() { case $1 in - qmaster) - hosttype="master";; shadow) return;; *) hosttype=$1;; esac - if [ $hosttype = "master" ]; then + if [ $hosttype = "qmaster" ]; then TMP_SGE_STARTUP_FILE=/tmp/sgemaster.$$ STARTUP_FILE_NAME=sgemaster S95NAME=S95sgemaster @@ -1528,15 +1526,13 @@ SetupRcScriptNames() fi case $1 in - qmaster) - hosttype="master";; shadow) DAEMON_NAME="shadow" return;; *) hosttype=$1;; esac - if [ $hosttype = "master" ]; then + if [ $hosttype = "qmaster" ]; then script_name=sgemaster TMP_SGE_STARTUP_FILE=/tmp/sgemaster.$$ STARTUP_FILE_NAME=sgemaster.$SGE_CLUSTER_NAME @@ -1576,14 +1572,18 @@ SetupRcScriptNames() #------------------------------------------------------------------------- # CheckRCfiles: Check for presence RC scripts # Requires SGE_ROOT and SGE_CELL to be set -# $1 ... can be empty or "61" to detect darwin RC script on 61 +# $1 host type (qmaster, execd, ...) # CheckRCfiles() { + hosttype=$1 rc_ret=0 rc_path="" + if [ "$RC_FILE" = "systemd" -a `IsSystemdServiceInstalled $hosttype` = "true" ]; then + SERVICE_NAME=`GetServiceName $hosttype` + rc_path="$RC_PREFIX/$SERVICE_NAME" # LSB, etc. - if [ "$RC_FILE" = "lsb" -o "$RC_FILE" = "insserv-linux" -o "$RC_FILE" = "update-rc.d" -o "$RC_FILE" = "rc-update" ]; then + elif [ "$RC_FILE" = "lsb" -o "$RC_FILE" = "insserv-linux" -o "$RC_FILE" = "update-rc.d" -o "$RC_FILE" = "rc-update" ]; then rc_path="$RC_PREFIX/$STARTUP_FILE_NAME" # System V elif [ "$RC_FILE" = "sysv_rc" ]; then @@ -1615,7 +1615,7 @@ CheckRCfiles() #------------------------------------------------------------------------- # CheckIfClusterNameAlreadyExists: Check for presence of SMF service and RC script # Requires SGE_ROOT and SGE_CELL to be set -# $1 ... compoment we are installing +# $1 ... component we are installing # CheckIfClusterNameAlreadyExists() { @@ -1648,7 +1648,7 @@ CheckIfClusterNameAlreadyExists() #Check for RCscript SetupRcScriptNames $hosttype - CheckRCfiles + CheckRCfiles $hosttype rc_res=$? #Prepare correct return value and message @@ -1696,7 +1696,7 @@ RemoveRC_SMF() esac if [ $rem_res -ne 0 ]; then $INFOTEXT "Removal not successful!" - if [ "$AUTO"=true ]; then + if [ "$AUTO" = "true" ]; then $INFOTEXT -log "Removal not successful!" MoveLog fi @@ -1755,7 +1755,7 @@ SearchForExistingInstallations() # ProcessSGEClusterName: Ask for cluster name # Requires SGE_ROOT and SGE_CELL to be set # $1 ... compoment we are installing -# valid values are: bdb, master, shadowd, execd, dbwriter +# valid values are: bdb, qmaster, shadowd, execd, dbwriter # "" - no service checking # ProcessSGEClusterName() @@ -1771,9 +1771,8 @@ ProcessSGEClusterName() SGE_QMASTER_PORT=`./utilbin/$SGE_ARCH/getservbyname -number sge_qmaster` fi - done=false - - while [ $done = false ]; do + done="false" + while [ "$done" = "false" ]; do GetDefaultClusterName $CLEAR $INFOTEXT -u "\nUnique cluster name" @@ -1819,16 +1818,16 @@ ProcessSGEClusterName() if [ $? -eq 0 ]; then $CLEAR RemoveRC_SMF $1 $validation_res - done=true + done="true" fi else - done=true + done="true" fi else - done=true + done="true" fi done - + #Only BDB or qmaster installation can create cluster_name file if [ \( "$1" = "bdb" -o "$1" = "qmaster" -o "$UPDATE" = "true" \) -a ! -f $SGE_ROOT/$SGE_CELL/common/cluster_name ]; then Makedir "$SGE_ROOT/$SGE_CELL/common" @@ -2026,7 +2025,7 @@ CreateSGEStartUpScripts() create=$2 hosttype=$3 - if [ $hosttype = "master" ]; then + if [ $hosttype = "qmaster" ]; then TMP_SGE_STARTUP_FILE=/tmp/sgemaster.$$ STARTUP_FILE_NAME=sgemaster else @@ -2050,7 +2049,7 @@ CreateSGEStartUpScripts() if [ $create = true ]; then - if [ $hosttype = "master" ]; then + if [ $hosttype = "qmaster" ]; then template="util/rctemplates/sgemaster_template" svc_name="sgemaster.${SGE_CLUSTER_NAME}" else @@ -2087,10 +2086,10 @@ CreateSGEStartUpScripts() rm -f $TMP_SGE_STARTUP_FILE ${TMP_SGE_STARTUP_FILE}.0 ${TMP_SGE_STARTUP_FILE}.1 - if [ $euid = 0 -a "$ADMINUSER" != default -a $QMASTER = "install" -a $hosttype = "master" ]; then + if [ $euid = 0 -a "$ADMINUSER" != default -a $QMASTER = "install" -a $hosttype = "qmaster" ]; then AddDefaultManager root $ADMINUSER AddDefaultOperator $ADMINUSER - elif [ $euid != 0 -a $hosttype = "master" ]; then + elif [ $euid != 0 -a $hosttype = "qmaster" ]; then AddDefaultManager $USER AddDefaultOperator $USER fi @@ -2124,6 +2123,236 @@ AddSGEStartUpScript() $CLEAR } +#------------------------------------------------------------------------- +# CheckSliceName: Check if a slice name is correct +# It may only contain lower case characters and numbers. +# It must start with a character. +# It may not contain the trailing ".slice". +# +CheckSliceName() +{ + slice_name=$1 + + # slice name may only contain lowercase letters and numbers + echo $slice_name | grep -qE '^[a-z][a-z0-9]+$' + if [ $? -ne 0 ]; then + $INFOTEXT "Slice name may only contain lowercase letters and numbers." + $INFOTEXT -log "Slice name may only contain lowercase letters and numbers." + WaitClear "noclear" + return 1 + fi + + # it may not contain the trailing ".slice" + echo $slice_name | grep -qE '^[a-z][a-z0-9]+.slice$' + if [ $? -eq 0 ]; then + $INFOTEXT "Slice name may not contain the trailing \".slice\"." + $INFOTEXT -log "Slice name may not contain the trailing \".slice\"." + WaitClear "noclear" + return 1 + fi + + return 0 +} + +#------------------------------------------------------------------------- +# IsSystemdServiceInstalled: Check if a service is installed +# $1 - the service type (qmaster, shadow, execd, ...) +# +IsSystemdServiceInstalled() +{ + # Check if the systemd service is installed + SERVICE_NAME=`GetServiceName $1 "false"` + if [ $? -ne 0 ]; then + $ECHO "false" + return 1 + fi + + # Check if the service file exists + if [ ! -f "$RC_PREFIX/$SERVICE_NAME" ]; then + $ECHO "false" + return 1 + fi + + $ECHO "true" + return 0 +} + +#------------------------------------------------------------------------- +# GetServiceName: Return the name of the systemd service +# $1 - the service type (qmaster, shadow, execd, ...) +# $2 - optional: boolean log_error, "true" or "false" +# +GetServiceName() +{ + log_error="true" + UNIT_NAME=$1 + if [ $# -gt 1 ]; then + log_error=$2 + fi + + if [ ! -f "$SGE_ROOT/$SGE_CELL/common/slice_name" ]; then + if [ "$log_error" = "true" ]; then + $INFOTEXT "$SGE_ROOT/$SGE_CELL/common/slice_name does not exist" + $INFOTEXT -log "$SGE_ROOT/$SGE_CELL/common/slice_name does not exist" + WaitClear "noclear" + fi + $ECHO "no_known_${UNIT_NAME}_service_yet.service" + return 1 + fi + + SLICE_NAME=`cat $SGE_ROOT/$SGE_CELL/common/slice_name` + if [ $? -ne 0 ]; then + if [ "$log_error" = "true" ]; then + $INFOTEXT "Could not read $SGE_ROOT/$SGE_CELL/common/slice_name" + $INFOTEXT -log "Could not read $SGE_ROOT/$SGE_CELL/common/slice_name" + WaitClear "noclear" + fi + $ECHO "no_known_${UNIT_NAME}_service_yet.service" + return 1 + fi + + if [ "$UNIT_NAME" = "shadow" ]; then + UNIT_NAME="qmaster" + fi + SERVICE_NAME="$SLICE_NAME-$UNIT_NAME.service" + $ECHO $SERVICE_NAME + return 0 +} + +#------------------------------------------------------------------------- +# GetDefaultSliceName: Get the name of the top level slice +# +GetDefaultSliceName() +{ + DEFAULT_SLICE_NAME="ocs" + # @todo do we always want to append the SGE_QMASTER_PORT? + # simply "ocs-qmaster.service" is shorter and more readable than "ocs1234-qmaster.service" + # and it is what you are looking for when you are looking for the qmaster service + # BUT: when doing a side by side update the new cluster must not use the same slice name - handle it there? + if [ "$SGE_QMASTER_PORT" != "" ]; then + DEFAULT_SLICE_NAME="${DEFAULT_SLICE_NAME}${SGE_QMASTER_PORT}" + fi + + $ECHO "$DEFAULT_SLICE_NAME" + return 0 +} + +GetSystemdSliceName() +{ + not_exists_error=$1 + if [ -f "$SGE_ROOT/$SGE_CELL/common/slice_name" ]; then + # read the slice name from $SGE_ROOT/$SGE_CELL/common/slice_name + SLICE_NAME=`cat $SGE_ROOT/$SGE_CELL/common/slice_name` + if [ $? -ne 0 ]; then + $INFOTEXT "Could not read $SGE_ROOT/$SGE_CELL/common/slice_name" + $INFOTEXT -log "Could not read $SGE_ROOT/$SGE_CELL/common/slice_name" + exit 1 + fi + $ECHO "$SLICE_NAME" + else + if [ "$not_exists_error" = "true" ]; then + $INFOTEXT "No slice name defined, file %s does not exist" "$SGE_ROOT/$SGE_CELL/common/slice_name" + $INFOTEXT -log "No slice name defined, file %s does not exist" "$SGE_ROOT/$SGE_CELL/common/slice_name" + exit 1 + fi + return 1 + fi + + return 0 +} + +SetupSystemdSliceName() +{ + # if we are on a systemd system, we need to define the slice name + if [ "$RC_FILE" = "systemd" ]; then + SLICE_NAME=`GetSystemdSliceName "false"` + # if slice name is not yet defined, we ask the user for it + if [ $? -ne 0 ]; then + # query toplevel slice name + slice_name_is_ok="false" + while [ "$slice_name_is_ok" = "false" ]; do + $CLEAR + if [ "$AUTO" = "false" ]; then + # set the default + SLICE_NAME=`GetDefaultSliceName` + fi + $INFOTEXT -u "\nSetting up systemd toplevel slice name" + $INFOTEXT "%s will be running withing a top level systemd/cgroups slice,\ndefault is \"%s.slice\"." $DAEMON_NAME $SLICE_NAME + $INFOTEXT "If you are running multiple clusters on the same host,\nplease use a unique slice name.\n" + $INFOTEXT -n "Please enter the slice name (without the trailing .slice) or\nhit to use [%s] >> " $SLICE_NAME + SLICE_NAME=`Enter $SLICE_NAME` + CheckSliceName $SLICE_NAME + if [ $? -eq 0 ]; then + slice_name_is_ok="true" + elif [ "$AUTO" = "true" ]; then + # avoid endless loop when slice is incorrectly defined + return 1 + fi + done + + # store the slice name in $SGE_ROOT/$SGE_CELL/common/slice_name + SafelyCreateFile "$SGE_ROOT/$SGE_CELL/common/slice_name" 644 "$SLICE_NAME" + fi + fi +} + +#------------------------------------------------------------------------- +# InstallSystemdUnitFile: Install the systemd unit file from template +# +InstallSystemdUnitFile() +{ + # source template and destination file names + case $DAEMON_NAME in + qmaster|shadow) + template_file="$SGE_ROOT/util/rctemplates/sgemaster_systemd_template" + ;; + execd) + template_file="$SGE_ROOT/util/rctemplates/sgeexecd_systemd_template" + ;; + *) + $INFOTEXT "\nStarting up %s via systemd is not yet supported!\n" $DAEMON_NAME + $INFOTEXT -log "\nStarting up %s via systemd is not yet supported!\n" $DAEMON_NAME + return 1 + ;; + esac + + SLICE_NAME=`GetSystemdSliceName "true"` + $INFOTEXT "Using slice name from $SGE_ROOT/$SGE_CELL/common/slice_name: %s" $SLICE_NAME + $INFOTEXT -log "Using slice name from $SGE_ROOT/$SGE_CELL/common/slice_name: %s" $SLICE_NAME + + # replace variables in the template, store the result in a tmp file + TMP_UNIT_FILE="/tmp/sge_${DAEMON_NAME}_unit_file.$$" + sed -e "s%GENROOT%${SGE_ROOT_VAL}%g" \ + -e "s%GENCELL%${SGE_CELL_VAL}%g" \ + -e "s%GENPRODUCT%Cluster Scheduler%g" \ + -e "s%GENSLICE%${SLICE_NAME}%" \ + $template_file > $TMP_UNIT_FILE + + SERVICE_NAME=`GetServiceName $DAEMON_NAME` + if [ $? -ne 0 ]; then + return 1 + fi + + # copy the tmp file to the systemd directory ($RC_PREFIX) + # and make sure systemd is aware of the new unit file + unit_file="$RC_PREFIX/$SERVICE_NAME" + Execute cp $TMP_UNIT_FILE $unit_file + Execute rm -f $TMP_UNIT_FILE + Execute systemctl daemon-reload + + $INFOTEXT "Installed %s systemd unit file to %s" $DAEMON_NAME $unit_file + $INFOTEXT -log "Installed %s systemd unit file to %s" $DAEMON_NAME $unit_file + + return 0 +} + +#------------------------------------------------------------------------- +# EnableSystemdService: Enable a systemd service given by SERVICE_NAME +# +EnableSystemdService() +{ + Execute systemctl enable $SERVICE_NAME +} InstallRcScript() { @@ -2189,8 +2418,14 @@ InstallRcScript() return fi + # Linux host with systemd + if [ "$RC_FILE" = "systemd" ]; then + InstallSystemdUnitFile + if [ $? -eq 0 ]; then + EnableSystemdService + fi # If system is Linux Standard Base (LSB) compliant, use the install_initd utility - if [ "$RC_FILE" = lsb ]; then + elif [ "$RC_FILE" = "lsb" ]; then echo cp $SGE_STARTUP_FILE $RC_PREFIX/$STARTUP_FILE_NAME echo /usr/lib/lsb/install_initd $RC_PREFIX/$STARTUP_FILE_NAME Execute cp $SGE_STARTUP_FILE $RC_PREFIX/$STARTUP_FILE_NAME @@ -2521,10 +2756,10 @@ CheckRunningDaemon() BackupConfig() { DATE=`date '+%Y-%m-%d_%H_%M_%S'` - BUP_BDB_COMMON_FILE_LIST_TMP="accounting bootstrap qtask settings.sh st.enabled act_qmaster sgemaster host_aliases settings.csh sgeexecd sgebdb shadow_masters cluster_name" + BUP_BDB_COMMON_FILE_LIST_TMP="accounting bootstrap qtask settings.sh st.enabled act_qmaster sgemaster host_aliases settings.csh sgeexecd sgebdb shadow_masters cluster_name slice_name" BUP_BDB_COMMON_DIR_LIST_TMP="sgeCA" BUP_BDB_SPOOL_FILE_LIST_TMP="jobseqnum" - BUP_CLASSIC_COMMON_FILE_LIST_TMP="configuration sched_configuration accounting bootstrap qtask settings.sh st.enabled act_qmaster sgemaster host_aliases settings.csh sgeexecd shadow_masters cluster_name" + BUP_CLASSIC_COMMON_FILE_LIST_TMP="configuration sched_configuration accounting bootstrap qtask settings.sh st.enabled act_qmaster sgemaster host_aliases settings.csh sgeexecd shadow_masters cluster_name slice_name" BUP_CLASSIC_DIR_LIST_TMP="sgeCA local_conf" BUP_CLASSIC_SPOOL_FILE_LIST_TMP="jobseqnum advance_reservations admin_hosts calendars centry ckpt cqueues exec_hosts hostgroups resource_quotas managers operators pe projects qinstances schedd submit_hosts usermapping users usersets zombies" BUP_COMMON_FILE_LIST="" @@ -2588,10 +2823,10 @@ BackupConfig() RestoreConfig() { DATE=`date '+%H_%M_%S'` - BUP_COMMON_FILE_LIST="accounting bootstrap qtask settings.sh act_qmaster sgemaster host_aliases settings.csh sgeexecd sgebdb shadow_masters st.enabled cluster_name" + BUP_COMMON_FILE_LIST="accounting bootstrap qtask settings.sh act_qmaster sgemaster host_aliases settings.csh sgeexecd sgebdb shadow_masters st.enabled cluster_name slice_name" BUP_COMMON_DIR_LIST="sgeCA" BUP_SPOOL_FILE_LIST="jobseqnum" - BUP_CLASSIC_COMMON_FILE_LIST="configuration sched_configuration accounting bootstrap qtask settings.sh act_qmaster sgemaster host_aliases settings.csh sgeexecd shadow_masters st.enabled cluster_name" + BUP_CLASSIC_COMMON_FILE_LIST="configuration sched_configuration accounting bootstrap qtask settings.sh act_qmaster sgemaster host_aliases settings.csh sgeexecd shadow_masters st.enabled cluster_name slice_name" BUP_CLASSIC_DIR_LIST="sgeCA local_conf" BUP_CLASSIC_SPOOL_FILE_LIST="jobseqnum admin_hosts advance_reservations calendars centry ckpt cqueues exec_hosts hostgroups managers operators pe projects qinstances resource_quotas schedd submit_hosts usermapping users usersets zombies" @@ -2968,7 +3203,7 @@ RemoveRcScript() hosttype=$2 euid=$3 upgrade=$4 - + # --- from here only if root installs --- if [ $euid != 0 ]; then return 0 @@ -3023,9 +3258,30 @@ RemoveRcScript() #This might happen when we do a reinstall and have RC scripts, but now #want to use SMF. fi - + + # If system has systemd + if [ "$RC_FILE" = "systemd" ]; then + SERVICE_NAME=`GetServiceName $hosttype "false"` + if [ $? -eq 0 ]; then + if [ -f "$RC_PREFIX/$SERVICE_NAME" ]; then + $INFOTEXT "Removing %s service" "$SERVICE_NAME" + $INFOTEXT -log "Removing %s service" "$SERVICE_NAME" + systemctl is-active "$SERVICE_NAME" + if [ $? -eq 0 ]; then + systemctl stop "$SERVICE_NAME" + fi + systemctl is-enabled "$SERVICE_NAME" + if [ $? -eq 0 ]; then + systemctl disable "$SERVICE_NAME" + fi + rm -f "$RC_PREFIX/$SERVICE_NAME" + systemctl daemon-reload + else + $INFOTEXT "Service %s not found, skipping removal" "$SERVICE_NAME" + fi + fi # If system is Linux Standard Base (LSB) compliant, use the install_initd utility - if [ "$RC_FILE" = lsb ]; then + elif [ "$RC_FILE" = "lsb" ]; then echo /usr/lib/lsb/remove_initd $RC_PREFIX/$STARTUP_FILE_NAME Execute /usr/lib/lsb/remove_initd $RC_PREFIX/$STARTUP_FILE_NAME # Several old Red Hat releases do not create/remove startup links from LSB conform @@ -3085,7 +3341,7 @@ RemoveRcScript() if [ -z "$v61" ]; then RC_DIR="$RC_DIR.$SGE_CLUSTER_NAME" fi - if [ $hosttype = "master" ]; then + if [ $hosttype = "qmaster" ]; then DARWIN_GEN_REPLACE="#GENMASTERRC" elif [ $hosttype = "bdb" ]; then DARWIN_GEN_REPLACE="#GENBDBRC" diff --git a/source/dist/util/install_modules/inst_execd.sh b/source/dist/util/install_modules/inst_execd.sh index f02e4b33c7..beb4082ecd 100644 --- a/source/dist/util/install_modules/inst_execd.sh +++ b/source/dist/util/install_modules/inst_execd.sh @@ -446,15 +446,24 @@ StartExecd() fi $SVCADM enable -s "svc:/application/sge/execd:$SGE_CLUSTER_NAME" if [ $? -ne 0 ]; then - $INFOTEXT "\nFailed to start execution deamon over SMF.\n" \ + $INFOTEXT "\nFailed to start execution daemon over SMF.\n" \ "Check service by issuing svcs -l svc:/application/sge/execd:%s" $SGE_CLUSTER_NAME - $INFOTEXT -log "\nFailed to start execution deamon over SMF.\n" \ + $INFOTEXT -log "\nFailed to start execution daemon over SMF.\n" \ "Check service by issuing svcs -l svc:/application/sge/execd:%s" $SGE_CLUSTER_NAME if [ $AUTO = true ]; then MoveLog fi exit 1 fi + elif [ "$RC_FILE" = "systemd" -a `IsSystemdServiceInstalled "execd"` = "true" ]; then + SERVICE_NAME=`GetServiceName "execd"` + systemctl start "$SERVICE_NAME" + if [ $? -ne 0 ]; then + $INFOTEXT "sge_execd start problem" + $INFOTEXT -log "sge_execd start problem" + MoveLog + exit 1 + fi else $SGE_STARTUP_FILE fi diff --git a/source/dist/util/install_modules/inst_execd_uninst.sh b/source/dist/util/install_modules/inst_execd_uninst.sh index cb7ad67aab..f2b2868714 100644 --- a/source/dist/util/install_modules/inst_execd_uninst.sh +++ b/source/dist/util/install_modules/inst_execd_uninst.sh @@ -304,10 +304,8 @@ RemoveReferences() for q in `qconf -sql`; do qconf -purge queue "*" "$q@$exechost" done - } - RemoveSpoolDir() { exechost=$1 diff --git a/source/dist/util/install_modules/inst_qmaster.sh b/source/dist/util/install_modules/inst_qmaster.sh index 66b3405fdc..0349e8d48c 100644 --- a/source/dist/util/install_modules/inst_qmaster.sh +++ b/source/dist/util/install_modules/inst_qmaster.sh @@ -1042,13 +1042,22 @@ StartQmaster() if [ "$SGE_ENABLE_SMF" = "true" ]; then $SVCADM enable -s "svc:/application/sge/qmaster:$SGE_CLUSTER_NAME" if [ $? -ne 0 ]; then - $INFOTEXT "\nFailed to start qmaster deamon over SMF. Check service by issuing "\ + $INFOTEXT "\nFailed to start qmaster daemon over SMF. Check service by issuing "\ "svcs -l svc:/application/sge/qmaster:%s" $SGE_CLUSTER_NAME - $INFOTEXT -log "\nFailed to start qmaster deamon over SMF. Check service by issuing "\ + $INFOTEXT -log "\nFailed to start qmaster daemon over SMF. Check service by issuing "\ "svcs -l svc:/application/sge/qmaster:%s" $SGE_CLUSTER_NAME MoveLog exit 1 fi + elif [ $RC_FILE = "systemd" -a `IsSystemdServiceInstalled "qmaster"` = "true" ]; then + SERVICE_NAME=`GetServiceName "qmaster"` + systemctl start "$SERVICE_NAME" + if [ $? -ne 0 ]; then + $INFOTEXT "sge_qmaster start problem" + $INFOTEXT -log "sge_qmaster start problem" + MoveLog + exit 1 + fi else $SGE_STARTUP_FILE -qmaster if [ $? -ne 0 ]; then @@ -1297,7 +1306,7 @@ AddHostsFromFile() file=`Enter none` if [ "$file" = "none" -o ! -f "$file" ]; then $INFOTEXT "\nYou entered an invalid file name or the file does not exist." - $INFOTEXT -auto $autoinst -ask "y" "n" -def "y" -n \ + $INFOTEXT -auto $AUTO -ask "y" "n" -def "y" -n \ "Do you want to enter a new file name (y/n) [y] >> " if [ $? = 1 ]; then return 1 diff --git a/source/dist/util/install_modules/inst_qmaster_uninst.sh b/source/dist/util/install_modules/inst_qmaster_uninst.sh index 1f474dd933..6173633606 100644 --- a/source/dist/util/install_modules/inst_qmaster_uninst.sh +++ b/source/dist/util/install_modules/inst_qmaster_uninst.sh @@ -48,10 +48,9 @@ RemoveQmaster() $INFOTEXT -n -ask "y" "n" -def "n" "Do you want to uninstall the master host? [n] >> " fi - if [ $? = 0 ]; then + if [ $? -eq 0 ]; then $INFOTEXT -n "We're going to uninstall the master host now!\n" CheckRegisteredExecd - else MoveLog exit 0 @@ -77,8 +76,6 @@ CheckRegisteredExecd() $INFOTEXT "There are no running execution host registered!\n" $INFOTEXT -log "There are no running execution host registered!\n" ShutdownMaster - - } ShutdownMaster() @@ -115,7 +112,7 @@ ShutdownMaster() toDelete="accounting act_qmaster bootstrap cluster_name configuration jmx local_conf qtask sched_configuration sgeCA sge_request sgemaster" - RemoveRcScript $HOST master $euid + RemoveRcScript $HOST "qmaster" $euid if [ -f $SGE_ROOT/$SGE_CELL/common/sgebdb ]; then $INFOTEXT "Berkeley db server is being used with this installation" diff --git a/source/dist/util/install_modules/inst_template.conf b/source/dist/util/install_modules/inst_template.conf index 0e1db027b0..655dde487f 100644 --- a/source/dist/util/install_modules/inst_template.conf +++ b/source/dist/util/install_modules/inst_template.conf @@ -129,6 +129,15 @@ ADMIN_MAIL="none" # to start automatically during boottime ADD_TO_RC="false" +# When Cluster Scheduler is running on Linux under Systemd control then +# all services (sge_qmaster, sge_shadowd, sge_execd) as well as the jobs +# will run under cgroups control. The top level cgroup name can be configured +# in SLICE_NAME. The default is "ocs". +# Keep the default unless you want to run multiple clusters on the same host. +# In this case make the name unique, e.g. by appending the sge_qmaster port number, +# e.g. "ocs6444". SLICE_NAME may contain only lowercase letters and numbers. +SLICE_NAME="ocs" + #If this is "true" the file permissions of executables will be set to 755 #and of ordenary file to 644. SET_FILE_PERMS="true" diff --git a/source/dist/util/rctemplates/sgeexecd_systemd_template b/source/dist/util/rctemplates/sgeexecd_systemd_template new file mode 100644 index 0000000000..e9183c7fa9 --- /dev/null +++ b/source/dist/util/rctemplates/sgeexecd_systemd_template @@ -0,0 +1,23 @@ +[Unit] +Description=GENPRODUCT sge_execd service +Documentation=man:sge_execd(8) +After=network-online.target remote-fs.target autofs.service +#RequiresMountsFor=GENROOT + +[Service] +Type=forking +Slice=GENSLICE.slice +ExecStart=GENROOT/GENCELL/common/sgeexecd start +ExecStop=GENROOT/GENCELL/common/sgeexecd softstop +Restart=no +#We can set (cgroups / systemd-run) options +#MemoryAccounting=true +#IOAccounting=true +#CPUAccounting=true +#MemorySwapMax=1 +#CPUShares=1000 +#CPUWeight=1000 +#... + +[Install] +WantedBy=multi-user.target diff --git a/source/dist/util/rctemplates/sgemaster_systemd_template b/source/dist/util/rctemplates/sgemaster_systemd_template new file mode 100644 index 0000000000..71d0552708 --- /dev/null +++ b/source/dist/util/rctemplates/sgemaster_systemd_template @@ -0,0 +1,24 @@ +[Unit] +Description=GENPRODUCT sge_qmaster and optionally sge_shadowd services +Documentation=man:sge_qmaster(8) man:sge_shadowd(8) +#Wants=network.target +After=network-online.target remote-fs.target autofs.service +#RequiresMountsFor=GENROOT + +[Service] +Type=forking +Slice=GENSLICE.slice +ExecStart=GENROOT/GENCELL/common/sgemaster start +ExecStop=GENROOT/GENCELL/common/sgemaster stop +Restart=no +#We can set (cgroups / systemd-run) options +#MemoryAccounting=true +#IOAccounting=true +#CPUAccounting=true +#MemorySwapMax=1 +#CPUShares=1000 +#CPUWeight=1000 +#... + +[Install] +WantedBy=multi-user.target diff --git a/source/libs/cull/cull_list.cc b/source/libs/cull/cull_list.cc index 9c4026f1a0..61703585f5 100644 --- a/source/libs/cull/cull_list.cc +++ b/source/libs/cull/cull_list.cc @@ -666,6 +666,22 @@ void lWriteElemToStr(const lListElem *ep, dstring *buffer) { DRETURN_VOID; } +void lWriteElemToMessagesFile(const lListElem *ep, u_long32 log_level) { + DENTER(CULL_LAYER); + if (log_level >= log_state_get_log_level()) { + dstring buffer = DSTRING_INIT; + lWriteElem_(ep, &buffer, 0); + saved_vars_s *context = nullptr; + const char *token = sge_strtok_r(sge_dstring_get_string(&buffer), "\n", &context); + while (token != nullptr) { + sge_log(log_level, token, __FILE__, __LINE__); + token = sge_strtok_r(nullptr, "\n", &context); + } + sge_free_saved_vars(context); + } + DRETURN_VOID; +} + static void lWriteElem_(const lListElem *ep, dstring *buffer, int nesting_level) { int i; char space[128]; @@ -821,6 +837,22 @@ void lWriteListToStr(const lList *lp, dstring *buffer) { DRETURN_VOID; } +void lWriteListToMessagesFile(const lList *lp, u_long32 log_level) { + DENTER(CULL_LAYER); + if (log_level >= log_state_get_log_level()) { + dstring buffer = DSTRING_INIT; + lWriteList_(lp, &buffer, 0); + saved_vars_s *context = nullptr; + const char *token = sge_strtok_r(sge_dstring_get_string(&buffer), "\n", &context); + while (token != nullptr) { + sge_log(log_level, token, __FILE__, __LINE__); + token = sge_strtok_r(nullptr, "\n", &context); + } + sge_free_saved_vars(context); + } + DRETURN_VOID; +} + static void lWriteList_(const lList *lp, dstring *buffer, int nesting_level) { const lListElem *ep; char indent[128]; diff --git a/source/libs/cull/cull_list.h b/source/libs/cull/cull_list.h index ca8b066c9f..c55d31b795 100644 --- a/source/libs/cull/cull_list.h +++ b/source/libs/cull/cull_list.h @@ -236,12 +236,16 @@ void lWriteElemTo(const lListElem *ep, FILE *fp); void lWriteElemToStr(const lListElem *ep, dstring *buffer); +void lWriteElemToMessagesFile(const lListElem *ep, u_long32 log_level); + void lWriteList(const lList *lp); void lWriteListTo(const lList *lp, FILE *fp); void lWriteListToStr(const lList *lp, dstring *buffer); +void lWriteListToMessagesFile(const lList *lp, u_long32 log_level); + lListElem *lCreateElem(const lDescr *dp); lList *lCreateList(const char *listname, const lDescr *descr); diff --git a/source/libs/sgeobj/cull/sge_ja_task_JAT_L.h b/source/libs/sgeobj/cull/sge_ja_task_JAT_L.h index 5200de0cf1..79fdcab2dd 100644 --- a/source/libs/sgeobj/cull/sge_ja_task_JAT_L.h +++ b/source/libs/sgeobj/cull/sge_ja_task_JAT_L.h @@ -118,6 +118,23 @@ * Could be / was a OS supported job id on no longer supported platforms * like Cray, Nec, Irix. * +* SGE_STRING(JAT_systemd_scope) - Systemd Scope +* In case a job is started in a systemd scope, this is the +* scope unit name. +* Only used in sge_execd, it is set during the startup phase of the +* job. +* +* SGE_STRING(JAT_systemd_slice) - Systemd Slice +* In case of tightly integrated parallel jobs, the job's tasks (master and slave) +* are started within a slice. We only store it if we have a tightly integrated parallel job, +* and this slice needs to be deleted after the job is finished. +* Only used in sge_execd, it is set during the startup phase of the +* job. +* +* SGE_ULONG(JAT_usage_collection) - Usage Collection Mode +* Usage collection mode of a pe task. +* One of 0 (FALSE), 1 (PDC), 2 (HYBRID), 3 (TRUE). +* * SGE_LIST(JAT_usage_list) - Usage List * Raw usage from data collector. Scaled by sge_qmaster to JAT_scaled_usage_list. * Scheduling is using the scaled usage. @@ -211,6 +228,9 @@ enum { JAT_pending_signal_delivery_time, JAT_pid, JAT_osjobid, + JAT_systemd_scope, + JAT_systemd_slice, + JAT_usage_collection, JAT_usage_list, JAT_scaled_usage_list, JAT_reported_usage_list, @@ -251,6 +271,9 @@ LISTDEF(JAT_Type) SGE_ULONG64(JAT_pending_signal_delivery_time, CULL_SUBLIST) SGE_ULONG(JAT_pid, CULL_SUBLIST) SGE_STRING(JAT_osjobid, CULL_SUBLIST) + SGE_STRING(JAT_systemd_scope, CULL_SUBLIST) + SGE_STRING(JAT_systemd_slice, CULL_SUBLIST) + SGE_ULONG(JAT_usage_collection, CULL_DEFAULT) SGE_LIST(JAT_usage_list, UA_Type, CULL_SUBLIST) SGE_LIST(JAT_scaled_usage_list, UA_Type, CULL_SUBLIST) SGE_LIST(JAT_reported_usage_list, UA_Type, CULL_SUBLIST) @@ -291,6 +314,9 @@ NAMEDEF(JATN) NAME("JAT_pending_signal_delivery_time") NAME("JAT_pid") NAME("JAT_osjobid") + NAME("JAT_systemd_scope") + NAME("JAT_systemd_slice") + NAME("JAT_usage_collection") NAME("JAT_usage_list") NAME("JAT_scaled_usage_list") NAME("JAT_reported_usage_list") diff --git a/source/libs/sgeobj/cull/sge_job_JB_L.h b/source/libs/sgeobj/cull/sge_job_JB_L.h index 00913f5d74..687e0ff345 100644 --- a/source/libs/sgeobj/cull/sge_job_JB_L.h +++ b/source/libs/sgeobj/cull/sge_job_JB_L.h @@ -561,7 +561,7 @@ LISTDEF(JB_Type) SGE_LIST(JB_grp_list, ST_Type, CULL_SPOOL) SGE_LIST(JB_joker, VA_Type, CULL_SPOOL) SGE_ULONG(JB_sync_options, CULL_SPOOL) - SGE_ULONG(JB_category_id, CULL_HASH | CULL_SPOOL) + SGE_ULONG(JB_category_id, CULL_HASH) LISTEND NAMEDEF(JBN) diff --git a/source/libs/sgeobj/cull/sge_pe_task_PET_L.h b/source/libs/sgeobj/cull/sge_pe_task_PET_L.h index 6a365db633..337f161a25 100644 --- a/source/libs/sgeobj/cull/sge_pe_task_PET_L.h +++ b/source/libs/sgeobj/cull/sge_pe_task_PET_L.h @@ -51,6 +51,16 @@ * SGE_STRING(PET_osjobid) - OS JobId * OS jobid / additional group id of a running pe task. * +* SGE_STRING(PET_systemd_scope) - Systemd Scope +* In case a pe task is started in a systemd scope, this is the +* scope unit name. +* Only used in sge_execd, it is set during the startup phase of the +* pe task. +* +* SGE_ULONG(PET_usage_collection) - Usage Collection Mode +* Usage collection mode of a pe task. +* One of 0 (FALSE), 1 (PDC), 2 (HYBRID), 3 (TRUE). +* * SGE_LIST(PET_usage) - Usage * Usage list of a running/just finished pe task. * @@ -103,6 +113,8 @@ enum { PET_granted_destin_identifier_list, PET_pid, PET_osjobid, + PET_systemd_scope, + PET_usage_collection, PET_usage, PET_scaled_usage, PET_reported_usage, @@ -124,6 +136,8 @@ LISTDEF(PET_Type) SGE_LIST(PET_granted_destin_identifier_list, JG_Type, CULL_SUBLIST) SGE_ULONG(PET_pid, CULL_DEFAULT) SGE_STRING(PET_osjobid, CULL_DEFAULT) + SGE_STRING(PET_systemd_scope, CULL_DEFAULT) + SGE_ULONG(PET_usage_collection, CULL_DEFAULT) SGE_LIST(PET_usage, UA_Type, CULL_SUBLIST) SGE_LIST(PET_scaled_usage, UA_Type, CULL_SUBLIST) SGE_LIST(PET_reported_usage, UA_Type, CULL_SUBLIST) @@ -145,6 +159,8 @@ NAMEDEF(PETN) NAME("PET_granted_destin_identifier_list") NAME("PET_pid") NAME("PET_osjobid") + NAME("PET_systemd_scope") + NAME("PET_usage_collection") NAME("PET_usage") NAME("PET_scaled_usage") NAME("PET_reported_usage") diff --git a/source/libs/sgeobj/cull/sge_ptf_JL_L.h b/source/libs/sgeobj/cull/sge_ptf_JL_L.h index 98fde6df0e..b95bba5ba8 100644 --- a/source/libs/sgeobj/cull/sge_ptf_JL_L.h +++ b/source/libs/sgeobj/cull/sge_ptf_JL_L.h @@ -127,7 +127,7 @@ enum { }; LISTDEF(JL_Type) - SGE_ULONG(JL_job_ID, CULL_DEFAULT) + SGE_ULONG(JL_job_ID, CULL_PRIMARY_KEY | CULL_UNIQUE | CULL_HASH) SGE_LIST(JL_OS_job_list, JO_Type, CULL_DEFAULT) SGE_ULONG(JL_state, CULL_DEFAULT) SGE_ULONG(JL_tickets, CULL_DEFAULT) diff --git a/source/libs/sgeobj/cull/sge_ptf_JO_L.h b/source/libs/sgeobj/cull/sge_ptf_JO_L.h index 0151f9700e..0378c9e659 100644 --- a/source/libs/sgeobj/cull/sge_ptf_JO_L.h +++ b/source/libs/sgeobj/cull/sge_ptf_JO_L.h @@ -41,6 +41,13 @@ * SGE_ULONG(JO_OS_job_ID2) - OS Job Id 2 * OS job id (upper 32 bits). @todo Replace the two id fields by one u_long64? * +* SGE_STRING(JO_systemd_scope) - Systemd Scope +* When the job is running in a systemd scope, this field contains the name of the scope. +* +* SGE_ULONG(JO_usage_collection) - Usage Collection Mode +* Usage collection mode used for the os job. +* One of 0 (FALSE), 1 (PDC), 2 (HYBRID), 3 (TRUE). +* * SGE_ULONG(JO_ja_task_ID) - Array Task Id * In case of an array job: Task number of an array task. * @@ -61,6 +68,8 @@ enum { JO_OS_job_ID = JO_LOWERBOUND, JO_OS_job_ID2, + JO_systemd_scope, + JO_usage_collection, JO_ja_task_ID, JO_task_id_str, JO_state, @@ -71,6 +80,8 @@ enum { LISTDEF(JO_Type) SGE_ULONG(JO_OS_job_ID, CULL_DEFAULT) SGE_ULONG(JO_OS_job_ID2, CULL_DEFAULT) + SGE_STRING(JO_systemd_scope, CULL_DEFAULT) + SGE_ULONG(JO_usage_collection, CULL_DEFAULT) SGE_ULONG(JO_ja_task_ID, CULL_DEFAULT) SGE_STRING(JO_task_id_str, CULL_DEFAULT) SGE_ULONG(JO_state, CULL_DEFAULT) @@ -81,6 +92,8 @@ LISTEND NAMEDEF(JON) NAME("JO_OS_job_ID") NAME("JO_OS_job_ID2") + NAME("JO_systemd_scope") + NAME("JO_usage_collection") NAME("JO_ja_task_ID") NAME("JO_task_id_str") NAME("JO_state") diff --git a/source/libs/sgeobj/json/JAT.json b/source/libs/sgeobj/json/JAT.json index a7e15644e0..adff0586d8 100644 --- a/source/libs/sgeobj/json/JAT.json +++ b/source/libs/sgeobj/json/JAT.json @@ -233,24 +233,68 @@ "name": "osjobid", "summary": "OS Job Id", "description": [{ - "line": "Unique id which applies to all os processes started" - }, { - "line": "on behalf of this task. Set during the startup phase of the" - }, { - "line": "job. Meaning depends on the architecture of the" - }, { - "line": "host were the task is started." + "line": "Unique id which applies to all os processes started" + }, { + "line": "on behalf of this task. Set during the startup phase of the" + }, { + "line": "job. Meaning depends on the architecture of the" + }, { + "line": "host were the task is started." + }, { + "line": "SOLARIS/LINUX: additional group id" + }, { + "line": "Could be / was a OS supported job id on no longer supported platforms" + }, { + "line": "like Cray, Nec, Irix." + }], + "type": "lStringT", + "flags": [{ + "name": "SPOOL_SUBLIST" + }] + }, { + "name": "systemd_scope", + "summary": "Systemd Scope", + "description": [{ + "line": "In case a job is started in a systemd scope, this is the" + }, { + "line": "scope unit name." + }, { + "line": "Only used in sge_execd, it is set during the startup phase of the" + }, { + "line": "job." + }], + "type": "lStringT", + "flags": [{ + "name": "SPOOL_SUBLIST" + }] + }, { + "name": "systemd_slice", + "summary": "Systemd Slice", + "description": [{ + "line": "In case of tightly integrated parallel jobs, the job's tasks (master and slave)" + }, { + "line": "are started within a slice. We only store it if we have a tightly integrated parallel job," }, { - "line": "SOLARIS/LINUX: additional group id" + "line": "and this slice needs to be deleted after the job is finished." }, { - "line": "Could be / was a OS supported job id on no longer supported platforms" + "line": "Only used in sge_execd, it is set during the startup phase of the" }, { - "line": "like Cray, Nec, Irix." - }], + "line": "job." + }], "type": "lStringT", "flags": [{ "name": "SPOOL_SUBLIST" }] + }, { + "name": "usage_collection", + "summary": "Usage Collection Mode", + "description": [{ + "line": "Usage collection mode of a pe task." + }, { + "line": "One of 0 (FALSE), 1 (PDC), 2 (HYBRID), 3 (TRUE)." + }], + "type": "lUlongT", + "flags": [] }, { "name": "usage_list", "summary": "Usage List", diff --git a/source/libs/sgeobj/json/JL.json b/source/libs/sgeobj/json/JL.json index 07754aae90..41f38e6ce5 100644 --- a/source/libs/sgeobj/json/JL.json +++ b/source/libs/sgeobj/json/JL.json @@ -14,7 +14,17 @@ "line": "The Job Id." }], "type": "lUlongT", - "flags": [] + "flags": [ + { + "name": "PRIMARY_KEY" + }, + { + "name": "UNIQUE" + }, + { + "name": "HASH" + } + ] }, { "name": "OS_job_list", "summary": "OS Job List", diff --git a/source/libs/sgeobj/json/JO.json b/source/libs/sgeobj/json/JO.json index 6667ad7e11..339c5d75bb 100644 --- a/source/libs/sgeobj/json/JO.json +++ b/source/libs/sgeobj/json/JO.json @@ -27,6 +27,24 @@ }], "type": "lUlongT", "flags": [] + }, { + "name": "systemd_scope", + "summary": "Systemd Scope", + "description": [{ + "line": "When the job is running in a systemd scope, this field contains the name of the scope." + }], + "type": "lStringT", + "flags": [] + }, { + "name": "usage_collection", + "summary": "Usage Collection Mode", + "description": [{ + "line": "Usage collection mode used for the os job." + }, { + "line": "One of 0 (FALSE), 1 (PDC), 2 (HYBRID), 3 (TRUE)." + }], + "type": "lUlongT", + "flags": [] }, { "name": "ja_task_ID", "summary": "Array Task Id", diff --git a/source/libs/sgeobj/json/PET.json b/source/libs/sgeobj/json/PET.json index f0010b4f2d..8a40c2686b 100644 --- a/source/libs/sgeobj/json/PET.json +++ b/source/libs/sgeobj/json/PET.json @@ -67,10 +67,34 @@ "name": "osjobid", "summary": "OS JobId", "description": [{ - "line": "OS jobid / additional group id of a running pe task." + "line": "OS jobid / additional group id of a running pe task." + }], + "type": "lStringT", + "flags": [] + }, { + "name": "systemd_scope", + "summary": "Systemd Scope", + "description": [{ + "line": "In case a pe task is started in a systemd scope, this is the" + }, { + "line": "scope unit name." + }, { + "line": "Only used in sge_execd, it is set during the startup phase of the" + }, { + "line": "pe task." }], "type": "lStringT", "flags": [] + }, { + "name": "usage_collection", + "summary": "Usage Collection Mode", + "description": [{ + "line": "Usage collection mode of a pe task." + }, { + "line": "One of 0 (FALSE), 1 (PDC), 2 (HYBRID), 3 (TRUE)." + }], + "type": "lUlongT", + "flags": [] }, { "name": "usage", "summary": "Usage", diff --git a/source/libs/sgeobj/ocs_Job.cc b/source/libs/sgeobj/ocs_Job.cc index 7a4f669317..30bb206a1e 100644 --- a/source/libs/sgeobj/ocs_Job.cc +++ b/source/libs/sgeobj/ocs_Job.cc @@ -40,9 +40,13 @@ #include "ocs_Job.h" #include "sge_ja_task.h" #include "sge_job.h" +#include "sge_pe.h" +#include "sge_pe_task.h" #include "cull/sge_eejob_SGEJ_L.h" +#include "uti/ocs_Systemd.h" + /** @brief Sort jobs in the job list based on prio, submit time and job number * * This function sorts the jobs in the provided job list based on the @@ -97,3 +101,70 @@ void ocs::Job::sgeee_sort_jobs(lList **job_list) { DRETURN_VOID; } + +bool +ocs::Job::job_get_systemd_slice_and_scope(const lListElem *job, const lListElem *ja_task, const lListElem *pe_task, + std::string &slice, std::string &scope, dstring *error_dstr) { + DENTER(TOP_LAYER); + + bool ret = true; + +#if defined(OCS_WITH_SYSTEMD) + + bool is_array = job_is_array(job); + bool is_tightly_integrated = false; + if (ja_task != nullptr) { + const lListElem *pe = lGetObject(ja_task, JAT_pe_object); + if (pe != nullptr) { + is_tightly_integrated = lGetBool(pe, PE_control_slaves); + } + } + + std::string toplevel_slice = ocs::uti::Systemd::get_slice_name(); + slice = toplevel_slice + "-jobs"; + scope = toplevel_slice + "."; + if (is_array) { + std::string jobtask_id = std::to_string(lGetUlong(job, JB_job_number)) + "." + std::to_string(lGetUlong(ja_task, JAT_task_number)); + // array job + if (is_tightly_integrated) { + // array PE job, we have master and slave tasks + // ocs8012-jobs-1234.1.slice, ocs8012.1234.1.master.scope or ocs8012.1234.1...scope + slice += "-" + jobtask_id; + if (pe_task == nullptr) { + scope += jobtask_id + ".master"; + } else { + scope += jobtask_id + '.' + lGetString(pe_task, PET_id); + } + } else { + // just an array job + // ocs8012-jobs.slice, ocs8012.1234.1.scope + scope += jobtask_id; + } + } else { + std::string job_id = std::to_string(lGetUlong(job, JB_job_number)); + if (is_tightly_integrated) { + // sequential PE job, we have master and slave tasks + // ocs8012-jobs-1234.slice, ocs8012.master.scope or ocs8012...scope + slice += "-" + job_id; + if (pe_task == nullptr) { + scope += job_id + ".master"; + } else { + scope += job_id + '-' + lGetString(pe_task, PET_id); + } + } else { + // just a sequential job + // ocs8012-jobs.slice, ocs8012.1234.scope + scope += job_id; + } + } + + slice += ".slice"; + scope += ".scope"; +#else + slice.clear(); + scope.clear(); + ret = false; +#endif + + DRETURN(ret); +} diff --git a/source/libs/sgeobj/ocs_Job.h b/source/libs/sgeobj/ocs_Job.h index 719202bd02..dc471832e2 100644 --- a/source/libs/sgeobj/ocs_Job.h +++ b/source/libs/sgeobj/ocs_Job.h @@ -33,11 +33,15 @@ ************************************************************************/ /*___INFO__MARK_END__*/ +#include + #include "cull/cull.h" namespace ocs { class Job { public: static void sgeee_sort_jobs(lList **job_list); + static bool job_get_systemd_slice_and_scope(const lListElem *job, const lListElem *ja_task, const lListElem *pe_task, + std::string &slice, std::string &scope, dstring *error_dstr); }; } diff --git a/source/libs/sgeobj/sge_conf.cc b/source/libs/sgeobj/sge_conf.cc index 87233ef4d3..08909390a1 100644 --- a/source/libs/sgeobj/sge_conf.cc +++ b/source/libs/sgeobj/sge_conf.cc @@ -145,6 +145,7 @@ static bool enable_enforce_master_limit = false; static bool enable_test_sleep_after_request = false; static bool enable_forced_qdel_if_unknown = false; static bool ignore_ngroups_max_limit = false; +static bool enable_systemd = true; static bool do_credentials = true; static bool do_authentication = true; static bool is_monitor_message = true; @@ -183,6 +184,8 @@ static long ptf_min_priority = -999; static int max_dynamic_event_clients = 1000; static keep_active_t keep_active = KEEP_ACTIVE_FALSE; +static usage_collection_t usage_collection = USAGE_COLLECTION_DEFAULT; + static u_long32 script_timeout = 120; #ifdef LINUX static bool enable_binding = true; @@ -680,6 +683,7 @@ int merge_configuration(lList **answer_list, u_long32 progid, const char *cell_r enable_test_sleep_after_request = false; enable_forced_qdel_if_unknown = false; ignore_ngroups_max_limit = false; + enable_systemd = true; do_credentials = true; do_authentication = true; is_monitor_message = true; @@ -911,6 +915,7 @@ int merge_configuration(lList **answer_list, u_long32 progid, const char *cell_r ptf_max_priority = -999; ptf_min_priority = -999; keep_active = KEEP_ACTIVE_FALSE; + usage_collection = USAGE_COLLECTION_DEFAULT; script_timeout = 120; #ifdef LINUX enable_binding = true; @@ -976,6 +981,22 @@ int merge_configuration(lList **answer_list, u_long32 progid, const char *cell_r continue; } } + { + if (strncasecmp(s, "USAGE_COLLECTION", sizeof("USAGE_COLLECTION")-1) == 0) { + const char *usage_collection_str = &s[sizeof("USAGE_COLLECTION=")-1]; + + if (strncasecmp(usage_collection_str, TRUE_STR, sizeof(TRUE_STR)-1) == 0) { + usage_collection = USAGE_COLLECTION_DEFAULT; + } else if (strncasecmp(usage_collection_str, "PDC", sizeof("PDC")-1) == 0) { + usage_collection = USAGE_COLLECTION_PDC; + } else if (strncasecmp(usage_collection_str, "HYBRID", sizeof("HYBRID")-1) == 0) { + usage_collection = USAGE_COLLECTION_HYBRID; + } else { + usage_collection = USAGE_COLLECTION_NONE; + } + continue; + } + } if (parse_time_param(s, "SCRIPT_TIMEOUT", &script_timeout)) { continue; } @@ -1092,7 +1113,10 @@ int merge_configuration(lList **answer_list, u_long32 progid, const char *cell_r } if (parse_bool_param(s, "IGNORE_NGROUPS_MAX_LIMIT", &ignore_ngroups_max_limit)) { continue; - } + } + if (parse_bool_param(s, "ENABLE_SYSTEMD", &enable_systemd)) { + continue; + } } SGE_UNLOCK(LOCK_MASTER_CONF, LOCK_WRITE); sge_free_saved_vars(conf_context); @@ -2133,6 +2157,18 @@ keep_active_t mconf_get_keep_active() { DRETURN(ret); } +usage_collection_t mconf_get_usage_collection() { + DENTER(BASIS_LAYER); + + usage_collection_t ret; + + SGE_LOCK(LOCK_MASTER_CONF, LOCK_READ); + ret = usage_collection; + SGE_UNLOCK(LOCK_MASTER_CONF, LOCK_READ); + + DRETURN(ret); +} + bool mconf_get_enable_binding() { bool ret; @@ -2703,6 +2739,16 @@ bool mconf_get_ignore_ngroups_max_limit() { DRETURN(ret); } +bool mconf_get_enable_systemd() { + bool ret; + + DENTER(BASIS_LAYER); + SGE_LOCK(LOCK_MASTER_CONF, LOCK_READ); + ret = enable_systemd; + SGE_UNLOCK(LOCK_MASTER_CONF, LOCK_READ); + DRETURN(ret); +} + bool mconf_get_enable_submit_lib_path() { int ret; diff --git a/source/libs/sgeobj/sge_conf.h b/source/libs/sgeobj/sge_conf.h index 6916577758..5b93766429 100644 --- a/source/libs/sgeobj/sge_conf.h +++ b/source/libs/sgeobj/sge_conf.h @@ -52,6 +52,13 @@ typedef enum { KEEP_ACTIVE_ERROR } keep_active_t; +typedef enum { + USAGE_COLLECTION_NONE = 0, // no usage collection + USAGE_COLLECTION_PDC, // usage collection via PDC + USAGE_COLLECTION_HYBRID, // usage collection via PDC and systemd + USAGE_COLLECTION_DEFAULT // default: systemd if available, otherwise PDC +} usage_collection_t; + typedef int (*tDaemonizeFunc)(void *ctx); /* This list is *ONLY* used by the execd and should be moved eventually */ @@ -128,6 +135,7 @@ bool mconf_get_do_authentication(); bool mconf_get_acct_reserved_usage(); bool mconf_get_sharetree_reserved_usage(); keep_active_t mconf_get_keep_active(); +usage_collection_t mconf_get_usage_collection(); bool mconf_get_enable_binding(); bool mconf_get_simulate_execds(); bool mconf_get_simulate_jobs(); @@ -183,6 +191,7 @@ void mconf_get_s_locks(char **pret); int mconf_get_jsv_timeout(); int mconf_get_jsv_threshold(); bool mconf_get_ignore_ngroups_max_limit(); +bool mconf_get_enable_systemd(); bool mconf_get_enable_submit_lib_path(); bool mconf_get_enable_submit_ld_preload(); u_long32 mconf_get_script_timeout(); diff --git a/source/libs/sgeobj/sge_daemonize.cc b/source/libs/sgeobj/sge_daemonize.cc index 5667680804..8a0f9c3dae 100644 --- a/source/libs/sgeobj/sge_daemonize.cc +++ b/source/libs/sgeobj/sge_daemonize.cc @@ -75,7 +75,7 @@ static int fd_pipe[2]; * The parent process will exit with one of the following exit states: * * typedef enum uti_daemonize_state_type { -* SGE_DEAMONIZE_OK = 0, ok +* SGE_DAEMONIZE_OK = 0, ok * SGE_DAEMONIZE_DEAD_CHILD = 1, child exited before sending state * SGE_DAEMONIZE_TIMEOUT = 2 timeout whild waiting for state * } uti_daemonize_state_t; @@ -181,7 +181,7 @@ bool sge_daemonize_prepare() { } switch (exit_status) { - case SGE_DEAMONIZE_OK: + case SGE_DAEMONIZE_OK: INFO(SFNMAX, MSG_UTI_DAEMONIZE_OK); break; case SGE_DAEMONIZE_DEAD_CHILD: @@ -269,7 +269,7 @@ sge_daemonize_finalize() { } /* The response id has 4 byte, send it to father process */ - snprintf(tmp_buffer, 4, "%3d", SGE_DEAMONIZE_OK); + snprintf(tmp_buffer, 4, "%3d", SGE_DAEMONIZE_OK); if (write(fd_pipe[1], tmp_buffer, 4) != 4) { dstring ds = DSTRING_INIT; CRITICAL(MSG_FILE_CANNOT_WRITE_SS, "fd_pipe[1]", sge_strerror(errno, &ds)); diff --git a/source/libs/sgeobj/sge_proc.cc b/source/libs/sgeobj/sge_proc.cc index 7a8cef8311..25237ba543 100644 --- a/source/libs/sgeobj/sge_proc.cc +++ b/source/libs/sgeobj/sge_proc.cc @@ -55,8 +55,8 @@ static lList *procList; *******************************************************************************/ lListElem *get_pr (int pid) { - if (!procList) { - gen_procList (); + if (procList == nullptr) { + gen_procList(); return nullptr; } @@ -65,8 +65,8 @@ lListElem *get_pr (int pid) void append_pr (lListElem *pr) { - if (!procList) { - gen_procList (); + if (procList == nullptr) { + gen_procList(); } lAppendElem(procList, pr); } diff --git a/source/libs/uti/CMakeLists.txt b/source/libs/uti/CMakeLists.txt index 53d3a7e13c..66457f6a15 100644 --- a/source/libs/uti/CMakeLists.txt +++ b/source/libs/uti/CMakeLists.txt @@ -24,6 +24,7 @@ set(LIBRARY_SOURCES config_file.cc ocs_JsonUtil.cc ocs_Munge.cc + ocs_Systemd.cc ocs_TerminationManager.cc ocs_cond.cc sge_afsutil.cc diff --git a/source/libs/uti/msg_utilib.h b/source/libs/uti/msg_utilib.h index e5baccb9f5..6239f1bcec 100644 --- a/source/libs/uti/msg_utilib.h +++ b/source/libs/uti/msg_utilib.h @@ -1,35 +1,35 @@ #pragma once /*___INFO__MARK_BEGIN__*/ /************************************************************************* - * + * * The Contents of this file are made available subject to the terms of * the Sun Industry Standards Source License Version 1.2 - * + * * Sun Microsystems Inc., March, 2001 - * - * + * + * * Sun Industry Standards Source License Version 1.2 * ================================================= * The contents of this file are subject to the Sun Industry Standards * Source License Version 1.2 (the "License"); You may not use this file * except in compliance with the License. You may obtain a copy of the * License at http://gridengine.sunsource.net/Gridengine_SISSL_license.html - * + * * Software provided under this License is provided on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, * WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS, * MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING. * See the License for the specific provisions governing your rights and * obligations concerning the Software. - * + * * The Initial Developer of the Original Code is: Sun Microsystems, Inc. - * + * * Copyright: 2001 by Sun Microsystems, Inc. * * All Rights Reserved. * * Portions of this code are Copyright 2011 Univa Inc. - * + * * Portions of this software are Copyright (c) 2023-2024 HPC-Gridware GmbH * ************************************************************************/ @@ -212,4 +212,24 @@ #define MSG_UTI_UNABLE_TO_EXTRACT_GID _MESSAGE(59245, _("unable to extract gid from auth_info")) #define MSG_UTI_UNABLE_TO_EXTRACT_NSUP _MESSAGE(59246, _("unable to extract number of supplementary groups from auth_info")) #define MSG_UTI_UNABLE_TO_EXTRACT_SUP_S _MESSAGE(59247, _("unable to extract supplementary groups from auth_info: " SFN)) + +/* ocs_Systemd.cc */ +#define MSG_SYSTEMD_ALREADY_INITIALIZED _MESSAGE(59250, _("Systemd library already loaded")) +#define MSG_SYSTEMD_OPEN_LIB_SS _MESSAGE(59251, _("can't open shared library " SFN ": " SFN4)) +#define MSG_SYSTEMD_LOAD_FUNC_SS _MESSAGE(59252, _("can't load function " SFN ": " SFN4)) +#define MSG_SYSTEMD_CANNOT_CONNECT_IS _MESSAGE(59253, _("cannot connect to system bus: error %d: " SFN4)) +#define MSG_SYSTEMD_CANNOT_CALL_SSIS _MESSAGE(59254, _("systemd method " SFN "(" SFN ") failed: error %d: " SFN4)) +#define MSG_SYSTEMD_AFTER_RETRIES_I _MESSAGE(59255, _(" (%d retries after EINTR)")) +#define MSG_SYSTEMD_CANNOT_READ_RESULT_SISS _MESSAGE(59256, _("cannot read result from systemd method " SFN ": error %d: " SFN4)) +#define MSG_SYSTEMD_EMPTY_RESULT_S _MESSAGE(59257, _("empty result from systemd method " SFN)) +#define MSG_SYSTEMD_CANNOT_CREATE_MESSAGE_CALL_SIS _MESSAGE(59258, _("cannot create message for systemd method " SFN ": error %d: " SFN4)) +#define MSG_SYSTEMD_CANNOT_APPEND_TO_MESSAGE_SSIS _MESSAGE(59259, _("cannot append " SFN " to message for systemd method " SFN ": error %d: " SFN4)) +#define MSG_SYSTEMD_CANNOT_OPEN_CONTAINER_SSIS _MESSAGE(59260, _("cannot open container " SFN " for systemd method " SFN ": error %d: " SFN4)) +#define MSG_SYSTEMD_CANNOT_CLOSE_CONTAINER_SSIS _MESSAGE(59261, _("cannot close container " SFN " for systemd method " SFN ": error %d: " SFN4)) +#define MSG_SYSTEMD_CANNOT_APPEND_PROPERTY_SSIS _MESSAGE(59262, _("cannot append property " SFN " to message for systemd method " SFN ": error %d: " SFN4)) +#define MSG_SYSTEMD_CANNOT_READ_PROPERTY_RESULT_SIS _MESSAGE(59263, _("cannot read result value of property " SFN ": error %d: " SFN4)) +#define MSG_SYSTEMD_CANNOT_ENCODE_PATH_SIS _MESSAGE(59264, _("cannot encode path for unit " SFN ": error %d: " SFN4)) +#define MSG_SYSTEMD_CANNOT_DETECT_CGROUP_VERSION _MESSAGE(59265, _("cannot detect cgroup version")) +#define MSG_SYSTEMD_NOT_ROOT _MESSAGE(59266, _("cannot initialize the systemd integration, we are not running as root user")) + // clang-format on diff --git a/source/libs/uti/ocs_Systemd.cc b/source/libs/uti/ocs_Systemd.cc new file mode 100644 index 0000000000..aa1e37b241 --- /dev/null +++ b/source/libs/uti/ocs_Systemd.cc @@ -0,0 +1,1532 @@ +/*___INFO__MARK_BEGIN_NEW__*/ +/*************************************************************************** + * + * Copyright 2025 HPC-Gridware GmbH + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************/ +/*___INFO__MARK_END_NEW__*/ + +#if defined(OCS_WITH_SYSTEMD) +#include +#include +#include + +#include "sge_bootstrap_env.h" +#include "sge_log.h" +#include "sge_rmon_macros.h" +#include "sge_time.h" + +#include "msg_utilib.h" +#include "ocs_Systemd.h" + +namespace ocs::uti { + static const int NUM_SD_BUS_RETRIES = 5; // Number of retries for sd_bus_* methods after EINTR + + //================================================================================ + // static members + void *ocs::uti::Systemd::lib_handle = nullptr; + sd_bus_open_system_func_t Systemd::sd_bus_open_system_func = nullptr; + sd_bus_unref_func_t Systemd::sd_bus_unref_func = nullptr; + sd_bus_call_method_func_t Systemd::sd_bus_call_method_func = nullptr; + sd_bus_message_read_func_t Systemd::sd_bus_message_read_func = nullptr; + sd_bus_message_new_method_call_func_t Systemd::sd_bus_message_new_method_call_func = nullptr; + sd_bus_message_unref_func_t Systemd::sd_bus_message_unref_func = nullptr; + sd_bus_message_append_func_t Systemd::sd_bus_message_append_func = nullptr; + sd_bus_message_append_array_func_t Systemd::sd_bus_message_append_array_func = nullptr; + sd_bus_message_open_container_func_t Systemd::sd_bus_message_open_container_func = nullptr; + sd_bus_message_close_container_func_t Systemd::sd_bus_message_close_container_func = nullptr; + sd_bus_call_func_t Systemd::sd_bus_call_func = nullptr; + sd_bus_add_match_func_t Systemd::sd_bus_add_match_func = nullptr; + sd_bus_match_signal_func_t Systemd::sd_bus_match_signal_func = nullptr; + sd_bus_slot_unref_func_t Systemd::sd_bus_slot_unref_func = nullptr; + sd_bus_process_func_t Systemd::sd_bus_process_func = nullptr; + sd_bus_wait_func_t Systemd::sd_bus_wait_func = nullptr; + sd_bus_message_get_member_func_t Systemd::sd_bus_message_get_member_func = nullptr; + sd_bus_message_get_sender_func_t Systemd::sd_bus_message_get_sender_func = nullptr; + sd_bus_path_encode_func_t Systemd::sd_bus_path_encode_func = nullptr; + sd_bus_get_property_func_t Systemd::sd_bus_get_property_func = nullptr; + sd_bus_error_free_func_t Systemd::sd_bus_error_free_func = nullptr; + + std::string Systemd::slice_name{}; + std::string Systemd::service_name{}; + bool Systemd::running_as_service{false}; + int Systemd::cgroup_version{}; + int Systemd::systemd_version{}; + + // @todo move somewhere else + static std::string + get_slice_file_name() { + std::string ret = bootstrap_get_sge_root(); + ret += "/"; + ret += bootstrap_get_sge_cell(); + ret += "/common/slice_name"; + return ret; + } + + // @todo move somewhere else + // @todo add a dstring for reporting errors? + static bool + read_one_line_file(const std::string &file_name, std::string &line) { + bool ret = false; + std::ifstream file(file_name); + if (file.is_open()) { + if (std::getline(file, line)) { + ret = true; + } + file.close(); + } + return ret; + } + + // ================================================================================ + // @brief Initialize the Systemd class + // + // This function loads the systemd shared library and retrieves the function pointers + // for the sd-bus API functions. It checks for errors during loading and function + // retrieval, and returns a boolean indicating success or failure. + // + // @param error_dstr A pointer to a dstring where error messages will be stored + // @return true if initialization was successful, false otherwise + // @note This function should be called before using any other methods of the Systemd class. + // @note Must be root when calling this function (in our daemons before switching to admin user). + bool + Systemd::initialize(const std::string &service_name_in, dstring *error_dstr) { + DENTER(TOP_LAYER); + bool ret = true; + + if (std::filesystem::exists("/sys/fs/cgroup/systemd")) { + // cgroup v1 is available + cgroup_version = 1; + } else if (std::filesystem::exists("/sys/fs/cgroup/system.slice")) { + // cgroup v2 is available + cgroup_version = 2; + } else { + sge_dstring_sprintf(error_dstr, SFNMAX, MSG_SYSTEMD_CANNOT_DETECT_CGROUP_VERSION); + ret = false; + } + DPRINTF("==> cgroup version: %d", cgroup_version); + + // In order to use systemd, we need to be root (for writing operations) + if (ret && getuid() != 0) { + sge_dstring_sprintf(error_dstr, SFNMAX, MSG_SYSTEMD_NOT_ROOT); + ret = false; + } + + // initialize only once + if (ret && lib_handle != nullptr) { + sge_dstring_sprintf(error_dstr, SFNMAX, MSG_SYSTEMD_ALREADY_INITIALIZED); + ret = false; + } + + // Load the shared library and the required functions + if (ret) { + const char *libsystemd = "libsystemd.so.0"; + lib_handle = dlopen(libsystemd, RTLD_LAZY); + if (lib_handle == nullptr) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_OPEN_LIB_SS, libsystemd, dlerror()); + ret = false; + } + } + + // load the functions + const char *func; + if (ret) { + func = "sd_bus_open_system"; + sd_bus_open_system_func = reinterpret_cast(dlsym(lib_handle, func)); + if (sd_bus_open_system_func == nullptr) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_LOAD_FUNC_SS, func, dlerror()); + ret = false; + } + } + if (ret) { + // func = "sd_bus_unref"; + // safer is sd_bus_flush_close_unref() + func = "sd_bus_flush_close_unref"; + sd_bus_unref_func = reinterpret_cast(dlsym(lib_handle, func)); + if (sd_bus_unref_func == nullptr) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_LOAD_FUNC_SS, func, dlerror()); + ret = false; + } + } + if (ret) { + func = "sd_bus_call_method"; + sd_bus_call_method_func = reinterpret_cast(dlsym(lib_handle, func)); + if (sd_bus_unref_func == nullptr) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_LOAD_FUNC_SS, func, dlerror()); + ret = false; + } + } + if (ret) { + func = "sd_bus_message_read"; + sd_bus_message_read_func = reinterpret_cast(dlsym(lib_handle, func)); + if (sd_bus_unref_func == nullptr) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_LOAD_FUNC_SS, func, dlerror()); + ret = false; + } + } + + if (ret) { + func = "sd_bus_message_new_method_call"; + sd_bus_message_new_method_call_func = reinterpret_cast(dlsym(lib_handle, func)); + if (sd_bus_message_new_method_call_func == nullptr) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_LOAD_FUNC_SS, func, dlerror()); + ret = false; + } + } + if (ret) { + func = "sd_bus_message_unref"; + sd_bus_message_unref_func = reinterpret_cast(dlsym(lib_handle, func)); + if (sd_bus_message_unref_func == nullptr) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_LOAD_FUNC_SS, func, dlerror()); + ret = false; + } + } + if (ret) { + func = "sd_bus_message_append"; + sd_bus_message_append_func = reinterpret_cast(dlsym(lib_handle, func)); + if (sd_bus_message_append_func == nullptr) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_LOAD_FUNC_SS, func, dlerror()); + ret = false; + } + } + if (ret) { + func = "sd_bus_message_append_array"; + sd_bus_message_append_array_func = reinterpret_cast(dlsym(lib_handle, func)); + if (sd_bus_message_append_array_func == nullptr) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_LOAD_FUNC_SS, func, dlerror()); + ret = false; + } + } + if (ret) { + func = "sd_bus_message_open_container"; + sd_bus_message_open_container_func = reinterpret_cast(dlsym(lib_handle, func)); + if (sd_bus_message_open_container_func == nullptr) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_LOAD_FUNC_SS, func, dlerror()); + ret = false; + } + } + if (ret) { + func = "sd_bus_message_close_container"; + sd_bus_message_close_container_func = reinterpret_cast(dlsym(lib_handle, func)); + if (sd_bus_message_close_container_func == nullptr) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_LOAD_FUNC_SS, func, dlerror()); + ret = false; + } + } + if (ret) { + func = "sd_bus_call"; + sd_bus_call_func = reinterpret_cast(dlsym(lib_handle, func)); + if (sd_bus_call_func == nullptr) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_LOAD_FUNC_SS, func, dlerror()); + ret = false; + } + } + if (ret) { + func = "sd_bus_add_match"; + sd_bus_add_match_func = reinterpret_cast(dlsym(lib_handle, func)); + if (sd_bus_add_match_func == nullptr) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_LOAD_FUNC_SS, func, dlerror()); + ret = false; + } + } + if (ret) { + func = "sd_bus_match_signal"; + sd_bus_match_signal_func = reinterpret_cast(dlsym(lib_handle, func)); + if (sd_bus_match_signal_func == nullptr) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_LOAD_FUNC_SS, func, dlerror()); + ret = false; + } + } + if (ret) { + func = "sd_bus_slot_unref"; + sd_bus_slot_unref_func = reinterpret_cast(dlsym(lib_handle, func)); + if (sd_bus_slot_unref_func == nullptr) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_LOAD_FUNC_SS, func, dlerror()); + ret = false; + } + } + if (ret) { + func = "sd_bus_process"; + sd_bus_process_func = reinterpret_cast(dlsym(lib_handle, func)); + if (sd_bus_process_func == nullptr) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_LOAD_FUNC_SS, func, dlerror()); + ret = false; + } + } + if (ret) { + func = "sd_bus_wait"; + sd_bus_wait_func = reinterpret_cast(dlsym(lib_handle, func)); + if (sd_bus_wait_func == nullptr) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_LOAD_FUNC_SS, func, dlerror()); + ret = false; + } + } + if (ret) { + func = "sd_bus_message_get_sender"; + sd_bus_message_get_sender_func = reinterpret_cast(dlsym(lib_handle, func)); + if (sd_bus_message_get_sender_func == nullptr) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_LOAD_FUNC_SS, func, dlerror()); + ret = false; + } + } + if (ret) { + func = "sd_bus_message_get_member"; + sd_bus_message_get_member_func = reinterpret_cast(dlsym(lib_handle, func)); + if (sd_bus_message_get_member_func == nullptr) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_LOAD_FUNC_SS, func, dlerror()); + ret = false; + } + } + if (ret) { + func = "sd_bus_path_encode"; + sd_bus_path_encode_func = reinterpret_cast(dlsym(lib_handle, func)); + if (sd_bus_path_encode_func == nullptr) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_LOAD_FUNC_SS, func, dlerror()); + ret = false; + } + } + if (ret) { + func = "sd_bus_get_property"; + sd_bus_get_property_func = reinterpret_cast(dlsym(lib_handle, func)); + if (sd_bus_get_property_func == nullptr) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_LOAD_FUNC_SS, func, dlerror()); + ret = false; + } + } + if (ret) { + func = "sd_bus_error_free"; + sd_bus_error_free_func = reinterpret_cast(dlsym(lib_handle, func)); + if (sd_bus_error_free_func == nullptr) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_LOAD_FUNC_SS, func, dlerror()); + ret = false; + } + } + + if (ret) { + // We will need the slice name for the service name, read it once. + // If we have a slice name, we can figure out if we are running as service. + service_name = service_name_in; + std::string slice_file_name = get_slice_file_name(); + if (read_one_line_file(slice_file_name, slice_name)) { + // build service name, e.g. "ocs6444-execd.service" + std::string full_service_name = slice_name + "-" + service_name; + // get unit path of service + Systemd systemd; + if (systemd.connect(error_dstr)) { + // get systemd version + std::string systemd_version_str; + systemd.sd_bus_get_property("Manager", "", "Version", systemd_version_str, error_dstr); + systemd_version = std::stoi(systemd_version_str); + DPRINTF("systemd version: %d", systemd_version); + + std::string service_unit_path; + bool have_service_unit = systemd.sd_bus_method_s_o("GetUnit", full_service_name, service_unit_path, error_dstr); + if (have_service_unit) { + DPRINTF("have_service_unit: %s", service_unit_path.c_str()); + // get the unit path of this process + std::string pid_unit_path; + bool have_pid_unit = systemd.sd_bus_method_u_o("GetUnitByPID", getpid(), pid_unit_path, error_dstr); + // compare both unit paths, if they are equal then we are running as service + if (have_pid_unit) { + DPRINTF("have_pid_unit: %s", pid_unit_path.c_str()); + if (service_unit_path.compare(pid_unit_path) == 0) { + running_as_service = true; + DPRINTF("we are running as systemd service: %s", service_unit_path.c_str()); + } + } else { + DPRINTF("could not get unit path for pid %d: %s", getpid(), sge_dstring_get_string(error_dstr)); + // this is OK, we might not be running as a service + } + } else { + DPRINTF("could not get unit path for service %s: %s", full_service_name.c_str(), sge_dstring_get_string(error_dstr)); + // this is OK, we might not be running as a service + } + } else { + DPRINTF("cannot connect to systemd: %s", sge_dstring_get_string(error_dstr)); + ret = false; + } + } + } + + // if we could not load the library or the functions, + // or we cannot connect to the systemd, + // then close the library + if (!ret && lib_handle != nullptr) { + dlclose(lib_handle); + lib_handle = nullptr; + } + + return ret; + } + + /*! + * @brief Check if systemd is available + * + * This function checks if the systemd library is loaded and available for use. + * It returns true if the library is loaded, otherwise false. + * + * @return true if systemd is available, false otherwise + */ + bool + Systemd::is_systemd_available() { + // Check if systemd is available + return lib_handle != nullptr; + } + + /*! + * @brief Check if we are running as a systemd service + * + * This function checks if the current process is running as a systemd service. + * It returns true if the process is running as a service, otherwise false. + * + * @return true if running as a service, false otherwise + */ + bool + Systemd::is_running_as_service() { + return running_as_service; + } + + //================================================================================ + // instance methods + + /*! + * @brief Constructor for the Systemd class + * + * This constructor initializes the Systemd object and sets the bus pointer to nullptr. + */ + Systemd::Systemd() + : bus(nullptr) { + } + + /*! + * @brief Destructor for the Systemd class + * + * This destructor cleans up the Systemd object by unreferencing the bus pointer + * if it is not nullptr. + */ + Systemd::~Systemd() { + // Destructor implementation + if (bus != nullptr) { + sd_bus_unref_func(bus); + bus = nullptr; + } + } + + /*! + * @brief Connect to the systemd bus + * + * This function attempts to connect to the systemd bus using the sd_bus_open_system_func. + * If the connection is successful, it returns true. If there is an error, it sets the + * error_dstr with an appropriate error message and returns false. + * + * @param error_dstr A pointer to a dstring where error messages will be stored + * @return true if the connection was successful, false otherwise + */ + bool + Systemd::connect(dstring *error_dstr) { + bool ret = true; + + int r = sd_bus_open_system_func(&bus); + if (r < 0) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_CANNOT_CONNECT_IS, r, strerror(-r)); + ret = false; + } + + return ret; + } + + /*! + * @brief Check if the systemd bus is connected + * + * This function checks if the bus pointer is not nullptr, indicating that a connection + * to the systemd bus has been established. + * + * @return true if connected to the systemd bus, false otherwise + */ + bool + Systemd::connected() const { + return bus != nullptr; + } + + /*! + * @brief Call a systemd method with a string input and get an object output + * + * This function calls a systemd method with a string input and retrieves an object output. + * An object is represented as a string in this context, which is the path to the object. + * It handles retries in case of EINTR errors and returns true if the call was successful, + * otherwise it sets the error_dstr with an appropriate error message. + * + * @param method The name of the systemd method to call + * @param input The input string to pass to the method + * @param output A reference to a string where the output will be stored + * @param error_dstr A pointer to a dstring where error messages will be stored + * @return true if the method call was successful, false otherwise + */ + bool + Systemd::sd_bus_method_s_o(const std::string &method, std::string &input, std::string &output, dstring *error_dstr) const { + bool ret = true; + + bool retry_on_interrupt = true; // retry on EINTR + int retries = 0; + while (ret && retry_on_interrupt) { + retry_on_interrupt = false; + sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus_message *m = nullptr; + int r = sd_bus_call_method_func(bus, "org.freedesktop.systemd1", // service to contact + "/org/freedesktop/systemd1", // object path + "org.freedesktop.systemd1.Manager", // interface name + method.c_str(), // method name + &error, // object to return error in + &m, // return message on success + "s", // input signature + input.c_str()); // first argument + if (r < 0) { + if (-r == EINTR && retries++ < NUM_SD_BUS_RETRIES) { + retry_on_interrupt = true; + } else { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_CANNOT_CALL_SSIS, method.c_str(), input.c_str(), r, error.message); + if (retries > 0) { + sge_dstring_sprintf_append(error_dstr, MSG_SYSTEMD_AFTER_RETRIES_I, retries); + } + ret = false; + } + } else { + const char *result = nullptr; + r = sd_bus_message_read_func(m, "o", &result); + if (r < 0) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_CANNOT_READ_RESULT_SISS, method.c_str(), r, error.message); + ret = false; + } else { + if (result == nullptr) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_EMPTY_RESULT_S, method.c_str()); + ret = false; + } + output = result; + } + } + sd_bus_error_free_func(&error); + sd_bus_message_unref_func(m); + } + + return ret; + } + + /*! + * @brief Call a systemd method with a uint32_t input and get an object output + * + * This function calls a systemd method with a uint32_t input and retrieves an object output. + * An object is represented as a string in this context, which is the path to the object. + * It handles retries in case of EINTR errors and returns true if the call was successful, + * otherwise it sets the error_dstr with an appropriate error message. + * + * @param method The name of the systemd method to call + * @param input The input uint32_t to pass to the method + * @param output A reference to a string where the output will be stored + * @param error_dstr A pointer to a dstring where error messages will be stored + * @return true if the method call was successful, false otherwise + */ + bool + Systemd::sd_bus_method_u_o(const std::string &method, uint32_t input, std::string &output, dstring *error_dstr) const { + bool ret = true; + + bool retry_on_interrupt = true; // retry on EINTR + int retries = 0; + while (ret && retry_on_interrupt) { + retry_on_interrupt = false; + sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus_message *m = nullptr; + int r = sd_bus_call_method_func(bus, "org.freedesktop.systemd1", // service to contact + "/org/freedesktop/systemd1", // object path + "org.freedesktop.systemd1.Manager", // interface name + method.c_str(), // method name + &error, // object to return error in + &m, // return message on success + "u", // input signature + input); // first argument + if (r < 0) { + if (-r == EINTR && retries++ < NUM_SD_BUS_RETRIES) { + retry_on_interrupt = true; + } else { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_CANNOT_CALL_SSIS, method.c_str(), std::to_string(input), r, error.message); + if (retries > 0) { + sge_dstring_sprintf_append(error_dstr, MSG_SYSTEMD_AFTER_RETRIES_I, retries); + } + ret = false; + } + } else { + const char *result = nullptr; + r = sd_bus_message_read_func(m, "o", &result); + if (r < 0) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_CANNOT_READ_RESULT_SISS, method.c_str(), r, error.message); + ret = false; + } else { + if (result == nullptr) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_EMPTY_RESULT_S, method.c_str()); + ret = false; + } + output = result; + } + } + sd_bus_error_free_func(&error); + sd_bus_message_unref_func(m); + } + + return ret; + } + + /*! + * @brief Move a shepherd process to a systemd scope + * + * This function moves a shepherd process identified by its PID to a systemd scope. + * It first checks if the scope exists, and if not, it creates it. If the scope already + * exists, it attaches the shepherd process to it. It handles retries in case of EINTR errors + * and returns true if the operation was successful, otherwise it sets the error_dstr with an + * appropriate error message. + * + * @param pid The PID of the shepherd process to move + * @param error_dstr A pointer to a dstring where error messages will be stored + * @return true if the operation was successful, false otherwise + */ + bool + Systemd::move_shepherd_to_scope(pid_t pid, dstring *error_dstr) const { + DENTER(TOP_LAYER); + + bool ret = true; + bool create = false; + std::string full_scope_name = slice_name + "-" + "shepherds.scope"; + std::string full_slice_name = slice_name + ".slice"; + + bool retry_on_interrupt = true; // retry on EINTR + int retries = 0; + while (ret && retry_on_interrupt) { + retry_on_interrupt = false; + DPRINTF("Systemd::move_shepherd_to_scope: Calling GetUnit\n"); + sd_bus_message *m = nullptr; + sd_bus_error error = SD_BUS_ERROR_NULL; + // search the unit (full_scope_name) + // if it does not exist, then call StartTransientUnit + // if it exists, then call AttachProcessesToUnit + // if AttachProcessesToUnit fails with -2 (ENOENT), we might have run into a race condition: + // the unit was removed just between GetUnit and AttachProcessesToUnit. + // then call StartTransientUnit which will re-create the unit + int r = sd_bus_call_method_func(bus, "org.freedesktop.systemd1", // service to contact + "/org/freedesktop/systemd1", // object path + "org.freedesktop.systemd1.Manager", // interface name + "GetUnit", // method name + &error, // object to return error in + &m, // return message on success + "s", // input signature + full_scope_name.c_str()); // input argument + if (r < 0) { + // ENOENT (-2): scope does not exist + if (-r == ENOENT) { + DPRINTF("Systemd::move_shepherd_to_scope: scope does not exist, create it\n"); + create = true; + } else if (-r == EINTR && retries < NUM_SD_BUS_RETRIES) { + retry_on_interrupt = true; + } else { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_CANNOT_CALL_SSIS, "GetUnit", full_scope_name.c_str(), r, error.message); + if (retries > 0) { + sge_dstring_sprintf_append(error_dstr, MSG_SYSTEMD_AFTER_RETRIES_I, retries); + } + ret = false; + } + } + + sd_bus_error_free_func(&error); + sd_bus_message_unref_func(m); + } + +#if 0 + // Enable for debugging the situation, that the scope did not exist, according to GetUnit, + // but when we try to create it, it suddenly exists. + // This can happen if sge_execd forks multiple shepherds in short succession, + // they all see that the unit does not exist, but only one of them can create it, the others fail. + // Submit an array job to the test host, and you should see the DPRINTF output below. + srand(pid); + int microsecs = rand() % 1000000; + DPRINTF("===> sleeping for %d µsec, then %s shepherd scope %s", microsecs, create ? "creating" : "attaching to", full_scope_name.c_str()); + usleep(microsecs); +#endif + + // the scope exists, we can attach the shepherd to it + if (ret && !create) { + DPRINTF("Systemd::move_shepherd_to_scope: calling AttachProcessesToUnit\n"); + bool scope_not_exists = false; + ret = attach_pid_to_scope(full_scope_name, pid, scope_not_exists, error_dstr); + if (!ret && scope_not_exists) { + // The scope vanished between GetUnit and AttachProcessesToUnit, + // we have to create it again. + + // Clear dstring as we handle the error. + sge_dstring_clear(error_dstr); + DPRINTF("Systemd::move_shepherd_to_scope: scope no longer exists, calling StartTransientUnit\n"); + create = true; + } + } + + if (ret && create) { + DPRINTF("Systemd::move_shepherd_to_scope: calling create_scope_with_pid\n"); + SystemdProperties_t properties; + bool scope_already_exists = false; + ret = create_scope_with_pid(full_scope_name, full_slice_name, properties, pid, scope_already_exists, error_dstr); + if (!ret) { + if (scope_already_exists) { + // The scope already exists, someone (another shepherd) created it in the meantime. + // We can try to attach the pid to the existing scope. + DPRINTF("===> Scope already exists, while GetUnit called earlier said it didn't exist."); + + // Clear dstring as we handle the error. + sge_dstring_clear(error_dstr); + + bool scope_not_exists = false; + ret = attach_pid_to_scope(full_scope_name, pid, scope_not_exists, error_dstr); + // if it fails here again, then there was really a problem + } + } + } + + DRETURN(ret); + } + + /*! + * @brief Attach a PID to a systemd scope + * + * This function attaches a given PID to a specified systemd scope. + * If the operation fails as the scope does not exist, it sets the scope_not_exists flag to true and + * returns false. It handles retries in case of EINTR errors and returns true + * if the operation was successful, otherwise it sets the error_dstr with an + * appropriate error message. + * + * Background for the scope_not_exists flag: We can run into race conditions: + * - We call GetUnit to check if the scope exists. + * - If it exists, we call AttachProcessesToUnit to attach the PID to the scope. + * - But between GetUnit and AttachProcessesToUnit, the scope might have been removed. + * + * @param scope The name of the systemd scope to attach the PID to + * @param pid The PID to attach to the scope + * @param scope_not_exists A reference to a boolean that will be set to true if the scope does not exist + * @param error_dstr A pointer to a dstring where error messages will be stored + * @return true if the operation was successful, false otherwise + */ + bool + Systemd::attach_pid_to_scope(const std::string &scope, pid_t pid, bool &scope_not_exists, dstring *error_dstr) const { + DENTER(TOP_LAYER); + + bool ret = true; + scope_not_exists = false; + + DPRINTF("Systemd::attach_pid_to_scope: calling AttachProcessesToUnit\n"); + sd_bus_message *m = nullptr; + sd_bus_error error = SD_BUS_ERROR_NULL; + bool retry_on_interrupt = true; // retry on EINTR + int retries = 0; + while (ret && retry_on_interrupt) { + retry_on_interrupt = false; + int r = sd_bus_call_method_func(bus, "org.freedesktop.systemd1", // service to contact + "/org/freedesktop/systemd1", // object path + "org.freedesktop.systemd1.Manager", // interface name + "AttachProcessesToUnit", // method name + &error, // object to return error in + &m, // return message on success + "ssau", // input signature + scope.c_str(), // -> scope + "", // -> subcgroup + 1, pid); // array containing one pid + if (r < 0) { + if (-r == ENOENT) { + scope_not_exists = true; + DPRINTF("Systemd::attach_pid_to_scope: scope does not exist\n"); + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_CANNOT_CALL_SSIS, "AttachProcessesToUnit", scope.c_str(), r, error.message); + ret = false; + } else if (-r == EINTR && retries++ < NUM_SD_BUS_RETRIES) { + retry_on_interrupt = true; + } else { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_CANNOT_CALL_SSIS, "AttachProcessesToUnit", scope.c_str(), r, error.message); + if (retries > 0) { + sge_dstring_sprintf_append(error_dstr, MSG_SYSTEMD_AFTER_RETRIES_I, retries); + } + ret = false; + } + } + + sd_bus_message_unref_func(m); + sd_bus_error_free_func(&error); + } + + DRETURN(ret); + } + + /*! + * @brief Create a systemd scope with a given PID + * + * This function creates a systemd scope with the specified name and slice, + * and attaches the given PID to it. It also sets properties for the scope. + * If the scope already exists, it sets the scope_already_exists flag to true + * and returns false. + * It handles retries in case of EINTR errors and returns true if the operation + * was successful, otherwise it sets the error_dstr with an appropriate error message. + * + * Background for the scope_already_exists flag: We can run into race conditions: + * - We call GetUnit to check if the scope exists. + * - If it does not exist, we call this function to create it. + * - But between GetUnit and StartTransientUnit, the scope might have been created by another process, + * e.g., a second execd child (sge_shepherd) having been forked and calling move_shepherd_to_scope(). + * + * @param scope The name of the systemd scope to create + * @param slice The name of the systemd slice to use + * @param properties A map of properties to set for the scope + * @param pid The PID to attach to the scope + * @param scope_already_exists A reference to a boolean that will be set to true if the scope already exists + * @param error_dstr A pointer to a dstring where error messages will be stored + * @return true if the operation was successful, false otherwise + */ + bool + Systemd::create_scope_with_pid(const std::string &scope, const std::string &slice, + const SystemdProperties_t &properties, pid_t pid, bool &scope_already_exists, + dstring *error_dstr) const { + DENTER(TOP_LAYER); + + bool ret = true; + + DPRINTF("Systemd::move_shepherd_to_scope: calling StartTransientUnit\n"); + + // StartTransientUnit will start a systemd job, we have to wait for it to finish + // @todo AI claims that this is only needed with systemd version >= 239, but is this correct? + // from man page: sd_bus_wait() was added in version 240. + // but older shared libs have it already - so with which version shall we start to use it? + sd_bus_slot *slot = nullptr; + if (systemd_version >= 240) { + slot = sd_bus_wait_for_job_subscribe("JobRemoved", error_dstr); + if (slot == nullptr) { + ret = false; + } + } + + sd_bus_message *m = nullptr; + int r; + if (ret) { + // build the method step by step as we add arrays + r = sd_bus_message_new_method_call_func(bus, &m, + "org.freedesktop.systemd1", // service to contact + "/org/freedesktop/systemd1", // object path + "org.freedesktop.systemd1.Manager", // interface name + "StartTransientUnit"); // method name + if (r < 0) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_CANNOT_CREATE_MESSAGE_CALL_SIS, "StartTransientUnit", r, strerror(-r)); + ret = false; + } + } + + if (ret) { + r = sd_bus_message_append_func(m, "ss", scope.c_str(), "fail"); + if (r < 0) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_CANNOT_APPEND_TO_MESSAGE_SSIS, "name and mode", "StartTransientUnit", r, strerror(-r)); + ret = false; + } + } + + // we add an array of properties (which are of type struct containing a string and a variant) + if (ret) { + r = sd_bus_message_open_container_func(m, SD_BUS_TYPE_ARRAY, "(sv)"); + if (r < 0) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_CANNOT_OPEN_CONTAINER_SSIS, "properties", "StartTransientUnit", r, strerror(-r)); + ret = false; + } + } + + if (ret) { + r = sd_bus_message_append_func(m, "(sv)", "Delegate", "b", 1); + if (r < 0) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_CANNOT_APPEND_PROPERTY_SSIS, "Delegate", "StartTransientUnit", r, strerror(-r)); + ret = false; + } + } + if (ret) { + r = sd_bus_message_append_func(m, "(sv)", "Slice", "s", slice.c_str()); + if (r < 0) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_CANNOT_APPEND_PROPERTY_SSIS, "Slice", "StartTransientUnit", r, strerror(-r)); + ret = false; + } + } + if (ret) { + r = sd_bus_message_append_func(m, "(sv)", "PIDs", "au", 1, pid); + if (r < 0) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_CANNOT_APPEND_PROPERTY_SSIS, "PIDs", "StartTransientUnit", r, strerror(-r)); + ret = false; + } + } + + // add properties, e.g. "MemoryMax" or "IOReadBandwidthMax + // @todo need to catch bad_variant_access exception? Not really needed here, as we know the types, but to be on the safe side? + if (ret && properties.size() > 0) { + for (auto const& [key, value] : properties) { + if (ret) { + // open the struct (sv) + r = sd_bus_message_open_container_func(m, SD_BUS_TYPE_STRUCT, "sv"); + if (r < 0) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_CANNOT_OPEN_CONTAINER_SSIS, "one property struct", "StartTransientUnit", r, strerror(-r)); + ret = false; + } + if (ret) { + r = sd_bus_message_append_func(m, "s", key.c_str()); + if (r < 0) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_CANNOT_APPEND_PROPERTY_SSIS, "one property key", "StartTransientUnit", r, strerror(-r)); + ret = false; + } + } + if (ret) { + switch (value.index()) { + case 0: // std::string + r = sd_bus_message_open_container_func(m, SD_BUS_TYPE_VARIANT, "s"); + break; + case 1: // uint64_t + r = sd_bus_message_open_container_func(m, SD_BUS_TYPE_VARIANT, "t"); + break; + case 2: // bool + r = sd_bus_message_open_container_func(m, SD_BUS_TYPE_VARIANT, "b"); + break; + case 3: // std::vector + r = sd_bus_message_open_container_func(m, SD_BUS_TYPE_VARIANT, "ay"); + break; + case 4: // SystemdDevice_t (struct of two strings, device and mode) + r = sd_bus_message_open_container_func(m, SD_BUS_TYPE_VARIANT, "a(ss)"); + break; + default: + r = -EINVAL; // invalid type + break; + } + if (r < 0) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_CANNOT_OPEN_CONTAINER_SSIS, "one property variant", "StartTransientUnit", r, strerror(-r)); + ret = false; + } + } + if (ret) { + switch (value.index()) { + case 0: // std::string + r = sd_bus_message_append_func(m, "s", std::get(value).c_str()); + break; + case 1: // uint64_t + r = sd_bus_message_append_func(m, "t", std::get(value)); + break; + case 2: // bool + r = sd_bus_message_append_func(m, "b", std::get(value) ? 1 : 0); + break; + case 3: // std::vector + { + // adding the vector as an array + std::vector bits = std::get>(value); + r = sd_bus_message_append_array_func(m, 'y', bits.data(), bits.size()); + } + break; + case 4: + { + std::vector devices = std::get>(value); + // adding the vector as an array of structs (ss) + r = sd_bus_message_open_container_func(m, SD_BUS_TYPE_ARRAY, "(ss)"); + if (r >= 0) { + for (auto const& device : devices) { + // append each device as a struct (ss) + r = sd_bus_message_append_func(m, "(ss)", device.first.c_str(), device.second.c_str()); + if (r < 0) { + break; + } + } + } + if (r >= 0) { + // close the array of structs + r = sd_bus_message_close_container_func(m); + } + } + break; + default: + // cannot really happen + r = -EINVAL; // invalid type + break; + } + if (r < 0) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_CANNOT_APPEND_PROPERTY_SSIS, key.c_str(), "StartTransientUnit", r, strerror(-r)); + ret = false; + } + } + + if (ret) { + r = sd_bus_message_close_container_func(m); + if (r < 0) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_CANNOT_CLOSE_CONTAINER_SSIS, "one property variant", "StartTransientUnit", r, strerror(-r)); + ret = false; + } + } + if (ret) { + r = sd_bus_message_close_container_func(m); + if (r < 0) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_CANNOT_CLOSE_CONTAINER_SSIS, "one property struct", "StartTransientUnit", r, strerror(-r)); + ret = false; + } + } + } + } // loop over properties + } + + if (ret) { + r = sd_bus_message_close_container_func(m); + if (r < 0) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_CANNOT_CLOSE_CONTAINER_SSIS, "properties", "StartTransientUnit", r, strerror(-r)); + ret = false; + } + } + if (ret) { + r = sd_bus_message_open_container_func(m, SD_BUS_TYPE_ARRAY, "(sa(sv))"); + if (r < 0) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_CANNOT_OPEN_CONTAINER_SSIS, "aux", "StartTransientUnit", r, strerror(-r)); + ret = false; + } + } + if (ret) { + r = sd_bus_message_close_container_func(m); + if (r < 0) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_CANNOT_CLOSE_CONTAINER_SSIS, "aux", "StartTransientUnit", r, strerror(-r)); + ret = false; + } + } + bool retry_on_interrupt = true; // retry on EINTR + int retries = 0; + while (ret && retry_on_interrupt) { + retry_on_interrupt = false; + sd_bus_message *reply = nullptr; + sd_bus_error error = SD_BUS_ERROR_NULL; + r = sd_bus_call_func(bus, m, 0, &error, &reply); + DPRINTF("===> StartTransientUnit returned %d", r); + if (r < 0) { + if (-r == EEXIST) { + scope_already_exists = true; + DPRINTF("Systemd::attach_pid_to_scope: scope does not exist\n"); + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_CANNOT_CALL_SSIS, "StartTransientUnit", scope.c_str(), r, error.message); + ret = false; + } else if (-r == EINTR && retries++ < NUM_SD_BUS_RETRIES) { + retry_on_interrupt = true; + } else { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_CANNOT_CALL_SSIS, "StartTransientUnit", scope.c_str(), r, error.message); + if (retries > 0) { + sge_dstring_sprintf_append(error_dstr, MSG_SYSTEMD_AFTER_RETRIES_I, retries); + } + ret = false; + } + } else { + const char *job = nullptr; + r = sd_bus_message_read_func(reply, "o", &job); + if (r < 0) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_CANNOT_READ_RESULT_SISS, "StartTransientUnit", r, strerror(-r)); + ret = false; + } else { + if (job == nullptr) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_EMPTY_RESULT_S, "StartTransientUnit"); + ret = false; + } else { + // wait for the job to finish + if (slot != nullptr) { + ret = sd_bus_wait_for_job_completion(job, error_dstr); + sd_bus_wait_for_job_unsubscribe(&slot); + } + } + } + } + sd_bus_message_unref_func(reply); + sd_bus_error_free_func(&error); + } + + sd_bus_message_unref_func(m); + + DRETURN(ret); + } + + /*! + * @brief Wait for a systemd job signal + * + * This function subscribes to a systemd job signal, allowing the caller to wait for + * specific job events, such as JobRemoved. It returns a pointer to the sd_bus_slot + * that can be used to receive the signals. + * + * @param signal The name of the signal to subscribe to (e.g., "JobRemoved") + * @param error_dstr A pointer to a dstring where error messages will be stored + * @return A pointer to the sd_bus_slot for the subscribed signal, or nullptr on failure + */ + sd_bus_slot * + Systemd::sd_bus_wait_for_job_subscribe(const std::string &signal, dstring *error_dstr) const { + DENTER(TOP_LAYER); + + // add match for JobRemoved signal + sd_bus_slot *slot = nullptr; + std::string match_rule = "type='signal',interface='org.freedesktop.systemd1.Manager',member='" + signal + "'"; + int r = sd_bus_add_match_func(bus, &slot, match_rule.c_str(), nullptr, nullptr); + // @todo should also work but doesn't + // int r = sd_bus_match_signal_func(bus, &slot, nullptr, nullptr, "JobRemoved", nullptr, nullptr); + if (r < 0) { + sge_dstring_sprintf(error_dstr, SFN ": adding match func " SFQ " failed: error %d: " SFN, __func__, match_rule.c_str(), r, strerror(-r)); + } + + DRETURN(slot); + } + + /*! + * @brief Unsubscribe from a systemd job signal + * + * This function unsubscribes from a systemd job signal by releasing the match pattern + * associated with the sd_bus_slot. It sets the slot pointer to nullptr after unreferencing it. + * + * @param slot A pointer to the sd_bus_slot to unsubscribe from + */ + void + Systemd::sd_bus_wait_for_job_unsubscribe(sd_bus_slot **slot) const { + // release the match pattern + sd_bus_slot_unref_func(*slot); + slot = nullptr; + } + + /*! + * @brief Wait for a systemd job to complete + * + * This function waits for a specific systemd job to complete by processing bus messages + * and checking for the JobRemoved signal. It returns true if the job was completed successfully, + * otherwise it sets the error_dstr with an appropriate error message. + * It waits with a 5s timeout for the job to complete, if it times out it returns an error. + * @note we might want to adjust the timeout based on experience, or make it configurable. + * + * @param job_path The path of the job to wait for completion + * @param error_dstr A pointer to a dstring where error messages will be stored + * @return true if the job was completed successfully, false otherwise + */ + bool + Systemd::sd_bus_wait_for_job_completion(const std::string &job_path, dstring *error_dstr) const { + DENTER(TOP_LAYER); + DPRINTF("==> sd_bus_wait_for_job_completion(%s)", job_path.c_str()); + + bool ret = true; + + // Wait for the job to finish. + // @todo What is an appropriate timeout? + // 1 second was not enough in some cases, so we use 5 seconds. + // Make it configurable? + u_long64 timeout = sge_get_gmt64() + 5000000; // 5 second timeout + while (ret == true) { + if (sge_get_gmt64() > timeout) { + sge_dstring_sprintf(error_dstr, SFN ": timeout waiting for completion of job " SFN, __func__, job_path.c_str()); + ret = false; + break; + } + + // wait for the next signal + sd_bus_message *m = nullptr; + int r = sd_bus_process_func(bus, &m); + DPRINTF("sd_bus_process_func(bus, &m) returned %d", r); + if (r < 0) { + if (-r != EINTR) { + // We ignore EINTR, as we might get it, e.g., when sge_execd gets a SIGCHILD from an exiting sge_shepherd. + sge_dstring_sprintf(error_dstr, SFN ": processing bus failed: error %d: " SFN, __func__, r, strerror(-r)); + ret = false; + } + } else { + // 0 means we need to wait before calling sd_bus_process again + if (r == 0) { + // we wait with timeout + // sd_bus_wait() returns 0 on timeout, not an error, final timeout handled above + // does it actually make sense to use a timeout < our final timeout? We use 100ms for now. + r = sd_bus_wait_func(bus, 100000); + DPRINTF("sd_bus_wait_func(bus, nullptr) returned %d", r); + if (r < 0) { + sge_dstring_sprintf(error_dstr, SFN ": waiting for bus failed: error %d: " SFN, __func__, r, strerror(-r)); + ret = false; + } + // do the next sd_bus_process call + sd_bus_message_unref_func(m); + continue; + } + } + + // sd_bus_process read signal + if (ret && m != nullptr) { + if (strcmp(sd_bus_message_get_member_func(m), "JobRemoved") == 0) { + DPRINTF("got JobRemoved signal"); + const char *completed_job_path = nullptr; + // message contains for a signal: + // `u`: job id, e.g. `1234` + // `o`: job path, e.g. `/org/freedesktop/systemd1/job/1234` + // `s`: status, e.g. `"done"` + r = sd_bus_message_read_func(m, "uos", nullptr, &completed_job_path, nullptr); + DPRINTF("sd_bus_message_read_func(m, \"uos\", &completed_job_path) returned %d", r); + if (r < 0) { + sge_dstring_sprintf(error_dstr, SFN ": reading job path failed: error %d: " SFN, __func__, r, strerror(-r)); + ret = false; + } else { + DPRINTF("completed_job_path = %s", completed_job_path); + if (job_path.compare(completed_job_path) == 0) { + sd_bus_message_unref_func(m); + break; // job done + } + } + } + } + + sd_bus_message_unref_func(m); + } + + DRETURN(ret); + } + + /*! + * @brief Get a string property from a systemd unit + * + * This function retrieves a property from a specified systemd unit. + * It handles retries in case of EINTR errors and returns true if the operation was successful, + * otherwise it sets the error_dstr with an appropriate error message. + * + * @param interface The interface of the systemd unit (e.g., "Unit" or "Scope") + * @param unit The name of the systemd unit to query + * @param property The name of the property to retrieve + * @param value A reference to a string where the property value will be stored + * @param error_dstr A pointer to a dstring where error messages will be stored + * @return true if the operation was successful, false otherwise + */ + bool + Systemd::sd_bus_get_property(const std::string &interface, const std::string &unit, const std::string &property, std::string &value, dstring *error_dstr) const { + DENTER(TOP_LAYER); + + bool ret = true; + + // encode the object path + char *path = nullptr; + if (unit.empty()) { + path = strdup("/org/freedesktop/systemd1"); + } else { + int r = sd_bus_path_encode_func("/org/freedesktop/systemd1/unit", unit.c_str(), &path); + if (r < 0) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_CANNOT_ENCODE_PATH_SIS, unit.c_str(), r, strerror(-r)); + ret = false; + } + } + + int retries = 0; + bool retry_on_interrupt = true; // retry on EINTR + while (ret && retry_on_interrupt) { + retry_on_interrupt = false; + sd_bus_message *m = nullptr; + sd_bus_error error = SD_BUS_ERROR_NULL; + std::string full_interface = "org.freedesktop.systemd1." + interface; // e.g. "org.freedesktop.systemd1.Unit" + int r = sd_bus_get_property_func(bus, "org.freedesktop.systemd1", // service to contact + path, // object path + full_interface.c_str(), // interface name + property.c_str(), // property name + &error, // object to return error in + &m, // return message on success + "s"); // type signature + if (r < 0) { + DPRINTF("sd_bus_get_property(%s, %s, %s) returned %d: %s", interface.c_str(), unit.c_str(), property.c_str(), r, error.message); + if (-r == EINTR && retries++ < NUM_SD_BUS_RETRIES) { + retry_on_interrupt = true; + } else { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_CANNOT_CALL_SSIS, "sd_bus_get_property", property.c_str(), r, error.message); + if (retries > 0) { + sge_dstring_sprintf_append(error_dstr, MSG_SYSTEMD_AFTER_RETRIES_I, retries); + } + ret = false; + } + } else { + char *result = nullptr; + r = sd_bus_message_read_func(m, "s", &result); + if (r < 0) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_CANNOT_READ_PROPERTY_RESULT_SIS, property.c_str(), r, strerror(-r)); + value.clear(); + ret = false; + } else { + value = result; + } + } + sd_bus_message_unref_func(m); + sd_bus_error_free_func(&error); + } + + free(path); // free the encoded path + + DRETURN(ret); + } + + /*! + * @brief Get an integer property from a systemd unit + * + * This function retrieves a property from a specified systemd unit. + * It handles retries in case of EINTR errors and returns true if the operation was successful, + * otherwise it sets the error_dstr with an appropriate error message. + * + * @param interface The interface of the systemd unit (e.g., "Unit" or "Scope") + * @param unit The name of the systemd unit to query + * @param property The name of the property to retrieve + * @param value A reference to a uint64_t where the property value will be stored + * @param error_dstr A pointer to a dstring where error messages will be stored + * @return true if the operation was successful, false otherwise + */ + bool + Systemd::sd_bus_get_property(const std::string &interface, const std::string &unit, const std::string &property, uint64_t &value, dstring *error_dstr) const { + DENTER(TOP_LAYER); + + bool ret = true; + + // encode the object path + char *path = nullptr; + int r = sd_bus_path_encode_func("/org/freedesktop/systemd1/unit", unit.c_str(), &path); + if (r < 0) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_CANNOT_ENCODE_PATH_SIS, unit.c_str(), r, strerror(-r)); + ret = false; + } + + bool retry_on_interrupt = true; // retry on EINTR + int retries = 0; + while (ret && retry_on_interrupt) { + retry_on_interrupt = false; + sd_bus_message *m = nullptr; + sd_bus_error error = SD_BUS_ERROR_NULL; + std::string full_interface = "org.freedesktop.systemd1." + interface; // e.g. "org.freedesktop.systemd1.Unit" + r = sd_bus_get_property_func(bus, "org.freedesktop.systemd1", // service to contact + path, // object path + full_interface.c_str(), // interface name + property.c_str(), // property name + &error, // object to return error in + &m, // return message on success + "t"); // type signature + if (r < 0) { + DPRINTF("sd_bus_get_property(%s, %s, %s) returned %d: %s", interface.c_str(), unit.c_str(), property.c_str(), r, error.message); + if (-r == EINTR && retries++ < NUM_SD_BUS_RETRIES) { + retry_on_interrupt = true; + } else { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_CANNOT_CALL_SSIS, "sd_bus_get_property", property.c_str(), r, error.message); + if (retries > 0) { + sge_dstring_sprintf_append(error_dstr, MSG_SYSTEMD_AFTER_RETRIES_I, retries); + } + ret = false; + } + } else { + uint64_t result{}; + r = sd_bus_message_read_func(m, "t", &result); + if (r < 0) { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_CANNOT_READ_PROPERTY_RESULT_SIS, property.c_str(), r, strerror(-r)); + value = 0; + ret = false; + } else { + value = result; + } + } + sd_bus_message_unref_func(m); + sd_bus_error_free_func(&error); + } + + free(path); // free the encoded path + + DRETURN(ret); + } + + /*! + * @brief Stop a systemd unit + * + * This function stops a specified systemd unit by calling the StopUnit method. + * It handles retries in case of EINTR errors and returns true if the operation was successful, + * otherwise it sets the error_dstr with an appropriate error message. + * + * @param unit The name of the systemd unit to stop + * @param error_dstr A pointer to a dstring where error messages will be stored + * @return true if the operation was successful, false otherwise + */ + bool + Systemd::stop_unit(const std::string &unit, dstring *error_dstr) const { + bool ret = true; + + bool retry_on_interrupt = true; // retry on EINTR + int retries = 0; + while (ret && retry_on_interrupt) { + retry_on_interrupt = false; + sd_bus_error error = SD_BUS_ERROR_NULL; + int r = sd_bus_call_method_func(bus, "org.freedesktop.systemd1", // service to contact + "/org/freedesktop/systemd1", // object path + "org.freedesktop.systemd1.Manager", // interface name + "StopUnit", // method name + &error, // object to return error in + nullptr, // return message on success (not needed) + // @todo Really? Would give us the job we could wait on. + "ss", // input signature + unit.c_str(), // first argument (unit name) + "replace"); // second argument (mode) + // If the unit does not exist, we get -2 (ENOENT) which is OK + if (r < 0 && -r != ENOENT) { + if (-r == EINTR && retries++ < NUM_SD_BUS_RETRIES) { + retry_on_interrupt = true; + } else { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_CANNOT_CALL_SSIS, "StopUnit", unit.c_str(), r, error.message); + if (retries > 0) { + sge_dstring_sprintf_append(error_dstr, MSG_SYSTEMD_AFTER_RETRIES_I, retries); + } + ret = false; + } + } + sd_bus_error_free_func(&error); + } + + return ret; + } + + /*! + * @brief Freeze (suspend) a systemd unit + * + * This function stops a specified systemd unit by calling the StopUnit method. + * It handles retries in case of EINTR errors and returns true if the operation was successful, + * otherwise it sets the error_dstr with an appropriate error message. + * + * @param unit The name of the systemd unit to stop + * @param error_dstr A pointer to a dstring where error messages will be stored + * @return true if the operation was successful, false otherwise + */ + bool + Systemd::freeze_unit(const std::string &unit, dstring *error_dstr) const { + bool ret = true; + + bool retry_on_interrupt = true; // retry on EINTR + int retries = 0; + while (ret && retry_on_interrupt) { + retry_on_interrupt = false; + sd_bus_error error = SD_BUS_ERROR_NULL; + int r = sd_bus_call_method_func(bus, "org.freedesktop.systemd1", // service to contact + "/org/freedesktop/systemd1", // object path + "org.freedesktop.systemd1.Manager", // interface name + "FreezeUnit", // method name + &error, // object to return error in + nullptr, // return message on success (not needed) + "s", // input signature + unit.c_str()); // first argument (unit name) + if (r < 0) { + if (-r == EINTR && retries++ < NUM_SD_BUS_RETRIES) { + retry_on_interrupt = true; + } else { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_CANNOT_CALL_SSIS, "FreezeUnit", unit.c_str(), r, error.message); + if (retries > 0) { + sge_dstring_sprintf_append(error_dstr, MSG_SYSTEMD_AFTER_RETRIES_I, retries); + } + ret = false; + } + } + + sd_bus_error_free_func(&error); + } + + return ret; + } + + /*! + * @brief Thaw (unsuspend) a systemd unit + * + * This function thaws (unsuspends) a specified systemd unit by calling the ThawUnit method. + * It handles retries in case of EINTR errors and returns true if the operation was successful, + * otherwise it sets the error_dstr with an appropriate error message. + * + * @param unit The name of the systemd unit to thaw + * @param error_dstr A pointer to a dstring where error messages will be stored + * @return true if the operation was successful, false otherwise + */ + bool + Systemd::thaw_unit(const std::string &unit, dstring *error_dstr) const { + bool ret = true; + + bool retry_on_interrupt = true; // retry on EINTR + int retries = 0; + while (ret && retry_on_interrupt) { + retry_on_interrupt = false; + sd_bus_error error = SD_BUS_ERROR_NULL; + int r = sd_bus_call_method_func(bus, "org.freedesktop.systemd1", // service to contact + "/org/freedesktop/systemd1", // object path + "org.freedesktop.systemd1.Manager", // interface name + "ThawUnit", // method name + &error, // object to return error in + nullptr, // return message on success (not needed) + "s", // input signature + unit.c_str()); // first argument (unit name) + if (r < 0) { + if (-r == EINTR && retries++ < NUM_SD_BUS_RETRIES) { + retry_on_interrupt = true; + } else { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_CANNOT_CALL_SSIS, "ThawUnit", unit.c_str(), r, error.message); + if (retries > 0) { + sge_dstring_sprintf_append(error_dstr, MSG_SYSTEMD_AFTER_RETRIES_I, retries); + } + ret = false; + } + } + + sd_bus_error_free_func(&error); + } + + return ret; + } + + /*! + * @brief Send a signal to a systemd unit + * + * This function sends a signal to a specified systemd unit by calling the KillUnit method. + * It handles retries in case of EINTR errors and returns true if the operation was successful, + * otherwise it sets the error_dstr with an appropriate error message. + * + * It uses the KillUnit method of the systemd Manager interface to send a signal to the unit. + * Depending on the `only_main` parameter, it can signal either the main process of the unit + * or all processes associated with the unit. + * + * @param unit The name of the systemd unit to signal + * @param signal The signal number to send (e.g., SIGTERM) + * @param only_main If true, only the main process of the unit will be signaled + * @param error_dstr A pointer to a dstring where error messages will be stored + * @return true if the operation was successful, false otherwise + */ + bool + Systemd::signal_unit(const std::string &unit, int signal, bool only_main, dstring *error_dstr) const { + bool ret = true; + + bool retry_on_interrupt = true; // retry on EINTR + int retries = 0; + while (ret && retry_on_interrupt) { + retry_on_interrupt = false; + sd_bus_error error = SD_BUS_ERROR_NULL; + int r = sd_bus_call_method_func(bus, "org.freedesktop.systemd1", // service to contact + "/org/freedesktop/systemd1", // object path + "org.freedesktop.systemd1.Manager", // interface name + "KillUnit", // method name + &error, // object to return error in + nullptr, // return message on success (not needed) + "ssi", // input signature + unit.c_str(), // first argument (unit name) + only_main ? "main" : "all", // second argument (mode) + signal); // third argument (signal number) + if (r < 0) { + if (-r == EINTR && retries++ < NUM_SD_BUS_RETRIES) { + retry_on_interrupt = true; + } else { + sge_dstring_sprintf(error_dstr, MSG_SYSTEMD_CANNOT_CALL_SSIS, "SignalUnit", unit.c_str(), r, error.message); + if (retries > 0) { + sge_dstring_sprintf_append(error_dstr, MSG_SYSTEMD_AFTER_RETRIES_I, retries); + } + ret = false; + } + } + sd_bus_error_free_func(&error); + } + + return ret; + } + +} // namespace +#endif diff --git a/source/libs/uti/ocs_Systemd.h b/source/libs/uti/ocs_Systemd.h new file mode 100644 index 0000000000..6f332fbcab --- /dev/null +++ b/source/libs/uti/ocs_Systemd.h @@ -0,0 +1,158 @@ +#pragma once +/*___INFO__MARK_BEGIN_NEW__*/ +/*************************************************************************** + * + * Copyright 2025 HPC-Gridware GmbH + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************/ +/*___INFO__MARK_END_NEW__*/ + +#include +#include +#include +#include + +#if defined(OCS_WITH_SYSTEMD) +#include + +#include "sge_dstring.h" +#endif + +namespace ocs::uti { + // systemd properties type + // we use it in the signature of a shepherd function - therefore, we need it outside OCS_WITH_SYSTEMD + using SystemdDevice_t = std::pair; + using SystemdProperty_t = std::variant, std::vector>; + using SystemdProperties_t = std::map; + +#if defined(OCS_WITH_SYSTEMD) + + // function types for the sdbus interface + using sd_bus_open_system_func_t = int (*)(sd_bus **bus); + using sd_bus_unref_func_t = sd_bus *(*)(sd_bus *bus); + using sd_bus_call_method_func_t = int (*)(sd_bus *bus, const char *destination, const char *path, + const char *interface, const char *member, sd_bus_error *ret_error, sd_bus_message **reply, + const char *types, ...); + using sd_bus_message_read_func_t = int (*)(sd_bus_message *m, const char *types, ...); + using sd_bus_message_new_method_call_func_t = int (*)(sd_bus *bus, sd_bus_message **m, + const char *destination, const char *path, const char *interface, const char *member); + using sd_bus_message_unref_func_t = int *(*)(sd_bus_message *m); + using sd_bus_message_append_func_t = int (*)(sd_bus_message *m, const char *types, ...); + using sd_bus_message_append_array_func_t = int (*)(sd_bus_message *m, char type, const void *ptr, size_t size); + using sd_bus_message_open_container_func_t = int (*)(sd_bus_message *m, int type, const char *types); + using sd_bus_message_close_container_func_t = int (*)(sd_bus_message *m); + using sd_bus_call_func_t = int (*)(sd_bus *bus, sd_bus_message *m, uint64_t usec, sd_bus_error *error, sd_bus_message **reply); + using sd_bus_add_match_func_t = int (*)(sd_bus *bus, sd_bus_slot **slot, const char *match, sd_bus_message_handler_t callback, void *userdata); + using sd_bus_match_signal_func_t = int (*)(sd_bus *bus, sd_bus_slot **slot, const char *path, const char *interface, const char *member, sd_bus_message_handler_t callback, void *userdata); + using sd_bus_slot_unref_func_t = int (*)(sd_bus_slot *slot); + using sd_bus_process_func_t = int (*)(sd_bus *bus, sd_bus_message **m); + using sd_bus_wait_func_t = int (*)(sd_bus *bus, int timeout_usec); + using sd_bus_message_get_sender_func_t = const char *(*)(sd_bus_message *m); + using sd_bus_message_get_member_func_t = const char *(*)(sd_bus_message *m); + using sd_bus_path_encode_func_t = int (*)(const char *prefix, const char *external_id, char **ret_path); + using sd_bus_get_property_func_t = int (*)(sd_bus *bus, const char *destination, const char *path, const char *interface, const char *member, sd_bus_error *ret_error, sd_bus_message **reply, const char *type); + using sd_bus_error_free_func_t = void (*)(sd_bus_error *error); + + // @brief Systemd class + // + // This class provides an interface to interact with systemd using the sd-bus API. + // It allows for opening a system bus connection, making method calls, and handling messages. + // It has static methods for initialization and checking systemd availability. + // An instance of the class connects to the system bus and provides methods for + // interacting with systemd services. + class Systemd { + private: + // static data + // handle and function pointers of the libsystemd.so + static void *lib_handle; + static sd_bus_open_system_func_t sd_bus_open_system_func; + static sd_bus_unref_func_t sd_bus_unref_func; + static sd_bus_call_method_func_t sd_bus_call_method_func; + static sd_bus_message_read_func_t sd_bus_message_read_func; + static sd_bus_message_new_method_call_func_t sd_bus_message_new_method_call_func; + static sd_bus_message_unref_func_t sd_bus_message_unref_func; + static sd_bus_message_append_func_t sd_bus_message_append_func; + static sd_bus_message_append_array_func_t sd_bus_message_append_array_func; + static sd_bus_message_open_container_func_t sd_bus_message_open_container_func; + static sd_bus_message_close_container_func_t sd_bus_message_close_container_func; + static sd_bus_call_func_t sd_bus_call_func; + static sd_bus_add_match_func_t sd_bus_add_match_func; + static sd_bus_match_signal_func_t sd_bus_match_signal_func; + static sd_bus_slot_unref_func_t sd_bus_slot_unref_func; + static sd_bus_process_func_t sd_bus_process_func; + static sd_bus_wait_func_t sd_bus_wait_func; + static sd_bus_message_get_sender_func_t sd_bus_message_get_sender_func; + static sd_bus_message_get_member_func_t sd_bus_message_get_member_func; + static sd_bus_path_encode_func_t sd_bus_path_encode_func; + static sd_bus_get_property_func_t sd_bus_get_property_func; + static sd_bus_error_free_func_t sd_bus_error_free_func; + + // name of toplevel slice (from $SGE_ROOT/$SGE_CELL/common/slice_name, when running under Systemd control) + static std::string slice_name; + static std::string service_name; + static bool running_as_service; + static int cgroup_version; + static int systemd_version; + + public: + // constants + static constexpr std::string execd_service_name{"execd.service"}; + static constexpr std::string shepherd_scope_name{"shepherds.scope"}; + + // static methods + static bool initialize(const std::string &service_name_in, dstring *error_dstr); + static bool is_systemd_available(); // we can load the systemd library and connect to systemd + static bool is_running_as_service(); // the process is running as a systemd service + static std::string get_slice_name() { return slice_name; } + static int get_cgroup_version() { return cgroup_version; } + static int get_systemd_version() { return systemd_version; } + + private: + // instance data + sd_bus *bus; + + // instance methods + bool sd_bus_method_s_o(const std::string &method, std::string &input, std::string &output, dstring *error_dstr) const; + bool sd_bus_method_u_o(const std::string &method, uint32_t input, std::string &output, dstring *error_dstr) const; + sd_bus_slot *sd_bus_wait_for_job_subscribe(const std::string &signal, dstring *error_dstr) const; + void sd_bus_wait_for_job_unsubscribe(sd_bus_slot **slot) const; + bool sd_bus_wait_for_job_completion(const std::string &job_path, dstring *error_dstr) const; + + public: + Systemd(); + ~Systemd(); + + bool connect(dstring *error_dstr); + bool connected() const; + + bool move_shepherd_to_scope(pid_t pid, dstring *error_dstr) const; + bool create_scope_with_pid(const std::string &scope, const std::string &slice, + const SystemdProperties_t &properties, pid_t pid, bool &scope_already_exists, dstring *error_dstr) const; + bool + attach_pid_to_scope(const std::string &scope, pid_t pid, bool &scope_not_exists, dstring *error_dstr) const; + + bool sd_bus_get_property(const std::string &interface, const std::string &unit, const std::string &property, std::string &value, dstring *error_dstr) const; + bool sd_bus_get_property(const std::string &interface, const std::string &unit, const std::string &property, uint64_t &value, dstring *error_dstr) const; + + bool stop_unit(const std::string &unit, dstring *error_dstr) const; + bool freeze_unit(const std::string &unit, dstring *error_dstr) const; + bool thaw_unit(const std::string &unit, dstring *error_dstr) const; + bool signal_unit(const std::string &unit, int signal, bool only_main, dstring *error_dstr) const; + }; + +#endif + +} diff --git a/source/libs/uti/sge_log.h b/source/libs/uti/sge_log.h index f278f9b058..446f558cf9 100644 --- a/source/libs/uti/sge_log.h +++ b/source/libs/uti/sge_log.h @@ -301,7 +301,7 @@ sge_log(u_long32 log_level, const char *msg, const char *file, int line); sge_set_message_id_output(1); \ snprintf(log_buffer, log_buffer_size, __VA_ARGS__); \ sge_set_message_id_output(0); \ - sge_log(LOG_DEBUG, SGE_EVENT,__FILE__,__LINE__); \ + sge_log(LOG_DEBUG, log_buffer,__FILE__,__LINE__); \ } \ } void() #else diff --git a/source/libs/uti/sge_os.h b/source/libs/uti/sge_os.h index a680513f61..7c074f4e10 100644 --- a/source/libs/uti/sge_os.h +++ b/source/libs/uti/sge_os.h @@ -54,7 +54,7 @@ * max. supported number = 999 */ typedef enum uti_daemonize_state_type { - SGE_DEAMONIZE_OK = 0, + SGE_DAEMONIZE_OK = 0, SGE_DAEMONIZE_DEAD_CHILD = 100, SGE_DAEMONIZE_TIMEOUT = 101 } uti_daemonize_state_t; diff --git a/source/libs/uti/sge_uidgid.cc b/source/libs/uti/sge_uidgid.cc index 64a7abb7b7..a4735d7bc7 100644 --- a/source/libs/uti/sge_uidgid.cc +++ b/source/libs/uti/sge_uidgid.cc @@ -803,7 +803,7 @@ _sge_set_uid_gid_addgrp(const char *user, const char *intermediate_user, gid_t m #if defined(SOLARIS) || defined(LINUX) || defined(FREEBSD) || defined(DARWIN) /* add Additional group id to current list of groups */ - if (add_grp) { + if (add_grp != 0) { if (sge_add_group(add_grp, err_str, err_str_size, skip_silently) == -1) { return 5; } @@ -1467,5 +1467,3 @@ double sge_normalize_value(double value, double range_min, double range_max) return 0.5; return (value - range_min) / (range_max - range_min); } - - diff --git a/source/security/gss/doc/gss_customer.html b/source/security/gss/doc/gss_customer.html index acc04a9f5c..0653a69a84 100644 --- a/source/security/gss/doc/gss_customer.html +++ b/source/security/gss/doc/gss_customer.html @@ -256,7 +256,7 @@

Instructions for Cluster Scheduler spool directories in DFS

If you would like to maintain the Cluster Scheduler spool directories in DFS, -then the Cluster Scheduler deamons must run under a DCE identity. The best way +then the Cluster Scheduler daemons must run under a DCE identity. The best way to accomplish this is to create a unique DCE account (e.g. "sge_daemon") and put the key into the default keytab. To create the account, follow the DCE Instructions above substituting the DCE account name that you choose @@ -337,7 +337,7 @@

directory containing the following line: johndoe@FUBAR.ORG This would allow her husband "johndoe" to use Cluster Scheduler to access her account. However, in order for this to work, the .k5login file must be accessible by the -qmaster deamon running on the qmaster host. If the qmaster host does not +qmaster daemon running on the qmaster host. If the qmaster host does not have access to the users home directories, then the Cluster Scheduler manager has the option to create "dummy" home directories which simply contain the appropriate .k5login files for the appropriate users. The dummy home