diff --git a/config/bps_remote.yaml b/config/bps_remote.yaml index ff4cb468..082969e5 100644 --- a/config/bps_remote.yaml +++ b/config/bps_remote.yaml @@ -1,6 +1,10 @@ remoteBuild: enabled: true runnerCommand: > + logDir=`pwd`; + chmod ugo+w ${logDir}; + file=${logDir}/memory_monitor_output.txt; touch ${file}; chmod ugo+w ${file}; + file=${logDir}/memory_monitor_summary.json; touch ${file}; chmod ugo+w ${file}; export SHELL=/bin/bash; unset PYTHONPATH; source /cvmfs/sw.lsst.eu/almalinux-x86_64/lsst_distrib/{LSST_VERSION}/loadLSST.bash; @@ -25,4 +29,12 @@ remoteBuild: export IDDS_MAX_NAME_LENGTH={IDDS_MAX_NAME_LENGTH}; {custom_lsst_setup} python3 ${CTRL_BPS_PANDA_DIR}/python/lsst/ctrl/bps/panda/edgenode/build_cmd_line_decoder.py \ - _download_cmd_line_ _build_cmd_line_ _compute_site_ + _download_cmd_line_ _build_cmd_line_ _compute_site_ & pJob=$!; + prmon -i 5 + -f ${logDir}/memory_monitor_output.txt + -j ${logDir}/memory_monitor_summary.json + -p $pJob & mJob=$!; + wait $pJob; + ret=$?; + wait $mJob; + exit $ret; diff --git a/doc/changes/DM-53876.bugfix.rst b/doc/changes/DM-53876.bugfix.rst new file mode 100644 index 00000000..683db310 --- /dev/null +++ b/doc/changes/DM-53876.bugfix.rst @@ -0,0 +1 @@ +Support memory boost and prmon for build task diff --git a/python/lsst/ctrl/bps/panda/edgenode/build_cmd_line_decoder.py b/python/lsst/ctrl/bps/panda/edgenode/build_cmd_line_decoder.py index b6246e76..1e238efd 100644 --- a/python/lsst/ctrl/bps/panda/edgenode/build_cmd_line_decoder.py +++ b/python/lsst/ctrl/bps/panda/edgenode/build_cmd_line_decoder.py @@ -13,6 +13,7 @@ import os import sys +from lsst.ctrl.bps import BpsSubprocessError from lsst.ctrl.bps.constants import DEFAULT_MEM_FMT, DEFAULT_MEM_UNIT from lsst.ctrl.bps.drivers import prepare_driver from lsst.ctrl.bps.panda.constants import PANDA_DEFAULT_MAX_COPY_WORKERS, PANDA_DEFAULT_MAX_REQUEST_LENGTH @@ -79,7 +80,7 @@ def create_idds_workflow(config_file, compute_site): print("IDDS_BUIL_SIGNATURE is not defined") sys.exit(-1) -print(f"INFO: start {datetime.datetime.utcnow()}") +print(f"INFO: start {datetime.datetime.now(datetime.UTC)}") print(f"INFO: config file: {config_file}") print(f"INFO: compute site: {compute_site}") @@ -87,7 +88,15 @@ def create_idds_workflow(config_file, compute_site): print(f"INFO: current dir: {current_dir}") -config, bps_workflow = create_idds_workflow(config_file, compute_site) +try: + config, bps_workflow = create_idds_workflow(config_file, compute_site) +except BpsSubprocessError as e: + code = e.errno + if code < 0: + print(f"BPS prepare caught exception: {e.strerror}") + sys.exit(128 + abs(code)) + elif code != 0: + sys.exit(code) idds_workflow = bps_workflow.idds_client_workflow _, max_copy_workers = config.search("maxCopyWorkers", opt={"default": PANDA_DEFAULT_MAX_COPY_WORKERS}) diff --git a/python/lsst/ctrl/bps/panda/utils.py b/python/lsst/ctrl/bps/panda/utils.py index 3aa49904..6d64a1fa 100644 --- a/python/lsst/ctrl/bps/panda/utils.py +++ b/python/lsst/ctrl/bps/panda/utils.py @@ -1128,8 +1128,13 @@ def create_idds_build_workflow(**kwargs): task_site = get_task_parameter(config, remote_build, "computeSite") task_queue = get_task_parameter(config, remote_build, "queue") task_rss = get_task_parameter(config, remote_build, "requestMemory") + task_rss_max = get_task_parameter(config, remote_build, "requestMemoryMax") + memory_multiplier = get_task_parameter(config, remote_build, "memoryMultiplier") + task_rss_retry_step = task_rss * memory_multiplier if memory_multiplier else 0 + task_rss_retry_offset = 0 if task_rss_retry_step else task_rss nretries = get_task_parameter(config, remote_build, "numberOfRetries") processing_type = get_task_parameter(config, remote_build, "processingType") + priority = get_task_parameter(config, remote_build, "priority") _LOG.info("requestMemory: %s", task_rss) _LOG.info("Site: %s", task_site) # _LOG.info("executable: %s", executable) @@ -1155,8 +1160,12 @@ def create_idds_build_workflow(**kwargs): "value": "log.tgz", }, task_rss=task_rss if task_rss else PANDA_DEFAULT_RSS, + task_rss_max=task_rss_max if task_rss_max else PANDA_DEFAULT_RSS_MAX, + task_rss_retry_offset=task_rss_retry_offset, + task_rss_retry_step=task_rss_retry_step, task_cloud=task_cloud, task_site=task_site, + task_priority=int(priority) if priority else PANDA_DEFAULT_PRIORITY, maxattempt=nretries if nretries > 0 else PANDA_DEFAULT_MAX_ATTEMPTS, )