Skip to content

Commit 29a9ffe

Browse files
author
zhaoyu
committed
Support memory boost and prmon for build task
1 parent 59d1ae4 commit 29a9ffe

3 files changed

Lines changed: 30 additions & 2 deletions

File tree

config/bps_remote.yaml

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
remoteBuild:
22
enabled: true
33
runnerCommand: >
4+
logDir=`pwd`;
5+
chmod ugo+w ${logDir};
6+
file=${logDir}/memory_monitor_output.txt; touch ${file}; chmod ugo+w ${file};
7+
file=${logDir}/memory_monitor_summary.json; touch ${file}; chmod ugo+w ${file};
48
export SHELL=/bin/bash;
59
unset PYTHONPATH;
610
source /cvmfs/sw.lsst.eu/almalinux-x86_64/lsst_distrib/{LSST_VERSION}/loadLSST.bash;
@@ -25,4 +29,12 @@ remoteBuild:
2529
export IDDS_MAX_NAME_LENGTH={IDDS_MAX_NAME_LENGTH};
2630
{custom_lsst_setup}
2731
python3 ${CTRL_BPS_PANDA_DIR}/python/lsst/ctrl/bps/panda/edgenode/build_cmd_line_decoder.py \
28-
_download_cmd_line_ _build_cmd_line_ _compute_site_
32+
_download_cmd_line_ _build_cmd_line_ _compute_site_ & pJob=$!;
33+
prmon -i 5
34+
-f ${logDir}/memory_monitor_output.txt
35+
-j ${logDir}/memory_monitor_summary.json
36+
-p $pJob & mJob=$!;
37+
wait $pJob;
38+
ret=$?;
39+
wait $mJob;
40+
exit $ret;

python/lsst/ctrl/bps/panda/edgenode/build_cmd_line_decoder.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import os
1414
import sys
1515

16+
from lsst.ctrl.bps import BpsSubprocessError
1617
from lsst.ctrl.bps.constants import DEFAULT_MEM_FMT, DEFAULT_MEM_UNIT
1718
from lsst.ctrl.bps.drivers import prepare_driver
1819
from lsst.ctrl.bps.panda.constants import PANDA_DEFAULT_MAX_COPY_WORKERS, PANDA_DEFAULT_MAX_REQUEST_LENGTH
@@ -87,7 +88,15 @@ def create_idds_workflow(config_file, compute_site):
8788

8889
print(f"INFO: current dir: {current_dir}")
8990

90-
config, bps_workflow = create_idds_workflow(config_file, compute_site)
91+
try:
92+
config, bps_workflow = create_idds_workflow(config_file, compute_site)
93+
except BpsSubprocessError as e:
94+
code = e.errno
95+
if abs(code) == 9:
96+
print(f"BPS prepare caught exception: {e.strerror}")
97+
sys.exit(137)
98+
elif code != 0:
99+
sys.exit(code)
91100
idds_workflow = bps_workflow.idds_client_workflow
92101

93102
_, max_copy_workers = config.search("maxCopyWorkers", opt={"default": PANDA_DEFAULT_MAX_COPY_WORKERS})

python/lsst/ctrl/bps/panda/utils.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1110,8 +1110,12 @@ def create_idds_build_workflow(**kwargs):
11101110
task_site = get_task_parameter(config, remote_build, "computeSite")
11111111
task_queue = get_task_parameter(config, remote_build, "queue")
11121112
task_rss = get_task_parameter(config, remote_build, "requestMemory")
1113+
memory_multiplier = get_task_parameter(config, remote_build, "memoryMultiplier")
1114+
task_rss_retry_step = task_rss * memory_multiplier if memory_multiplier else 0
1115+
task_rss_retry_offset = 0 if task_rss_retry_step else task_rss
11131116
nretries = get_task_parameter(config, remote_build, "numberOfRetries")
11141117
processing_type = get_task_parameter(config, remote_build, "processingType")
1118+
priority = get_task_parameter(config, remote_build, "priority")
11151119
_LOG.info("requestMemory: %s", task_rss)
11161120
_LOG.info("Site: %s", task_site)
11171121
# _LOG.info("executable: %s", executable)
@@ -1137,8 +1141,11 @@ def create_idds_build_workflow(**kwargs):
11371141
"value": "log.tgz",
11381142
},
11391143
task_rss=task_rss if task_rss else PANDA_DEFAULT_RSS,
1144+
task_rss_retry_offset=task_rss_retry_offset,
1145+
task_rss_retry_step=task_rss_retry_step,
11401146
task_cloud=task_cloud,
11411147
task_site=task_site,
1148+
task_priority=int(priority) if priority else PANDA_DEFAULT_PRIORITY,
11421149
maxattempt=nretries if nretries > 0 else PANDA_DEFAULT_MAX_ATTEMPTS,
11431150
)
11441151

0 commit comments

Comments
 (0)