Skip to content

Commit 395fc2f

Browse files
author
zhaoyu
committed
Support memory boost and prmon for build task
1 parent c634e51 commit 395fc2f

4 files changed

Lines changed: 34 additions & 3 deletions

File tree

config/bps_remote.yaml

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
remoteBuild:
22
enabled: true
33
runnerCommand: >
4+
logDir=`pwd`;
5+
chmod ugo+w ${logDir};
6+
file=${logDir}/memory_monitor_output.txt; touch ${file}; chmod ugo+w ${file};
7+
file=${logDir}/memory_monitor_summary.json; touch ${file}; chmod ugo+w ${file};
48
export SHELL=/bin/bash;
59
unset PYTHONPATH;
610
source /cvmfs/sw.lsst.eu/almalinux-x86_64/lsst_distrib/{LSST_VERSION}/loadLSST.bash;
@@ -25,4 +29,12 @@ remoteBuild:
2529
export IDDS_MAX_NAME_LENGTH={IDDS_MAX_NAME_LENGTH};
2630
{custom_lsst_setup}
2731
python3 ${CTRL_BPS_PANDA_DIR}/python/lsst/ctrl/bps/panda/edgenode/build_cmd_line_decoder.py \
28-
_download_cmd_line_ _build_cmd_line_ _compute_site_
32+
_download_cmd_line_ _build_cmd_line_ _compute_site_ & pJob=$!;
33+
prmon -i 5
34+
-f ${logDir}/memory_monitor_output.txt
35+
-j ${logDir}/memory_monitor_summary.json
36+
-p $pJob & mJob=$!;
37+
wait $pJob;
38+
ret=$?;
39+
wait $mJob;
40+
exit $ret;

doc/changes/DM-53876.bugfix.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Support memory boost and prmon for build task

python/lsst/ctrl/bps/panda/edgenode/build_cmd_line_decoder.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import os
1414
import sys
1515

16+
from lsst.ctrl.bps import BpsSubprocessError
1617
from lsst.ctrl.bps.constants import DEFAULT_MEM_FMT, DEFAULT_MEM_UNIT
1718
from lsst.ctrl.bps.drivers import prepare_driver
1819
from lsst.ctrl.bps.panda.constants import PANDA_DEFAULT_MAX_COPY_WORKERS, PANDA_DEFAULT_MAX_REQUEST_LENGTH
@@ -79,15 +80,23 @@ def create_idds_workflow(config_file, compute_site):
7980
print("IDDS_BUIL_SIGNATURE is not defined")
8081
sys.exit(-1)
8182

82-
print(f"INFO: start {datetime.datetime.utcnow()}")
83+
print(f"INFO: start {datetime.datetime.now(datetime.UTC)}")
8384
print(f"INFO: config file: {config_file}")
8485
print(f"INFO: compute site: {compute_site}")
8586

8687
current_dir = os.getcwd()
8788

8889
print(f"INFO: current dir: {current_dir}")
8990

90-
config, bps_workflow = create_idds_workflow(config_file, compute_site)
91+
try:
92+
config, bps_workflow = create_idds_workflow(config_file, compute_site)
93+
except BpsSubprocessError as e:
94+
code = e.errno
95+
if code < 0:
96+
print(f"BPS prepare caught exception: {e.strerror}")
97+
sys.exit(128 + abs(code))
98+
elif code != 0:
99+
sys.exit(code)
91100
idds_workflow = bps_workflow.idds_client_workflow
92101

93102
_, max_copy_workers = config.search("maxCopyWorkers", opt={"default": PANDA_DEFAULT_MAX_COPY_WORKERS})

python/lsst/ctrl/bps/panda/utils.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1128,8 +1128,13 @@ def create_idds_build_workflow(**kwargs):
11281128
task_site = get_task_parameter(config, remote_build, "computeSite")
11291129
task_queue = get_task_parameter(config, remote_build, "queue")
11301130
task_rss = get_task_parameter(config, remote_build, "requestMemory")
1131+
task_rss_max = get_task_parameter(config, remote_build, "requestMemoryMax")
1132+
memory_multiplier = get_task_parameter(config, remote_build, "memoryMultiplier")
1133+
task_rss_retry_step = task_rss * memory_multiplier if memory_multiplier else 0
1134+
task_rss_retry_offset = 0 if task_rss_retry_step else task_rss
11311135
nretries = get_task_parameter(config, remote_build, "numberOfRetries")
11321136
processing_type = get_task_parameter(config, remote_build, "processingType")
1137+
priority = get_task_parameter(config, remote_build, "priority")
11331138
_LOG.info("requestMemory: %s", task_rss)
11341139
_LOG.info("Site: %s", task_site)
11351140
# _LOG.info("executable: %s", executable)
@@ -1155,8 +1160,12 @@ def create_idds_build_workflow(**kwargs):
11551160
"value": "log.tgz",
11561161
},
11571162
task_rss=task_rss if task_rss else PANDA_DEFAULT_RSS,
1163+
task_rss_max=task_rss_max if task_rss_max else PANDA_DEFAULT_RSS_MAX,
1164+
task_rss_retry_offset=task_rss_retry_offset,
1165+
task_rss_retry_step=task_rss_retry_step,
11581166
task_cloud=task_cloud,
11591167
task_site=task_site,
1168+
task_priority=int(priority) if priority else PANDA_DEFAULT_PRIORITY,
11601169
maxattempt=nretries if nretries > 0 else PANDA_DEFAULT_MAX_ATTEMPTS,
11611170
)
11621171

0 commit comments

Comments
 (0)