Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion config/bps_remote.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
remoteBuild:
enabled: true
runnerCommand: >
logDir=`pwd`;
chmod ugo+w ${logDir};
file=${logDir}/memory_monitor_output.txt; touch ${file}; chmod ugo+w ${file};
file=${logDir}/memory_monitor_summary.json; touch ${file}; chmod ugo+w ${file};
export SHELL=/bin/bash;
unset PYTHONPATH;
source /cvmfs/sw.lsst.eu/almalinux-x86_64/lsst_distrib/{LSST_VERSION}/loadLSST.bash;
Expand All @@ -25,4 +29,12 @@ remoteBuild:
export IDDS_MAX_NAME_LENGTH={IDDS_MAX_NAME_LENGTH};
{custom_lsst_setup}
python3 ${CTRL_BPS_PANDA_DIR}/python/lsst/ctrl/bps/panda/edgenode/build_cmd_line_decoder.py \
_download_cmd_line_ _build_cmd_line_ _compute_site_
_download_cmd_line_ _build_cmd_line_ _compute_site_ & pJob=$!;
prmon -i 5
-f ${logDir}/memory_monitor_output.txt
-j ${logDir}/memory_monitor_summary.json
-p $pJob & mJob=$!;
wait $pJob;
ret=$?;
wait $mJob;
exit $ret;
1 change: 1 addition & 0 deletions doc/changes/DM-53876.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Support memory boost and prmon for build task
13 changes: 11 additions & 2 deletions python/lsst/ctrl/bps/panda/edgenode/build_cmd_line_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import os
import sys

from lsst.ctrl.bps import BpsSubprocessError
from lsst.ctrl.bps.constants import DEFAULT_MEM_FMT, DEFAULT_MEM_UNIT
from lsst.ctrl.bps.drivers import prepare_driver
from lsst.ctrl.bps.panda.constants import PANDA_DEFAULT_MAX_COPY_WORKERS, PANDA_DEFAULT_MAX_REQUEST_LENGTH
Expand Down Expand Up @@ -79,15 +80,23 @@ def create_idds_workflow(config_file, compute_site):
print("IDDS_BUIL_SIGNATURE is not defined")
sys.exit(-1)

print(f"INFO: start {datetime.datetime.utcnow()}")
print(f"INFO: start {datetime.datetime.now(datetime.UTC)}")
print(f"INFO: config file: {config_file}")
print(f"INFO: compute site: {compute_site}")

current_dir = os.getcwd()

print(f"INFO: current dir: {current_dir}")

config, bps_workflow = create_idds_workflow(config_file, compute_site)
try:
config, bps_workflow = create_idds_workflow(config_file, compute_site)
except BpsSubprocessError as e:
code = e.errno
if code < 0:
print(f"BPS prepare caught exception: {e.strerror}")
sys.exit(128 + abs(code))
elif code != 0:
sys.exit(code)
idds_workflow = bps_workflow.idds_client_workflow

_, max_copy_workers = config.search("maxCopyWorkers", opt={"default": PANDA_DEFAULT_MAX_COPY_WORKERS})
Expand Down
9 changes: 9 additions & 0 deletions python/lsst/ctrl/bps/panda/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1128,8 +1128,13 @@ def create_idds_build_workflow(**kwargs):
task_site = get_task_parameter(config, remote_build, "computeSite")
task_queue = get_task_parameter(config, remote_build, "queue")
task_rss = get_task_parameter(config, remote_build, "requestMemory")
task_rss_max = get_task_parameter(config, remote_build, "requestMemoryMax")
memory_multiplier = get_task_parameter(config, remote_build, "memoryMultiplier")
task_rss_retry_step = task_rss * memory_multiplier if memory_multiplier else 0
task_rss_retry_offset = 0 if task_rss_retry_step else task_rss
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add task_rss_max, to limit the top memory. It will be supported.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll add this. I asked in the jira ticket whether this parameter will this break production jobs if it's not in iDDS yet?

nretries = get_task_parameter(config, remote_build, "numberOfRetries")
processing_type = get_task_parameter(config, remote_build, "processingType")
priority = get_task_parameter(config, remote_build, "priority")
_LOG.info("requestMemory: %s", task_rss)
_LOG.info("Site: %s", task_site)
# _LOG.info("executable: %s", executable)
Expand All @@ -1155,8 +1160,12 @@ def create_idds_build_workflow(**kwargs):
"value": "log.tgz",
},
task_rss=task_rss if task_rss else PANDA_DEFAULT_RSS,
task_rss_max=task_rss_max if task_rss_max else PANDA_DEFAULT_RSS_MAX,
task_rss_retry_offset=task_rss_retry_offset,
task_rss_retry_step=task_rss_retry_step,
task_cloud=task_cloud,
task_site=task_site,
task_priority=int(priority) if priority else PANDA_DEFAULT_PRIORITY,
maxattempt=nretries if nretries > 0 else PANDA_DEFAULT_MAX_ATTEMPTS,
)

Expand Down
Loading