Skip to content

Commit f2b533c

Browse files
authored
Merge pull request #309 from rynge/ospool-cm-auto-update
OSPool: refactoring cm/ccb, enable live reconfig
2 parents 56f497b + 2ee6102 commit f2b533c

16 files changed

Lines changed: 532 additions & 10 deletions
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
[program:condor_master]
2+
command=/usr/sbin/condor_master_wrapper
3+
autorestart=True
4+
startsecs=60
5+
stdout_logfile=/dev/stdout
6+
stdout_logfile_maxbytes=0
7+
redirect_stderr=true
8+
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# Specify the opensciencegrid/software-base image tag
2+
ARG BASE_OSG_SERIES=25
3+
ARG BASE_YUM_REPO=release
4+
ARG BASE_OS=el9
5+
6+
FROM opensciencegrid/software-base:$BASE_OSG_SERIES-$BASE_OS-$BASE_YUM_REPO
7+
8+
# has to be redefined for use in the RUN stages
9+
ARG BASE_YUM_REPO
10+
11+
RUN yum -y install \
12+
bc \
13+
git \
14+
lsof \
15+
rrdtool \
16+
python3-pip \
17+
vim \
18+
wget \
19+
&& \
20+
yum clean all
21+
22+
# Pull HTCondor from the proper repo. For "release" we need to use
23+
# osg-upcoming-testing to meet the patch tuesday requirements.
24+
RUN if [[ $BASE_YUM_REPO = release ]]; then \
25+
yum -y --enablerepo=osg-upcoming-testing install condor; \
26+
else \
27+
yum -y install condor; \
28+
fi
29+
30+
# basic config is a collector, so we can test
31+
RUN echo "DAEMON_LIST = MASTER, COLLECTOR" >/etc/condor/config.d/05-ospool-base.config && \
32+
echo "USE_SHARED_PORT = TRUE" >>/etc/condor/config.d/05-ospool-base.config
33+
34+
COPY condor_master_wrapper /usr/sbin/
35+
RUN chmod 755 /usr/sbin/condor_master_wrapper
36+
37+
# Override the software-base supervisord.conf to throw away supervisord logs
38+
COPY supervisord.conf /etc/supervisord.conf
39+
40+
COPY 10-htcondor.conf /etc/supervisord.d/
41+
42+
COPY ospool-ccb-config.sh /etc/osg/image-init.d/60-ospool-ccb-config.sh
43+
RUN chmod 755 /etc/osg/image-init.d/60-ospool-ccb-config.sh
44+
45+
COPY ospool-ccb.cron /etc/cron.d/ospool-ccb
46+
47+
ADD opt/ospool /opt/ospool
48+
49+
ADD healthy.sh /healthy.sh
50+
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# ospool-ccb
2+
3+
Container image for the OSPool ccb instances.
4+
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
{
2+
"standard_build": true,
3+
"repo_build": false,
4+
"base_os": ["el9"],
5+
"osg_series": ["25"],
6+
"base_repo": ["release"]
7+
}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#!/bin/bash
2+
3+
tail -F `condor_config_val LOG`/MasterLog 2>/dev/null &
4+
5+
exec /usr/sbin/condor_master -f
6+
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#!/bin/bash
2+
3+
failures=$(supervisorctl status | grep -Ev 'container_cleanup|RUNNING')
4+
if [ -n "$failures" ]; then
5+
failures=$(echo $failures | sed -E 's/ +/ /' | xargs)
6+
echo "supervisord non-RUNNING service: $failures" >&2
7+
exit 2
8+
fi
9+
10+
container_start_time=$(stat -c %Z /proc/1) # ctime, epoch time
11+
12+
procs_z=$(ps axo pid,stat | awk '$2 ~ /^Z/ { print $1 }' | wc -l)
13+
if [ "$procs_z" -gt 3 ]; then
14+
echo "Found $procs_z zombie (Z) processes" >&2
15+
exit 4
16+
fi
17+
18+
procs_d=$(ps axo pid,stat | awk '$2 ~ /^D/ { print $1 }' | wc -l)
19+
if [ "$procs_d" -gt 15 ]; then
20+
echo "Found $procs_d uninterruptible (D) processes" >&2
21+
exit 5
22+
fi
23+
24+
exit 0
25+
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#!/bin/bash
2+
3+
# For performance reasons, the pods are set up to log to local
4+
# disk. This script is run from cron, and keeps copies of the
5+
# logs for a certain number of days.
6+
7+
TARGET_DIR=save-$(date +'%Y%m%d')
8+
9+
mkdir -p /state/htcondor/logs
10+
cd /state/htcondor/logs || exit 1
11+
12+
mkdir -p $TARGET_DIR
13+
rsync -a /var/log/condor/. $TARGET_DIR/.
14+
15+
# only keep the last N set of saved logs
16+
for OLD in $(ls -d -t save-*| tail -n +7); do
17+
rm -rf $OLD
18+
done
19+
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
#!/usr/bin/python3
2+
3+
# This script is designed to adjust user priorities in the OSPool based on
4+
# the current state of user workloads. The HTCondor priority settings are
5+
# modified according to attributes such as the number of held jobs and the
6+
# ratio of goodput to badput.
7+
8+
import os
9+
import re
10+
import sys
11+
12+
import htcondor2 as htcondor
13+
import classad2 as classad
14+
from htcondor2 import AdTypes
15+
16+
17+
MIN_PRIORITY_FACTOR = 500
18+
MAX_PRIORITY_FACTOR = 20000
19+
HELD_JOB_PENALTY_MULTIPLIER = 5
20+
21+
22+
def ad_int(ad, key, default=0):
23+
if key not in ad:
24+
return default
25+
try:
26+
if int(ad[key]) == ad[key]:
27+
return int(ad[key])
28+
except:
29+
return default
30+
return default
31+
32+
33+
def step(n):
34+
# round n to a nice 100
35+
return int(round(n / 100) * 100)
36+
37+
38+
def main():
39+
col = htcondor.Collector()
40+
41+
# find the right negotiator
42+
neg_ad = None
43+
for ad in col.query(AdTypes.Negotiator):
44+
# the main negotiator of the pool starts with cm-1. or cm-2.
45+
if re.search("^cm-[12]\.", ad["Name"]):
46+
neg_ad = ad
47+
if not neg_ad:
48+
print("Unable to find the main negotiator")
49+
sys.exit(1)
50+
neg = htcondor.Negotiator(neg_ad)
51+
print(f"Updating negotiator {neg_ad['Name']}")
52+
53+
# get the current prio ads so we can determine if we need
54+
# an update or note
55+
current_prios = neg.getPriorities()
56+
57+
for ad in col.query(AdTypes.Submitter):
58+
59+
# current factor
60+
current_factor = 1000
61+
for prio_ad in current_prios:
62+
if prio_ad["Name"] == ad["Name"]:
63+
current_factor = round(prio_ad["PriorityFactor"])
64+
65+
factor = 1000
66+
67+
# held jobs
68+
held = ad_int(ad, "HeldJobs", 0) * HELD_JOB_PENALTY_MULTIPLIER
69+
factor += held
70+
71+
# make sure we finish jobs on ap2X before migration
72+
if ".uc.osg-htc.org" in ad["Name"]:
73+
factor -= 1000
74+
75+
# upper/lower limits on the adjustments
76+
factor = min(factor, MAX_PRIORITY_FACTOR)
77+
factor = max(factor, MIN_PRIORITY_FACTOR)
78+
factor = step(factor)
79+
80+
if factor != current_factor:
81+
print(f" {ad['Name']} {current_factor} -> {factor}")
82+
neg.setFactor(ad["Name"], factor)
83+
84+
85+
if __name__ == "__main__":
86+
main()
87+
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
#!/bin/bash
2+
3+
# this script is invoked by the OSG container init setup
4+
5+
6+
function kill_container()
7+
{
8+
echo "Restarting container: $1" 2>&1
9+
sleep 180
10+
exit 1
11+
}
12+
13+
# create an env file so we can have the cron jobs run with the
14+
# same environment as the init.d run
15+
if [ -e /etc/ospool.env ]; then
16+
. /etc/ospool.env
17+
else
18+
cat >/etc/ospool.env <<EOF
19+
export OSPOOL_ENVIRONMENT=$OSPOOL_ENVIRONMENT
20+
export OSPOOL_CCB_HOSTNAME=$OSPOOL_CCB_HOSTNAME
21+
EOF
22+
fi
23+
24+
# git repo - fetch / check for updates
25+
NEEDS_UPDATE=1
26+
cd /opt
27+
if [ ! -e osg-flock ]; then
28+
git clone https://github.com/opensciencegrid/osg-flock.git || kill_container "Unable to pull Git repo"
29+
else
30+
# only continue with the config if changes are found
31+
cd osg-flock
32+
git fetch --quiet
33+
# Count commits that exist on the upstream branch but NOT on local branch
34+
CHANGES_COUNT=$(git rev-list --count HEAD..@{u})
35+
if [ "$CHANGES_COUNT" -eq 0 ]; then
36+
echo "The osg-flock checkout is in sync with git repo. Nothing to do."
37+
NEEDS_UPDATE=0
38+
fi
39+
git pull
40+
fi
41+
42+
if [ $NEEDS_UPDATE -eq 1 ]; then
43+
44+
# fix ownership/permissions on mounted directories
45+
chown -R condor:condor /var/log/condor
46+
chown -R condor:condor /var/lib/condor/spool
47+
48+
# most config comes from the shared github repo below, but here is
49+
# what is specific to the cm pod
50+
cat >/etc/condor/config.d/10-ospool-ccb.config <<EOF
51+
52+
DAEMON_LIST = MASTER, SHARED_PORT, COLLECTOR
53+
54+
# FULL_HOSTNAME seems to be causing issues with HTCondor 23
55+
#CONDOR_HOST = \$(FULL_HOSTNAME)
56+
CONDOR_HOST = 127.0.0.1
57+
58+
HOST_ALIAS = $OSPOOL_CCB_HOSTNAME
59+
TCP_FORWARDING_HOST = $OSPOOL_CCB_HOSTNAME
60+
61+
UPDATE_COLLECTOR_WITH_TCP = True
62+
63+
USE_SHARED_PORT = True
64+
SHARED_PORT_MAX_WORKERS = 1000
65+
SHARED_PORT_PORT = 9618
66+
67+
# Setup 10 child collectors
68+
use feature:ChildCollector(1)
69+
use feature:ChildCollector(2)
70+
use feature:ChildCollector(3)
71+
use feature:ChildCollector(4)
72+
use feature:ChildCollector(5)
73+
use feature:ChildCollector(6)
74+
use feature:ChildCollector(7)
75+
use feature:ChildCollector(8)
76+
use feature:ChildCollector(9)
77+
use feature:ChildCollector(10)
78+
79+
# no forwarding here - these are only used for CCB
80+
TOP_COLLECTOR_HOST =
81+
82+
# limit logging
83+
COLLECTOR1.MAX_COLLECTOR_LOG = \$(MAX_DEFAULT_LOG)
84+
COLLECTOR2.MAX_COLLECTOR_LOG = \$(MAX_DEFAULT_LOG)
85+
COLLECTOR3.MAX_COLLECTOR_LOG = \$(MAX_DEFAULT_LOG)
86+
COLLECTOR4.MAX_COLLECTOR_LOG = \$(MAX_DEFAULT_LOG)
87+
COLLECTOR5.MAX_COLLECTOR_LOG = \$(MAX_DEFAULT_LOG)
88+
COLLECTOR6.MAX_COLLECTOR_LOG = \$(MAX_DEFAULT_LOG)
89+
COLLECTOR7.MAX_COLLECTOR_LOG = \$(MAX_DEFAULT_LOG)
90+
COLLECTOR8.MAX_COLLECTOR_LOG = \$(MAX_DEFAULT_LOG)
91+
COLLECTOR9.MAX_COLLECTOR_LOG = \$(MAX_DEFAULT_LOG)
92+
COLLECTOR10.MAX_COLLECTOR_LOG = \$(MAX_DEFAULT_LOG)
93+
94+
EOF
95+
96+
echo "Installing HTCondor credentials..."
97+
cd /etc/ospool-creds/idkeys.d
98+
for FILE in *; do
99+
install -o root -g root -m 0600 $FILE /etc/condor/passwords.d/$FILE
100+
done
101+
cd /etc/ospool-creds/idtokens.d
102+
for FILE in *; do
103+
install -o root -g root -m 0600 $FILE /etc/condor/tokens.d/$FILE
104+
done
105+
# the gwms frontend generates tokens with kid=FRONTEND - for now make
106+
# sure we have a copy of our flock.opensciencegrid.org password in the
107+
# correct location
108+
install -o root -g root -m 0600 \
109+
/etc/condor/passwords.d/flock.opensciencegrid.org \
110+
/etc/condor/passwords.d/FRONTEND
111+
# SSL auth - the main hostcert comes from k8s certmanager
112+
cd /etc/ospool-creds/tls.d
113+
install -o root -g root -m 0644 tls.crt /etc/pki/tls/certs/localhost.crt
114+
install -o root -g root -m 0600 tls.key /etc/pki/tls/private/localhost.key
115+
116+
# condor config
117+
rm -f /etc/condor/config.d/*ospoolgit*
118+
cp /opt/osg-flock/ospool.osg-htc.org/$OSPOOL_ENVIRONMENT/htcondor-config.d/* /etc/condor/config.d/
119+
rm -f /etc/condor/config.d/90_high_availability.config
120+
rm -f /etc/condor/config.d/95_negotiator_osgflockgit.config
121+
122+
cd /opt/osg-flock/ospool.osg-htc.org
123+
echo "Writing out new /etc/condor/certs/condor_mapfile ..."
124+
mkdir -p /etc/condor/certs
125+
./fe-admin --target-env $OSPOOL_ENVIRONMENT --htcondor-mapfile >/etc/condor/certs/condor_mapfile
126+
echo "Writing out new /etc/condor/config.d/95_flocking_ospoolgit.config ..."
127+
./fe-admin --target-env $OSPOOL_ENVIRONMENT --htcondor-config >/etc/condor/config.d/95_flocking_ospoolgit.config
128+
129+
# this will fail during initial configuration, but work once the pool is up
130+
/usr/sbin/condor_reconfig || true
131+
132+
fi
133+
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
2+
# configuration changes
3+
*/5 * * * * root /bin/bash -c 'sleep $(( RANDOM \% 240 )); /etc/osg/image-init.d/60-ospool-ccb-config.sh' >/var/log/ospool-ccb-config.log 2>&1
4+
5+
6+
# logs
7+
0 * * * * root /opt/ospool/retain-logs >/dev/null 2>&1
8+
9+

0 commit comments

Comments
 (0)