-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathdist_longjob.sh
More file actions
executable file
·495 lines (399 loc) · 17 KB
/
dist_longjob.sh
File metadata and controls
executable file
·495 lines (399 loc) · 17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
#!/bin/bash
#DESCRIPTION checkpoint restart for long jobs, and batching of subsequent jobs into arrays
#
# Inspired by longjob, written by Dirk Colbry
# Written by Rosangela Canino-Koning
#
## Script acts as the initial job runner,
## or as the restorer of a checkpointed job
## Setup and Environment Variables
# Set the default wait time to just under four hours
export BLCR_WAIT_SEC=$(( 4 * 60 * 60 - 600 ))
#export BLCR_WAIT_SEC=30 # 90 seconds for testing
# these variables must be passed in via qsub -v, or be exported in the environment
# if calling dist_longjob.sh directly (not recommended).
# e.g. qsub -v JOBNAME=JOB_YES,TARGET_DIR="/mnt/home/caninoko/tmp/qsub_dev/output/101"
echo TARGETDIR $TARGETDIR
echo STARTSEED $STARTSEED
seed=$(($STARTSEED + $PBS_ARRAYID))
JOBTARGET="${JOBNAME}_${seed}"
echo seed $seed
echo JOBTARGET $JOBTARGET
echo JOBNAME $JOBNAME
echo JOBSEEDS $JOBSEEDS
echo DEST_DIR $DEST_DIR
echo LSTRING $LSTRING
echo JOBCOMMAND $JOBCOMMAND
echo CONFIGDIR $CONFIGDIR
echo CPR $CPR
echo EMAILSCRIPT $EMAILSCRIPT
echo USESCRATCH $USESCRATCH
echo DIST_QSUB_DIR $DIST_QSUB_DIR
echo QSUB_DIR $QSUB_DIR
echo QSUB_FILE $QSUB_FILE
echo MAX_QUEUE $MAX_QUEUE
user=$(whoami)
timeout_retries=0
###### get the job going
if [ $CPR -eq "0" ] ## initial
then
## do the inital work
#We have no clue where this was actually submitted from, but we know
#the configdir is at the level below it
# create the directory where we will do our work
mkdir $TARGETDIR/$JOBTARGET
echo mkdir $TARGETDIR/$JOBTARGET
# copy the config dir
cp -r ${CONFIGDIR}/* $TARGETDIR/$JOBTARGET
echo cp -r ${CONFIGDIR}/* $TARGETDIR/$JOBTARGET
# head to the tmp directory on the node
cd $TARGETDIR/$JOBTARGET
echo cd $TARGETDIR/$JOBTARGET
# dump out the JOBCOMMAND
echo "#!/bin/bash" > command.sh
echo $JOBCOMMAND >> command.sh
chmod 755 ./command.sh
# Add this ID to the list of ids associated with this chunk of jobs
trimmedid=`echo ${PBS_JOBID} | rev | cut -d[ -f2- | rev`
echo $trimmedid >> ${QSUB_FILE}_successor_jobs.txt
# and run it with cr_run
cr_run ./command.sh 1> run.log 2>&1 &
export PID=$!
else ## restart an existing job!
# Double-check that this job isn't already done (someone might have been trying to resubmit other jobs in the array)
isdone=`grep -w ${PBS_ARRAYID} ${QSUB_FILE}_done_arrayjobs.txt | wc -l`
if [ $isdone -eq 1 ]
then
echo "Job already done"
exit 0
fi
# go to the final location, where we should've stashed our checkpoint
cd $TARGETDIR/$JOBTARGET
# restart our job, using the pwd we saved before!
echo "Restarting!"
echo "HEYA RESTARTING" >> run.log
cr_restart --no-restore-pid --run-on-fail-temp="echo temp_fail" --run-on-fail-perm="echo perm_fail" --run-on-fail-env="echo env_fail" --run-on-fail-temp="echo args_fail" --run-on-success="echo Success" --file checkpoint.blcr >> run.log 2>&1 &
PID=$!
fi
copy_out() {
tar czf dist_transfer.tar.gz .
mv dist_transfer.tar.gz $TARGETDIR/$JOBTARGET
cd $TARGETDIR/$JOBTARGET
tar xzf dist_transfer.tar.gz
rm dist_transfer.tar.gz
}
resubmit_array() {
## calculate what the successor job's name should be
# trim out the excess after the [ from the jobID
trimmedid=`echo ${PBS_JOBID} | rev | cut -d[ -f2- | rev`
# now, trim the completed name down to 16 characters because that's
# what'll show up on qstat
sname=`echo "${trimmedid}_${JOBNAME}" | cut -c 1-16`
echo $sname
# sleep a random amount of time (to break up the identical stacks of jobs)
sleep $[ 3 + $[ RANDOM % 10 ]]
# look through qstat until you find the name
echo "qstat -u $PBS_O_LOGNAME | grep $sname | wc -l"
combinedstatus=`qstat -u $PBS_O_LOGNAME | grep $sname | wc -l`
# if we didn't find it, go ahead and race to make the successor job ourselves
# and start it in a held state
if [ $combinedstatus -lt 1 ]
then
# check if someone else is already in charge.
if [ ! -f $TARGETDIR/$sname.* ]
then
# throw my hat in the ring
touch $TARGETDIR/${sname}.${PBS_ARRAYID}
sleep 5
# ooh, it's a race.
if [ `ls $TARGETDIR/$sname.* | sort | head -1` == $TARGETDIR/$sname.${PBS_ARRAYID} ]
then
## it's me!
echo "WON THE RACE"
corrected_lstring=`echo $LSTRING | tr " " ","`
echo qsub -h -l $corrected_lstring -N $sname -o ${DEST_DIR}/${JOBNAME}_message.log -t $JOBSEEDS -v STARTSEED="${STARTSEED}",TARGETDIR="${TARGETDIR}",JOBNAME="${JOBNAME}",DEST_DIR="${DEST_DIR}",JOBSEEDS="${JOBSEEDS}",LSTRING="$LSTRING",CPR=1,EMAILSCRIPT="$EMAILSCRIPT",DIST_QSUB_DIR="${DIST_QSUB_DIR}",QSUB_FILE="${QSUB_FILE}",MAX_QUEUE="${MAX_QUEUE}" ${DIST_QSUB_DIR}/dist_longjob.sh
qsub -h -l $corrected_lstring -N $sname -o ${DEST_DIR}/${JOBNAME}_message.log -t $JOBSEEDS -v STARTSEED="${STARTSEED}",TARGETDIR="${TARGETDIR}",JOBNAME="${JOBNAME}",DEST_DIR="${DEST_DIR}",JOBSEEDS="${JOBSEEDS}",LSTRING="$LSTRING",CPR=1,EMAILSCRIPT="$EMAILSCRIPT",DIST_QSUB_DIR="${DIST_QSUB_DIR}",QSUB_FILE="${QSUB_FILE}",MAX_QUEUE="${MAX_QUEUE}" ${DIST_QSUB_DIR}/dist_longjob.sh
sleep 10
rm $TARGETDIR/$sname.* # clean up
### Grab the ID of the job we just made and stuff it into the jobs file
### Original ID should have already been added above
echo "qstat -u $PBS_O_LOGNAME | grep "$sname" | awk '{print \$1}' | rev | cut -d[ -f2- | rev"
mysid=`qstat -u $PBS_O_LOGNAME | grep "$sname" | awk '{print \$1}' | rev | cut -d[ -f2- | rev`
echo $mysid >> ${QSUB_FILE}_successor_jobs.txt
# Attempt to restart any orphaned jobs (i.e. jobs that should run but that don't have any jobs around
# that could possibly start them - this happens if the precursor dies in a weird way)
# Start by iterating over all jobs in the current array that are still in the held state
# (since it's suspicious that they haven't even started running and this one is already done)
for jid in $(qselect -s H -u $PBS_O_LOGNAME | grep $trimmedid | cut -d '[' -f 2 | cut -d ']' -f 1)
do
# If this job is already completely done, go to the next iteration so we don't accidentally restart it
isdone=`grep -w $jid ${QSUB_FILE}_done_arrayjobs.txt | wc -l`
if [ $isdone -ge 1 ]
then
continue
fi
running=0
echo "Checking for orphaned jobs. JID:" $jid
while read suc || [[ -n $suc ]]
do
# Is this job id running in any prior array? If so, it just got really far behind. No action required.
running=$(expr $running + `qstat -t $suc[$jid] | tail -n +3 | tr -s ' ' | cut -f 5 -d " " | grep "R" | wc -l`)
done <${QSUB_FILE}_successor_jobs.txt
if [ $running -lt 1 ]
then
echo "Job isn't running. Restarting it"
# Cleanup any jobs that were supposed to run this but never got released - we're in charge now
while read suc || [[ -n $suc ]]
do
if [ $suc -ne ${mysid} ]
then
qdel ${suc}[$jid]
fi
done <${QSUB_FILE}_successor_jobs.txt
# Run the job
echo qrls -t $jid ${mysid}[]
qrls -t $jid ${mysid}[]
fi
done
else
# oop, lost the race
echo "Lost the race, letting winner do the thing."
sleep 10
fi
else
echo "Someone else is already in charge, letting leader do the thing."
# someone else is already doing it.
sleep 10
fi
fi
# now, find the ID of the successor job
# trim it down so we can send messages to it.
echo "qstat -u $PBS_O_LOGNAME | grep "$sname" | awk '{print \$1}' | rev | cut -d[ -f2- | rev"
sid=`qstat -u $PBS_O_LOGNAME | grep "$sname" | awk '{print \$1}' | rev | cut -d[ -f2- | rev`
# send an un-hold message to our particular successor sub-job
echo "qrls -t $PBS_ARRAYID ${sid}[]"
qrls -t $PBS_ARRAYID ${sid}[]
#delete all the finished jobs we know about (for sanity)
echo "Deleting all other unneeded successor subjobs."
while read j || [[ -n $j ]]
do
while read p || [[ -n $p ]]
do
echo qdel ${j}[$p]
qdel ${j}[$p]
done <${QSUB_FILE}_done_arrayjobs.txt
done <${QSUB_FILE}_successor_jobs.txt
echo "Done with Timeout and Checkpoint Processing"
}
checkpoint_timeout() {
echo "Timeout. Checkpointing Job"
# Sometimes, which checkpointing fails, it leaves behind a file called .checkpoint.blcr.tmp, which
# causes all future attempts to run cr_checkpoint to fail. There is no reason that a file like this
# should exist immediately before we call cr_checkpoint, so this is a safe time to get rid of it
# if necessary.
if [ -f .checkpoint.blcr.tmp ]
then
echo "Removing .checkpoint.blcr.tmp so it doesn't confuse cr_checkpoint"
yes | rm .checkpoint.blcr.tmp
fi
time cr_checkpoint --term -f checkpoint.blcr --backup=checkpoint_safe.blcr --kmsg-warning --time 300 $PID
if [ ! "$?" == "0" ]
then
echo "Failed to checkpoint."
# If there were no successful checkpoints letting this get resubmitted again
# won't help. It will have to be resubmitted manually.
# TODO: There's probably a way to make that happen automatically
if [ ! -f checkpoint.blcr ] && [ ! -f checkpoint_safe.blcr ]
then
exit 2
fi
fi
resubmit_array
}
# begin checkpoint timeout, which will go in the background.
# This will run if the job didn't finish before the timer runs out.
# Because the timeout kills the job, the wait ${PID} below will return.
# Even after the wait ${PID} below returns, the timeout may still be going,
# what with re-submitting the job, etc.
echo $BLCR_WAIT_SEC
(sleep $BLCR_WAIT_SEC; echo 'Timer Done'; checkpoint_timeout;) &
timeout=$!
echo "starting timer (${timeout}) for $BLCR_WAIT_SEC seconds"
echo "Waiting on cr_run job: $PID"
echo "ZZzzzzz"
wait ${PID}
RET=$?
###############################################################################
############### NOW WE WAIT ###################################################
###############################################################################
handle_didnt_timeout() {
# Ooh, we're executing again. Something musta happened.
# Check to see if we're moving along again because the job checkpointed
if [ "${RET}" = "143" ] #Job terminated due to cr_checkpoint
then
# Clear repeated failure tracker
rm last_failed 2> /dev/null
rm last_two_failed 2> /dev/null
echo "AWAKE - Job seems to have been checkpointed, waiting for checkpoint_timeout function to finish processing."
wait ${timeout}
echo "See you next time around..."
exit 0
fi
echo "$timeout_retries timeouts"
# ELSE:
######################### JOB COMPLETED ##############################
# We're actually executing again because the job finished (no checkpointing).
# This could happen for a couple reasons.
# 1. Either the job legit finished,
# 2. The job crashed on checkpoint restart, as in, it never started up. :(
# Either way, we have some cleanup to do. :/
#Kill timeout timer
kill ${timeout} # prevent it from doing anything dumb.
echo "Sub-job seems to have finished. Here's the return code: "
echo ${RET}
if [ "${RET}" = "132" ] || [ "${RET}" = "139" ] #Job terminated due to cr_checkpoint
then
echo "CRASH - Job seems to have crashed, but it's unclear how."
echo "Attempting crash recovery. Retries: $timeout_retries"
#If we have a checkpoint_safe file and using it hasn't already failed
#give that a shot
if [ -f checkpoint.blcr ] && [ $timeout_retries -lt 2 ]
then
echo "Restarting..."
mv checkpoint.blcr checkpoint_tried.blcr
cr_restart --no-restore-pid --run-on-fail-temp="echo temp_fail" --run-on-fail-perm="echo perm_fail" --run-on-fail-env="echo env_fail" --run-on-fail-temp="echo args_fail" --run-on-success="echo Success" --file checkpoint_tried.blcr >> run.log 2>&1 &
PID=$!
#debugging
touch attempted_recovery_check_$PID
timeout_retries=$(expr $timeout_retries + 1)
#Dividing it by 2 is probably overkill - just trying to play it safe.
(sleep $(expr $BLCR_WAIT_SEC / 2); echo 'Timer Done'; checkpoint_timeout;) &
timeout=$!
echo "starting timer (${timeout}) for $BLCR_WAIT_SEC / 2 seconds"
echo "Waiting on cr_run job: $PID"
echo "ZZzzzzz"
wait ${PID}
RET=$?
handle_didnt_timeout
elif [ -f checkpoint_safe.blcr ] && [ $timeout_retries -lt 3 ]
then
echo "Restarting..."
mv checkpoint_safe.blcr checkpoint_safe_tried.blcr
cr_restart --no-restore-pid --run-on-fail-temp="echo temp_fail" --run-on-fail-perm="echo perm_fail" --run-on-fail-env="echo env_fail" --run-on-fail-temp="echo args_fail" --run-on-success="echo Success" --file checkpoint_safe_tried.blcr >> run.log 2>&1 &
PID=$!
#debugging
touch attempted_recovery_checksafe_$PID
timeout_retries=$(expr $timeout_retries + 1)
#Dividing it by 2 is probably overkill - just trying to play it safe.
(sleep $(expr $BLCR_WAIT_SEC / 2); echo 'Timer Done'; checkpoint_timeout;) &
timeout=$!
echo "starting timer (${timeout}) for $BLCR_WAIT_SEC / 2 seconds"
echo "Waiting on cr_run job: $PID"
echo "ZZzzzzz"
wait ${PID}
RET=$?
handle_didnt_timeout
fi
#debugging
if [ $timeout_retries -eq 3 ]
then
touch array_resubmited_$PID
echo "Restoring checkpoint files since it's unlikely they're both corrupted. This was probably caused by something else, like running on the wrong node"
mv checkpoint_safe_tried.blcr checkpoint_safe.blcr
mv checkpoint_tried.blcr checkpoint.blcr
if [ -f last_failed ]
then
if [ -f last_two_failed ]
then
echo "Third array resubmit fail in a row... this isn't working"
echo "Letting this job die. Maybe it will get recovered by another job"
rm last_failed
rm last_two_failed
touch complete_array_resubmit_failure
exit 0
fi
echo "Hmmm... second array resubmit fail in a row. This isn't looking good."
touch last_two_failed
else
touch last_failed
fi
echo "Resubmitting array... hopefully this will work next time around"
resubmit_array
fi
exit 0
fi
echo "Cleanup time"
## delete our successor job, should there be one
# trim out the excess after the [ from the jobID
echo "Cleanup - PREPPING TO DELETE UN-NEEDED SUBJOBS"
trimmedid=`echo ${PBS_JOBID} | rev | cut -d[ -f2- | rev`
echo "echo ${PBS_JOBID} | rev | cut -d[ -f2- | rev"
echo trimmedid = $trimmedid
# now, trim the completed name down to 16 characters because that's
# what'll show up on qstat
sname=`echo "${trimmedid}_${JOBNAME}" | cut -c 1-16`
echo "echo "${trimmedid}_${JOBNAME}" | cut -c 1-16"
echo sname = $sname
sid=`qstat -u $PBS_O_LOGNAME | grep "$sname" | awk '{print \$1}' | rev | cut -d[ -f2- | rev`
echo "qstat -u $PBS_O_LOGNAME | grep "$sname" | awk '{print \$1}' | rev | cut -d[ -f2- | rev"
echo Found Successor ID: $sid
if [ -n "$sid" ]
then
echo "Deleting unneeded successor subjob:" $sid
echo qdel -t $PBS_ARRAYID ${sid}[]
qdel -t $PBS_ARRAYID ${sid}[]
else
echo "No successor job found."
fi
#delete all the finished jobs we know about (for sanity)
echo "Deleting all other unneeded successor subjobs."
while read j || [[ -n $j ]]
do
# Sucessors for jobs that have already been marked as done
while read p || [[ -n $p ]]
do
echo qdel $j[$p]
qdel $j[$p]
done <${QSUB_FILE}_done_arrayjobs.txt
# Delete this job from other arrays
if [ $j -ne $trimmedid ]
then
qdel $j[${PBS_ARRAYID}]
fi
done <${QSUB_FILE}_successor_jobs.txt
#Notify the email script that we're done.
# If all sub-jobs are done, it'll email the user that
# the job has completed
$EMAILSCRIPT $PBS_JOBID $USER " " $JOBNAME
echo "Sub-job completed with exit status ${RET}"
#create task finished file
cp ${QSUB_FILE} ${QSUB_FILE}_done
echo "${QSUB_FILE} is done"
#remove lock file
rm ${QSUB_FILE}_done.lock 2> /dev/null
echo "Lock removed"
#remove original qsub file so we don't have to keep trying to submit it
rm ${QSUB_FILE} 2> /dev/null
echo "Original qsub file removed"
echo "Checking to see if there are more jobs that should be started"
qstat -f ${PBS_JOBID} | grep "used"
export RET
# Make sure not to submit too many jobs
current_jobs=$(showq -u $user | tail -2 | head -1 | cut -d " " -f 4)
echo "There are currently ${current_jobs} jobs in the queue"
if [ ! -f $QSUB_DIR/finished.txt ] # If "finished.txt" exists, no more tasks need to be done
then
# submits the next job
if [ $current_jobs -lt $MAX_QUEUE ]
then
echo "Trying to submit another job"
python $DIST_QSUB_DIR/scheduler.py ${PBS_JOBID} $QSUB_DIR
fi
fi
## mark our job as being complete, so it gets cleaned up in later iterations.
## This happens at the very end so no other jobs try to clean it up while it's still doing cleanup
echo $PBS_ARRAYID >> ${QSUB_FILE}_done_arrayjobs.txt
}
timeout_retries=$(expr $timeout_retries + 1)
handle_didnt_timeout
echo "Done with everything"