@@ -143,7 +143,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
143143 res = await session .execute (
144144 select (RunModel )
145145 .where (RunModel .id == job_model .run_id )
146- .options (joinedload (RunModel .project ). joinedload ( ProjectModel . backends ) )
146+ .options (joinedload (RunModel .project ))
147147 .options (joinedload (RunModel .user ))
148148 .options (joinedload (RunModel .repo ))
149149 .options (joinedload (RunModel .jobs ))
@@ -163,22 +163,18 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
163163
164164 job = find_job (run .jobs , job_model .replica_num , job_model .job_num )
165165
166- # Wait until all other jobs in the replica are provisioned
167- for other_job in run .jobs :
168- if (
169- other_job .job_spec .replica_num == job .job_spec .replica_num
170- and other_job .job_submissions [- 1 ].status == JobStatus .SUBMITTED
171- ):
172- job_model .last_processed_at = common_utils .get_current_datetime ()
173- await session .commit ()
174- return
175-
176- server_ssh_private_keys = get_instance_ssh_private_keys (
177- common_utils .get_or_error (job_model .instance )
178- )
179-
180166 initial_status = job_model .status
181167 if initial_status in [JobStatus .PROVISIONING , JobStatus .PULLING ]:
168+ # Wait until all other jobs in the replica are provisioned
169+ for other_job in run .jobs :
170+ if (
171+ other_job .job_spec .replica_num == job .job_spec .replica_num
172+ and other_job .job_submissions [- 1 ].status == JobStatus .SUBMITTED
173+ ):
174+ job_model .last_processed_at = common_utils .get_current_datetime ()
175+ await session .commit ()
176+ return
177+
182178 cluster_info = _get_cluster_info (
183179 jobs = run .jobs ,
184180 replica_num = job .job_spec .replica_num ,
@@ -210,94 +206,98 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
210206 job_model .last_processed_at = common_utils .get_current_datetime ()
211207 return
212208
209+ server_ssh_private_keys = get_instance_ssh_private_keys (
210+ common_utils .get_or_error (job_model .instance )
211+ )
212+
213213 if initial_status == JobStatus .PROVISIONING :
214214 if job_provisioning_data .hostname is None :
215215 await _wait_for_instance_provisioning_data (job_model = job_model )
216+ job_model .last_processed_at = common_utils .get_current_datetime ()
217+ await session .commit ()
218+ return
219+ if _should_wait_for_other_nodes (run , job , job_model ):
220+ job_model .last_processed_at = common_utils .get_current_datetime ()
221+ await session .commit ()
222+ return
223+
224+ # fails are acceptable until timeout is exceeded
225+ if job_provisioning_data .dockerized :
226+ logger .debug (
227+ "%s: process provisioning job with shim, age=%s" ,
228+ fmt (job_model ),
229+ job_submission .age ,
230+ )
231+ ssh_user = job_provisioning_data .username
232+ user_ssh_key = run .run_spec .ssh_key_pub .strip ()
233+ public_keys = [project .ssh_public_key .strip (), user_ssh_key ]
234+ if job_provisioning_data .backend == BackendType .LOCAL :
235+ # No need to update ~/.ssh/authorized_keys when running shim locally
236+ user_ssh_key = ""
237+ success = await common_utils .run_async (
238+ _process_provisioning_with_shim ,
239+ server_ssh_private_keys ,
240+ job_provisioning_data ,
241+ None ,
242+ run ,
243+ job_model ,
244+ job_provisioning_data ,
245+ volumes ,
246+ job .job_spec .registry_auth ,
247+ public_keys ,
248+ ssh_user ,
249+ user_ssh_key ,
250+ )
216251 else :
217- if _should_wait_for_other_nodes (run , job , job_model ):
218- job_model .last_processed_at = common_utils .get_current_datetime ()
219- await session .commit ()
220- return
252+ logger .debug (
253+ "%s: process provisioning job without shim, age=%s" ,
254+ fmt (job_model ),
255+ job_submission .age ,
256+ )
257+ # FIXME: downloading file archives and code here is a waste of time if
258+ # the runner is not ready yet
259+ file_archives = await _get_job_file_archives (
260+ session = session ,
261+ archive_mappings = job .job_spec .file_archives ,
262+ user = run_model .user ,
263+ )
264+ code = await _get_job_code (
265+ session = session ,
266+ project = project ,
267+ repo = repo_model ,
268+ code_hash = _get_repo_code_hash (run , job ),
269+ )
221270
222- # fails are acceptable until timeout is exceeded
223- if job_provisioning_data .dockerized :
224- logger .debug (
225- "%s: process provisioning job with shim, age=%s" ,
226- fmt (job_model ),
227- job_submission .age ,
228- )
229- ssh_user = job_provisioning_data .username
230- user_ssh_key = run .run_spec .ssh_key_pub .strip ()
231- public_keys = [project .ssh_public_key .strip (), user_ssh_key ]
232- if job_provisioning_data .backend == BackendType .LOCAL :
233- # No need to update ~/.ssh/authorized_keys when running shim locally
234- user_ssh_key = ""
235- success = await common_utils .run_async (
236- _process_provisioning_with_shim ,
237- server_ssh_private_keys ,
238- job_provisioning_data ,
239- None ,
240- run ,
241- job_model ,
242- job_provisioning_data ,
243- volumes ,
244- job .job_spec .registry_auth ,
245- public_keys ,
246- ssh_user ,
247- user_ssh_key ,
248- )
249- else :
250- logger .debug (
251- "%s: process provisioning job without shim, age=%s" ,
271+ success = await common_utils .run_async (
272+ _submit_job_to_runner ,
273+ server_ssh_private_keys ,
274+ job_provisioning_data ,
275+ None ,
276+ run ,
277+ job_model ,
278+ job ,
279+ cluster_info ,
280+ code ,
281+ file_archives ,
282+ secrets ,
283+ repo_creds ,
284+ success_if_not_available = False ,
285+ )
286+
287+ if not success :
288+ # check timeout
289+ if job_submission .age > get_provisioning_timeout (
290+ backend_type = job_provisioning_data .get_base_backend (),
291+ instance_type_name = job_provisioning_data .instance_type .name ,
292+ ):
293+ logger .warning (
294+ "%s: failed because runner has not become available in time, age=%s" ,
252295 fmt (job_model ),
253296 job_submission .age ,
254297 )
255- # FIXME: downloading file archives and code here is a waste of time if
256- # the runner is not ready yet
257- file_archives = await _get_job_file_archives (
258- session = session ,
259- archive_mappings = job .job_spec .file_archives ,
260- user = run_model .user ,
261- )
262- code = await _get_job_code (
263- session = session ,
264- project = project ,
265- repo = repo_model ,
266- code_hash = _get_repo_code_hash (run , job ),
267- )
268-
269- success = await common_utils .run_async (
270- _submit_job_to_runner ,
271- server_ssh_private_keys ,
272- job_provisioning_data ,
273- None ,
274- run ,
275- job_model ,
276- job ,
277- cluster_info ,
278- code ,
279- file_archives ,
280- secrets ,
281- repo_creds ,
282- success_if_not_available = False ,
283- )
284-
285- if not success :
286- # check timeout
287- if job_submission .age > get_provisioning_timeout (
288- backend_type = job_provisioning_data .get_base_backend (),
289- instance_type_name = job_provisioning_data .instance_type .name ,
290- ):
291- logger .warning (
292- "%s: failed because runner has not become available in time, age=%s" ,
293- fmt (job_model ),
294- job_submission .age ,
295- )
296- job_model .status = JobStatus .TERMINATING
297- job_model .termination_reason = (
298- JobTerminationReason .WAITING_RUNNER_LIMIT_EXCEEDED
299- )
300- # instance will be emptied by process_terminating_jobs
298+ job_model .status = JobStatus .TERMINATING
299+ job_model .termination_reason = JobTerminationReason .WAITING_RUNNER_LIMIT_EXCEEDED
300+ # instance will be emptied by process_terminating_jobs
301301
302302 else : # fails are not acceptable
303303 if initial_status == JobStatus .PULLING :
0 commit comments