Skip to content

Commit efa1e47

Browse files
authored
Merge pull request #325 from jedwards4b/pio_asyncio_in_cmeps
enable asyncio using pio ### Description of changes Allows IO tasks to be independent of compute tasks in cesm ### Specific notes (testing in progress) Contributors other than yourself, if any: Depends on share (ESCOMP/CESM_share#37) and cime (ESMCI/cime#4340). CMEPS Issues Fixed (include github issue #): Are changes expected to change answers? (specify if bfb, different at roundoff, more substantial) Any User Interface Changes (namelist or namelist defaults changes)? ### Testing performed Testing performed if application target is CESM: - [ ] (recommended) CIME_DRIVER=nuopc scripts_regression_tests.py - machines: - details (e.g. failed tests): - [ ] (recommended) CESM testlist_drv.xml - machines and compilers: - details (e.g. failed tests): - [X] (optional) CESM prealpha test - machines and compilers cheyenne intel - details (e.g. failed tests): results consistant with cesm2_3_alpha10d - [ ] (other) please described in detail - machines and compilers - details (e.g. failed tests): Testing performed if application target is UFS-coupled: - [ ] (recommended) UFS-coupled testing - description: - details (e.g. failed tests): Testing performed if application target is UFS-HAFS: - [X] (recommended) UFS-HAFS testing - description: - details (e.g. failed tests): ### Hashes used for testing: - [ ] CESM: - repository to check out: https://github.com/ESCOMP/CESM.git - branch/hash: - [ ] UFS-coupled, then umbrella repostiory to check out and associated hash: - repository to check out: - branch/hash: - [ ] UFS-HAFS, then umbrella repostiory to check out and associated hash: - repository to check out: - branch/hash:
2 parents 1f06c58 + c2f8792 commit efa1e47

12 files changed

Lines changed: 832 additions & 327 deletions

File tree

cesm/driver/ensemble_driver.F90

Lines changed: 198 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,13 @@ module Ensemble_driver
1616

1717
public :: SetServices
1818
private :: SetModelServices
19+
private :: ensemble_finalize
1920

21+
integer, allocatable :: asyncio_petlist(:)
22+
logical :: asyncio_task=.false.
23+
logical :: asyncIO_available=.false.
24+
integer :: number_of_members
25+
integer :: inst ! ensemble instance containing this task
2026
character(*),parameter :: u_FILE_u = &
2127
__FILE__
2228

@@ -26,9 +32,12 @@ module Ensemble_driver
2632

2733
subroutine SetServices(ensemble_driver, rc)
2834

29-
use NUOPC , only : NUOPC_CompDerive, NUOPC_CompSpecialize
35+
use NUOPC , only : NUOPC_CompDerive, NUOPC_CompSpecialize, NUOPC_CompAttributeSet
36+
use NUOPC , only : NUOPC_CompAttributeGet
3037
use NUOPC_Driver , only : driver_routine_SS => SetServices
3138
use NUOPC_Driver , only : ensemble_label_SetModelServices => label_SetModelServices
39+
use NUOPC_Driver , only : ensemble_label_PostChildrenAdvertise => label_PostChildrenAdvertise
40+
use NUOPC_Driver , only : label_Finalize
3241
use ESMF , only : ESMF_GridComp, ESMF_GridCompSet
3342
use ESMF , only : ESMF_Config, ESMF_ConfigCreate, ESMF_ConfigLoadFile
3443
use ESMF , only : ESMF_SUCCESS, ESMF_LogWrite, ESMF_LOGMSG_INFO
@@ -38,6 +47,7 @@ subroutine SetServices(ensemble_driver, rc)
3847

3948
! local variables
4049
type(ESMF_Config) :: config
50+
logical :: isPresent
4151
character(len=*), parameter :: subname = "(ensemble_driver.F90:SetServices)"
4252
!---------------------------------------
4353

@@ -53,6 +63,14 @@ subroutine SetServices(ensemble_driver, rc)
5363
specRoutine=SetModelServices, rc=rc)
5464
if (chkerr(rc,__LINE__,u_FILE_u)) return
5565

66+
! PostChildrenAdvertise is a NUOPC specialization which happens after Advertize but before Realize
67+
! We have overloaded this specialization location to initilize IO.
68+
! So after all components have called Advertise but before any component calls Realize
69+
! IO will be initialized and any async IO tasks will be split off to the PIO async IO driver.
70+
call NUOPC_CompSpecialize(ensemble_driver, specLabel=ensemble_label_PostChildrenAdvertise, &
71+
specRoutine=InitializeIO, rc=rc)
72+
if (chkerr(rc,__LINE__,u_FILE_u)) return
73+
5674
! Create, open and set the config
5775
config = ESMF_ConfigCreate(rc=rc)
5876
if (chkerr(rc,__LINE__,u_FILE_u)) return
@@ -63,6 +81,26 @@ subroutine SetServices(ensemble_driver, rc)
6381
call ESMF_GridCompSet(ensemble_driver, config=config, rc=rc)
6482
if (chkerr(rc,__LINE__,u_FILE_u)) return
6583

84+
! NUOPC component drivers end the initialization process with an internal call to InitializeDataResolution.
85+
! The ensemble_driver does not need to InitializeDataResolution and doing so will cause a hang
86+
! if asyncronous IO is used. This attribute is available after ESMF8.4.0b03 to toggle that control.
87+
! Cannot use asyncIO with older ESMF versions.
88+
call NUOPC_CompAttributeGet(ensemble_driver, name="InitializeDataResolution", &
89+
isPresent=isPresent, rc=rc)
90+
if (chkerr(rc,__LINE__,u_FILE_u)) return
91+
92+
if(isPresent) then
93+
call ESMF_LogWrite(trim(subname)//": setting InitializeDataResolution false", ESMF_LOGMSG_INFO)
94+
call NUOPC_CompAttributeSet(ensemble_driver, name="InitializeDataResolution", value="false", rc=rc)
95+
if (chkerr(rc,__LINE__,u_FILE_u)) return
96+
asyncIO_available = .true.
97+
call ESMF_LogWrite(trim(subname)//": asyncio is available", ESMF_LOGMSG_INFO)
98+
endif
99+
! Set a finalize method, it calls pio_finalize
100+
call NUOPC_CompSpecialize(ensemble_driver, specLabel=label_Finalize, &
101+
specRoutine=ensemble_finalize, rc=rc)
102+
if (chkerr(rc,__LINE__,u_FILE_u)) return
103+
66104
call ESMF_LogWrite(trim(subname)//": done", ESMF_LOGMSG_INFO)
67105

68106
end subroutine SetServices
@@ -99,9 +137,13 @@ subroutine SetModelServices(ensemble_driver, rc)
99137
character(len=512) :: logfile
100138
logical :: read_restart
101139
character(len=CS) :: read_restart_string
102-
integer :: inst
103-
integer :: number_of_members
104140
integer :: ntasks_per_member
141+
integer :: iopetcnt
142+
integer :: petcnt
143+
logical :: comp_task
144+
integer :: pio_asyncio_ntasks
145+
integer :: pio_asyncio_stride
146+
integer :: pio_asyncio_rootpe
105147
integer :: Global_Comm
106148
character(CL) :: start_type ! Type of startup
107149
character(len=7) :: drvrinst
@@ -196,13 +238,25 @@ subroutine SetModelServices(ensemble_driver, rc)
196238
if (chkerr(rc,__LINE__,u_FILE_u)) return
197239
read(cvalue,*) number_of_members
198240

241+
call NUOPC_CompAttributeGet(ensemble_driver, name="pio_asyncio_ntasks", value=cvalue, rc=rc)
242+
if (chkerr(rc,__LINE__,u_FILE_u)) return
243+
read(cvalue,*) pio_asyncio_ntasks
244+
245+
call NUOPC_CompAttributeGet(ensemble_driver, name="pio_asyncio_stride", value=cvalue, rc=rc)
246+
if (chkerr(rc,__LINE__,u_FILE_u)) return
247+
read(cvalue,*) pio_asyncio_stride
248+
249+
call NUOPC_CompAttributeGet(ensemble_driver, name="pio_asyncio_rootpe", value=cvalue, rc=rc)
250+
if (chkerr(rc,__LINE__,u_FILE_u)) return
251+
read(cvalue,*) pio_asyncio_rootpe
252+
199253
call ESMF_VMGet(vm, localPet=localPet, PetCount=PetCount, rc=rc)
200254
if (chkerr(rc,__LINE__,u_FILE_u)) return
201255

202-
ntasks_per_member = PetCount/number_of_members
203-
if(ntasks_per_member*number_of_members .ne. PetCount) then
256+
ntasks_per_member = PetCount/number_of_members - pio_asyncio_ntasks
257+
if(modulo(PetCount-pio_asyncio_ntasks*number_of_members, number_of_members) .ne. 0) then
204258
write (msgstr,'(a,i5,a,i3,a,i3,a)') &
205-
"PetCount (",PetCount,") must be evenly divisable by number of members (",number_of_members,")"
259+
"PetCount (",PetCount,") - Async IOtasks (",pio_asyncio_ntasks*number_of_members,") must be evenly divisable by number of members (",number_of_members,")"
206260
call ESMF_LogSetError(ESMF_RC_ARG_BAD, msg=msgstr, line=__LINE__, file=__FILE__, rcToReturn=rc)
207261
return
208262
endif
@@ -212,33 +266,70 @@ subroutine SetModelServices(ensemble_driver, rc)
212266
!-------------------------------------------
213267

214268
allocate(petList(ntasks_per_member))
215-
! We need to loop over instances
216-
call t_startf('compute_drivers')
217-
do inst = 1, number_of_members
218-
219-
! Determine pet list for driver instance
220-
petList(1) = (inst-1) * ntasks_per_member
221-
do n=2,ntasks_per_member
222-
petList(n) = petList(n-1) + 1
269+
allocate(asyncio_petlist(pio_asyncio_ntasks))
270+
!
271+
! Logic for asyncio variables is handled in cmeps buildnml.
272+
! here we assume that pio_asyncio_stride and pio_asyncio_ntasks are only set
273+
! if asyncio is enabled.
274+
!
275+
inst = localPet/(ntasks_per_member+pio_asyncio_ntasks) + 1
276+
277+
petcnt=1
278+
iopetcnt = 1
279+
comp_task = .false.
280+
asyncio_task = .false.
281+
! Determine pet list for driver instance
282+
if(pio_asyncio_ntasks > 0) then
283+
do n=pio_asyncio_rootpe,pio_asyncio_rootpe+pio_asyncio_stride*(pio_asyncio_ntasks-1),pio_asyncio_stride
284+
asyncio_petlist(iopetcnt) = (inst-1)*(ntasks_per_member+pio_asyncio_ntasks) + n
285+
if(asyncio_petlist(iopetcnt) == localPet) asyncio_task = .true.
286+
iopetcnt = iopetcnt+1
223287
enddo
224-
225-
! Add driver instance to ensemble driver
226-
write(drvrinst,'(a,i4.4)') "ESM",inst
227-
call NUOPC_DriverAddComp(ensemble_driver, drvrinst, ESMSetServices, petList=petList, rc=rc)
228-
if (chkerr(rc,__LINE__,u_FILE_u)) return
229-
enddo
230-
call t_stopf('compute_drivers')
231-
232-
inst = localPet/ntasks_per_member + 1
233-
petList(1) = (inst-1) * ntasks_per_member
234-
do n=2,ntasks_per_member
235-
petList(n) = petList(n-1) + 1
288+
iopetcnt = 1
289+
endif
290+
do n=0,ntasks_per_member+pio_asyncio_ntasks-1
291+
if(pio_asyncio_ntasks > 0) then
292+
if( asyncio_petlist(iopetcnt)==(inst-1)*(ntasks_per_member+pio_asyncio_ntasks) + n) then
293+
! Here if asyncio is true and this is an io task
294+
iopetcnt = iopetcnt+1
295+
else if(petcnt <= ntasks_per_member) then
296+
! Here if this is a compute task
297+
petList(petcnt) = n + (inst-1)*(ntasks_per_member + pio_asyncio_ntasks)
298+
if (petList(petcnt) == localPet) then
299+
comp_task=.true.
300+
endif
301+
petcnt = petcnt+1
302+
else
303+
msgstr = "ERROR task cannot be neither a compute task nor an asyncio task"
304+
call ESMF_LogSetError(ESMF_RC_NOT_VALID, msg=msgstr, line=__LINE__, file=__FILE__, rcToReturn=rc)
305+
return ! bail out
306+
endif
307+
else
308+
! Here if asyncio is false
309+
petList(petcnt) = (inst-1)*ntasks_per_member + n
310+
if (petList(petcnt) == localPet) comp_task=.true.
311+
petcnt = petcnt+1
312+
endif
236313
enddo
237-
if (localpet >= petlist(1) .and. localpet <= petlist(ntasks_per_member)) then
238-
write(drvrinst,'(a,i4.4)') "ESM",inst
239-
call NUOPC_DriverGetComp(ensemble_driver, drvrinst, comp=driver, rc=rc)
240-
if (chkerr(rc,__LINE__,u_FILE_u)) return
314+
if(comp_task .and. asyncio_task) then
315+
msgstr = "ERROR task cannot be both a compute task and an asyncio task"
316+
call ESMF_LogSetError(ESMF_RC_NOT_VALID, msg=msgstr, line=__LINE__, file=__FILE__, rcToReturn=rc)
317+
return ! bail out
318+
elseif (.not. comp_task .and. .not. asyncio_task) then
319+
msgstr = "ERROR task is nether a compute task nor an asyncio task"
320+
call ESMF_LogSetError(ESMF_RC_NOT_VALID, msg=msgstr, line=__LINE__, file=__FILE__, rcToReturn=rc)
321+
return ! bail out
322+
endif
323+
! Add driver instance to ensemble driver
324+
write(drvrinst,'(a,i4.4)') "ESM",inst
325+
326+
call NUOPC_DriverAddComp(ensemble_driver, drvrinst, ESMSetServices, petList=petList, comp=driver, rc=rc)
327+
if (chkerr(rc,__LINE__,u_FILE_u)) return
328+
write(msgstr, *) ": driver added on PETS ",petlist(1),' to ',petlist(petcnt-1)
329+
call ESMF_LogWrite(trim(subname)//msgstr)
241330

331+
maintask = .false.
332+
if (comp_task) then
242333
if(number_of_members > 1) then
243334
call NUOPC_CompAttributeAdd(driver, attrList=(/'inst_suffix'/), rc=rc)
244335
if (chkerr(rc,__LINE__,u_FILE_u)) return
@@ -265,7 +356,8 @@ subroutine SetModelServices(ensemble_driver, rc)
265356
if (chkerr(rc,__LINE__,u_FILE_u)) return
266357

267358
! Set the driver log to the driver task 0
268-
if (mod(localPet, ntasks_per_member) == 0) then
359+
360+
if (localPet == petList(1)) then
269361
call NUOPC_CompAttributeGet(driver, name="diro", value=diro, rc=rc)
270362
if (chkerr(rc,__LINE__,u_FILE_u)) return
271363
call NUOPC_CompAttributeGet(driver, name="logfile", value=logfile, rc=rc)
@@ -274,15 +366,12 @@ subroutine SetModelServices(ensemble_driver, rc)
274366
maintask = .true.
275367
else
276368
logUnit = 6
277-
maintask = .false.
278369
endif
279370
call shr_log_setLogUnit (logunit)
280-
281-
! Create a clock for each driver instance
282-
call esm_time_clockInit(ensemble_driver, driver, logunit, maintask, rc)
283-
if (chkerr(rc,__LINE__,u_FILE_u)) return
284-
285371
endif
372+
! Create a clock for each driver instance
373+
call esm_time_clockInit(ensemble_driver, driver, logunit, maintask, rc)
374+
if (chkerr(rc,__LINE__,u_FILE_u)) return
286375

287376
deallocate(petList)
288377
call t_stopf(subname)
@@ -291,4 +380,76 @@ subroutine SetModelServices(ensemble_driver, rc)
291380

292381
end subroutine SetModelServices
293382

383+
subroutine InitializeIO(ensemble_driver, rc)
384+
use ESMF, only: ESMF_GridComp, ESMF_LOGMSG_INFO, ESMF_LogWrite
385+
use ESMF, only: ESMF_SUCCESS, ESMF_VM, ESMF_GridCompGet, ESMF_VMGet
386+
use ESMF, only: ESMF_CONFIG, ESMF_GridCompIsPetLocal, ESMF_State, ESMF_Clock
387+
use NUOPC, only: NUOPC_CompAttributeGet, NUOPC_CompGet
388+
use NUOPC_DRIVER, only: NUOPC_DriverGetComp
389+
use driver_pio_mod , only: driver_pio_init, driver_pio_component_init
390+
#ifndef NO_MPI2
391+
use MPI, only : MPI_Comm_split, MPI_UNDEFINED
392+
#endif
393+
type(ESMF_GridComp) :: ensemble_driver
394+
type(ESMF_VM) :: ensemble_vm
395+
integer, intent(out) :: rc
396+
character(len=*), parameter :: subname = '('//__FILE__//':InitializeIO)'
397+
type(ESMF_GridComp), pointer :: dcomp(:)
398+
integer :: iam
399+
integer :: Global_Comm, Instance_Comm
400+
integer :: drv
401+
integer :: PetCount
402+
integer :: key, color, i
403+
character(len=8) :: compname
404+
405+
rc = ESMF_SUCCESS
406+
call ESMF_LogWrite(trim(subname)//": called", ESMF_LOGMSG_INFO)
407+
call shr_log_setLogUnit (logunit)
408+
409+
call ESMF_GridCompGet(ensemble_driver, vm=ensemble_vm, rc=rc)
410+
if (chkerr(rc,__LINE__,u_FILE_u)) return
411+
call ESMF_VMGet(ensemble_vm, localpet=iam, mpiCommunicator=Global_Comm, PetCount=PetCount, rc=rc)
412+
if (chkerr(rc,__LINE__,u_FILE_u)) return
413+
if(number_of_members > 1) then
414+
color = inst
415+
key = modulo(iam, PetCount/number_of_members)
416+
#ifndef NO_MPI2
417+
call MPI_Comm_split(Global_Comm, color, key, Instance_Comm, rc)
418+
#endif
419+
do i=1,size(asyncio_petlist)
420+
asyncio_petList(i) = modulo(asyncio_petList(i), PetCount/number_of_members)
421+
enddo
422+
else
423+
Instance_Comm = Global_Comm
424+
endif
425+
nullify(dcomp)
426+
call NUOPC_DriverGetComp(ensemble_driver, complist=dcomp, rc=rc)
427+
if (chkerr(rc,__LINE__,u_FILE_u)) return
428+
if (chkerr(rc,__LINE__,u_FILE_u)) return
429+
call NUOPC_CompGet(dcomp(1), name=compname, rc=rc)
430+
if (chkerr(rc,__LINE__,u_FILE_u)) return
431+
call ESMF_LogWrite(trim(subname)//": call driver_pio_init "//compname, ESMF_LOGMSG_INFO)
432+
call driver_pio_init(dcomp(1), rc=rc)
433+
if (chkerr(rc,__LINE__,u_FILE_u)) return
434+
435+
call ESMF_LogWrite(trim(subname)//": call driver_pio_component_init "//compname, ESMF_LOGMSG_INFO)
436+
call driver_pio_component_init(dcomp(1), Instance_Comm, asyncio_petlist, rc)
437+
if (chkerr(rc,__LINE__,u_FILE_u)) return
438+
call ESMF_LogWrite(trim(subname)//": driver_pio_component_init done "//compname, ESMF_LOGMSG_INFO)
439+
440+
deallocate(dcomp)
441+
deallocate(asyncio_petlist)
442+
call ESMF_LogWrite(trim(subname)//": done", ESMF_LOGMSG_INFO)
443+
end subroutine InitializeIO
444+
445+
subroutine ensemble_finalize(ensemble_driver, rc)
446+
use ESMF, only : ESMF_GridComp, ESMF_SUCCESS
447+
use driver_pio_mod, only: driver_pio_finalize
448+
type(ESMF_GridComp) :: Ensemble_driver
449+
integer, intent(out) :: rc
450+
rc = ESMF_SUCCESS
451+
call shr_log_setLogUnit (logunit)
452+
call driver_pio_finalize()
453+
454+
end subroutine ensemble_finalize
294455
end module Ensemble_driver

0 commit comments

Comments
 (0)