@@ -334,7 +334,8 @@ def _test__native_dist_model_create_from_context_set_local_rank(true_conf):
334334def _test__native_dist_model_create_from_context_no_dist (true_backend , true_device ):
335335 assert _NativeDistModel .create_from_context () is None
336336
337- dist .init_process_group (true_backend , "tcp://0.0.0.0:2222" , world_size = 1 , rank = 0 )
337+ store = dist .TCPStore ("0.0.0.0" , 2222 , world_size = 1 , is_master = True )
338+ dist .init_process_group (true_backend , store = store , world_size = 1 , rank = 0 )
338339 dist .barrier ()
339340
340341 _test__native_dist_model_create_from_context_no_local_rank ()
@@ -358,7 +359,9 @@ def _test__native_dist_model_create_from_context_no_dist(true_backend, true_devi
358359def _test__native_dist_model_create_from_context_dist (local_rank , rank , world_size , true_backend , true_device ):
359360 assert _NativeDistModel .create_from_context () is None
360361
361- dist .init_process_group (true_backend , "tcp://0.0.0.0:2222" , world_size = world_size , rank = rank )
362+ is_master = rank == 0
363+ store = dist .TCPStore ("0.0.0.0" , 2222 , world_size = world_size , is_master = is_master )
364+ dist .init_process_group (true_backend , store = store , world_size = world_size , rank = rank )
362365 dist .barrier ()
363366 if torch .cuda .is_available ():
364367 torch .cuda .set_device (local_rank )
@@ -397,7 +400,7 @@ def test__native_dist_model_create_no_dist_nccl(clean_env):
397400
398401
399402@pytest .mark .distributed
400- @pytest .mark .parametrize ("init_method" , [None , "tcp://0.0.0.0:22334" , " FILE" ])
403+ @pytest .mark .parametrize ("init_method" , [None , "FILE" ])
401404def test__native_dist_model_create_dist_gloo_1 (init_method , get_fixed_dirname , local_rank , world_size ):
402405 if init_method == "FILE" :
403406 init_method = f"file://{ get_fixed_dirname ('native_dist_model_create_dist_gloo_1' )} /shared"
@@ -418,7 +421,7 @@ def test__native_dist_model_create_dist_gloo_2(local_rank, world_size):
418421
419422@pytest .mark .distributed
420423@pytest .mark .skipif (torch .cuda .device_count () < 1 , reason = "Skip if no GPU" )
421- @pytest .mark .parametrize ("init_method" , [None , "tcp://0.0.0.0:22334" , " FILE" ])
424+ @pytest .mark .parametrize ("init_method" , [None , "FILE" ])
422425def test__native_dist_model_create_dist_nccl_1 (init_method , get_fixed_dirname , local_rank , world_size ):
423426 if init_method == "FILE" :
424427 init_method = f"file://{ get_fixed_dirname ('native_dist_model_create_dist_nccl_1' )} /shared"
@@ -444,7 +447,9 @@ def test__native_dist_model_create_dist_nccl_2(local_rank, world_size):
444447def test__native_dist_model_warning_index_less_localrank (local_rank , world_size ):
445448 assert _NativeDistModel .create_from_context () is None
446449
447- dist .init_process_group ("nccl" , "tcp://0.0.0.0:2222" , world_size = world_size , rank = local_rank )
450+ is_master = local_rank == 0
451+ store = dist .TCPStore ("0.0.0.0" , 2222 , world_size = world_size , is_master = is_master )
452+ dist .init_process_group ("nccl" , store = store , world_size = world_size , rank = local_rank )
448453 dist .barrier ()
449454 # We deliberately incorrectly set cuda device to 0
450455 torch .cuda .set_device (0 )
@@ -496,7 +501,7 @@ def _test__native_dist_model_spawn(backend, num_workers_per_machine, device, ini
496501
497502@pytest .mark .distributed
498503@pytest .mark .skipif ("WORLD_SIZE" in os .environ , reason = "Skip if launched as multiproc" )
499- @pytest .mark .parametrize ("init_method" , [None , "CUSTOM_ADDR_PORT" , "env://" , "tcp://0.0.0.0:22334" , " FILE" ])
504+ @pytest .mark .parametrize ("init_method" , [None , "CUSTOM_ADDR_PORT" , "env://" , "FILE" ])
500505def test__native_dist_model_spawn_gloo (init_method , dirname ):
501506 spawn_kwargs = {}
502507
@@ -532,7 +537,7 @@ def test__native_dist_model_spawn_gloo(init_method, dirname):
532537@pytest .mark .distributed
533538@pytest .mark .skipif ("WORLD_SIZE" in os .environ , reason = "Skip if launched as multiproc" )
534539@pytest .mark .skipif (torch .cuda .device_count () < 1 , reason = "Skip if no GPU" )
535- @pytest .mark .parametrize ("init_method" , [None , "CUSTOM_ADDR_PORT" , "tcp://0.0.0.0:22334" , " FILE" ])
540+ @pytest .mark .parametrize ("init_method" , [None , "CUSTOM_ADDR_PORT" , "FILE" ])
536541def test__native_dist_model_spawn_nccl (init_method , dirname ):
537542 spawn_kwargs = {}
538543
@@ -720,3 +725,8 @@ def test__setup_ddp_vars_from_slurm_env_bad_configs():
720725 "SLURM_JOB_ID" : "12345" ,
721726 }
722727 _setup_ddp_vars_from_slurm_env (environ )
728+
729+
730+ def test__native_dist_model_tcp_init_method_error ():
731+ with pytest .raises (ValueError , match = "is not supported by PyTorch. To fix this, please configure a TCPStore" ):
732+ _NativeDistModel .create_from_backend (backend = "gloo" , init_method = "tcp://10.1.1.20:23456" , rank = 0 , world_size = 1 )
0 commit comments