Skip to content

Commit 1127fd7

Browse files
committed
Reverted to warn against tcp ip use
1 parent 9debb72 commit 1127fd7

6 files changed

Lines changed: 120 additions & 41 deletions

File tree

ignite/distributed/comp_models/native.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,30 @@ def _create_from_backend(
107107
rank: int | None = None,
108108
**kwargs: Any,
109109
) -> None:
110+
if init_method is not None and init_method.startswith("tcp://"):
111+
from urllib.parse import urlparse
112+
113+
try:
114+
parsed = urlparse(init_method)
115+
host = parsed.hostname
116+
port = parsed.port
117+
except Exception:
118+
host = None
119+
port = None
120+
121+
host = host or "127.0.0.1"
122+
port = port or 29500
123+
124+
raise ValueError(
125+
f"init_method='{init_method}' is not supported by PyTorch. "
126+
"To fix this, please configure a TCPStore and initialize the process group using the store instead. "
127+
"For example:\n\n"
128+
" import torch.distributed as dist\n"
129+
" from datetime import timedelta\n\n"
130+
f' store = dist.TCPStore("{host}", {port}, world_size, is_master, timedelta(seconds=30))\n'
131+
" dist.init_process_group(backend, store=store, rank=rank, world_size=world_size)"
132+
)
133+
110134
if backend == dist.Backend.NCCL and not torch.cuda.is_available():
111135
raise RuntimeError("Nccl backend is required but no cuda capable devices")
112136
self._backend = backend

ignite/distributed/launcher.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,30 @@ def __init__(
233233
if value is not None:
234234
raise ValueError(f"If backend is None, argument '{name}' should be also None, but given {value}")
235235

236+
if init_method is not None and init_method.startswith("tcp://"):
237+
from urllib.parse import urlparse
238+
239+
try:
240+
parsed = urlparse(init_method)
241+
host = parsed.hostname
242+
port = parsed.port
243+
except Exception:
244+
host = None
245+
port = None
246+
247+
host = host or "127.0.0.1"
248+
port = port or 29500
249+
250+
raise ValueError(
251+
f"init_method='{init_method}' is not supported by PyTorch. "
252+
"To fix this, please configure a TCPStore and initialize the process group using the store instead. "
253+
"For example:\n\n"
254+
" import torch.distributed as dist\n"
255+
" from datetime import timedelta\n\n"
256+
f' store = dist.TCPStore("{host}", {port}, world_size, is_master, timedelta(seconds=30))\n'
257+
" dist.init_process_group(backend, store=store, rank=rank, world_size=world_size)"
258+
)
259+
236260
self.backend = backend
237261
self._spawn_params = None
238262
self.init_method = init_method

tests/ignite/conftest.py

Lines changed: 43 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -230,15 +230,19 @@ def _setup_free_port(local_rank):
230230
@pytest.fixture()
231231
def distributed_context_single_node_nccl(local_rank, world_size):
232232
free_port = _setup_free_port(local_rank)
233+
os.environ["MASTER_ADDR"] = "localhost"
234+
os.environ["MASTER_PORT"] = str(free_port)
233235

234236
dist_info = {
235237
"backend": "nccl",
236238
"world_size": world_size,
237239
"rank": local_rank,
238-
"init_method": f"tcp://localhost:{free_port}",
240+
"init_method": "env://",
239241
}
240242
yield _create_dist_context(dist_info, local_rank)
241243
_destroy_dist_context()
244+
os.environ.pop("MASTER_ADDR", None)
245+
os.environ.pop("MASTER_PORT", None)
242246

243247

244248
@pytest.fixture()
@@ -250,22 +254,32 @@ def distributed_context_single_node_gloo(local_rank, world_size):
250254
# can't use backslashes in f-strings
251255
backslash = "\\"
252256
init_method = f"file:///{temp_file.name.replace(backslash, '/')}"
257+
dist_info = {
258+
"backend": "gloo",
259+
"world_size": world_size,
260+
"rank": local_rank,
261+
"init_method": init_method,
262+
"timeout": timedelta(seconds=30),
263+
}
264+
yield _create_dist_context(dist_info, local_rank)
265+
_destroy_dist_context()
266+
temp_file.close()
253267
else:
254268
free_port = _setup_free_port(local_rank)
255-
init_method = f"tcp://localhost:{free_port}"
256-
temp_file = None
269+
os.environ["MASTER_ADDR"] = "localhost"
270+
os.environ["MASTER_PORT"] = str(free_port)
257271

258-
dist_info = {
259-
"backend": "gloo",
260-
"world_size": world_size,
261-
"rank": local_rank,
262-
"init_method": init_method,
263-
"timeout": timedelta(seconds=30),
264-
}
265-
yield _create_dist_context(dist_info, local_rank)
266-
_destroy_dist_context()
267-
if temp_file:
268-
temp_file.close()
272+
dist_info = {
273+
"backend": "gloo",
274+
"world_size": world_size,
275+
"rank": local_rank,
276+
"init_method": "env://",
277+
"timeout": timedelta(seconds=30),
278+
}
279+
yield _create_dist_context(dist_info, local_rank)
280+
_destroy_dist_context()
281+
os.environ.pop("MASTER_ADDR", None)
282+
os.environ.pop("MASTER_PORT", None)
269283

270284

271285
@pytest.fixture()
@@ -471,16 +485,21 @@ def distributed(request, local_rank, world_size):
471485
# can't use backslashes in f-strings
472486
backslash = "\\"
473487
init_method = f"file:///{temp_file.name.replace(backslash, '/')}"
488+
dist_info = {
489+
"world_size": world_size,
490+
"rank": local_rank,
491+
"init_method": init_method,
492+
}
474493
else:
475494
temp_file = None
476495
free_port = _setup_free_port(local_rank)
477-
init_method = f"tcp://localhost:{free_port}"
478-
479-
dist_info = {
480-
"world_size": world_size,
481-
"rank": local_rank,
482-
"init_method": init_method,
483-
}
496+
os.environ["MASTER_ADDR"] = "localhost"
497+
os.environ["MASTER_PORT"] = str(free_port)
498+
dist_info = {
499+
"world_size": world_size,
500+
"rank": local_rank,
501+
"init_method": "env://",
502+
}
484503

485504
if request.param == "nccl":
486505
dist_info["backend"] = "nccl"
@@ -493,6 +512,9 @@ def distributed(request, local_rank, world_size):
493512
_destroy_dist_context()
494513
if temp_file:
495514
temp_file.close()
515+
else:
516+
os.environ.pop("MASTER_ADDR", None)
517+
os.environ.pop("MASTER_PORT", None)
496518

497519
elif request.param == "horovod":
498520
request.node.stash[is_horovod_stash_key] = True

tests/ignite/distributed/comp_models/test_native.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -334,7 +334,8 @@ def _test__native_dist_model_create_from_context_set_local_rank(true_conf):
334334
def _test__native_dist_model_create_from_context_no_dist(true_backend, true_device):
335335
assert _NativeDistModel.create_from_context() is None
336336

337-
dist.init_process_group(true_backend, "tcp://0.0.0.0:2222", world_size=1, rank=0)
337+
store = dist.TCPStore("0.0.0.0", 2222, world_size=1, is_master=True)
338+
dist.init_process_group(true_backend, store=store, world_size=1, rank=0)
338339
dist.barrier()
339340

340341
_test__native_dist_model_create_from_context_no_local_rank()
@@ -358,7 +359,9 @@ def _test__native_dist_model_create_from_context_no_dist(true_backend, true_devi
358359
def _test__native_dist_model_create_from_context_dist(local_rank, rank, world_size, true_backend, true_device):
359360
assert _NativeDistModel.create_from_context() is None
360361

361-
dist.init_process_group(true_backend, "tcp://0.0.0.0:2222", world_size=world_size, rank=rank)
362+
is_master = rank == 0
363+
store = dist.TCPStore("0.0.0.0", 2222, world_size=world_size, is_master=is_master)
364+
dist.init_process_group(true_backend, store=store, world_size=world_size, rank=rank)
362365
dist.barrier()
363366
if torch.cuda.is_available():
364367
torch.cuda.set_device(local_rank)
@@ -397,7 +400,7 @@ def test__native_dist_model_create_no_dist_nccl(clean_env):
397400

398401

399402
@pytest.mark.distributed
400-
@pytest.mark.parametrize("init_method", [None, "tcp://0.0.0.0:22334", "FILE"])
403+
@pytest.mark.parametrize("init_method", [None, "FILE"])
401404
def test__native_dist_model_create_dist_gloo_1(init_method, get_fixed_dirname, local_rank, world_size):
402405
if init_method == "FILE":
403406
init_method = f"file://{get_fixed_dirname('native_dist_model_create_dist_gloo_1')}/shared"
@@ -418,7 +421,7 @@ def test__native_dist_model_create_dist_gloo_2(local_rank, world_size):
418421

419422
@pytest.mark.distributed
420423
@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
421-
@pytest.mark.parametrize("init_method", [None, "tcp://0.0.0.0:22334", "FILE"])
424+
@pytest.mark.parametrize("init_method", [None, "FILE"])
422425
def test__native_dist_model_create_dist_nccl_1(init_method, get_fixed_dirname, local_rank, world_size):
423426
if init_method == "FILE":
424427
init_method = f"file://{get_fixed_dirname('native_dist_model_create_dist_nccl_1')}/shared"
@@ -444,7 +447,9 @@ def test__native_dist_model_create_dist_nccl_2(local_rank, world_size):
444447
def test__native_dist_model_warning_index_less_localrank(local_rank, world_size):
445448
assert _NativeDistModel.create_from_context() is None
446449

447-
dist.init_process_group("nccl", "tcp://0.0.0.0:2222", world_size=world_size, rank=local_rank)
450+
is_master = local_rank == 0
451+
store = dist.TCPStore("0.0.0.0", 2222, world_size=world_size, is_master=is_master)
452+
dist.init_process_group("nccl", store=store, world_size=world_size, rank=local_rank)
448453
dist.barrier()
449454
# We deliberately incorrectly set cuda device to 0
450455
torch.cuda.set_device(0)
@@ -496,7 +501,7 @@ def _test__native_dist_model_spawn(backend, num_workers_per_machine, device, ini
496501

497502
@pytest.mark.distributed
498503
@pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
499-
@pytest.mark.parametrize("init_method", [None, "CUSTOM_ADDR_PORT", "env://", "tcp://0.0.0.0:22334", "FILE"])
504+
@pytest.mark.parametrize("init_method", [None, "CUSTOM_ADDR_PORT", "env://", "FILE"])
500505
def test__native_dist_model_spawn_gloo(init_method, dirname):
501506
spawn_kwargs = {}
502507

@@ -532,7 +537,7 @@ def test__native_dist_model_spawn_gloo(init_method, dirname):
532537
@pytest.mark.distributed
533538
@pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
534539
@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
535-
@pytest.mark.parametrize("init_method", [None, "CUSTOM_ADDR_PORT", "tcp://0.0.0.0:22334", "FILE"])
540+
@pytest.mark.parametrize("init_method", [None, "CUSTOM_ADDR_PORT", "FILE"])
536541
def test__native_dist_model_spawn_nccl(init_method, dirname):
537542
spawn_kwargs = {}
538543

@@ -720,3 +725,8 @@ def test__setup_ddp_vars_from_slurm_env_bad_configs():
720725
"SLURM_JOB_ID": "12345",
721726
}
722727
_setup_ddp_vars_from_slurm_env(environ)
728+
729+
730+
def test__native_dist_model_tcp_init_method_error():
731+
with pytest.raises(ValueError, match="is not supported by PyTorch. To fix this, please configure a TCPStore"):
732+
_NativeDistModel.create_from_backend(backend="gloo", init_method="tcp://10.1.1.20:23456", rank=0, world_size=1)

tests/ignite/distributed/test_launcher.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -103,13 +103,6 @@ def _test_check_idist_parallel_torch_launch(init_method, fp, backend, nprocs):
103103
"init_method",
104104
[
105105
None,
106-
pytest.param(
107-
"tcp://0.0.0.0:29500",
108-
marks=pytest.mark.skipif(
109-
"dev" in torch.__version__,
110-
reason="Skip tcp:// init_method with torchrun on nightly due to incompatibility",
111-
),
112-
),
113106
"FILE",
114107
],
115108
)
@@ -241,7 +234,7 @@ def _test_func(index, ws, device, backend, true_init_method):
241234
@pytest.mark.distributed
242235
@pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
243236
@pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support")
244-
@pytest.mark.parametrize("init_method", ["env://", "tcp://0.0.0.0:29500", "FILE"])
237+
@pytest.mark.parametrize("init_method", ["env://", "FILE"])
245238
@pytest.mark.parametrize(
246239
"backend",
247240
["gloo", pytest.param("nccl", marks=pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU"))],
@@ -259,7 +252,7 @@ def test_idist_parallel_spawn_n_procs_native(init_method, backend, dirname):
259252
@pytest.mark.distributed
260253
@pytest.mark.skipif("WORLD_SIZE" not in os.environ, reason="Skip if not launched as multiproc")
261254
@pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support")
262-
@pytest.mark.parametrize("init_method", ["env://", "tcp://0.0.0.0:29500", "FILE"])
255+
@pytest.mark.parametrize("init_method", ["env://", "FILE"])
263256
@pytest.mark.parametrize(
264257
"backend",
265258
["gloo", pytest.param("nccl", marks=pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU"))],
@@ -296,3 +289,9 @@ def test_idist_parallel_spawn_params_xla():
296289
res = parallel._spawn_params
297290
assert "nproc_per_node" in res and res["nproc_per_node"] == 8
298291
assert "start_method" in res and res["start_method"] == "fork"
292+
293+
294+
def test_idist_parallel_tcp_init_method_error():
295+
with pytest.raises(ValueError, match="is not supported by PyTorch. To fix this, please configure a TCPStore"):
296+
with idist.Parallel(backend="gloo", init_method="tcp://10.1.1.20:23456", nproc_per_node=1) as parallel:
297+
pass

tests/ignite/distributed/utils/test_native.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def _test_native_distrib_single_node_launch_tool(backend, device, local_rank, wo
3838

3939
@pytest.mark.distributed
4040
@pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support")
41-
@pytest.mark.parametrize("init_method", [None, "tcp://0.0.0.0:22334", "FILE"])
41+
@pytest.mark.parametrize("init_method", [None, "FILE"])
4242
def test_native_distrib_single_node_launch_tool_gloo(init_method, get_fixed_dirname, local_rank, world_size):
4343
from datetime import timedelta
4444

@@ -56,7 +56,7 @@ def test_native_distrib_single_node_launch_tool_gloo(init_method, get_fixed_dirn
5656
@pytest.mark.distributed
5757
@pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support")
5858
@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
59-
@pytest.mark.parametrize("init_method", [None, "tcp://0.0.0.0:22334", "FILE"])
59+
@pytest.mark.parametrize("init_method", [None, "FILE"])
6060
def test_native_distrib_single_node_launch_tool_nccl(init_method, get_fixed_dirname, local_rank, world_size):
6161
if init_method == "FILE":
6262
init_method = f"file://{get_fixed_dirname('native_distrib_single_node_launch_tool_nccl')}/shared"
@@ -80,7 +80,7 @@ def _test_native_distrib_single_node_spawn(init_method, backend, device, **kwarg
8080
@pytest.mark.distributed
8181
@pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support")
8282
@pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
83-
@pytest.mark.parametrize("init_method", [None, "tcp://0.0.0.0:22334", "FILE"])
83+
@pytest.mark.parametrize("init_method", [None, "FILE"])
8484
def test_native_distrib_single_node_spawn_gloo(init_method, dirname):
8585
from datetime import timedelta
8686

@@ -97,7 +97,7 @@ def test_native_distrib_single_node_spawn_gloo(init_method, dirname):
9797
@pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support")
9898
@pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
9999
@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
100-
@pytest.mark.parametrize("init_method", [None, "tcp://0.0.0.0:22334", "FILE"])
100+
@pytest.mark.parametrize("init_method", [None, "FILE"])
101101
def test_native_distrib_single_node_spawn_nccl(init_method, dirname):
102102
if init_method == "FILE":
103103
init_method = f"file://{dirname}/shared"

0 commit comments

Comments
 (0)