diff --git a/tests/unit/checkpoint/test_zero_optimizer.py b/tests/unit/checkpoint/test_zero_optimizer.py index 9ad785071602..6447d1821cd1 100644 --- a/tests/unit/checkpoint/test_zero_optimizer.py +++ b/tests/unit/checkpoint/test_zero_optimizer.py @@ -653,7 +653,9 @@ def test_save_tensor_clone(self, tmpdir, zero_stage, use_cpu_device): class TestZeRONonDistributed(DistributedTest): world_size = 1 - init_distributed = False + # This test calls deepspeed.initialize(), so use the harness' file-store + # initialization instead of env:// TCP rendezvous ports under xdist. + init_distributed = True @pytest.mark.parametrize('zero_stage', [1, 2, 3]) def test_chmod_exception_handling(self, monkeypatch, zero_stage): diff --git a/tests/unit/inference/quantization/test_intX_quantization.py b/tests/unit/inference/quantization/test_intX_quantization.py index 8169912ae487..18a491123b84 100644 --- a/tests/unit/inference/quantization/test_intX_quantization.py +++ b/tests/unit/inference/quantization/test_intX_quantization.py @@ -54,7 +54,7 @@ def quantization_test_helper(pre_quant_type: torch.dtype, num_bits: int): assert mean_diff < 0.15 and max_diff < 0.5, f'Numeric error exceed threshold, mean diff {mean_diff} (threshold 0.15), max diff {max_diff} (threshold 0.5)' -def zero3_post_init_quantization_test_helper(cpu_offload: bool, nvme_offload: bool, bits: int): +def zero3_post_init_quantization_test_helper(cpu_offload: bool, nvme_offload: bool, bits: int, nvme_path=None): import deepspeed from transformers.integrations.deepspeed import HfDeepSpeedConfig @@ -131,7 +131,7 @@ def get_zero3_ds_config(hf_config: OPTConfig, cpu_offload: bool, nvme_offload: b ds_config["zero_optimization"]["offload_param"] = dict( device="nvme", pin_memory=True, - nvme_path='~/tmp_offload_dir', + nvme_path=nvme_path or '~/tmp_offload_dir', buffer_count=5, buffer_size=1 * GB, ) @@ -174,7 +174,7 @@ def get_zero3_ds_config(hf_config: OPTConfig, cpu_offload: bool, nvme_offload: b assert mean_diff < 0.4, f'Numeric error exceed threshold, relative error {mean_diff} (threshold 0.4)' -def zero3_quantized_initialization_test_helper(cpu_offload: bool, nvme_offload: bool, bits: int): +def zero3_quantized_initialization_test_helper(cpu_offload: bool, nvme_offload: bool, bits: int, nvme_path=None): import deepspeed from transformers.integrations.deepspeed import HfDeepSpeedConfig @@ -213,7 +213,7 @@ def get_zero3_ds_config(hf_config: OPTConfig, cpu_offload: bool, nvme_offload: b ds_config["zero_optimization"]["offload_param"] = dict( device="nvme", pin_memory=True, - nvme_path='~/tmp_offload_dir', + nvme_path=nvme_path or '~/tmp_offload_dir', buffer_count=5, buffer_size=1 * GB, ) @@ -393,9 +393,12 @@ def test_zero3_int4_post_init_quant_cpu_offload(self, quantization_bits): zero3_post_init_quantization_test_helper(cpu_offload=True, nvme_offload=False, bits=quantization_bits) @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM') - def test_zero3_int4_post_init_quant_nvme_offload(self): + def test_zero3_int4_post_init_quant_nvme_offload(self, tmpdir): reset_random() - zero3_post_init_quantization_test_helper(cpu_offload=False, nvme_offload=True, bits=4) + zero3_post_init_quantization_test_helper(cpu_offload=False, + nvme_offload=True, + bits=4, + nvme_path=str(tmpdir.join("nvme_offload"))) @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM') def test_zero3_int4_quantized_initialization(self, quantization_bits): @@ -408,6 +411,9 @@ def test_zero3_int4_quantized_initialization_cpu_offload(self, quantization_bits zero3_quantized_initialization_test_helper(cpu_offload=True, nvme_offload=False, bits=quantization_bits) @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM') - def test_zero3_int4_quantized_initialization_nvme_offload(self): + def test_zero3_int4_quantized_initialization_nvme_offload(self, tmpdir): reset_random() - zero3_quantized_initialization_test_helper(cpu_offload=False, nvme_offload=True, bits=4) + zero3_quantized_initialization_test_helper(cpu_offload=False, + nvme_offload=True, + bits=4, + nvme_path=str(tmpdir.join("nvme_offload")))