Add MPS (Apple Silicon) support and update dependencies

Acelogic · Acelogic · commit 01a8ba5c5763 · 2026-01-05T07:42:43.000-05:00
This commit adds support for Apple's MPS backend throughout the codebase, allowing usage on Apple Silicon devices. Device selection logic now detects MPS, and memory management calls are updated accordingly. Requirements are relaxed and updated for broader compatibility, including newer versions of faiss-cpu and removal of strict version pins for several packages. Also removes PID file handling in infer.py and improves error handling for config file loading. Training precision now defaults to float32 for better MPS compatibility.
diff --git a/requirements.txt b/requirements.txt
@@ -2,44 +2,41 @@
 pip>=23.3; sys_platform == 'darwin'
 wheel; sys_platform == 'darwin'
 PyYAML; sys_platform == 'darwin'
-numpy==1.26.4
-requests>=2.31.0,<2.32.0
+numpy<2
+requests
 tqdm
 wget
 
 # Audio processing
-ffmpeg-python>=0.2.0
-faiss-cpu==1.7.3
-librosa==0.11.0
-scipy==1.11.1
-soundfile==0.12.1
+ffmpeg-python
+faiss-cpu==1.7.4
+librosa
+scipy
+soundfile
 noisereduce
 pedalboard
 stftpitchshift
 soxr
 
 # Machine learning and deep learning
-omegaconf>=2.0.6; sys_platform == 'darwin'
-numba; sys_platform == 'linux'
-numba==0.61.0; sys_platform == 'darwin' or sys_platform == 'win32'
-torch==2.7.1; sys_platform == 'darwin'
-torch==2.7.1+cu128; sys_platform == 'linux' or sys_platform == 'win32'
-torchaudio==2.7.1; sys_platform == 'darwin'
-torchaudio==2.7.1+cu128; sys_platform == 'linux' or sys_platform == 'win32'
-torchvision==0.22.1; sys_platform == 'darwin'
-torchvision==0.22.1+cu128; sys_platform == 'linux' or sys_platform == 'win32'
-torchcrepe==0.0.23
+omegaconf; sys_platform == 'darwin'
+numba
+torch==2.4.0; sys_platform == 'darwin'
+torchaudio==2.4.0; sys_platform == 'darwin'
+torchvision==0.19.0; sys_platform == 'darwin'
+torchcrepe
 torchfcpe
 einops
-transformers==4.44.2
+transformers
+beautifulsoup4
 
 # Visualization and UI
-matplotlib==3.7.2
+matplotlib
 tensorboard
 tensorboardX
 
 # Miscellaneous utilities
-certifi>=2023.07.22; sys_platform == 'darwin'
-antlr4-python3-runtime==4.8; sys_platform == 'darwin'
-edge-tts==7.2.0
+certifi; sys_platform == 'darwin'
+antlr4-python3-runtime
+edge-tts
 webrtcvad
diff --git a/rvc/.DS_Store b/rvc/.DS_Store
diff --git a/rvc/configs/config.py b/rvc/configs/config.py
@@ -23,11 +23,17 @@ def get_instance(*args, **kwargs):
 @singleton
 class Config:
     def __init__(self):
-        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
+        if torch.cuda.is_available():
+            self.device = "cuda:0"
+        elif torch.backends.mps.is_available():
+            self.device = "mps"
+        else:
+            self.device = "cpu"
+            
         self.gpu_name = (
             torch.cuda.get_device_name(int(self.device.split(":")[-1]))
             if self.device.startswith("cuda")
-            else None
+            else "Apple M-Series" if self.device == "mps" else None
         )
         self.json_config = self.load_config_json()
         self.gpu_mem = None
@@ -44,6 +50,8 @@ def load_config_json(self):
     def device_config(self):
         if self.device.startswith("cuda"):
             self.set_cuda_config()
+        elif self.device == "mps":
+            self.gpu_mem = 16 # Default to assumes decent unified memory, or could try to detect
         else:
             self.device = "cpu"
 
@@ -63,36 +71,44 @@ def set_cuda_config(self):
         )
 
 
+
 def max_vram_gpu(gpu):
     if torch.cuda.is_available():
         gpu_properties = torch.cuda.get_device_properties(gpu)
         total_memory_gb = round(gpu_properties.total_memory / 1024 / 1024 / 1024)
         return total_memory_gb
+    elif torch.backends.mps.is_available():
+        return 16 # Default placeholder
     else:
         return "8"
 
 
 def get_gpu_info():
-    ngpu = torch.cuda.device_count()
-    gpu_infos = []
-    if torch.cuda.is_available() or ngpu != 0:
+    if torch.cuda.is_available():
+        ngpu = torch.cuda.device_count()
+        gpu_infos = []
         for i in range(ngpu):
             gpu_name = torch.cuda.get_device_name(i)
             mem = int(
                 torch.cuda.get_device_properties(i).total_memory / 1024 / 1024 / 1024
                 + 0.4
             )
             gpu_infos.append(f"{i}: {gpu_name} ({mem} GB)")
-    if len(gpu_infos) > 0:
-        gpu_info = "\n".join(gpu_infos)
+        return "\n".join(gpu_infos)
+    elif torch.backends.mps.is_available():
+        return "0: Apple M-Series (Unified Memory)"
     else:
-        gpu_info = "Unfortunately, there is no compatible GPU available to support your training."
-    return gpu_info
+        return "Unfortunately, there is no compatible GPU available to support your training."
 
 
 def get_number_of_gpus():
     if torch.cuda.is_available():
         num_gpus = torch.cuda.device_count()
         return "-".join(map(str, range(num_gpus)))
+    elif torch.backends.mps.is_available():
+        return "0"
     else:
         return "-"
+
+
+
diff --git a/rvc/infer/infer.py b/rvc/infer/infer.py
@@ -363,12 +363,7 @@ def convert_audio_batch(
             sid (int, optional): Speaker ID. Default is 0.
             **kwargs: Additional keyword arguments.
         """
-        pid = os.getpid()
         try:
-            with open(
-                os.path.join(now_dir, "assets", "infer_pid.txt"), "w"
-            ) as pid_file:
-                pid_file.write(str(pid))
             start_time = time.time()
             print(f"Converting audio batch '{audio_input_paths}'...")
             audio_files = [
@@ -410,8 +405,6 @@ def convert_audio_batch(
         except Exception as error:
             print(f"An error occurred during audio batch conversion: {error}")
             print(traceback.format_exc())
-        finally:
-            os.remove(os.path.join(now_dir, "assets", "infer_pid.txt"))
 
     def get_vc(self, weight_root, sid):
         """
@@ -425,6 +418,8 @@ def get_vc(self, weight_root, sid):
             self.cleanup_model()
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
+            elif torch.backends.mps.is_available():
+                torch.mps.empty_cache()
 
         if not self.loaded_model or self.loaded_model != weight_root:
             self.load_model(weight_root)
@@ -445,10 +440,14 @@ def cleanup_model(self):
             self.hubert_model = self.net_g = self.n_spk = self.vc = self.tgt_sr = None
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
+            elif torch.backends.mps.is_available():
+                torch.mps.empty_cache()
 
         del self.net_g, self.cpt
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
+        elif torch.backends.mps.is_available():
+            torch.mps.empty_cache()
         self.cpt = None
 
     def load_model(self, weight_root):
diff --git a/rvc/lib/predictors/FCPE.py b/rvc/lib/predictors/FCPE.py
@@ -726,7 +726,12 @@ def gaussian_blurred_cent(self, cents):
 class FCPEInfer:
     def __init__(self, model_path, device=None, dtype=torch.float32):
         if device is None:
-            device = "cuda" if torch.cuda.is_available() else "cpu"
+            if torch.cuda.is_available():
+                device = "cuda"
+            elif torch.backends.mps.is_available():
+                device = "mps"
+            else:
+                device = "cpu"
         self.device = device
         ckpt = torch.load(
             model_path, map_location=torch.device(self.device), weights_only=True
@@ -769,7 +774,12 @@ def __init__(self, args, device=None, dtype=torch.float32):
         self.sample_rate = args.mel.sampling_rate
         self.hop_size = args.mel.hop_size
         if device is None:
-            device = "cuda" if torch.cuda.is_available() else "cpu"
+            if torch.cuda.is_available():
+                device = "cuda"
+            elif torch.backends.mps.is_available():
+                device = "mps"
+            else:
+                device = "cpu"
         self.device = device
         self.dtype = dtype
         self.stft = STFT(
@@ -849,7 +859,15 @@ def __init__(
         self.hop_length = hop_length
         self.f0_min = f0_min
         self.f0_max = f0_max
-        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        if device is None:
+            if torch.cuda.is_available():
+                self.device = "cuda"
+            elif torch.backends.mps.is_available():
+                self.device = "mps"
+            else:
+                self.device = "cpu"
+        else:
+            self.device = device
         self.threshold = threshold
         self.sample_rate = sample_rate
         self.dtype = dtype
diff --git a/rvc/lib/predictors/RMVPE.py b/rvc/lib/predictors/RMVPE.py
@@ -506,7 +506,10 @@ def infer_from_audio(self, audio, thred=0.03):
         mel = self.mel_extractor(audio, center=True)
         del audio
         with torch.no_grad():
-            torch.cuda.empty_cache()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            elif torch.backends.mps.is_available():
+                torch.mps.empty_cache()
         hidden = self.mel2hidden(mel)
         hidden = hidden.squeeze(0).cpu().numpy()
         f0 = self.decode(hidden, thred=thred)
diff --git a/rvc/train/extract/extract.py b/rvc/train/extract/extract.py
@@ -212,7 +212,14 @@ def run_embedding_extraction(
         ]
         files.append(file_info)
 
-    devices = ["cpu"] if gpus == "-" else [f"cuda:{idx}" for idx in gpus.split("-")]
+    if gpus == "-":
+        devices = ["cpu"]
+    elif torch.cuda.is_available():
+        devices = [f"cuda:{idx}" for idx in gpus.split("-")]
+    elif torch.backends.mps.is_available():
+        devices = ["mps"]
+    else:
+        devices = ["cpu"]
 
     run_pitch_extraction(files, devices, f0_method, num_processes)
 
diff --git a/rvc/train/process/extract_model.py b/rvc/train/process/extract_model.py
@@ -50,9 +50,12 @@ def extract_model(
         else:
             dataset_length = None
 
-        with open(os.path.join(now_dir, "assets", "config.json"), "r") as f:
-            data = json.load(f)
-            model_author = data.get("model_author", None)
+        try:
+            with open(os.path.join(now_dir, "assets", "config.json"), "r") as f:
+                data = json.load(f)
+                model_author = data.get("model_author", None)
+        except (FileNotFoundError, json.JSONDecodeError):
+            model_author = None
 
         opt = OrderedDict(
             weight={
diff --git a/rvc/train/train.py b/rvc/train/train.py
@@ -76,20 +76,11 @@
 
 current_dir = os.getcwd()
 
+
 try:
-    with open(os.path.join(current_dir, "assets", "config.json"), "r") as f:
-        config = json.load(f)
-        precision = config["precision"]
-        if (
-            precision == "bf16"
-            and torch.cuda.is_available()
-            and torch.cuda.is_bf16_supported()
-        ):
-            train_dtype = torch.bfloat16
-        elif precision == "fp16" and torch.cuda.is_available():
-            train_dtype = torch.float16
-        else:
-            train_dtype = torch.float32
+    # Removed assets/config.json reading logic as assets dir is gone.
+    # Defaulting to float32 which is safe for MPS.
+    train_dtype = torch.float32
 except (FileNotFoundError, json.JSONDecodeError, KeyError):
     train_dtype = torch.float32
 
@@ -693,7 +684,7 @@ def train_and_evaluate(
             ) = info
 
             with torch.amp.autocast(
-                device_type="cuda", enabled=use_amp, dtype=train_dtype
+                device_type="cuda" if device.type == "cuda" else "cpu", enabled=use_amp, dtype=train_dtype
             ):
                 # Forward pass
                 model_output = net_g(
@@ -712,7 +703,7 @@ def train_and_evaluate(
                     )
             for _ in range(d_step_per_g_step):  # default x1
                 with torch.amp.autocast(
-                    device_type="cuda", enabled=use_amp, dtype=train_dtype
+                    device_type="cuda" if device.type == "cuda" else "cpu", enabled=use_amp, dtype=train_dtype
                 ):
                     y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach())
                 loss_disc, _, _ = discriminator_loss(y_d_hat_r, y_d_hat_g)
@@ -729,7 +720,7 @@ def train_and_evaluate(
                     optim_d.step()
 
             with torch.amp.autocast(
-                device_type="cuda", enabled=use_amp, dtype=train_dtype
+                device_type="cuda" if device.type == "cuda" else "cpu", enabled=use_amp, dtype=train_dtype
             ):
                 # Generator backward and update
                 _, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat)
@@ -886,7 +877,7 @@ def train_and_evaluate(
 
         if epoch % save_every_epoch == 0:
             with torch.amp.autocast(
-                device_type="cuda", enabled=use_amp, dtype=train_dtype
+                device_type="cuda" if device.type == "cuda" else "cpu", enabled=use_amp, dtype=train_dtype
             ):
                 with torch.no_grad():
                     if hasattr(net_g, "module"):