Politrees
diff --git a/‎PolUVR/separator/architectures/mdxc_separator.py‎
Lines changed: 50 additions & 31 deletions b/‎PolUVR/separator/architectures/mdxc_separator.py‎
Lines changed: 50 additions & 31 deletions
diff --git a/‎PolUVR/separator/common_separator.py‎
Lines changed: 86 additions & 22 deletions b/‎PolUVR/separator/common_separator.py‎
Lines changed: 86 additions & 22 deletions
@@ -9,9 +9,8 @@
 
 from PolUVR.separator.common_separator import CommonSeparator
 from PolUVR.separator.uvr_lib_v5 import spec_utils
-from PolUVR.separator.uvr_lib_v5.roformer.bs_roformer import BSRoformer
-from PolUVR.separator.uvr_lib_v5.roformer.mel_band_roformer import MelBandRoformer
 from PolUVR.separator.uvr_lib_v5.tfc_tdf_v3 import TFC_TDF_net
+# Roformer direct constructors removed; loading handled via RoformerLoader in CommonSeparator.
 
 
 class MDXCSeparator(CommonSeparator):
@@ -88,7 +87,8 @@ def __init__(self, common_config, arch_config):
         self.logger.debug(f"MDXC arch params: override_model_segment_size={self.override_model_segment_size}, pitch_shift={self.pitch_shift}")
         self.logger.debug(f"MDXC multi-stem params: process_all_stems={self.process_all_stems}")
 
-        self.is_roformer = "is_roformer" in self.model_data
+        # Align Roformer detection flag with CommonSeparator to ensure consistent stats/logging
+        self.is_roformer = getattr(self, "is_roformer_model", False)
 
         self.load_model()
 
@@ -115,28 +115,29 @@ def load_model(self):
 
         try:
             if self.is_roformer:
-                self.logger.debug("Loading Roformer model...")
-
-                # Determine the model type based on the configuration and instantiate it
-                if "num_bands" in self.model_data_cfgdict.model:
-                    self.logger.debug("Loading MelBandRoformer model...")
-                    model = MelBandRoformer(**self.model_data_cfgdict.model)
-                elif "freqs_per_bands" in self.model_data_cfgdict.model:
-                    self.logger.debug("Loading BSRoformer model...")
-                    model = BSRoformer(**self.model_data_cfgdict.model)
+                # Use the RoformerLoader exclusively; no legacy fallback
+                self.logger.debug("Loading Roformer model via RoformerLoader...")
+                result = self.roformer_loader.load_model(
+                    model_path=self.model_path,
+                    config=self.model_data,
+                    device=str(self.torch_device),
+                )
+
+                if getattr(result, "success", False) and getattr(result, "model", None) is not None:
+                    self.model_run = result.model
+                    self.model_run.to(self.torch_device).eval()
                 else:
-                    raise ValueError("Unknown Roformer model type in the configuration.")
-
-                # Load model checkpoint
-                checkpoint = torch.load(self.model_path, map_location="cpu", weights_only=True)
-                self.model_run = model if not isinstance(model, torch.nn.DataParallel) else model.module
-                self.model_run.load_state_dict(checkpoint)
-                self.model_run.to(self.torch_device).eval()
+                    error_msg = getattr(result, "error_message", "RoformerLoader unsuccessful")
+                    self.logger.error(f"Failed to load Roformer model: {error_msg}")
+                    raise RuntimeError(error_msg)
 
             else:
                 self.logger.debug("Loading TFC_TDF_net model...")
                 self.model_run = TFC_TDF_net(self.model_data_cfgdict, device=self.torch_device)
-                self.model_run.load_state_dict(torch.load(self.model_path, map_location=self.torch_device))
+                self.logger.debug("Loading model onto cpu")
+                # For some reason loading the state onto a hardware accelerated devices causes issues, 
+                # so we load it onto CPU first then move it to the device
+                self.model_run.load_state_dict(torch.load(self.model_path, map_location="cpu"))
                 self.model_run.to(self.torch_device).eval()
 
         except RuntimeError as e:
@@ -273,9 +274,12 @@ def pitch_fix(self, source, sr_pitched, orig_mix):
         return source
 
     def overlap_add(self, result, x, weights, start, length):
-        """Adds the overlapping part of the result to the result tensor.
-        """
-        result[..., start : start + length] += x[..., :length] * weights[:length]
+        """Adds the overlapping part of the result to the result tensor."""
+        # Guard against minor shape mismatches from model output length
+        # Use the minimum of provided lengths to avoid broadcasting errors
+        safe_len = min(length, x.shape[-1], weights.shape[0])
+        if safe_len > 0:
+            result[..., start : start + safe_len] += x[..., :safe_len] * weights[:safe_len]
         return result
 
     def demix(self, mix: np.ndarray) -> dict:
@@ -311,11 +315,21 @@ def demix(self, mix: np.ndarray) -> dict:
             self.logger.debug(f"Number of stems: {num_stems}")
 
             # chunk_size aka "C" in UVR
-            chunk_size = self.model_data_cfgdict.audio.hop_length * (mdx_segment_size - 1)
-            self.logger.debug(f"Chunk size: {chunk_size}")
-
-            step = int(self.overlap * self.model_data_cfgdict.audio.sample_rate)
-            self.logger.debug(f"Step: {step}")
+            # IMPORTANT: For Roformer models, use the model's STFT hop length to derive the temporal chunk size
+            stft_hop_len = getattr(self.model_data_cfgdict.model, "stft_hop_length", None)
+            if stft_hop_len is None:
+                # Fallback to audio.hop_length if not present, but log for visibility
+                stft_hop_len = self.model_data_cfgdict.audio.hop_length
+                self.logger.debug(f"Model.stft_hop_length missing; falling back to audio.hop_length={stft_hop_len}")
+
+            chunk_size = int(stft_hop_len) * (int(mdx_segment_size) - 1)
+            self.logger.debug(f"Chunk size: {chunk_size} (using stft_hop_length={stft_hop_len} and dim_t={mdx_segment_size})")
+
+            # Align step to chunk_size by default for Roformer to avoid stride mismatches
+            # If a user-specified overlap (in seconds) results in a step larger than chunk_size, clamp it
+            desired_step = int(self.overlap * self.model_data_cfgdict.audio.sample_rate)
+            step = chunk_size if desired_step <= 0 else min(desired_step, chunk_size)
+            self.logger.debug(f"Step: {step} (desired={desired_step})")
 
             # Create a weighting table and convert it to a PyTorch tensor
             window = torch.tensor(signal.windows.hamming(chunk_size), dtype=torch.float32)
@@ -340,11 +354,16 @@ def demix(self, mix: np.ndarray) -> dict:
                     # Perform overlap_add on CPU
                     if i + chunk_size > mix.shape[1]:
                         # Fixed to correctly add to the end of the tensor
-                        result = self.overlap_add(result, x, window, result.shape[-1] - chunk_size, length)
-                        counter[..., result.shape[-1] - chunk_size :] += window[:length]
+                        start_idx = result.shape[-1] - chunk_size
+                        result = self.overlap_add(result, x, window, start_idx, length)
+                        safe_len = min(length, x.shape[-1], window.shape[0])
+                        if safe_len > 0:
+                            counter[..., start_idx : start_idx + safe_len] += window[:safe_len]
                     else:
                         result = self.overlap_add(result, x, window, i, length)
-                        counter[..., i : i + length] += window[:length]
+                        safe_len = min(length, x.shape[-1], window.shape[0])
+                        if safe_len > 0:
+                            counter[..., i : i + safe_len] += window[:safe_len]
 
             inferenced_outputs = result / counter.clamp(min=1e-10)
 
 
@@ -15,8 +15,7 @@
 
 
 class CommonSeparator:
-    """This class contains the common methods and attributes common to all architecture-specific Separator classes.
-    """
+    """This class contains the common methods and attributes common to all architecture-specific Separator classes."""
 
     ALL_STEMS = "All Stems"
     VOCAL_STEM = "Vocals"
@@ -84,6 +83,12 @@ def __init__(self, config):
         self.invert_using_spec = config.get("invert_using_spec")
         self.sample_rate = config.get("sample_rate")
         self.use_soundfile = config.get("use_soundfile")
+        
+        # Roformer-specific loading support
+        self.roformer_loader = None
+        self.is_roformer_model = self._detect_roformer_model()
+        if self.is_roformer_model:
+            self._initialize_roformer_loader()
 
         # Model specific properties
 
@@ -138,13 +143,11 @@ def secondary_stem(self, primary_stem: str):
         return secondary_stem
 
     def separate(self, audio_file_path):
-        """Placeholder method for separating audio sources. Should be overridden by subclasses.
-        """
+        """Placeholder method for separating audio sources. Should be overridden by subclasses."""
         raise NotImplementedError("This method should be overridden by subclasses.")
 
     def final_process(self, stem_path, source, stem_name):
-        """Finalizes the processing of a stem by writing the audio to a file and returning the processed source.
-        """
+        """Finalizes the processing of a stem by writing the audio to a file and returning the processed source."""
         self.logger.debug(f"Finalizing {stem_name} stem processing and writing audio...")
         self.write_audio(stem_path, source)
 
@@ -189,7 +192,7 @@ def cached_model_source_holder(self, model_architecture, sources, model_name=Non
         """Update the dictionary for the given model_architecture with the new model name and its sources.
         Use the model_architecture as a key to access the corresponding cache source mapper dictionary.
         """
-        self.cached_sources_map[model_architecture] = {**self.cached_sources_map.get(model_architecture, {}), model_name: sources}
+        self.cached_sources_map[model_architecture] = {**self.cached_sources_map.get(model_architecture, {}), **{model_name: sources}}
 
     def prepare_mix(self, mix):
         """Prepares the mix for processing. This includes loading the audio from a file if necessary,
@@ -246,8 +249,7 @@ def write_audio(self, stem_path: str, stem_source):
             self.write_audio_pydub(stem_path, stem_source)
 
     def write_audio_pydub(self, stem_path: str, stem_source):
-        """Writes the separated audio source to a file using pydub (ffmpeg)
-        """
+        """Writes the separated audio source to a file using pydub (ffmpeg)."""
         self.logger.debug(f"Entering write_audio_pydub with stem_path: {stem_path}")
 
         stem_source = spec_utils.normalize(wave=stem_source, max_peak=self.normalization_threshold, min_peak=self.amplification_threshold)
@@ -305,10 +307,21 @@ def write_audio_pydub(self, stem_path: str, stem_source):
             self.logger.error(f"Error exporting audio file: {e}")
 
     def write_audio_soundfile(self, stem_path: str, stem_source):
-        """Writes the separated audio source to a file using soundfile library.
-        """
+        """Writes the separated audio source to a file using soundfile library."""
         self.logger.debug(f"Entering write_audio_soundfile with stem_path: {stem_path}")
 
+        stem_source = spec_utils.normalize(wave=stem_source, max_peak=self.normalization_threshold, min_peak=self.amplification_threshold)
+
+        # Check if the numpy array is empty or contains very low values
+        if np.max(np.abs(stem_source)) < 1e-6:
+            self.logger.warning("Warning: stem_source array is near-silent or empty.")
+            return
+
+        # If output_dir is specified, create it and join it with stem_path
+        if self.output_dir:
+            os.makedirs(self.output_dir, exist_ok=True)
+            stem_path = os.path.join(self.output_dir, stem_path)
+
         # Correctly interleave stereo channels if needed
         if stem_source.shape[1] == 2:
             # If the audio is already interleaved, ensure it's in the correct order
@@ -327,9 +340,7 @@ def write_audio_soundfile(self, stem_path: str, stem_source):
 
         self.logger.debug(f"Interleaved audio data shape: {stem_source.shape}")
 
-        """
-        Write audio using soundfile (for formats other than M4A).
-        """
+        """Write audio using soundfile (for formats other than M4A)."""
         # Save audio using soundfile
         try:
             # Specify the subtype to define the sample width
@@ -339,8 +350,7 @@ def write_audio_soundfile(self, stem_path: str, stem_source):
             self.logger.error(f"Error exporting audio file: {e}")
 
     def clear_gpu_cache(self):
-        """This method clears the GPU cache to free up memory.
-        """
+        """This method clears the GPU cache to free up memory."""
         self.logger.debug("Running garbage collection...")
         gc.collect()
         if self.torch_device == torch.device("mps"):
@@ -351,8 +361,7 @@ def clear_gpu_cache(self):
             torch.cuda.empty_cache()
 
     def clear_file_specific_paths(self):
-        """Clears the file-specific variables which need to be cleared between processing different audio inputs.
-        """
+        """Clears the file-specific variables which need to be cleared between processing different audio inputs."""
         self.logger.info("Clearing input audio file paths, sources and stems...")
 
         self.audio_file_path = None
@@ -365,16 +374,14 @@ def clear_file_specific_paths(self):
         self.secondary_stem_output_path = None
 
     def sanitize_filename(self, filename):
-        """Cleans the filename by replacing invalid characters with underscores.
-        """
+        """Cleans the filename by replacing invalid characters with underscores."""
         sanitized = re.sub(r'[<>:"/\\|?*]', "_", filename)
         sanitized = re.sub(r"_+", "_", sanitized)
         sanitized = sanitized.strip("_. ")
         return sanitized
 
     def get_stem_output_path(self, stem_name, custom_output_names):
-        """Gets the output path for a stem based on the stem name and custom output names.
-        """
+        """Gets the output path for a stem based on the stem name and custom output names."""
         # Convert custom_output_names keys to lowercase for case-insensitive comparison
         if custom_output_names:
             custom_output_names_lower = {k.lower(): v for k, v in custom_output_names.items()}
@@ -389,3 +396,60 @@ def get_stem_output_path(self, stem_name, custom_output_names):
 
         filename = f"{sanitized_audio_base}_({sanitized_stem_name})_{sanitized_model_name}.{self.output_format.lower()}"
         return os.path.join(filename)
+    
+    def _detect_roformer_model(self):
+        """Detect if the current model is a Roformer model.
+        
+        Returns:
+            bool: True if this is a Roformer model, False otherwise
+        """
+        if not self.model_data:
+            return False
+            
+        # Check for explicit Roformer flag
+        if self.model_data.get("is_roformer", False):
+            return True
+            
+        # Check model path for Roformer indicators
+        if self.model_path and "roformer" in self.model_path.lower():
+            return True
+            
+        # Check model name for Roformer indicators
+        if self.model_name and "roformer" in self.model_name.lower():
+            return True
+            
+        return False
+    
+    def _initialize_roformer_loader(self):
+        """Initialize the Roformer loader for this model."""
+        try:
+            from .roformer.roformer_loader import RoformerLoader
+            self.roformer_loader = RoformerLoader()
+            self.logger.debug("Initialized Roformer loader for CommonSeparator")
+        except ImportError as e:
+            self.logger.warning(f"Could not import RoformerLoader: {e}")
+            self.roformer_loader = None
+    
+    def get_roformer_loading_stats(self):
+        """Get Roformer loading statistics if available.
+        
+        Returns:
+            dict: Loading statistics or empty dict if not available
+        """
+        if self.roformer_loader:
+            return self.roformer_loader.get_loading_stats()
+        return {}
+    
+    def validate_roformer_config(self, config, model_type):
+        """Validate Roformer configuration if loader is available.
+        
+        Args:
+            config: Configuration dictionary to validate
+            model_type: Type of model to validate for
+            
+        Returns:
+            bool: True if valid or validation not available, False if invalid
+        """
+        if self.roformer_loader:
+            return self.roformer_loader.validate_configuration(config, model_type)
+        return True  # Assume valid if no loader available