Politrees
diff --git a/‎PolUVR/separator/architectures/mdxc_separator.py‎
Lines changed: 25 additions & 8 deletions b/‎PolUVR/separator/architectures/mdxc_separator.py‎
Lines changed: 25 additions & 8 deletions
diff --git a/‎PolUVR/separator/architectures/vr_separator.py‎
Lines changed: 29 additions & 3 deletions b/‎PolUVR/separator/architectures/vr_separator.py‎
Lines changed: 29 additions & 3 deletions
diff --git a/‎PolUVR/separator/audio_chunking.py‎
Lines changed: 136 additions & 0 deletions b/‎PolUVR/separator/audio_chunking.py‎
Lines changed: 136 additions & 0 deletions
@@ -97,9 +97,9 @@ def __init__(self, common_config, arch_config):
         self.audio_file_path = None
         self.audio_file_base = None
 
-        self.is_primary_stem_main_target = False
-        if self.model_data_cfgdict.training.target_instrument == "Vocals" or len(self.model_data_cfgdict.training.instruments) > 1:
-            self.is_primary_stem_main_target = True
+        # Only mark primary stem as main target for single-target models.
+        # Multi-stem models should not trigger residual subtraction logic.
+        self.is_primary_stem_main_target = bool(self.model_data_cfgdict.training.target_instrument)
 
         self.logger.debug(f"is_primary_stem_main_target: {self.is_primary_stem_main_target}")
 
@@ -428,8 +428,8 @@ def demix(self, mix: np.ndarray) -> dict:
             self.logger.debug("Deleting accumulated outputs to free up memory")
             del accumulated_outputs
 
-        if num_stems > 1 or self.is_primary_stem_main_target:
-            self.logger.debug("Number of stems is greater than 1 or vocals are main target, detaching individual sources and correcting pitch if necessary...")
+        if num_stems > 1:
+            self.logger.debug("Number of stems is greater than 1, detaching individual sources and correcting pitch if necessary...")
 
             sources = {}
 
@@ -445,7 +445,8 @@ def demix(self, mix: np.ndarray) -> dict:
                 else:
                     sources[key] = value
 
-            if self.is_primary_stem_main_target:
+            # Residual subtraction is only applicable for single-target models (not multi-stem)
+            if self.is_primary_stem_main_target and num_stems == 1:
                 self.logger.debug(f"Primary stem: {self.primary_stem_name} is main target, detaching and matching array shapes if necessary...")
                 if sources[self.primary_stem_name].shape[1] != orig_mix.shape[1]:
                     sources[self.primary_stem_name] = spec_utils.match_array_shapes(sources[self.primary_stem_name], orig_mix)
@@ -456,6 +457,7 @@ def demix(self, mix: np.ndarray) -> dict:
 
             self.logger.debug("Returning separated sources")
             return sources
+
         self.logger.debug("Processing single source...")
 
         if self.is_roformer:
@@ -469,8 +471,23 @@ def demix(self, mix: np.ndarray) -> dict:
         self.logger.debug("Deleting inferenced outputs to free up memory")
         del inferenced_outputs
 
+        # For single-target models (e.g., karaoke), also return the residual as secondary
         if self.pitch_shift != 0:
             self.logger.debug("Applying pitch correction for single instrument")
-            return self.pitch_fix(inferenced_output, sample_rate, orig_mix)
+            primary = self.pitch_fix(inferenced_output, sample_rate, orig_mix)
+        else:
+            primary = inferenced_output
+
+        if self.is_primary_stem_main_target:
+            self.logger.debug("Single-target model detected; computing residual secondary stem from original mix")
+            # Ensure shapes match before residual subtraction
+            if primary.shape[1] != orig_mix.shape[1]:
+                primary = spec_utils.match_array_shapes(primary, orig_mix)
+            secondary = orig_mix - primary
+            return {
+                self.primary_stem_name: primary,
+                self.secondary_stem_name: secondary,
+            }
+
         self.logger.debug("Returning inferenced output for single instrument")
-        return inferenced_output
+        return primary
@@ -138,8 +138,8 @@ def __init__(self, common_config, arch_config: dict):
 
         self.model_run = lambda *args, **kwargs: self.logger.error("Model run method is not initialised yet.")
 
-        # This should go away once we refactor to remove soundfile.write and replace with pydub like we did for the MDX rewrite
-        self.wav_subtype = "PCM_16"
+        # wav_subtype will be set based on input audio bit depth in prepare_mix()
+        # Removed hardcoded "PCM_16" to allow bit depth preservation
 
         self.logger.info("VR Separator initialisation complete")
 
@@ -161,6 +161,32 @@ def separate(self, audio_file_path, custom_output_names=None):
         self.audio_file_path = audio_file_path
         self.audio_file_base = os.path.splitext(os.path.basename(audio_file_path))[0]
 
+        # Detect input audio bit depth for output preservation
+        try:
+            import soundfile as sf
+            info = sf.info(audio_file_path)
+            self.input_audio_subtype = info.subtype
+            self.logger.info(f"Input audio subtype: {self.input_audio_subtype}")
+            
+            # Map subtype to wav_subtype for soundfile and set input_bit_depth for pydub
+            if "24" in self.input_audio_subtype:
+                self.wav_subtype = "PCM_24"
+                self.input_bit_depth = 24
+                self.logger.info("Detected 24-bit input audio")
+            elif "32" in self.input_audio_subtype:
+                self.wav_subtype = "PCM_32"
+                self.input_bit_depth = 32
+                self.logger.info("Detected 32-bit input audio")
+            else:
+                self.wav_subtype = "PCM_16"
+                self.input_bit_depth = 16
+                self.logger.info("Detected 16-bit input audio")
+        except Exception as e:
+            self.logger.warning(f"Could not detect input audio bit depth: {e}. Defaulting to PCM_16")
+            self.wav_subtype = "PCM_16"
+            self.input_audio_subtype = None
+            self.input_bit_depth = 16
+
         self.logger.debug(f"Starting separation for input audio file {self.audio_file_path}...")
 
         nn_arch_sizes = [31191, 33966, 56817, 123821, 123812, 129605, 218409, 537238, 537227]  # default
@@ -177,7 +203,7 @@ def separate(self, audio_file_path, custom_output_names=None):
             self.logger.debug("Determining model capacity...")
             self.model_run = nets.determine_model_capacity(self.model_params.param["bins"] * 2, nn_arch_size)
 
-        self.model_run.load_state_dict(torch.load(self.model_path, map_location=self.torch_device_cpu))
+        self.model_run.load_state_dict(torch.load(self.model_path, map_location="cpu"))
         self.model_run.to(self.torch_device)
         self.logger.debug("Model loaded and moved to device.")
 
 
@@ -0,0 +1,136 @@
+"""Audio chunking utilities for processing large audio files to prevent OOM errors."""
+
+import os
+import logging
+from typing import List
+from pydub import AudioSegment
+
+
+class AudioChunker:
+    """Handles splitting and merging of large audio files.
+
+    This class provides utilities to:
+    - Split large audio files into fixed-duration chunks
+    - Merge processed chunks back together with simple concatenation
+    - Determine if a file should be chunked based on its duration
+
+    Example:
+        >>> chunker = AudioChunker(chunk_duration_seconds=600)  # 10-minute chunks
+        >>> chunk_paths = chunker.split_audio("long_audio.wav", "/tmp/chunks")
+        >>> # Process each chunk...
+        >>> output_path = chunker.merge_chunks(processed_chunks, "output.wav")
+    """
+
+    def __init__(self, chunk_duration_seconds: float, logger: logging.Logger = None):
+        """Initialize the AudioChunker.
+
+        Args:
+            chunk_duration_seconds: Duration of each chunk in seconds
+            logger: Optional logger instance for logging operations
+        """
+        self.chunk_duration_ms = int(chunk_duration_seconds * 1000)
+        self.logger = logger or logging.getLogger(__name__)
+
+    def split_audio(self, input_path: str, output_dir: str) -> List[str]:
+        """Split audio file into fixed-size chunks.
+
+        Args:
+            input_path: Path to the input audio file
+            output_dir: Directory where chunk files will be saved
+
+        Returns:
+            List of paths to the created chunk files
+
+        Raises:
+            FileNotFoundError: If input file doesn't exist
+            IOError: If there's an error reading or writing audio files
+        """
+        if not os.path.exists(input_path):
+            raise FileNotFoundError(f"Input file not found: {input_path}")
+
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+
+        self.logger.debug(f"Loading audio file: {input_path}")
+        audio = AudioSegment.from_file(input_path)
+
+        total_duration_ms = len(audio)
+        chunk_paths = []
+
+        # Calculate number of chunks
+        num_chunks = (total_duration_ms + self.chunk_duration_ms - 1) // self.chunk_duration_ms
+        self.logger.info(f"Splitting {total_duration_ms / 1000:.1f}s audio into {num_chunks} chunks of {self.chunk_duration_ms / 1000:.1f}s each")
+
+        # Get file extension from input
+        _, ext = os.path.splitext(input_path)
+        if not ext:
+            ext = ".wav"  # Default to WAV if no extension
+
+        # Split into chunks
+        for i in range(num_chunks):
+            start_ms = i * self.chunk_duration_ms
+            end_ms = min(start_ms + self.chunk_duration_ms, total_duration_ms)
+
+            chunk = audio[start_ms:end_ms]
+            chunk_filename = f"chunk_{i:04d}{ext}"
+            chunk_path = os.path.join(output_dir, chunk_filename)
+
+            self.logger.debug(f"Exporting chunk {i + 1}/{num_chunks}: {start_ms / 1000:.1f}s - {end_ms / 1000:.1f}s to {chunk_path}")
+            chunk.export(chunk_path, format=ext.lstrip('.'))
+            chunk_paths.append(chunk_path)
+
+        return chunk_paths
+
+    def merge_chunks(self, chunk_paths: List[str], output_path: str) -> str:
+        """Merge processed chunks with simple concatenation.
+
+        Args:
+            chunk_paths: List of paths to chunk files to merge
+            output_path: Path where the merged output will be saved
+
+        Returns:
+            Path to the merged output file
+
+        Raises:
+            ValueError: If chunk_paths is empty
+            FileNotFoundError: If any chunk file doesn't exist
+            IOError: If there's an error reading or writing audio files
+        """
+        if not chunk_paths:
+            raise ValueError("Cannot merge empty list of chunks")
+
+        # Verify all chunks exist
+        for chunk_path in chunk_paths:
+            if not os.path.exists(chunk_path):
+                raise FileNotFoundError(f"Chunk file not found: {chunk_path}")
+
+        self.logger.info(f"Merging {len(chunk_paths)} chunks into {output_path}")
+
+        # Start with empty audio segment
+        combined = AudioSegment.empty()
+
+        # Concatenate all chunks
+        for i, chunk_path in enumerate(chunk_paths):
+            self.logger.debug(f"Loading chunk {i + 1}/{len(chunk_paths)}: {chunk_path}")
+            chunk = AudioSegment.from_file(chunk_path)
+            combined += chunk  # Simple concatenation
+
+        # Get output format from file extension
+        _, ext = os.path.splitext(output_path)
+        output_format = ext.lstrip('.') if ext else 'wav'
+
+        self.logger.info(f"Exporting merged audio ({len(combined) / 1000:.1f}s) to {output_path}")
+        combined.export(output_path, format=output_format)
+
+        return output_path
+
+    def should_chunk(self, audio_duration_seconds: float) -> bool:
+        """Determine if file is large enough to benefit from chunking.
+
+        Args:
+            audio_duration_seconds: Duration of the audio file in seconds
+
+        Returns:
+            True if the file should be chunked, False otherwise
+        """
+        return audio_duration_seconds > (self.chunk_duration_ms / 1000)