1- from librosa . effects import hpss
1+ from scipy . ndimage import median_filter
22import numpy as np
33from .pvtsm import phase_vocoder
44from .olatsm import ola
5- from .utils import _validate_audio
5+ from .utils import _validate_audio , stft , istft
66
77
8- def hptsm (x , s , hp_kernel_size = 31 , hp_power = 2.0 , hp_mask = False , hp_margin = 1.0 ,
8+ def hptsm (x , s , hp_len_harm = 10 , hp_len_perc = 10 , hp_mask_mode = 'binary' , hp_win_type = 'hann' ,
9+ hp_win_size = 1024 , hp_hop_size = 256 , hp_zero_pad = 0 , hp_fft_shift = False ,
910 pv_win_type = 'hann' , pv_win_size = 2048 , pv_syn_hop_size = 512 ,
1011 pv_zero_pad = 0 , pv_restore_energy = False , pv_fft_shift = False ,
1112 pv_phase_lock = True , ola_win_type = 'hann' ,
@@ -35,15 +36,10 @@ def hptsm(x, s, hp_kernel_size=31, hp_power=2.0, hp_mask=False, hp_margin=1.0,
3536 the modified output audio sequence.
3637 """
3738 x = _validate_audio (x )
38- x_harm = np .zeros (x .shape )
39- x_perc = np .zeros (x .shape )
4039
41- for c , x_chan in enumerate (x ):
42- x_harm_chan , x_perc_chan = hpss (x_chan , kernel_size = hp_kernel_size ,
43- power = hp_power , mask = hp_mask ,
44- margin = hp_margin )
45- x_harm [c , :] = x_harm_chan
46- x_perc [c , :] = x_perc_chan
40+ x_harm , x_perc = _hpss (x , len_harm = hp_len_harm , len_perc = hp_len_perc , mask_mode = hp_mask_mode ,
41+ win_type = hp_win_type , win_size = hp_win_size , hop_size = hp_hop_size ,
42+ zero_pad = hp_zero_pad , fft_shift = hp_fft_shift )
4743
4844 y_harm = phase_vocoder (x_harm , s , win_type = pv_win_type ,
4945 win_size = pv_win_size , syn_hop_size = pv_syn_hop_size ,
@@ -55,3 +51,71 @@ def hptsm(x, s, hp_kernel_size=31, hp_power=2.0, hp_mask=False, hp_margin=1.0,
5551 syn_hop_size = ola_syn_hop_size )
5652
5753 return y_harm + y_perc
54+
55+
56+ def _hpss (x , len_harm = 10 , len_perc = 10 , mask_mode = 'binary' , win_type = 'hann' ,
57+ win_size = 1024 , hop_size = 256 , zero_pad = 0 , fft_shift = False ):
58+ """Separate the input audio sequence to a harmonic and a percussive source.
59+ The algorithm is from the following paper.
60+
61+ Derry Fitzgerald, "Harmonic/percussive separation using median filtering." Proc. of the Int. Conf. on Digital Audio Effects (DAFx). Vol. 13. 2010.
62+
63+ Parameters
64+ ----------
65+
66+ x : numpy.ndarray [shape=(channel, num_samples) or (num_samples)]
67+ the input audio sequence to separate.
68+ len_harm : int
69+ length of the median filter kernel size for the harmonic source.
70+ len_perc : int
71+ length of the median filter kernel size for the percussive source.
72+ mask_mode : str
73+ mask mode for the separation. binary and relative are available.
74+ win_type : str
75+ type of the window function for the STFT. hann and sin are available.
76+ win_size : int > 0 [scalar]
77+ size of the window function for the STFT and the ISTFT.
78+ hop_size : int > 0 [scalar]
79+ hop size of the analysis/synthesis window for the STFT and the ISTFT.
80+ zero_pad : int > 0 [scalar]
81+ the size of the zero pad in the window function.
82+ fft_shift : bool
83+ apply circular shift to STFT and ISTFT.
84+
85+ Returns
86+ -------
87+
88+ x_harm : numpy.ndarray [shape=(channel, num_samples) or (num_samples)]
89+ the separated harmonic audio sequence.
90+ x_perc : numpy.ndarray [shape=(channel, num_samples) or (num_samples)]
91+ the separated percussive audio sequence.
92+ """
93+ x_harm = np .zeros (x .shape )
94+ x_perc = np .zeros (x .shape )
95+
96+ for c , x_chan in enumerate (x ):
97+ spec = stft (x_chan , ana_hop = hop_size , win_type = win_type , win_size = win_size ,
98+ zero_pad = zero_pad , fft_shift = fft_shift )
99+ mag_spec = np .abs (spec )
100+
101+ mag_spec_harm = median_filter (mag_spec , size = [1 , len_harm ], mode = 'reflect' )
102+ mag_spec_perc = median_filter (mag_spec , size = [len_perc , 1 ], mode = 'reflect' )
103+
104+ if mask_mode == 'binary' :
105+ mask_harm = mag_spec_harm > mag_spec_perc
106+ mask_perc = mag_spec_harm <= mag_spec_perc
107+ elif mask_mode == 'relative' :
108+ mask_harm = mag_spec_harm / (mag_spec_harm + mag_spec_perc + np .finfo (float ).eps )
109+ mask_perc = mag_spec_perc / (mag_spec_harm + mag_spec_perc + np .finfo (float ).eps )
110+ else :
111+ raise Exception ("Please use the valid mask mode. (binary, relative)" )
112+
113+ spec_harm = mask_harm * spec
114+ spec_perc = mask_perc * spec
115+
116+ x_harm [c , :] = istft (spec_harm , syn_hop = hop_size , win_type = win_type , win_size = win_size ,
117+ zero_pad = zero_pad , original_length = x .shape [1 ], fft_shift = fft_shift )
118+ x_perc [c , :] = istft (spec_perc , syn_hop = hop_size , win_type = win_type , win_size = win_size ,
119+ zero_pad = zero_pad , original_length = x .shape [1 ], fft_shift = fft_shift )
120+
121+ return x_harm .squeeze (), x_perc .squeeze ()
0 commit comments