Merge pull request #4 from seyong92/implement_median_filter

seyong92 · web-flow · commit 3aba45e0463b · 2022-05-28T19:53:12.000+09:00
Implement median filter
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -16,7 +16,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.6", "3.7", "3.8", "3.9", "3.10"]
+        python-version: ["3.8", "3.9", "3.10"]
 
     steps:
     - uses: actions/checkout@v3
diff --git a/pyproject.toml b/pyproject.toml
@@ -8,16 +8,15 @@ license = "GPL-3.0"
 readme = "README.md"
 
 [tool.poetry.dependencies]
-python = "^3.6"
-numpy = "^1.16.0"
-scipy = "^1.0.0"
-soundfile = "^0.10.0"
-librosa = "^0.8"
+python = ">=3.8,<3.11"
+numpy = "^1.20"
+scipy = "^1.8"
+soundfile = "^0.10"
 
 [tool.poetry.dev-dependencies]
-pytest = "^5.2"
-flake8 = "^3.8.3"
-numpydoc = "^1.0.0"
+pytest = "^7.0"
+flake8 = "^4.0.1"
+numpydoc = "^1.0"
 
 [tool.poetry.scripts]
 tsmod = 'pytsmod.console:run'
diff --git a/pytsmod/hptsm.py b/pytsmod/hptsm.py
@@ -1,11 +1,12 @@
-from librosa.effects import hpss
+from scipy.ndimage import median_filter
 import numpy as np
 from .pvtsm import phase_vocoder
 from .olatsm import ola
-from .utils import _validate_audio
+from .utils import _validate_audio, stft, istft
 
 
-def hptsm(x, s, hp_kernel_size=31, hp_power=2.0, hp_mask=False, hp_margin=1.0,
+def hptsm(x, s, hp_len_harm=10, hp_len_perc=10, hp_mask_mode='binary', hp_win_type='hann',
+          hp_win_size=1024, hp_hop_size=256, hp_zero_pad=0, hp_fft_shift=False,
           pv_win_type='hann', pv_win_size=2048, pv_syn_hop_size=512,
           pv_zero_pad=0, pv_restore_energy=False, pv_fft_shift=False,
           pv_phase_lock=True, ola_win_type='hann',
@@ -35,15 +36,10 @@ def hptsm(x, s, hp_kernel_size=31, hp_power=2.0, hp_mask=False, hp_margin=1.0,
         the modified output audio sequence.
     """
     x = _validate_audio(x)
-    x_harm = np.zeros(x.shape)
-    x_perc = np.zeros(x.shape)
 
-    for c, x_chan in enumerate(x):
-        x_harm_chan, x_perc_chan = hpss(x_chan, kernel_size=hp_kernel_size,
-                                        power=hp_power, mask=hp_mask,
-                                        margin=hp_margin)
-        x_harm[c, :] = x_harm_chan
-        x_perc[c, :] = x_perc_chan
+    x_harm, x_perc = _hpss(x, len_harm=hp_len_harm, len_perc=hp_len_perc, mask_mode=hp_mask_mode,
+                           win_type=hp_win_type, win_size=hp_win_size, hop_size=hp_hop_size,
+                           zero_pad=hp_zero_pad, fft_shift=hp_fft_shift)
 
     y_harm = phase_vocoder(x_harm, s, win_type=pv_win_type,
                            win_size=pv_win_size, syn_hop_size=pv_syn_hop_size,
@@ -55,3 +51,71 @@ def hptsm(x, s, hp_kernel_size=31, hp_power=2.0, hp_mask=False, hp_margin=1.0,
                  syn_hop_size=ola_syn_hop_size)
 
     return y_harm + y_perc
+
+
+def _hpss(x, len_harm=10, len_perc=10, mask_mode='binary', win_type='hann',
+          win_size=1024, hop_size=256, zero_pad=0, fft_shift=False):
+    """Separate the input audio sequence to a harmonic and a percussive source.
+    The algorithm is from the following paper.
+
+    Derry Fitzgerald, "Harmonic/percussive separation using median filtering." Proc. of the Int. Conf. on Digital Audio Effects (DAFx). Vol. 13. 2010.
+
+    Parameters
+    ----------
+
+    x : numpy.ndarray [shape=(channel, num_samples) or (num_samples)]
+        the input audio sequence to separate.
+    len_harm : int
+               length of the median filter kernel size for the harmonic source.
+    len_perc : int
+               length of the median filter kernel size for the percussive source.
+    mask_mode : str
+                mask mode for the separation. binary and relative are available.
+    win_type : str
+               type of the window function for the STFT. hann and sin are available.
+    win_size : int > 0 [scalar]
+               size of the window function for the STFT and the ISTFT.
+    hop_size : int > 0 [scalar]
+               hop size of the analysis/synthesis window for the STFT and the ISTFT.
+    zero_pad : int > 0 [scalar]
+               the size of the zero pad in the window function.
+    fft_shift : bool
+                apply circular shift to STFT and ISTFT.
+
+    Returns
+    -------
+
+    x_harm : numpy.ndarray [shape=(channel, num_samples) or (num_samples)]
+             the separated harmonic audio sequence.
+    x_perc : numpy.ndarray [shape=(channel, num_samples) or (num_samples)]
+             the separated percussive audio sequence.
+    """
+    x_harm = np.zeros(x.shape)
+    x_perc = np.zeros(x.shape)
+
+    for c, x_chan in enumerate(x):
+        spec = stft(x_chan, ana_hop=hop_size, win_type=win_type, win_size=win_size,
+                    zero_pad=zero_pad, fft_shift=fft_shift)
+        mag_spec = np.abs(spec)
+
+        mag_spec_harm = median_filter(mag_spec, size=[1, len_harm], mode='reflect')
+        mag_spec_perc = median_filter(mag_spec, size=[len_perc, 1], mode='reflect')
+
+        if mask_mode == 'binary':
+            mask_harm = mag_spec_harm > mag_spec_perc
+            mask_perc = mag_spec_harm <= mag_spec_perc
+        elif mask_mode == 'relative':
+            mask_harm = mag_spec_harm / (mag_spec_harm + mag_spec_perc + np.finfo(float).eps)
+            mask_perc = mag_spec_perc / (mag_spec_harm + mag_spec_perc + np.finfo(float).eps)
+        else:
+            raise Exception("Please use the valid mask mode. (binary, relative)")
+
+        spec_harm = mask_harm * spec
+        spec_perc = mask_perc * spec
+
+        x_harm[c, :] = istft(spec_harm, syn_hop=hop_size, win_type=win_type, win_size=win_size,
+                             zero_pad=zero_pad, original_length=x.shape[1], fft_shift=fft_shift)
+        x_perc[c, :] = istft(spec_perc, syn_hop=hop_size, win_type=win_type, win_size=win_size,
+                             zero_pad=zero_pad, original_length=x.shape[1], fft_shift=fft_shift)
+
+    return x_harm.squeeze(), x_perc.squeeze()
diff --git a/pytsmod/utils/stft.py b/pytsmod/utils/stft.py
@@ -22,7 +22,7 @@ def stft(x, ana_hop=2048, win_type='hann', win_size=4096, zero_pad=0, sr=44100,
     zero_pad : int > 0 [scalar]
                the size of the zero pad in the window function.
     sr : int > 0 [scalar]
-         the sample rate of the audio sequence.
+         the sample rate of the audio sequence. Only used for time_frequency_out.
     fft_shift : bool
                 apply circular shift to STFT.
     time_frequency_out : bool