-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathtest_all_f0_methods.py
More file actions
131 lines (107 loc) · 4.22 KB
/
test_all_f0_methods.py
File metadata and controls
131 lines (107 loc) · 4.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/env python3
"""Test all F0 extraction methods with real audio inference."""
import os
import sys
import time
import numpy as np
import librosa
import soundfile as sf
# Add project to path
sys.path.insert(0, '/Users/mcruz/Developer/Retrieval-based-Voice-Conversion-MLX')
from rvc_mlx.lib.mlx.pitch_extractors import PitchExtractor
# Test parameters
AUDIO_FILE = "test-audio/input_16k.wav"
OUTPUT_DIR = "test_results/f0_comparison"
METHODS = ["rmvpe", "dio", "pm", "harvest", "fcpe"] # Skip crepe (needs weights)
def main():
os.makedirs(OUTPUT_DIR, exist_ok=True)
# Load audio
print(f"Loading audio: {AUDIO_FILE}")
audio, sr = librosa.load(AUDIO_FILE, sr=16000)
duration = len(audio) / sr
print(f" Duration: {duration:.2f}s, Sample rate: {sr} Hz")
results = {}
print("\n" + "="*60)
print("Testing F0 extraction methods")
print("="*60)
for method in METHODS:
print(f"\n[{method.upper()}]")
try:
# Create extractor
start = time.time()
extractor = PitchExtractor(method=method, sample_rate=16000, hop_size=160)
init_time = time.time() - start
# Extract F0
start = time.time()
f0 = extractor.extract(audio, f0_min=50, f0_max=1100)
extract_time = time.time() - start
# Analyze results
voiced_mask = f0 > 0
voiced_ratio = voiced_mask.mean()
if voiced_mask.sum() > 0:
voiced_f0 = f0[voiced_mask]
mean_f0 = voiced_f0.mean()
std_f0 = voiced_f0.std()
min_f0 = voiced_f0.min()
max_f0 = voiced_f0.max()
else:
mean_f0 = std_f0 = min_f0 = max_f0 = 0
results[method] = {
'f0': f0,
'voiced_ratio': voiced_ratio,
'mean_f0': mean_f0,
'std_f0': std_f0,
'min_f0': min_f0,
'max_f0': max_f0,
'extract_time': extract_time,
}
print(f" Frames: {len(f0)}")
print(f" Voiced: {voiced_ratio*100:.1f}%")
print(f" F0 range: {min_f0:.1f} - {max_f0:.1f} Hz")
print(f" Mean F0: {mean_f0:.1f} Hz (±{std_f0:.1f})")
print(f" Time: {extract_time:.3f}s ({duration/extract_time:.1f}x realtime)")
# Save F0 contour
np.save(f"{OUTPUT_DIR}/{method}_f0.npy", f0)
except Exception as e:
print(f" ERROR: {e}")
results[method] = None
# Compare methods
print("\n" + "="*60)
print("Method Comparison")
print("="*60)
valid_methods = [m for m in METHODS if results.get(m) is not None]
if len(valid_methods) >= 2:
# Correlation matrix
print("\nF0 Correlation Matrix:")
print(f"{'':>10}", end='')
for m in valid_methods:
print(f"{m:>10}", end='')
print()
for m1 in valid_methods:
print(f"{m1:>10}", end='')
f0_1 = results[m1]['f0']
for m2 in valid_methods:
f0_2 = results[m2]['f0']
# Resample if different lengths
if len(f0_1) != len(f0_2):
from scipy.ndimage import zoom
f0_2_resampled = zoom(f0_2, len(f0_1) / len(f0_2))
else:
f0_2_resampled = f0_2
# Calculate correlation on voiced regions
both_voiced = (f0_1 > 0) & (f0_2_resampled > 0)
if both_voiced.sum() > 10:
corr = np.corrcoef(f0_1[both_voiced], f0_2_resampled[both_voiced])[0, 1]
print(f"{corr:>10.3f}", end='')
else:
print(f"{'N/A':>10}", end='')
print()
# Speed comparison
print("\nSpeed Comparison:")
for method in valid_methods:
r = results[method]
rtf = duration / r['extract_time']
print(f" {method:>10}: {r['extract_time']:.3f}s ({rtf:.1f}x realtime)")
print(f"\nResults saved to: {OUTPUT_DIR}/")
if __name__ == "__main__":
main()