Skip to content

Commit 1f795fb

Browse files
committed
Refactor: Pure MLX implementation and repository restructuring
1 parent 8934e00 commit 1f795fb

79 files changed

Lines changed: 13287 additions & 980 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.gitignore

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ Intermediates.noindex/
2626
*.xcresult/
2727
*.xccheckout
2828
*.xcscmblueprint
29-
xcuserdata/
29+
/xcuserdata/
3030
*.orig
3131

3232
# Logs
@@ -38,10 +38,14 @@ xcuserdata/
3838
*.onnx
3939
*.index
4040
*.safetensors
41+
*.npz
4142
assets/audios/
4243
assets/datasets/
4344
logs/
4445
rvc/models/
46+
rvc_mlx/models/
47+
weights/
48+
test-audio/
4549

4650
# Media
4751
*.mp3
@@ -51,5 +55,6 @@ rvc/models/
5155
*.wav
5256
*.png
5357

58+
5459
# Misc
5560
*.txt

Demos/iOS/RVCNative/RVCNativePackage/Sources/RVCNativeFeature/RVC/HubertModel.swift

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,7 @@ public class HubertModel: Module {
310310
x = feature_projection(x)
311311
x = encoder(x)
312312
// x = final_proj(x) // Skip projection for RVC v2 (model expects 768-dim)
313+
print("DEBUG: HubertModel output shape: \(x.shape)")
313314
return x
314315
}
315316
}

Demos/iOS/RVCNative/RVCNativePackage/Sources/RVCNativeFeature/RVC/RMVPE.swift

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,7 @@ class RMVPE: Module {
388388
// audio: [T]
389389
let melProcessor = MelSpectrogram()
390390
let mel = melProcessor(audio) // [n_mels, T_frames] (Log Mel) (128, T)
391+
print("DEBUG: Mel Spectrogram Stats: min \(mel.min().item(Float.self)), max \(mel.max().item(Float.self)), shape \(mel.shape)")
391392

392393
// Model expects [N, T, n_mels] (transposed)
393394
// mel.T -> [T_frames, n_mels]. Add dim [1, T, n_mels]
@@ -398,6 +399,13 @@ class RMVPE: Module {
398399

399400
// Decode
400401
let f0 = self.decode(hidden, thred: thred) // [1, T, 1]
402+
403+
// DEBUG Stats
404+
let f0_min = f0.min().item(Float.self)
405+
let f0_max = f0.max().item(Float.self)
406+
let f0_mean = f0.mean().item(Float.self)
407+
print("DEBUG: RMVPE F0 Stats: min \(f0_min), max \(f0_max), mean \(f0_mean)")
408+
401409
return f0
402410
}
403411
}

Demos/iOS/RVCNative/RVCNativePackage/Sources/RVCNativeFeature/RVC/RVCInference.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,6 @@ import MLXNN
298298
sid: sid
299299
)
300300

301-
return audioOut // [1, T_out, 1]
301+
return audioOut // [1, T_out, 1]
302302
}
303303
}

Demos/iOS/RVCNative/RVCNativePackage/Sources/RVCNativeFeature/RVC/RVCModel.swift

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -310,17 +310,20 @@ class Generator: Module {
310310

311311
func callAsFunction(_ x: MLXArray, f0: MLXArray, g: MLXArray? = nil) -> MLXArray {
312312
var out = conv_pre(x)
313+
print("DEBUG: Generator.conv_pre out: [\(out.min().item(Float.self))...\(out.max().item(Float.self))]")
313314

314315
// Add speaker conditioning if available
315316
if let g = g, let condLayer = cond {
316317
out = out + condLayer(g)
318+
print("DEBUG: Generator.cond added: [\(out.min().item(Float.self))...\(out.max().item(Float.self))]")
317319
}
318320

319321

320322
// NSF Source Signal: [B, L*U, 1] (High Resolution)
321323
// upp = prod(upsampleRates) = 400
322324
let upp = 400
323325
let har_source = m_source(f0, upsampling_factor: upp)
326+
print("DEBUG: Generator.har_source: [\(har_source.min().item(Float.self))...\(har_source.max().item(Float.self))]")
324327
// har_source is now [B, AudioLen, 1]
325328

326329
var resIdx = 0
@@ -332,16 +335,12 @@ class Generator: Module {
332335
// Add NSF Noise
333336
// noise_conv reduces har_source resolution to match current 'out'
334337
let noise_conv = noise_convs[i]
335-
// We need to transpose har_source to [B, 1, T] ?? No. MLX is [N, L, C].
336-
// har_source is [N, T, 1]. noise_conv(1->C) -> [N, T_down, C].
337-
// Should match 'out' shape.
338338
let n = noise_conv(har_source)
339339

340-
// Crop if necessary (Python does: if x.shape[1] != n.shape[1])
340+
// Crop if necessary
341341
if out.shape[1] != n.shape[1] {
342342
let minLen = min(out.shape[1], n.shape[1])
343343
out = out[0..., 0..<minLen, 0...]
344-
// n = n[0..., 0..<minLen, 0...] // n is let constant, but we add it.
345344
out = out + n[0..., 0..<minLen, 0...]
346345
} else {
347346
out = out + n
@@ -358,11 +357,15 @@ class Generator: Module {
358357
}
359358
// Average
360359
out = xs! / 3.0
360+
print("DEBUG: Generator.ups[\(i)] (after resblocks) out: [\(out.min().item(Float.self))...\(out.max().item(Float.self))]")
361361
}
362362

363363
out = leakyRelu(out)
364364
out = conv_post(out)
365+
print("DEBUG: Generator.conv_post out: [\(out.min().item(Float.self))...\(out.max().item(Float.self))]")
366+
365367
out = tanh(out)
368+
print("DEBUG: Generator.final (tanh): [\(out.min().item(Float.self))...\(out.max().item(Float.self))]")
366369
return out
367370
}
368371
}

Demos/iOS/RVCNative/RVCNativePackage/Sources/RVCNativeFeature/RVC/Synthesizer.swift

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -430,17 +430,20 @@ public class Synthesizer: Module {
430430

431431
// Encode features
432432
let (m_p, logs_p, xMask) = enc_p(phone, pitch: pitch, lengths: phoneLengths)
433-
print("DEBUG: TextEncoder output - m_p: \(m_p.shape), logs_p: \(logs_p.shape)")
433+
print("DEBUG: TextEncoder output - m_p: \(m_p.shape) [\(m_p.min().item(Float.self))...\(m_p.max().item(Float.self))], logs_p: \(logs_p.shape) [\(logs_p.min().item(Float.self))...\(logs_p.max().item(Float.self))]")
434434

435435
// Sample from encoded distribution
436436
let xMaskExpanded = xMask.expandedDimensions(axis: -1)
437437
let z_p = (m_p + exp(logs_p) * MLXRandom.normal(m_p.shape).asType(m_p.dtype) * 0.66666) * xMaskExpanded
438+
print("DEBUG: z_p stats: min \(z_p.min().item(Float.self)), max \(z_p.max().item(Float.self))")
438439

439440
// Flow reverse pass
440441
let z = flow(z_p, xMask: xMaskExpanded, g: g, reverse: true)
442+
print("DEBUG: z (after flow) stats: min \(z.min().item(Float.self)), max \(z.max().item(Float.self))")
441443

442444
// Decode to audio
443445
let output = dec(z * xMaskExpanded, f0: nsff0 ?? MLX.zeros([phone.shape[0], phone.shape[1], 1]), g: g)
446+
print("DEBUG: Generator output stats: min \(output.min().item(Float.self)), max \(output.max().item(Float.self))")
444447

445448
return output
446449
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Subproject commit 15b6ade64bd6aefdb35d732a90678c3b264ecfac

0 commit comments

Comments
 (0)