Support switch from compile mode to non-compile mode, only omni full duplex compile by default

bokesyo · bokesyo · commit 5169e6322187 · 2026-03-08T17:42:17.000Z
diff --git a/MiniCPMO45/modeling_minicpmo_unified.py b/MiniCPMO45/modeling_minicpmo_unified.py
@@ -443,23 +443,23 @@ def apply_torch_compile(
         compiled_modules: list = []
         skipped_modules: list = []
 
-        if hasattr(self, "vpm") and "vpm" not in skip:
-            self.vpm = torch.compile(self.vpm, **compile_kwargs)
-            compiled_modules.append("vpm")
-        elif "vpm" in skip:
-            skipped_modules.append("vpm")
+        # if hasattr(self, "vpm") and "vpm" not in skip:
+        #     self.vpm = torch.compile(self.vpm, **compile_kwargs)
+        #     compiled_modules.append("vpm")
+        # elif "vpm" in skip:
+        #     skipped_modules.append("vpm")
 
         if hasattr(self, "llm") and "llm.model" not in skip:
             self.llm.model = torch.compile(self.llm.model, **compile_kwargs)
             compiled_modules.append("llm.model")
         elif "llm.model" in skip:
             skipped_modules.append("llm.model")
 
-        if hasattr(self, "resampler") and "resampler" not in skip:
-            self.resampler = torch.compile(self.resampler, **compile_kwargs)
-            compiled_modules.append("resampler")
-        elif "resampler" in skip:
-            skipped_modules.append("resampler")
+        # if hasattr(self, "resampler") and "resampler" not in skip:
+        #     self.resampler = torch.compile(self.resampler, **compile_kwargs)
+        #     compiled_modules.append("resampler")
+        # elif "resampler" in skip:
+        #     skipped_modules.append("resampler")
 
         if hasattr(self, "tts") and hasattr(self.tts, "model") and "tts.model" not in skip:
             self.tts.model = torch.compile(self.tts.model, **compile_kwargs)
@@ -472,6 +472,7 @@ def apply_torch_compile(
 
         elapsed = _time.time() - t0
         self._compiled = True
+        self._compile_active = True
         logger.info(
             f"[torch.compile] Wrapping done ({elapsed:.2f}s), "
             f"compiled: {compiled_modules}"
@@ -480,6 +481,51 @@ def apply_torch_compile(
         )
         return self
 
+    def set_compile_enabled(self, enabled: bool) -> None:
+        """Switch between compiled and eager execution for all compiled sub-modules.
+
+        Only effective after apply_torch_compile() has been called.
+        Compiled and eager modules share the same weights (zero copy),
+        so switching is instant and costs no extra memory.
+        """
+        if not getattr(self, "_compiled", False):
+            return
+        if enabled == getattr(self, "_compile_active", True):
+            return
+
+        swapped: list = []
+
+        if hasattr(self, "llm"):
+            cur = self.llm.model
+            if enabled:
+                compiled = getattr(cur, "_compiled_ref", None)
+                if compiled is not None:
+                    self.llm.model = compiled
+                    swapped.append("llm.model")
+            else:
+                orig = getattr(cur, "_orig_mod", None)
+                if orig is not None:
+                    orig._compiled_ref = cur
+                    self.llm.model = orig
+                    swapped.append("llm.model")
+
+        if hasattr(self, "tts") and hasattr(self.tts, "model"):
+            cur = self.tts.model
+            if enabled:
+                compiled = getattr(cur, "_compiled_ref", None)
+                if compiled is not None:
+                    self.tts.model = compiled
+                    swapped.append("tts.model")
+            else:
+                orig = getattr(cur, "_orig_mod", None)
+                if orig is not None:
+                    orig._compiled_ref = cur
+                    self.tts.model = orig
+                    swapped.append("tts.model")
+
+        self._compile_active = enabled
+        logger.info(f"[torch.compile] {'enabled' if enabled else 'disabled'} → swapped {swapped}")
+
     def warmup_compile(
         self,
         warmup_video_path: Optional[str] = None,
diff --git a/TODO.md b/TODO.md
@@ -0,0 +1,4 @@
+[x] compile coverage, test?
+[] add tts case in audio chat
+[] add custom voice in audio chat
+[] calibration dataset for quantization
diff --git a/core/processors/unified.py b/core/processors/unified.py
@@ -1432,6 +1432,11 @@ def _release_resources(self) -> None:
 
     # ==================== Mode Switching ====================
 
+    def _sync_compile_state(self, want_compiled: bool) -> None:
+        """Enable/disable torch.compile based on target mode."""
+        if self.compile and self.model is not None:
+            self.model.set_compile_enabled(want_compiled)
+
     def set_chat_mode(self) -> ChatView:
         """Switch to Chat mode.
 
@@ -1442,6 +1447,7 @@ def set_chat_mode(self) -> ChatView:
 
         if self._current_mode != ProcessorMode.CHAT:
             start = time.time()
+            self._sync_compile_state(False)
             self.model.set_mode(ModelProcessorMode.CHAT)
             self._current_mode = ProcessorMode.CHAT
             logger.info(f"Switched to CHAT mode in {(time.time()-start)*1000:.1f}ms")
@@ -1458,6 +1464,7 @@ def set_half_duplex_mode(self) -> HalfDuplexView:
 
         if self._current_mode != ProcessorMode.HALF_DUPLEX:
             start = time.time()
+            self._sync_compile_state(False)
             self.model.set_mode(ModelProcessorMode.STREAMING)
             self._current_mode = ProcessorMode.HALF_DUPLEX
             logger.info(f"Switched to HALF_DUPLEX mode in {(time.time()-start)*1000:.1f}ms")
@@ -1474,6 +1481,7 @@ def set_duplex_mode(self) -> DuplexView:
 
         if self._current_mode != ProcessorMode.DUPLEX:
             start = time.time()
+            self._sync_compile_state(True)
             self.model.set_mode(ModelProcessorMode.DUPLEX)
             self._current_mode = ProcessorMode.DUPLEX
             logger.info(f"Switched to DUPLEX mode in {(time.time()-start)*1000:.1f}ms")
diff --git a/static/audio-duplex/audio-duplex-app.js b/static/audio-duplex/audio-duplex-app.js
@@ -38,7 +38,7 @@ import { initRefAudio } from '../duplex/ui/ref-audio-init.js';
 const SAMPLE_RATE_IN = 16000;
 const SAMPLE_RATE_OUT = 24000;
 const CHUNK_MS = 1000;
-const FILE_MAX_DURATION = 120; // 2 minutes
+const FILE_MAX_DURATION = 300; // 5 minutes
 
 let currentMode = 'live';
 let session = null;
diff --git a/tests/test_compile_bench.py b/tests/test_compile_bench.py
@@ -0,0 +1,194 @@
+#!/usr/bin/env python3
+"""Benchmark compiled vs eager duplex inference on the same video.
+
+Loads the model once (with compile=True), then runs the same omni full-duplex
+session twice: once with compiled modules, once with eager modules.
+Prints a side-by-side timing comparison at the end.
+
+Usage:
+    CUDA_VISIBLE_DEVICES=0 TORCHINDUCTOR_CACHE_DIR=./torch_compile_cache \
+        PYTHONPATH=. .venv/base/bin/python test_compile_bench.py
+"""
+
+import os
+import sys
+import time
+import logging
+import torch
+from config import get_config
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+)
+logger = logging.getLogger("compile_bench")
+
+VIDEO_PATH = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)),
+    "assets", "samples", "compile.mp4",
+)
+MAX_CHUNKS = 8
+
+
+def module_type_label(mod) -> str:
+    cls = type(mod).__name__
+    if cls == "OptimizedModule":
+        return f"OptimizedModule (compiled)"
+    return f"{cls} (eager)"
+
+
+def print_header(label: str, model):
+    active = getattr(model, "_compile_active", "N/A")
+    llm_label = module_type_label(model.llm.model)
+    tts_label = module_type_label(model.tts.model) if hasattr(model.tts, "model") else "N/A"
+    print(f"\n{'='*70}")
+    print(f"  {label}")
+    print(f"  _compile_active = {active}")
+    print(f"  llm.model = {llm_label}")
+    print(f"  tts.model = {tts_label}")
+    print(f"{'='*70}")
+
+
+def run_bench(model, label: str) -> dict:
+    print_header(label, model)
+    t0 = time.time()
+    result = model.benchmark(
+        video_paths=[VIDEO_PATH],
+        max_chunks_per_video=MAX_CHUNKS,
+    )
+    elapsed = time.time() - t0
+    print(f"  [{label}] done in {elapsed:.1f}s, "
+          f"units={result.get('num_units', 0)}, "
+          f"listen={result.get('listen_count', 0)}, "
+          f"speak={result.get('speak_count', 0)}")
+    return result
+
+
+def format_stats(stats: dict, key_path: str) -> str:
+    keys = key_path.split(".")
+    d = stats
+    for k in keys:
+        d = d.get(k, {})
+    if not d:
+        return "N/A"
+    return f"avg={d.get('avg', 0):.0f}ms  min={d.get('min', 0):.0f}ms  max={d.get('max', 0):.0f}ms"
+
+
+def print_comparison(compiled_result: dict, eager_result: dict):
+    print("\n")
+    print("=" * 70)
+    print("  Compiled vs Eager 对比")
+    print("=" * 70)
+
+    rows = [
+        ("总用时",            "total_time",   "s",  True),
+    ]
+
+    # top-level
+    for label, key, unit, is_time in rows:
+        cv = compiled_result.get(key, 0)
+        ev = eager_result.get(key, 0)
+        if is_time:
+            diff_pct = ((ev - cv) / cv * 100) if cv > 0 else 0
+            print(f"  {label:20s}  compiled={cv:.1f}{unit}  eager={ev:.1f}{unit}  "
+                  f"差异={diff_pct:+.1f}%")
+        else:
+            print(f"  {label:20s}  compiled={cv}  eager={ev}")
+
+    print(f"  {'单位数':20s}  compiled={compiled_result.get('num_units',0)}  "
+          f"eager={eager_result.get('num_units',0)}")
+
+    # per-decision-type stats
+    for decision in ("listen", "speak"):
+        cs = compiled_result.get(f"{decision}_stats", {})
+        es = eager_result.get(f"{decision}_stats", {})
+        cc = cs.get("count", 0)
+        ec = es.get("count", 0)
+        if cc == 0 and ec == 0:
+            continue
+
+        print(f"\n  ── {decision.upper()} (compiled n={cc}, eager n={ec}) ──")
+
+        metric_paths = [
+            ("prefill total",    "prefill.total"),
+            ("  vision_process", "prefill.vision_process"),
+            ("  vision_embed",   "prefill.vision_embed"),
+            ("  vision_feed",    "prefill.vision_feed"),
+            ("  audio_process",  "prefill.audio_process"),
+            ("  audio_embed",    "prefill.audio_embed"),
+            ("  audio_feed",     "prefill.audio_feed"),
+            ("generate total",   "generate.total"),
+            ("  llm",            "generate.llm"),
+            ("  tts_prep",       "generate.tts_prep"),
+            ("  tts",            "generate.tts"),
+            ("  token2wav",      "generate.token2wav"),
+            ("unit_total",       "unit_total"),
+        ]
+
+        for metric_label, path in metric_paths:
+            keys = path.split(".")
+            cd = cs
+            for k in keys:
+                cd = cd.get(k, {}) if isinstance(cd, dict) else {}
+            ed = es
+            for k in keys:
+                ed = ed.get(k, {}) if isinstance(ed, dict) else {}
+
+            c_avg = cd.get("avg", 0) if isinstance(cd, dict) else 0
+            e_avg = ed.get("avg", 0) if isinstance(ed, dict) else 0
+
+            if c_avg == 0 and e_avg == 0:
+                continue
+
+            diff_pct = ((e_avg - c_avg) / c_avg * 100) if c_avg > 0 else 0
+            arrow = "↑ slower" if diff_pct > 2 else ("↓ faster" if diff_pct < -2 else "≈")
+            print(f"    {metric_label:18s}  compiled={c_avg:6.0f}ms  eager={e_avg:6.0f}ms  "
+                  f"{diff_pct:+6.1f}% {arrow}")
+
+    print("=" * 70)
+
+
+def main():
+    cfg = get_config()
+
+    print("=" * 70)
+    print("  Compiled vs Eager Duplex Benchmark")
+    print("=" * 70)
+    print(f"  Model:      {cfg.model.model_path}")
+    print(f"  Video:      {VIDEO_PATH}")
+    print(f"  Max chunks: {MAX_CHUNKS}")
+    print()
+
+    from core.processors.unified import UnifiedProcessor
+
+    logger.info("加载模型 (compile=True)...")
+    t0 = time.time()
+    processor = UnifiedProcessor(
+        model_path=cfg.model.model_path,
+        pt_path=cfg.model.pt_path,
+        ref_audio_path=cfg.ref_audio_path,
+        compile=True,
+        chat_vocoder=cfg.chat_vocoder,
+        attn_implementation=cfg.attn_implementation,
+    )
+    logger.info(f"模型加载完成 ({time.time() - t0:.1f}s)")
+
+    model = processor.model
+
+    # ── Round 1: Compiled ──
+    model.set_compile_enabled(True)
+    compiled_result = run_bench(model, "COMPILED")
+
+    # ── Reset state between runs ──
+    torch.cuda.empty_cache()
+
+    # ── Round 2: Eager ──
+    model.set_compile_enabled(False)
+    eager_result = run_bench(model, "EAGER")
+
+    # ── Comparison ──
+    print_comparison(compiled_result, eager_result)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_compile_switch.py b/tests/test_compile_switch.py