MiniCPM-o-Demo/benchmark.py at main · OpenBMB/MiniCPM-o-Demo · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
#!/usr/bin/env python3
"""Omni duplex benchmark script.

Runs the duplex inference pipeline on video(s) and reports per-module
timing breakdown, separately for LISTEN and SPEAK decisions.

Usage:
    # Single video (defaults)
    CUDA_VISIBLE_DEVICES=0 .venv/base/bin/python benchmark.py

    # Custom video + ref audio
    CUDA_VISIBLE_DEVICES=0 .venv/base/bin/python benchmark.py \
        --video assets/samples/compile.mp4 \
        --ref-audio assets/ref_audio/ref_en_dlc_1.wav

    # Directory of videos
    CUDA_VISIBLE_DEVICES=0 .venv/base/bin/python benchmark.py \
        --video-dir /path/to/videos/

    # With torch.compile
    CUDA_VISIBLE_DEVICES=0 .venv/base/bin/python benchmark.py --compile
"""

import argparse
import gc
import json
import logging
import os
import subprocess
import time
from datetime import datetime

import torch

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
)
logger = logging.getLogger("benchmark")


def _collect_gpu_info(gpu_id: int = 0) -> dict:
    """Collect NVIDIA GPU information via torch.cuda and nvidia-smi."""
    info: dict = {}
    if not torch.cuda.is_available():
        info["available"] = False
        return info

    info["available"] = True
    info["device_count"] = torch.cuda.device_count()
    info["device_name"] = torch.cuda.get_device_name(gpu_id)
    info["cuda_version"] = torch.version.cuda
    props = torch.cuda.get_device_properties(gpu_id)
    info["total_memory_gb"] = round(props.total_memory / (1024 ** 3), 2)
    info["major"] = props.major
    info["minor"] = props.minor
    info["multi_processor_count"] = props.multi_processor_count

    try:
        out = subprocess.run(
            [
                "nvidia-smi",
                "--query-gpu=driver_version,memory.used,memory.total,temperature.gpu,power.draw,power.limit,clocks.current.sm,clocks.max.sm",
                "--format=csv,noheader,nounits",
                f"--id={gpu_id}",
            ],
            capture_output=True, text=True, timeout=5,
        )
        if out.returncode == 0:
            parts = [p.strip() for p in out.stdout.strip().split(",")]
            if len(parts) >= 8:
                info["driver_version"] = parts[0]
                info["memory_used_mb"] = float(parts[1])
                info["memory_total_mb"] = float(parts[2])
                info["temperature_c"] = int(parts[3])
                info["power_draw_w"] = float(parts[4])
                info["power_limit_w"] = float(parts[5])
                info["sm_clock_mhz"] = int(parts[6])
                info["sm_clock_max_mhz"] = int(parts[7])
    except Exception:
        pass

    return info


def main():
    from config import get_config
    cfg = get_config()

    parser = argparse.ArgumentParser(
        description="Omni duplex benchmark: per-module timing for LISTEN and SPEAK",
    )
    parser.add_argument("--model-path", type=str, default=None, help="Base model path")
    parser.add_argument("--pt-path", type=str, default=None, help="Extra weights path (.pt)")
    parser.add_argument("--ref-audio", type=str, default=None, help="Reference audio path for TTS")
    parser.add_argument("--gpu-id", type=int, default=0, help="GPU ID (default: 0)")
    parser.add_argument("--video", type=str, nargs="+", default=None, help="MP4 video path(s)")
    parser.add_argument("--video-dir", type=str, default=None, help="Directory containing MP4 videos")
    parser.add_argument(
        "--system-prompt", type=str, default="Streaming Omni Conversation.",
        help="System prompt content",
    )
    parser.add_argument(
        "--max-chunks", type=int, default=0,
        help="Max 1-second chunks per video (0 = all)",
    )
    parser.add_argument("--compile", action="store_true", help="Apply torch.compile before benchmark")
    parser.add_argument(
        "--compile-mode", type=str, default="default",
        help="torch.compile mode (default/reduce-overhead/max-autotune)",
    )
    parser.add_argument(
        "--attn-implementation", type=str, default=None,
        help="Attention impl (auto/flash_attention_2/sdpa)",
    )
    args = parser.parse_args()

    model_path = args.model_path or cfg.model.model_path
    pt_path = args.pt_path or cfg.model.pt_path
    attn_impl = args.attn_implementation or cfg.attn_implementation
    gpu_id = args.gpu_id

    def _is_quantized(path: str) -> bool:
        cfg_file = os.path.join(path, "config.json")
        if not os.path.isfile(cfg_file):
            return False
        try:
            import json as _json
            with open(cfg_file, "r", encoding="utf-8") as f:
                c = _json.load(f)
            qcfg = c.get("quantization_config")
            return bool(qcfg and qcfg.get("quant_method"))
        except Exception:
            return False

    is_quantized = _is_quantized(model_path)

    project_root = os.path.dirname(os.path.abspath(__file__))
    ref_audio_path = args.ref_audio or os.path.join(
        project_root, "assets", "ref_audio", "ref_en_dlc_1.wav"
    )

    video_paths = args.video
    video_dir = args.video_dir
    if video_paths is None and video_dir is None:
        video_paths = [os.path.join(project_root, "assets", "samples", "compile.mp4")]

    logger.info("=" * 60)
    logger.info("Omni Duplex Benchmark")
    logger.info("=" * 60)
    logger.info(f"PyTorch:       {torch.__version__}")
    logger.info(f"CUDA:          {torch.version.cuda}")
    if torch.cuda.is_available():
        logger.info(f"GPU:           {torch.cuda.get_device_name(gpu_id)}")
    logger.info(f"Model:         {model_path}")
    logger.info(f"PT path:       {pt_path}")
    logger.info(f"Ref audio:     {ref_audio_path}")
    logger.info(f"Attn impl:     {attn_impl}")
    logger.info(f"Quantized:     {is_quantized}")
    logger.info(f"Videos:        {video_paths or video_dir}")
    logger.info(f"System prompt: {args.system_prompt}")
    logger.info(f"Max chunks:    {args.max_chunks or 'all'}")
    logger.info(f"Compile:       {args.compile}")
    logger.info("=" * 60)

    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
    total_start = time.time()

    # ── 1. Load model ──
    logger.info("[1/3] Loading model...")
    t0 = time.time()

    from MiniCPMO45.modeling_minicpmo_unified import MiniCPMO

    resolved_attn = attn_impl
    if resolved_attn == "auto":
        try:
            from transformers.utils import is_flash_attn_2_available
            if is_flash_attn_2_available():
                resolved_attn = "flash_attention_2"
            else:
                resolved_attn = "sdpa"
        except ImportError:
            resolved_attn = "sdpa"
        logger.info(f"attn_implementation: auto -> {resolved_attn}")

    model = MiniCPMO.from_pretrained(
        model_path,
        trust_remote_code=True,
        _attn_implementation=resolved_attn,
    )
    if is_quantized:
        model.eval().cuda()
        logger.info("Quantized model detected — skipping .bfloat16() cast")
    else:
        model.bfloat16().eval().cuda()
    load_time = time.time() - t0
    logger.info(f"[1/3] Model loaded ({load_time:.1f}s)")

    # ── 2. Unified initialization ──
    logger.info("[2/3] init_unified...")
    t0 = time.time()
    model.init_unified(
        pt_path=pt_path,
        preload_both_tts=True,
        device="cuda",
        chat_vocoder=cfg.chat_vocoder,
    )
    init_time = time.time() - t0
    logger.info(f"[2/3] init_unified done ({init_time:.1f}s)")

    # ── 2.5 Optional torch.compile ──
    if args.compile:
        skip_modules = ["llm.model"] if is_quantized else None
        logger.info("[2.5/3] apply_torch_compile (mode=%s, skip=%s)...", args.compile_mode, skip_modules)
        t0 = time.time()
        model.apply_torch_compile(mode=args.compile_mode, dynamic=True, skip_modules=skip_modules)
        logger.info(f"[2.5/3] apply_torch_compile done ({time.time() - t0:.1f}s)")

    # ── 3. Benchmark ──
    logger.info("[3/3] Running benchmark...")
    t0 = time.time()
    results = model.benchmark(
        video_paths=video_paths,
        video_dir=video_dir,
        ref_audio_path=ref_audio_path,
        system_prompt=args.system_prompt,
        max_chunks_per_video=args.max_chunks,
    )
    benchmark_time = time.time() - t0
    logger.info(f"[3/3] Benchmark done ({benchmark_time:.1f}s)")

    # ── Collect GPU info (after inference, before cleanup) ──
    gpu_info = _collect_gpu_info(gpu_id)

    # ── Cleanup ──
    del model
    gc.collect()
    torch.cuda.empty_cache()

    total = time.time() - total_start

    # ── Build and save benchmark.json ──
    output = {
        "timestamp": datetime.now().isoformat(),
        "environment": {
            "pytorch_version": torch.__version__,
            "cuda_version": torch.version.cuda,
            "gpu": gpu_info,
        },
        "config": {
            "model_path": model_path,
            "pt_path": pt_path,
            "attn_implementation": resolved_attn,
            "quantized": is_quantized,
            "chat_vocoder": cfg.chat_vocoder,
        },
        "parameters": {
            "video_paths": video_paths,
            "video_dir": video_dir,
            "ref_audio_path": ref_audio_path,
            "system_prompt": args.system_prompt,
            "max_chunks_per_video": args.max_chunks,
            "compile": args.compile,
            "compile_mode": args.compile_mode if args.compile else None,
            "gpu_id": gpu_id,
        },
        "timing": {
            "model_load_s": round(load_time, 2),
            "init_unified_s": round(init_time, 2),
            "benchmark_s": round(benchmark_time, 2),
            "total_script_s": round(total, 2),
        },
        "results": results,
    }

    output_path = os.path.join(project_root, "benchmark.json")
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(output, f, indent=2, ensure_ascii=False)

    logger.info("=" * 60)
    logger.info(f"Total script time: {total:.1f}s")
    logger.info(f"Results saved to: {output_path}")
    logger.info("=" * 60)

    print("\n" + json.dumps(output, indent=2, ensure_ascii=False))


if __name__ == "__main__":
    main()