VFC-Audio-Restoration-Benchmark/scores_metadata.json at main · Diffio-AI/VFC-Audio-Restoration-Benchmark · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
{
  "benchmark": {
    "name": "Voices for Christ Benchmark",
    "submission_process": "Submit a pull request that adds a new benchmark_data/<mode> directory with one output file per original filename.",
    "task": "speech restoration and denoising",
    "version": "0.3.0"
  },
  "dataset": {
    "clip_count": 100,
    "clip_duration_seconds": 30,
    "distribution_status": "public",
    "license_note": "Current working assumption: the source Voices for Christ recordings are pre-1990 and open-domain/public-domain compatible. This should be verified before wider redistribution or downstream commercial reuse.",
    "license_status": "assumed_public_domain_unverified",
    "name": "Voices for Christ archive excerpts",
    "source_format": "mp3",
    "splits": [
      "public_test"
    ],
    "storage": "git-lfs",
    "total_duration_seconds": 3000,
    "version": "0.1.0"
  },
  "results": {
    "mode_summaries": {
      "adobe_podcast": {
        "mean": 3.8695788383483887,
        "median": 3.941006660461426,
        "n": 100,
        "std": 0.5502485632896423
      },
      "diffio_3_5": {
        "mean": 4.257707118988037,
        "median": 4.33013391494751,
        "n": 100,
        "std": 0.41899362206459045
      },
      "original": {
        "mean": 2.147495746612549,
        "median": 2.142691135406494,
        "n": 100,
        "std": 0.4837152063846588
      }
    }
  },
  "run": {
    "argv": [
      "score_scoreq.py"
    ],
    "file_count": 100,
    "mode_names": [
      "original",
      "adobe_podcast",
      "diffio_3_5"
    ],
    "package_versions": {
      "numpy": "2.2.6",
      "onnxruntime-gpu": "1.23.2",
      "soundfile": "0.13.1",
      "torch": "2.8.0",
      "torchaudio": "2.8.0",
      "tqdm": "4.67.3"
    },
    "platform": "Linux-6.5.0-15-generic-x86_64-with-glibc2.35",
    "python_version": "3.10.12",
    "timestamp_utc": "2026-03-17T22:50:55.306102Z"
  },
  "scoring": {
    "data_domain": "natural",
    "mode": "nr",
    "model_filename": "adapt_nr_telephone.onnx",
    "model_url": "https://zenodo.org/records/15739280/files/adapt_nr_telephone.onnx",
    "onnx_execution_providers": [
      "CUDAExecutionProvider",
      "CPUExecutionProvider"
    ],
    "primary_metric": "SCOREQ",
    "resolved_model_filename": "adapt_nr_telephone.onnx",
    "resolved_model_path": "/home/nharmon/.cache/scoreq/onnx-models/adapt_nr_telephone.onnx",
    "resolved_model_url": "https://zenodo.org/records/15739280/files/adapt_nr_telephone.onnx",
    "secondary_metric": "WER",
    "secondary_metric_rationale": "Use a stronger decode to freeze proxy transcripts for the noisy originals once, then use a weaker decode on restored outputs so ASR WER remains sensitive to restoration gains in a no-reference benchmark.",
    "secondary_metric_reference_asr_model": "faster-whisper large-v3",
    "secondary_metric_reference_decode": {
      "beam_size": 15,
      "best_of": 15,
      "compute_type": "float16",
      "condition_on_previous_text": true,
      "device": "cuda:0",
      "patience": 2.0
    },
    "secondary_metric_reference_policy": "Use the frozen transcript in reference_transcripts.csv for each benchmark_data/original/<filename> as the reference transcript for that filename.",
    "secondary_metric_submission_asr_model": "faster-whisper small.en",
    "secondary_metric_submission_decode": {
      "beam_size": 1,
      "best_of": 1,
      "compute_type": "float16",
      "condition_on_previous_text": false,
      "device": "cuda:1",
      "patience": 1.0
    },
    "tertiary_metric": "DNSMOS_P835",
    "tertiary_metric_config": {
      "mono": true,
      "outputs": [
        "p808_mos",
        "sig",
        "bak",
        "ovr"
      ],
      "personalized": false,
      "sample_rate": 16000
    },
    "tertiary_metric_local_implementation": "torchmetrics.functional.audio.dnsmos.deep_noise_suppression_mean_opinion_score"
  }
}