Skip to content

Commit f216de1

Browse files
committed
Add openai-whisper-large-v3-turbo recipe
1 parent e83eb4f commit f216de1

14 files changed

Lines changed: 737 additions & 0 deletions
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2025 OpenAI
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
## Whisper-large-v3-turbo Optimization with ONNX Runtime QNN EP
2+
This folder outlines the process for optimizing the Whisper-large-v3-turbo model using ONNX Runtime with the QNN Execution Provider. It includes steps for exporting FP32 models, generating representative data for static quantization, creating QDQ models, model evaluation and performing audio transcription using the optimized models.
3+
4+
### Prerequisites
5+
```bash
6+
python -m pip install -r requirements_qnn.txt
7+
```
8+
### Generate data for static quantization
9+
10+
To get better results, we need to generate real data from original FP32 model instead of using random data for static quantization. Here we use 100 samples of librispeech dataset to generate the required real data which requires around 164 GB of disk space.
11+
12+
First generate FP32 onnx models:
13+
14+
1. Encoder FP32 model
15+
16+
`olive run --config whisper_large_v3_turbo_encoder_fp32.json`
17+
1. Decoder FP32 model
18+
19+
`olive run --config whisper_large_v3_turbo_decoder_fp32.json`
20+
21+
22+
Then download and generate data:
23+
24+
1. `python download_librispeech_asr.py --save_dir .\data`
25+
26+
2. `python .\demo.py --audio-path .\data\librispeech_asr_clean_test --encoder "models\whisper_encoder_fp32\model\model.onnx" --decoder "models\whisper_decoder_fp32\model.onnx" --model_id "openai/whisper-large-v3-turbo" --save_data .\data\quantization_data --num_data 100`
27+
28+
### Generate QDQ models
29+
30+
1. `olive run --config whisper_large_v3_turbo_encoder_qdq.json`
31+
2. `olive run --config whisper_large_v3_turbo_decoder_qdq.json`
32+
33+
### Evaluation
34+
35+
Evaluate model using the librispeech test-clean dataset
36+
37+
`python .\evaluate_whisper.py --encoder "models\whisper_encoder_qdq\model.onnx" --decoder "models\whisper_decoder_qdq\model.onnx" --model_id "openai/whisper-large-v3-turbo" --execution_provider QNNExecutionProvider`
38+
39+
### To transcribe a single sample:
40+
41+
`python .\demo.py --audio-path .\data\librispeech_asr_clean_test\1320-122617-0000.npy --encoder "models\whisper_encoder_qdq\model.onnx" --decoder "models\whisper_decoder_qdq\model.onnx" --model_id "openai/whisper-large-v3-turbo" --execution_provider QNNExecutionProvider`

openai-whisper-large-v3-turbo/olive/__init__.py

Whitespace-only changes.
Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
# ---------------------------------------------------------------------
2+
# Copyright (c) 2025 Qualcomm Technologies, Inc. and/or its subsidiaries.
3+
# SPDX-License-Identifier: BSD-3-Clause
4+
# ---------------------------------------------------------------------
5+
6+
from __future__ import annotations
7+
8+
import os
9+
10+
import numpy as np
11+
import onnxruntime as ort
12+
import torch
13+
from qai_hub_models.models._shared.hf_whisper.app import HfWhisperApp, chunk_and_resample_audio
14+
from qai_hub_models.models._shared.hf_whisper.model import (
15+
CHUNK_LENGTH,
16+
SAMPLE_RATE,
17+
)
18+
from transformers import WhisperProcessor
19+
20+
21+
def infer_audio(app, model_id, audio_file, save_data):
22+
audio_dict = np.load(audio_file, allow_pickle=True).item()
23+
24+
audio = audio_dict["audio"]["array"]
25+
sample_rate = audio_dict["audio"]["sampling_rate"]
26+
audio_name = os.path.splitext(os.path.basename(audio_file))[0] if save_data else None
27+
28+
processor = WhisperProcessor.from_pretrained(model_id)
29+
reference = processor.tokenizer._normalize(audio_dict["text"])
30+
print("Reference: ", reference)
31+
32+
# Perform transcription
33+
transcription = app.transcribe(audio, sample_rate, audio_name, save_data)
34+
print("done transcription")
35+
prediction = processor.tokenizer._normalize(transcription)
36+
print("Prediction:", prediction)
37+
38+
39+
class HfWhisperAppWithSave(HfWhisperApp):
40+
def __init__(
41+
self,
42+
encoder,
43+
decoder,
44+
hf_model_id: str,
45+
execution_provider: str = "CPUExecutionProvider",
46+
provider_options: dict = None,
47+
sample_rate: int = SAMPLE_RATE,
48+
max_audio_seconds: int = CHUNK_LENGTH,
49+
):
50+
super().__init__(None, None, hf_model_id, sample_rate, max_audio_seconds)
51+
options = ort.SessionOptions()
52+
53+
self.encoder = ort.InferenceSession(
54+
encoder, sess_options=options, providers=[execution_provider], provider_options=[provider_options]
55+
)
56+
57+
self.decoder = ort.InferenceSession(
58+
decoder, sess_options=options, providers=[execution_provider], provider_options=[provider_options]
59+
)
60+
61+
def transcribe_tokens(self, audio, sample_rate, audio_name, save_data=False) -> list[int]:
62+
out_chunked_tokens = []
63+
for ind, x in enumerate(chunk_and_resample_audio(audio, sample_rate)):
64+
out_chunked_tokens.append(self._transcribe_single_chunk(x, audio_name, ind, save_data))
65+
66+
out_tokens: list[int] = []
67+
for chunk_tokens in out_chunked_tokens:
68+
out_tokens.extend(chunk_tokens)
69+
return out_tokens
70+
71+
def transcribe(self, audio, sample_rate, audio_name, save_data=False) -> str:
72+
tokens = self.transcribe_tokens(audio, sample_rate, audio_name, save_data)
73+
return self.tokenizer.decode(tokens, skip_special_tokens=True).strip()
74+
75+
def _transcribe_single_chunk(
76+
self, audio: np.ndarray, audio_name=None, chunk_number=None, save_data=False
77+
) -> list[int]:
78+
# feature
79+
input_features = self.feature_extractor(audio, sampling_rate=self.sample_rate, return_tensors="np")[
80+
"input_features"
81+
]
82+
83+
# encoder
84+
output_names_encoder = [output.name for output in self.encoder.get_outputs()]
85+
# kv_cache_cross = self.encoder(input_features)
86+
input_features_feed = {"input_features": input_features}
87+
88+
if save_data:
89+
input_features_save_path = os.path.join(save_data, audio_name, f"{chunk_number}_input_features.npy")
90+
os.makedirs(os.path.dirname(input_features_save_path), exist_ok=True)
91+
np.save(input_features_save_path, input_features_feed)
92+
93+
kv_cache_cross_numpy = self.encoder.run(output_names_encoder, input_features_feed)
94+
kv_cache_cross = [torch.from_numpy(arr) for arr in kv_cache_cross_numpy]
95+
if not isinstance(kv_cache_cross, tuple):
96+
kv_cache_cross = (kv_cache_cross,)
97+
if not isinstance(kv_cache_cross[0], (tuple, list)):
98+
kv_cache_cross = (kv_cache_cross,)
99+
100+
sot = self.config.decoder_start_token_id
101+
num_decoder_blocks = self.config.decoder_layers
102+
attention_dim = self.config.d_model
103+
num_decoder_heads = self.config.decoder_attention_heads
104+
mask_neg = self.config.mask_neg
105+
eot = self.config.eos_token_id
106+
107+
# decoder
108+
output_ids = torch.tensor([[sot]]) # Start of transcript
109+
output_logits = []
110+
output_length = output_ids.shape[1]
111+
112+
position_ids = torch.tensor([0], dtype=torch.int32)
113+
attention_mask = torch.full(
114+
(1, 1, 1, self.mean_decode_len),
115+
mask_neg,
116+
dtype=torch.float32,
117+
)
118+
119+
# init kv_cache_self
120+
k_cache_self = torch.zeros(
121+
(
122+
num_decoder_heads,
123+
1,
124+
attention_dim // num_decoder_heads,
125+
self.mean_decode_len - 1,
126+
),
127+
dtype=torch.float32,
128+
)
129+
v_cache_self = torch.zeros(
130+
(
131+
num_decoder_heads,
132+
1,
133+
self.mean_decode_len - 1,
134+
attention_dim // num_decoder_heads,
135+
),
136+
dtype=torch.float32,
137+
)
138+
kv_cache_self = tuple((k_cache_self, v_cache_self) for _ in range(num_decoder_blocks))
139+
140+
for n in range(self.mean_decode_len - 1):
141+
# get current token
142+
input_ids = output_ids[:, n : n + 1].to(torch.int32)
143+
144+
# update attention_mask
145+
attention_mask[:, :, :, self.mean_decode_len - n - 1] = 0.0
146+
147+
# flattened kv caches input
148+
flattened_kv_cache_self = tuple(item for sublist in kv_cache_self for item in sublist)
149+
flattened_kv_cache_cross = tuple(item for sublist in kv_cache_cross for item in sublist)
150+
151+
# decode and update kv_cache_self
152+
decoder_input = (
153+
(input_ids, attention_mask) + flattened_kv_cache_self + flattened_kv_cache_cross + (position_ids,)
154+
)
155+
156+
# print("decoder_input: ", decoder_input)
157+
input_names_decoder = [input.name for input in self.decoder.get_inputs()]
158+
output_names_decoder = [output.name for output in self.decoder.get_outputs()]
159+
160+
# decoder_input_feed = dict(zip(input_names_decoder, decoder_input))
161+
decoder_input_feed = {
162+
name: tensor.numpy() if isinstance(tensor, torch.Tensor) else tensor
163+
for name, tensor in zip(input_names_decoder, decoder_input)
164+
}
165+
166+
if save_data:
167+
decoder_input_save_path = os.path.join(save_data, audio_name, f"{chunk_number}_{n}_decoder_input.npy")
168+
os.makedirs(os.path.dirname(decoder_input_save_path), exist_ok=True)
169+
np.save(decoder_input_save_path, decoder_input_feed)
170+
171+
decoder_output_numpy = self.decoder.run(output_names_decoder, decoder_input_feed)
172+
decoder_output = [torch.from_numpy(arr) for arr in decoder_output_numpy]
173+
# decoder_output = self.decoder(*decoder_input)
174+
if isinstance(decoder_output, tuple) and len(decoder_output) == 2:
175+
logits, kv_cache_self = decoder_output
176+
else:
177+
logits = decoder_output[0]
178+
kv_cache_self = tuple(decoder_output[i : i + 2] for i in range(1, len(decoder_output), 2))
179+
180+
# update output_logits
181+
output_logits.append(logits.detach().clone())
182+
183+
# update output_ids
184+
output_id = torch.argmax(logits, 1).squeeze(0)
185+
# end of transcript
186+
if len(output_logits) == (self.mean_decode_len - 1) or output_id == eot:
187+
output_ids = torch.cat((output_ids, output_id), -1)
188+
break
189+
if n >= output_length - 1:
190+
output_ids = torch.cat((output_ids, output_id), -1)
191+
192+
# update position_ids
193+
position_ids += 1
194+
195+
return output_ids[0].tolist()
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
# ---------------------------------------------------------------------
2+
# Copyright (c) 2025 Qualcomm Technologies, Inc. and/or its subsidiaries.
3+
# SPDX-License-Identifier: BSD-3-Clause
4+
# ---------------------------------------------------------------------
5+
6+
import argparse
7+
import os
8+
9+
from app import HfWhisperAppWithSave, infer_audio
10+
11+
12+
def main():
13+
parser = argparse.ArgumentParser(description="Demo")
14+
parser.add_argument(
15+
"--audio-path",
16+
type=str,
17+
help="Path to folder containing audio files or a single audio file path",
18+
)
19+
parser.add_argument(
20+
"--encoder",
21+
type=str,
22+
help="Path to encoder onnx file",
23+
)
24+
parser.add_argument(
25+
"--decoder",
26+
type=str,
27+
help="Path to decoder onnx file",
28+
)
29+
parser.add_argument(
30+
"--model_id",
31+
type=str,
32+
default="openai/whisper-large-v3-turbo",
33+
help="HuggingFace Whisper model id",
34+
)
35+
parser.add_argument(
36+
"--execution_provider",
37+
type=str,
38+
default="CPUExecutionProvider",
39+
help="ORT Execution provider",
40+
)
41+
parser.add_argument(
42+
"--save_data",
43+
type=str,
44+
default=None,
45+
help="(Optional) Path to save quantization data",
46+
)
47+
parser.add_argument(
48+
"--num_data",
49+
type=int,
50+
default=100,
51+
help="Number of data samples to use for quantization. Only applicable if --save_data is enabled",
52+
)
53+
54+
args = parser.parse_args()
55+
56+
encoder_path = args.encoder
57+
decoder_path = args.decoder
58+
59+
provider_options = {}
60+
if args.execution_provider == "QNNExectionProvider":
61+
provider_options = {
62+
"backend_path": "QnnHtp.dll",
63+
"htp_performance_mode": "sustained_high_performance",
64+
"htp_graph_finalization_optimization_mode": "3",
65+
"offload_graph_io_quantization": "0",
66+
}
67+
68+
app = HfWhisperAppWithSave(encoder_path, decoder_path, args.model_id, args.execution_provider, provider_options)
69+
70+
if os.path.isdir(args.audio_path):
71+
for i, item in enumerate(os.listdir(args.audio_path)):
72+
if args.save_data and i == args.num_data:
73+
break
74+
75+
full_path = os.path.join(args.audio_path, item)
76+
infer_audio(app, args.model_id, full_path, args.save_data)
77+
78+
else:
79+
infer_audio(app, args.model_id, args.audio_path, args.save_data)
80+
81+
82+
if __name__ == "__main__":
83+
main()
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import argparse
2+
import os
3+
4+
import numpy as np
5+
from datasets import load_dataset
6+
7+
8+
def download_librispeech_asr(save_dir):
9+
# Create save_dir if it doesn't exist
10+
save_dir = os.path.join(save_dir, "librispeech_asr_clean_test")
11+
os.makedirs(save_dir, exist_ok=True)
12+
13+
# Load streaming dataset
14+
streamed_dataset = load_dataset("librispeech_asr", "clean", split="test", streaming=True)
15+
16+
for batch in streamed_dataset:
17+
file_path = os.path.join(save_dir, f"{batch['id']}.npy")
18+
np.save(file_path, batch)
19+
20+
print("Download complete!!")
21+
22+
23+
if __name__ == "__main__":
24+
parser = argparse.ArgumentParser(description="Download and save samples from librispeech_asr dataset.")
25+
parser.add_argument("--save_dir", type=str, required=True, help="Directory to save the dataset samples.")
26+
args = parser.parse_args()
27+
28+
download_librispeech_asr(args.save_dir)

0 commit comments

Comments
 (0)