executorch/examples/models/voxtral/multimodal.cpp at main · pytorch/executorch · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <cmath>
#include <cstring>
#include <fstream>

#include <gflags/gflags.h>

#include <executorch/extension/module/module.h>
#include <executorch/extension/tensor/tensor_ptr_maker.h>
#include <executorch/runtime/core/evalue.h>

#include <executorch/extension/llm/runner/audio.h>
#include <executorch/extension/llm/runner/image.h>
#include <executorch/extension/llm/runner/llm_runner_helper.h>
#include <executorch/extension/llm/runner/multimodal_input.h>
#include <executorch/extension/llm/runner/multimodal_runner.h>
#include <executorch/extension/llm/runner/wav_loader.h>
#include <executorch/runtime/core/error.h>
#include <executorch/runtime/platform/log.h>

#if defined(ET_USE_THREADPOOL)
#include <executorch/extension/threadpool/cpuinfo_utils.h>
#include <executorch/extension/threadpool/threadpool.h>
#endif

DEFINE_string(
    model_path,
    "multimodal.pte",
    "Model serialized in flatbuffer format.");

DEFINE_string(data_path, "", "Path to data file.");
DEFINE_string(tokenizer_path, "tekken.json", "Tokenizer stuff.");

DEFINE_string(prompt, "What is happening in this audio?", "Text prompt.");

DEFINE_string(audio_path, "", "Path to input audio file.");

DEFINE_string(
    processor_path,
    "",
    "Path to processor .pte file for raw audio processing.");

DEFINE_double(
    temperature,
    0.8f,
    "Temperature; Default is 0.8f. 0 = greedy argmax sampling (deterministic). Lower temperature = more deterministic");

DEFINE_int32(
    cpu_threads,
    -1,
    "Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device.");

DEFINE_bool(warmup, false, "Whether to run a warmup run.");

namespace {

using ::executorch::extension::from_blob;
using ::executorch::extension::Module;
using ::executorch::extension::llm::Image;
using ::executorch::extension::llm::make_image_input;
using ::executorch::extension::llm::make_text_input;
using ::executorch::extension::llm::MultimodalInput;
using ::executorch::runtime::EValue;

bool ends_with(const std::string& str, const std::string& suffix) {
  return str.size() >= suffix.size() &&
      str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
}

/**
 * @brief Loads preprocessed audio data from a binary file
 *
 * Reads mel spectrogram features that have been pre-computed and saved as a
 * binary file. The audio data is expected to be stored as float values in
 * binary format, typically saved using:
 *   with open("tensor.bin", "wb") as f:
 *       f.write(t.numpy().tobytes())
 *
 * @param audio_path Path to the binary audio file (.bin)
 * @return MultimodalInput containing the loaded audio data
 */
MultimodalInput loadPreprocessedAudio(const std::string& audio_path) {
  std::ifstream f(audio_path, std::ios::binary | std::ios::ate);
  if (!f.is_open()) {
    ET_LOG(Error, "Failed to open audio file: %s", audio_path.c_str());
    throw std::runtime_error("Failed to open audio file");
  }

  std::size_t n_floats = f.tellg() / sizeof(float);
  f.seekg(0, std::ios::beg);

  int32_t n_bins = 128;
  int32_t n_frames = 3000;

  int32_t batch_size = ceil(
      n_floats /
      (n_bins * n_frames)); // Batch in increments of n_frames, rounding up.

  ET_LOG(Info, "audio_data len = %zu", n_floats);

  std::vector<float> audio_data(n_floats);
  f.read(reinterpret_cast<char*>(audio_data.data()), n_floats * sizeof(float));
  f.close();

  auto audio = ::executorch::extension::llm::Audio(
      std::move(audio_data), batch_size, n_bins, n_frames);
  return ::executorch::extension::llm::make_audio_input(std::move(audio));
}

/**
 * @brief Loads raw audio from a .bin or .wav file and processes it using a
 * .pte processor
 *
 * This function loads raw audio data from either a .bin file (raw float array)
 * or a .wav file (WAV format with headers), creates a tensor from it, and then
 * passes it through a processor module loaded from a .pte file to generate
 * processed audio features.
 *
 * @param audio_path Path to the .bin or .wav audio file
 * @param processor_path Path to the .pte processor file
 * @return MultimodalInput containing the processed audio data
 * @throws std::runtime_error if file loading or processing fails
 */
MultimodalInput processRawAudioFile(
    const std::string& audio_path,
    const std::string& processor_path) {
  if (processor_path.empty()) {
    ET_LOG(Error, "Processor path is required for raw audio processing");
    throw std::runtime_error(
        "Processor path is required for raw audio processing");
  }

  // Load the audio data from file (.bin or .wav)
  std::vector<float> audio_data;
  if (ends_with(audio_path, ".wav")) {
    audio_data = ::executorch::extension::llm::load_wav_audio_data(audio_path);
    ET_LOG(
        Info,
        "Loaded WAV file: %s, %zu samples",
        audio_path.c_str(),
        audio_data.size());
  } else if (ends_with(audio_path, ".bin")) {
    std::ifstream f(audio_path, std::ios::binary | std::ios::ate);
    if (!f.is_open()) {
      ET_LOG(Error, "Failed to open audio file: %s", audio_path.c_str());
      throw std::runtime_error("Failed to open audio file");
    }

    std::size_t n_floats = f.tellg() / sizeof(float);
    f.seekg(0, std::ios::beg);

    audio_data.resize(n_floats);
    f.read(
        reinterpret_cast<char*>(audio_data.data()),
        audio_data.size() * sizeof(float));
    f.close();

    ET_LOG(
        Info, "Loaded .bin file: %s, %zu floats", audio_path.c_str(), n_floats);
  } else {
    ET_LOG(
        Error,
        "Unsupported audio file format: %s (only .bin and .wav files are supported)",
        audio_path.c_str());
    throw std::runtime_error("Unsupported audio file format");
  }

  // Load the audio processor .pte.
  std::unique_ptr<Module> processor_module;
  try {
    processor_module =
        std::make_unique<Module>(processor_path, Module::LoadMode::File);
    auto load_error = processor_module->load();
    if (load_error != ::executorch::runtime::Error::Ok) {
      ET_LOG(
          Error,
          "Failed to load processor module from: %s",
          processor_path.c_str());
      throw std::runtime_error("Failed to load processor module");
    }
  } catch (const std::exception& e) {
    ET_LOG(Error, "Exception while loading processor module: %s", e.what());
    throw std::runtime_error("Exception while loading processor module");
  }

  // Execute the processor
  std::vector<executorch::aten::SizesType> tensor_shape = {
      static_cast<executorch::aten::SizesType>(audio_data.size())};
  auto input_tensor = from_blob(
      audio_data.data(), tensor_shape, ::executorch::aten::ScalarType::Float);

  ET_LOG(Info, "Processing audio through processor module...");
  auto result = processor_module->execute("forward", input_tensor);
  if (!result.ok()) {
    ET_LOG(Error, "Failed to execute processor's forward method");
    throw std::runtime_error("Failed to execute processor forward method");
  }

  auto outputs = result.get();
  if (outputs.empty()) {
    ET_LOG(Error, "Processor returned no outputs");
    throw std::runtime_error("Processor returned no outputs");
  }

  // Extract processed audio features
  const auto& processed_tensor = outputs[0].toTensor();
  const float* processed_data = processed_tensor.const_data_ptr<float>();
  const auto& sizes = processed_tensor.sizes();

  ET_LOG(
      Info,
      "Processed audio tensor shape: [%d, %d, %d]",
      static_cast<int>(sizes[0]),
      static_cast<int>(sizes[1]),
      static_cast<int>(sizes[2]));

  // Create Audio multimodal input from processed features
  int32_t batch_size = static_cast<int32_t>(sizes[0]);
  int32_t n_bins = static_cast<int32_t>(sizes[1]);
  int32_t n_frames = static_cast<int32_t>(sizes[2]);
  size_t total_elements = batch_size * n_bins * n_frames;
  std::vector<float> audio_vec(processed_data, processed_data + total_elements);
  auto processed_audio = ::executorch::extension::llm::Audio(
      std::move(audio_vec), batch_size, n_bins, n_frames);
  ET_LOG(
      Info,
      "Created processed Audio: batch_size=%d, n_bins=%d, n_frames=%d",
      batch_size,
      n_bins,
      n_frames);
  return ::executorch::extension::llm::make_audio_input(
      std::move(processed_audio));
}

/**
 * @brief Processes audio files for multimodal input
 *
 * Dispatches audio file processing based on file extension and processor
 * availability:
 * - .wav files: Requires processor, processes raw audio through processor
 * - .bin files with processor: Loads raw audio from .bin and processes through
 * processor
 * - .bin files without processor: Loads preprocessed mel spectrogram features
 * directly
 *
 * @param audio_path Path to the audio file (.bin or .wav)
 * @param processor_path Path to the processor .pte file (optional for .bin,
 * required for .wav)
 * @return MultimodalInput containing the processed audio data
 * @throws std::runtime_error if file format is unsupported or processing fails
 */
MultimodalInput processAudioFile(
    const std::string& audio_path,
    const std::string& processor_path = "") {
  if (ends_with(audio_path, ".wav") || ends_with(audio_path, ".bin")) {
    if (processor_path.empty()) {
      if (ends_with(audio_path, ".wav")) {
        ET_CHECK_MSG(
            false,
            "Processor path is required for .wav file processing: %s",
            audio_path.c_str());
      } else {
        // Load preprocessed audio stored as a binary file (existing behavior)
        return loadPreprocessedAudio(audio_path);
      }
    }
    return processRawAudioFile(audio_path, processor_path);
  } else {
    ET_CHECK_MSG(
        false,
        "Unsupported audio file format: %s (only .bin and .wav files are supported)",
        audio_path.c_str());
  }
}

} // namespace

int32_t main(int32_t argc, char** argv) {
  gflags::ParseCommandLineFlags(&argc, &argv, true);

  const char* model_path = FLAGS_model_path.c_str();

  const char* tokenizer_path = FLAGS_tokenizer_path.c_str();
  const char* prompt = FLAGS_prompt.c_str();
  const char* audio_path = FLAGS_audio_path.c_str();
  const char* processor_path = FLAGS_processor_path.c_str();
  const char* data_path = FLAGS_data_path.c_str();
  float temperature = FLAGS_temperature;
  int32_t cpu_threads = FLAGS_cpu_threads;
  bool warmup = FLAGS_warmup;

#if defined(ET_USE_THREADPOOL)
  uint32_t num_performant_cores = cpu_threads == -1
      ? ::executorch::extension::cpuinfo::get_num_performant_cores()
      : static_cast<uint32_t>(cpu_threads);
  ET_LOG(
      Info, "Resetting threadpool with num threads = %d", num_performant_cores);
  if (num_performant_cores > 0) {
    ::executorch::extension::threadpool::get_threadpool()
        ->_unsafe_reset_threadpool(num_performant_cores);
  }
#endif

  // Load tokenizer
  std::unique_ptr<::tokenizers::Tokenizer> tokenizer =
      ::executorch::extension::llm::load_tokenizer(tokenizer_path);
  if (tokenizer == nullptr) {
    ET_LOG(Error, "Failed to load tokenizer from: %s", tokenizer_path);
    return 1;
  }

  // Create multimodal runner
  std::unique_ptr<::executorch::extension::llm::MultimodalRunner> runner =
      ::executorch::extension::llm::create_multimodal_runner(
          model_path, std::move(tokenizer), data_path, Module::LoadMode::Mmap);
  if (runner == nullptr) {
    ET_LOG(Error, "Failed to create multimodal runner");
    return 1;
  }

  // Load runner
  auto load_error = runner->load();
  if (load_error != ::executorch::runtime::Error::Ok) {
    ET_LOG(Error, "Failed to load multimodal runner");
    return 1;
  }

  // Prepare inputs
  std::vector<MultimodalInput> inputs = {
      make_text_input("<s>[INST][BEGIN_AUDIO]"),
      processAudioFile(audio_path, processor_path),
      make_text_input(std::string(prompt) + "[/INST]"),
  };

  ::executorch::extension::llm::GenerationConfig config;
  config.max_new_tokens = 100;
  config.temperature = temperature;

  // Run warmup if requested
  if (warmup) {
    ET_LOG(Info, "Running warmup...");
    auto warmup_error = runner->generate(inputs, config);
    if (warmup_error != ::executorch::runtime::Error::Ok) {
      ET_LOG(Error, "Failed to run warmup");
      return 1;
    }
    runner->reset();
  }

  // Generate
  ET_LOG(Info, "Starting generation...");
  auto error = runner->generate(inputs, config);
  if (error != ::executorch::runtime::Error::Ok) {
    ET_LOG(Error, "Failed to generate with multimodal runner");
    return 1;
  }

  printf("\n");
  return 0;
}