Skip to content

Commit b6cec38

Browse files
authored
skip cuda operations when running qwen 3.5 moe on other backend (#19095)
This PR makes GPU related operator cuda-backend specific, to bring metal qwen 3.5 moe ci back
1 parent eef7921 commit b6cec38

1 file changed

Lines changed: 12 additions & 5 deletions

File tree

examples/models/qwen3_5_moe/main.cpp

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include <executorch/runtime/platform/log.h>
1717
#include <pytorch/tokenizers/hf_tokenizer.h>
1818

19+
#include <cinttypes>
1920
#include <fstream>
2021
#include <string>
2122
#include <vector>
@@ -58,11 +59,13 @@ int main(int argc, char** argv) {
5859

5960
llm::Stats stats;
6061

62+
#ifdef EXECUTORCH_BUILD_CUDA
6163
// GPU memory before load
62-
size_t gpu_free_bytes, gpu_total_bytes;
64+
size_t gpu_free_bytes = 0, gpu_total_bytes = 0;
6365
cudaMemGetInfo(&gpu_free_bytes, &gpu_total_bytes);
6466
stats.gpu_total_bytes = gpu_total_bytes;
6567
stats.gpu_free_before_load_bytes = gpu_free_bytes;
68+
#endif
6669

6770
stats.model_load_start_ms = llm::time_in_ms();
6871

@@ -127,9 +130,11 @@ int main(int argc, char** argv) {
127130

128131
stats.model_load_end_ms = llm::time_in_ms();
129132

133+
#ifdef EXECUTORCH_BUILD_CUDA
130134
// GPU memory after load
131135
cudaMemGetInfo(&gpu_free_bytes, &gpu_total_bytes);
132136
stats.gpu_free_after_load_bytes = gpu_free_bytes;
137+
#endif
133138

134139
// Get EOS ids
135140
auto eos_ids = llm::get_eos_ids(tokenizer.get(), module.get());
@@ -155,7 +160,7 @@ int main(int argc, char** argv) {
155160
}
156161
auto prompt_tokens = std::move(*encode_result);
157162
int64_t num_prompt_tokens = prompt_tokens.size();
158-
printf("Prompt tokens: %ld\n", num_prompt_tokens);
163+
printf("Prompt tokens: %" PRId64 "\n", num_prompt_tokens);
159164

160165
stats.num_prompt_tokens = num_prompt_tokens;
161166
stats.inference_start_ms = llm::time_in_ms();
@@ -209,7 +214,7 @@ int main(int argc, char** argv) {
209214
double prefill_ms =
210215
(double)(stats.prompt_eval_end_ms - stats.inference_start_ms);
211216
printf(
212-
"Prefill: %ld tokens in %.1f ms (%.1f tok/s)\n",
217+
"Prefill: %" PRId64 " tokens in %.1f ms (%.1f tok/s)\n",
213218
num_prompt_tokens,
214219
prefill_ms,
215220
num_prompt_tokens * 1000.0 / prefill_ms);
@@ -290,17 +295,19 @@ int main(int argc, char** argv) {
290295
double decode_ms =
291296
(double)(stats.inference_end_ms - stats.prompt_eval_end_ms);
292297
printf(
293-
"Decode: %ld tokens in %.1f ms (%.1f tok/s)\n",
298+
"Decode: %" PRId64 " tokens in %.1f ms (%.1f tok/s)\n",
294299
num_generated,
295300
decode_ms,
296301
num_generated * 1000.0 / decode_ms);
297-
printf("Prompt tokens: %ld\n", num_prompt_tokens);
302+
printf("Prompt tokens: %" PRId64 "\n", num_prompt_tokens);
298303

304+
#ifdef EXECUTORCH_BUILD_CUDA
299305
// GPU memory after generation
300306
cudaMemGetInfo(&gpu_free_bytes, &gpu_total_bytes);
301307
stats.gpu_free_after_generate_bytes = gpu_free_bytes;
302308
stats.gpu_peak_usage_mb =
303309
(stats.gpu_total_bytes - gpu_free_bytes) / 1024.0 / 1024.0;
310+
#endif
304311

305312
llm::print_report(stats);
306313

0 commit comments

Comments
 (0)