diff --git a/examples/models/qwen3_5_moe/main.cpp b/examples/models/qwen3_5_moe/main.cpp index bae4cfc183c..00c91a685e1 100644 --- a/examples/models/qwen3_5_moe/main.cpp +++ b/examples/models/qwen3_5_moe/main.cpp @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -58,11 +59,13 @@ int main(int argc, char** argv) { llm::Stats stats; +#ifdef EXECUTORCH_BUILD_CUDA // GPU memory before load - size_t gpu_free_bytes, gpu_total_bytes; + size_t gpu_free_bytes = 0, gpu_total_bytes = 0; cudaMemGetInfo(&gpu_free_bytes, &gpu_total_bytes); stats.gpu_total_bytes = gpu_total_bytes; stats.gpu_free_before_load_bytes = gpu_free_bytes; +#endif stats.model_load_start_ms = llm::time_in_ms(); @@ -127,9 +130,11 @@ int main(int argc, char** argv) { stats.model_load_end_ms = llm::time_in_ms(); +#ifdef EXECUTORCH_BUILD_CUDA // GPU memory after load cudaMemGetInfo(&gpu_free_bytes, &gpu_total_bytes); stats.gpu_free_after_load_bytes = gpu_free_bytes; +#endif // Get EOS ids auto eos_ids = llm::get_eos_ids(tokenizer.get(), module.get()); @@ -155,7 +160,7 @@ int main(int argc, char** argv) { } auto prompt_tokens = std::move(*encode_result); int64_t num_prompt_tokens = prompt_tokens.size(); - printf("Prompt tokens: %ld\n", num_prompt_tokens); + printf("Prompt tokens: %" PRId64 "\n", num_prompt_tokens); stats.num_prompt_tokens = num_prompt_tokens; stats.inference_start_ms = llm::time_in_ms(); @@ -209,7 +214,7 @@ int main(int argc, char** argv) { double prefill_ms = (double)(stats.prompt_eval_end_ms - stats.inference_start_ms); printf( - "Prefill: %ld tokens in %.1f ms (%.1f tok/s)\n", + "Prefill: %" PRId64 " tokens in %.1f ms (%.1f tok/s)\n", num_prompt_tokens, prefill_ms, num_prompt_tokens * 1000.0 / prefill_ms); @@ -290,17 +295,19 @@ int main(int argc, char** argv) { double decode_ms = (double)(stats.inference_end_ms - stats.prompt_eval_end_ms); printf( - "Decode: %ld tokens in %.1f ms (%.1f tok/s)\n", + "Decode: %" PRId64 " tokens in %.1f ms (%.1f tok/s)\n", num_generated, decode_ms, num_generated * 1000.0 / decode_ms); - printf("Prompt tokens: %ld\n", num_prompt_tokens); + printf("Prompt tokens: %" PRId64 "\n", num_prompt_tokens); +#ifdef EXECUTORCH_BUILD_CUDA // GPU memory after generation cudaMemGetInfo(&gpu_free_bytes, &gpu_total_bytes); stats.gpu_free_after_generate_bytes = gpu_free_bytes; stats.gpu_peak_usage_mb = (stats.gpu_total_bytes - gpu_free_bytes) / 1024.0 / 1024.0; +#endif llm::print_report(stats);