|
18 | 18 | #include <executorch/runtime/platform/log.h> |
19 | 19 | #include <pytorch/tokenizers/hf_tokenizer.h> |
20 | 20 |
|
| 21 | +#include <algorithm> |
21 | 22 | #include <cinttypes> |
22 | 23 | #include <fstream> |
23 | 24 | #include <string> |
@@ -110,6 +111,17 @@ int main(int argc, char** argv) { |
110 | 111 | return 1; |
111 | 112 | } |
112 | 113 |
|
| 114 | + // GPU memory: before load |
| 115 | + { |
| 116 | + size_t free = 0, total = 0; |
| 117 | + if (cudaMemGetInfo(&free, &total) == cudaSuccess) { |
| 118 | + stats.gpu_total_bytes = total; |
| 119 | + stats.gpu_free_before_load_bytes = free; |
| 120 | + } |
| 121 | + } |
| 122 | + |
| 123 | + stats.model_load_start_ms = llm::time_in_ms(); |
| 124 | + |
113 | 125 | // Create Module with share_memory_arenas=true so prefill and decode |
114 | 126 | // share mutable buffers (KV cache, conv_state, recurrent_state). |
115 | 127 | std::vector<std::string> data_files; |
@@ -184,11 +196,13 @@ int main(int argc, char** argv) { |
184 | 196 |
|
185 | 197 | stats.model_load_end_ms = llm::time_in_ms(); |
186 | 198 |
|
187 | | -#ifdef EXECUTORCH_BUILD_CUDA |
188 | | - // GPU memory after load |
189 | | - cudaMemGetInfo(&gpu_free_bytes, &gpu_total_bytes); |
190 | | - stats.gpu_free_after_load_bytes = gpu_free_bytes; |
191 | | -#endif |
| 199 | + // GPU memory: after load |
| 200 | + { |
| 201 | + size_t free = 0, total = 0; |
| 202 | + if (cudaMemGetInfo(&free, &total) == cudaSuccess) { |
| 203 | + stats.gpu_free_after_load_bytes = free; |
| 204 | + } |
| 205 | + } |
192 | 206 |
|
193 | 207 | // Get EOS ids |
194 | 208 | auto eos_ids = llm::get_eos_ids(tokenizer.get(), module.get()); |
@@ -231,6 +245,9 @@ int main(int argc, char** argv) { |
231 | 245 | auto temp_tensor = |
232 | 246 | from_blob(&temp_val, {1}, executorch::aten::ScalarType::Float); |
233 | 247 |
|
| 248 | + stats.inference_start_ms = llm::time_in_ms(); |
| 249 | + stats.num_prompt_tokens = num_prompt_tokens; |
| 250 | + |
234 | 251 | // --------------------------------------------------------------- |
235 | 252 | // Prefill |
236 | 253 | // --------------------------------------------------------------- |
@@ -272,14 +289,14 @@ int main(int argc, char** argv) { |
272 | 289 | cur_token = read_token(prefill_outputs[0].toTensor()); |
273 | 290 |
|
274 | 291 | stats.prompt_eval_end_ms = llm::time_in_ms(); |
275 | | - |
| 292 | + stats.first_token_ms = stats.prompt_eval_end_ms; |
276 | 293 | double prefill_ms = |
277 | 294 | (double)(stats.prompt_eval_end_ms - stats.inference_start_ms); |
278 | 295 | printf( |
279 | 296 | "Prefill: %" PRId64 " tokens in %.1f ms (%.1f tok/s)\n", |
280 | 297 | num_prompt_tokens, |
281 | 298 | prefill_ms, |
282 | | - num_prompt_tokens * 1000.0 / prefill_ms); |
| 299 | + num_prompt_tokens / prefill_ms * stats.SCALING_FACTOR_UNITS_PER_SECOND); |
283 | 300 |
|
284 | 301 | #ifdef EXECUTORCH_BUILD_CUDA |
285 | 302 | // Synchronize CUDA device to ensure prefill's writes to shared mutable |
@@ -344,24 +361,104 @@ int main(int argc, char** argv) { |
344 | 361 | int64_t num_generated = pos - num_prompt_tokens; |
345 | 362 | stats.num_generated_tokens = num_generated; |
346 | 363 |
|
| 364 | + // GPU memory: after generate + peak usage |
| 365 | + { |
| 366 | + size_t free = 0, total = 0; |
| 367 | + if (cudaMemGetInfo(&free, &total) == cudaSuccess) { |
| 368 | + stats.gpu_free_after_generate_bytes = free; |
| 369 | + size_t min_free = free; |
| 370 | + if (stats.gpu_free_before_load_bytes != static_cast<uint64_t>(-1)) { |
| 371 | + min_free = std::min(min_free, (size_t)stats.gpu_free_before_load_bytes); |
| 372 | + } |
| 373 | + if (stats.gpu_free_after_load_bytes != static_cast<uint64_t>(-1)) { |
| 374 | + min_free = std::min(min_free, (size_t)stats.gpu_free_after_load_bytes); |
| 375 | + } |
| 376 | + stats.gpu_peak_usage_mb = (double)(total - min_free) / 1024.0 / 1024.0; |
| 377 | + } |
| 378 | + } |
| 379 | + |
| 380 | + printf("\n"); |
| 381 | + |
347 | 382 | double decode_ms = |
348 | 383 | (double)(stats.inference_end_ms - stats.prompt_eval_end_ms); |
| 384 | + printf( |
| 385 | + "Prefill: %" PRId64 " tokens in %.1f ms (%.1f tok/s)\n", |
| 386 | + num_prompt_tokens, |
| 387 | + prefill_ms, |
| 388 | + num_prompt_tokens / prefill_ms * stats.SCALING_FACTOR_UNITS_PER_SECOND); |
349 | 389 | printf( |
350 | 390 | "Decode: %" PRId64 " tokens in %.1f ms (%.1f tok/s)\n", |
351 | 391 | num_generated, |
352 | 392 | decode_ms, |
353 | | - num_generated * 1000.0 / decode_ms); |
| 393 | + num_generated / decode_ms * stats.SCALING_FACTOR_UNITS_PER_SECOND); |
354 | 394 | printf("Prompt tokens: %" PRId64 "\n", num_prompt_tokens); |
355 | 395 |
|
356 | | -#ifdef EXECUTORCH_BUILD_CUDA |
357 | | - // GPU memory after generation |
358 | | - cudaMemGetInfo(&gpu_free_bytes, &gpu_total_bytes); |
359 | | - stats.gpu_free_after_generate_bytes = gpu_free_bytes; |
360 | | - stats.gpu_peak_usage_mb = |
361 | | - (stats.gpu_total_bytes - gpu_free_bytes) / 1024.0 / 1024.0; |
362 | | -#endif |
| 396 | + // Structured stats report (matches stats.h print_report) |
| 397 | + printf("PyTorchObserver %s\n", llm::stats_to_json_string(stats).c_str()); |
| 398 | + |
| 399 | + double ms_per_s = stats.SCALING_FACTOR_UNITS_PER_SECOND; |
363 | 400 |
|
364 | | - llm::print_report(stats); |
| 401 | + double model_load_s = |
| 402 | + (double)(stats.model_load_end_ms - stats.model_load_start_ms) / ms_per_s; |
| 403 | + double inference_time_ms = |
| 404 | + (double)(stats.inference_end_ms - stats.inference_start_ms); |
| 405 | + double prompt_eval_ms = |
| 406 | + (double)(stats.prompt_eval_end_ms - stats.inference_start_ms); |
| 407 | + double eval_ms = (double)(stats.inference_end_ms - stats.prompt_eval_end_ms); |
| 408 | + double ttft_s = |
| 409 | + (double)(stats.first_token_ms - stats.inference_start_ms) / ms_per_s; |
| 410 | + double sampling_s = (double)stats.aggregate_sampling_time_ms / ms_per_s; |
| 411 | + |
| 412 | + printf("\n"); |
| 413 | + printf( |
| 414 | + "\tPrompt Tokens: %" PRId64 " Generated Tokens: %" PRId64 "\n", |
| 415 | + stats.num_prompt_tokens, |
| 416 | + stats.num_generated_tokens); |
| 417 | + printf("\tModel Load Time:\t\t%f (seconds)\n", model_load_s); |
| 418 | + printf( |
| 419 | + "\tTotal inference time:\t\t%f (seconds)\t\t Rate: \t%f (tokens/second)\n", |
| 420 | + inference_time_ms / ms_per_s, |
| 421 | + stats.num_generated_tokens / inference_time_ms * ms_per_s); |
| 422 | + printf( |
| 423 | + "\t\tPrompt evaluation:\t%f (seconds)\t\t Rate: \t%f (tokens/second)\n", |
| 424 | + prompt_eval_ms / ms_per_s, |
| 425 | + stats.num_prompt_tokens / prompt_eval_ms * ms_per_s); |
| 426 | + printf( |
| 427 | + "\t\tGenerated %" PRId64 |
| 428 | + " tokens:\t%f (seconds)\t\t Rate: \t%f (tokens/second)\n", |
| 429 | + stats.num_generated_tokens, |
| 430 | + eval_ms / ms_per_s, |
| 431 | + stats.num_generated_tokens / eval_ms * ms_per_s); |
| 432 | + printf("\tTime to first generated token:\t%f (seconds)\n", ttft_s); |
| 433 | + printf( |
| 434 | + "\tSampling time over %" PRId64 " tokens:\t%f (seconds)\n", |
| 435 | + stats.num_prompt_tokens + stats.num_generated_tokens, |
| 436 | + sampling_s); |
| 437 | + |
| 438 | + // GPU memory reporting |
| 439 | + if (stats.gpu_total_bytes != static_cast<uint64_t>(-1)) { |
| 440 | + printf( |
| 441 | + "\tGPU total memory: %.2f MB\n", |
| 442 | + stats.gpu_total_bytes / 1024.0 / 1024.0); |
| 443 | + if (stats.gpu_free_before_load_bytes != static_cast<uint64_t>(-1)) { |
| 444 | + printf( |
| 445 | + "\tGPU free before load: %.2f MB\n", |
| 446 | + stats.gpu_free_before_load_bytes / 1024.0 / 1024.0); |
| 447 | + } |
| 448 | + if (stats.gpu_free_after_load_bytes != static_cast<uint64_t>(-1)) { |
| 449 | + printf( |
| 450 | + "\tGPU free after load: %.2f MB\n", |
| 451 | + stats.gpu_free_after_load_bytes / 1024.0 / 1024.0); |
| 452 | + } |
| 453 | + if (stats.gpu_free_after_generate_bytes != static_cast<uint64_t>(-1)) { |
| 454 | + printf( |
| 455 | + "\tGPU free after generate: %.2f MB\n", |
| 456 | + stats.gpu_free_after_generate_bytes / 1024.0 / 1024.0); |
| 457 | + } |
| 458 | + if (stats.gpu_peak_usage_mb >= 0.0) { |
| 459 | + printf("\tGPU peak usage: %.2f MB\n", stats.gpu_peak_usage_mb); |
| 460 | + } |
| 461 | + } |
365 | 462 |
|
366 | 463 | return 0; |
367 | 464 | } |
0 commit comments