Skip to content

Commit dd2e8d3

Browse files
committed
Add structured stats reporting and GPU memory tracking to Qwen3.5 MoE runner
Runner now uses llm::Stats with proper timestamps for model load, prefill, decode, and GPU memory (via cudaMemGetInfo). Output matches stats.h print_report format: PyTorchObserver JSON line plus human-readable table. This commit was authored with the assistance of Claude Code. ghstack-source-id: fea9eb8 Pull Request resolved: #19190
1 parent bdc8094 commit dd2e8d3

3 files changed

Lines changed: 120 additions & 23 deletions

File tree

examples/models/qwen3_5_moe/export.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -667,12 +667,12 @@ def _apply_turboquant(model, config):
667667
# ---------------------------------------------------------------------------
668668

669669

670-
def _set_batched_moe(model, enabled, moe_moe_moe_activation_dtype="bf16"):
670+
def _set_batched_moe(model, enabled, moe_moe_moe_moe_activation_dtype="bf16"):
671671
"""Toggle batched tensor-core MoE kernel for all MoE layers."""
672672
for layer in model.layers:
673673
if hasattr(layer, "mlp") and hasattr(layer.mlp, "experts"):
674674
layer.mlp.experts.use_batched_moe = enabled
675-
layer.mlp.experts.moe_moe_moe_activation_dtype = moe_moe_moe_activation_dtype
675+
layer.mlp.experts.moe_moe_moe_moe_activation_dtype = moe_moe_moe_moe_activation_dtype
676676

677677

678678
def export_and_lower(model, config, args):
@@ -916,8 +916,8 @@ def _export_cuda(model, config, args):
916916
# chunk_gated_delta_rule with CHUNK_SIZE=64) for the full range of sequence
917917
# lengths. Smaller examples cause AOTI to bake in intermediate buffer sizes
918918
# that reject longer prompts at runtime.
919-
moe_moe_moe_activation_dtype = getattr(args, "moe_moe_moe_activation_dtype", "bf16")
920-
_set_batched_moe(model, True, moe_moe_moe_activation_dtype=moe_moe_moe_activation_dtype)
919+
moe_moe_moe_moe_activation_dtype = getattr(args, "moe_moe_moe_moe_activation_dtype", "bf16")
920+
_set_batched_moe(model, True, moe_moe_moe_moe_activation_dtype=moe_moe_moe_moe_activation_dtype)
921921
dense_prefill = getattr(args, "dense_prefill", "tinygemm")
922922
_set_dequant_prefill(model, dense_prefill == "dequant")
923923
print("Exporting prefill method...")
@@ -1139,7 +1139,7 @@ def main(): # noqa: C901
11391139
"(dense weights must be W4 quantized)"
11401140
)
11411141

1142-
if args.moe_moe_activation_dtype != "bf16" and args.backend != "cuda":
1142+
if args.moe_moe_moe_activation_dtype != "bf16" and args.backend != "cuda":
11431143
parser.error("--moe-activation-dtype int8 requires --backend cuda")
11441144

11451145
model, config = load_and_quantize(args)

examples/models/qwen3_5_moe/main.cpp

Lines changed: 113 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include <executorch/runtime/platform/log.h>
1919
#include <pytorch/tokenizers/hf_tokenizer.h>
2020

21+
#include <algorithm>
2122
#include <cinttypes>
2223
#include <fstream>
2324
#include <string>
@@ -110,6 +111,17 @@ int main(int argc, char** argv) {
110111
return 1;
111112
}
112113

114+
// GPU memory: before load
115+
{
116+
size_t free = 0, total = 0;
117+
if (cudaMemGetInfo(&free, &total) == cudaSuccess) {
118+
stats.gpu_total_bytes = total;
119+
stats.gpu_free_before_load_bytes = free;
120+
}
121+
}
122+
123+
stats.model_load_start_ms = llm::time_in_ms();
124+
113125
// Create Module with share_memory_arenas=true so prefill and decode
114126
// share mutable buffers (KV cache, conv_state, recurrent_state).
115127
std::vector<std::string> data_files;
@@ -184,11 +196,13 @@ int main(int argc, char** argv) {
184196

185197
stats.model_load_end_ms = llm::time_in_ms();
186198

187-
#ifdef EXECUTORCH_BUILD_CUDA
188-
// GPU memory after load
189-
cudaMemGetInfo(&gpu_free_bytes, &gpu_total_bytes);
190-
stats.gpu_free_after_load_bytes = gpu_free_bytes;
191-
#endif
199+
// GPU memory: after load
200+
{
201+
size_t free = 0, total = 0;
202+
if (cudaMemGetInfo(&free, &total) == cudaSuccess) {
203+
stats.gpu_free_after_load_bytes = free;
204+
}
205+
}
192206

193207
// Get EOS ids
194208
auto eos_ids = llm::get_eos_ids(tokenizer.get(), module.get());
@@ -231,6 +245,9 @@ int main(int argc, char** argv) {
231245
auto temp_tensor =
232246
from_blob(&temp_val, {1}, executorch::aten::ScalarType::Float);
233247

248+
stats.inference_start_ms = llm::time_in_ms();
249+
stats.num_prompt_tokens = num_prompt_tokens;
250+
234251
// ---------------------------------------------------------------
235252
// Prefill
236253
// ---------------------------------------------------------------
@@ -272,14 +289,14 @@ int main(int argc, char** argv) {
272289
cur_token = read_token(prefill_outputs[0].toTensor());
273290

274291
stats.prompt_eval_end_ms = llm::time_in_ms();
275-
292+
stats.first_token_ms = stats.prompt_eval_end_ms;
276293
double prefill_ms =
277294
(double)(stats.prompt_eval_end_ms - stats.inference_start_ms);
278295
printf(
279296
"Prefill: %" PRId64 " tokens in %.1f ms (%.1f tok/s)\n",
280297
num_prompt_tokens,
281298
prefill_ms,
282-
num_prompt_tokens * 1000.0 / prefill_ms);
299+
num_prompt_tokens / prefill_ms * stats.SCALING_FACTOR_UNITS_PER_SECOND);
283300

284301
#ifdef EXECUTORCH_BUILD_CUDA
285302
// Synchronize CUDA device to ensure prefill's writes to shared mutable
@@ -344,24 +361,104 @@ int main(int argc, char** argv) {
344361
int64_t num_generated = pos - num_prompt_tokens;
345362
stats.num_generated_tokens = num_generated;
346363

364+
// GPU memory: after generate + peak usage
365+
{
366+
size_t free = 0, total = 0;
367+
if (cudaMemGetInfo(&free, &total) == cudaSuccess) {
368+
stats.gpu_free_after_generate_bytes = free;
369+
size_t min_free = free;
370+
if (stats.gpu_free_before_load_bytes != static_cast<uint64_t>(-1)) {
371+
min_free = std::min(min_free, (size_t)stats.gpu_free_before_load_bytes);
372+
}
373+
if (stats.gpu_free_after_load_bytes != static_cast<uint64_t>(-1)) {
374+
min_free = std::min(min_free, (size_t)stats.gpu_free_after_load_bytes);
375+
}
376+
stats.gpu_peak_usage_mb = (double)(total - min_free) / 1024.0 / 1024.0;
377+
}
378+
}
379+
380+
printf("\n");
381+
347382
double decode_ms =
348383
(double)(stats.inference_end_ms - stats.prompt_eval_end_ms);
384+
printf(
385+
"Prefill: %" PRId64 " tokens in %.1f ms (%.1f tok/s)\n",
386+
num_prompt_tokens,
387+
prefill_ms,
388+
num_prompt_tokens / prefill_ms * stats.SCALING_FACTOR_UNITS_PER_SECOND);
349389
printf(
350390
"Decode: %" PRId64 " tokens in %.1f ms (%.1f tok/s)\n",
351391
num_generated,
352392
decode_ms,
353-
num_generated * 1000.0 / decode_ms);
393+
num_generated / decode_ms * stats.SCALING_FACTOR_UNITS_PER_SECOND);
354394
printf("Prompt tokens: %" PRId64 "\n", num_prompt_tokens);
355395

356-
#ifdef EXECUTORCH_BUILD_CUDA
357-
// GPU memory after generation
358-
cudaMemGetInfo(&gpu_free_bytes, &gpu_total_bytes);
359-
stats.gpu_free_after_generate_bytes = gpu_free_bytes;
360-
stats.gpu_peak_usage_mb =
361-
(stats.gpu_total_bytes - gpu_free_bytes) / 1024.0 / 1024.0;
362-
#endif
396+
// Structured stats report (matches stats.h print_report)
397+
printf("PyTorchObserver %s\n", llm::stats_to_json_string(stats).c_str());
398+
399+
double ms_per_s = stats.SCALING_FACTOR_UNITS_PER_SECOND;
363400

364-
llm::print_report(stats);
401+
double model_load_s =
402+
(double)(stats.model_load_end_ms - stats.model_load_start_ms) / ms_per_s;
403+
double inference_time_ms =
404+
(double)(stats.inference_end_ms - stats.inference_start_ms);
405+
double prompt_eval_ms =
406+
(double)(stats.prompt_eval_end_ms - stats.inference_start_ms);
407+
double eval_ms = (double)(stats.inference_end_ms - stats.prompt_eval_end_ms);
408+
double ttft_s =
409+
(double)(stats.first_token_ms - stats.inference_start_ms) / ms_per_s;
410+
double sampling_s = (double)stats.aggregate_sampling_time_ms / ms_per_s;
411+
412+
printf("\n");
413+
printf(
414+
"\tPrompt Tokens: %" PRId64 " Generated Tokens: %" PRId64 "\n",
415+
stats.num_prompt_tokens,
416+
stats.num_generated_tokens);
417+
printf("\tModel Load Time:\t\t%f (seconds)\n", model_load_s);
418+
printf(
419+
"\tTotal inference time:\t\t%f (seconds)\t\t Rate: \t%f (tokens/second)\n",
420+
inference_time_ms / ms_per_s,
421+
stats.num_generated_tokens / inference_time_ms * ms_per_s);
422+
printf(
423+
"\t\tPrompt evaluation:\t%f (seconds)\t\t Rate: \t%f (tokens/second)\n",
424+
prompt_eval_ms / ms_per_s,
425+
stats.num_prompt_tokens / prompt_eval_ms * ms_per_s);
426+
printf(
427+
"\t\tGenerated %" PRId64
428+
" tokens:\t%f (seconds)\t\t Rate: \t%f (tokens/second)\n",
429+
stats.num_generated_tokens,
430+
eval_ms / ms_per_s,
431+
stats.num_generated_tokens / eval_ms * ms_per_s);
432+
printf("\tTime to first generated token:\t%f (seconds)\n", ttft_s);
433+
printf(
434+
"\tSampling time over %" PRId64 " tokens:\t%f (seconds)\n",
435+
stats.num_prompt_tokens + stats.num_generated_tokens,
436+
sampling_s);
437+
438+
// GPU memory reporting
439+
if (stats.gpu_total_bytes != static_cast<uint64_t>(-1)) {
440+
printf(
441+
"\tGPU total memory: %.2f MB\n",
442+
stats.gpu_total_bytes / 1024.0 / 1024.0);
443+
if (stats.gpu_free_before_load_bytes != static_cast<uint64_t>(-1)) {
444+
printf(
445+
"\tGPU free before load: %.2f MB\n",
446+
stats.gpu_free_before_load_bytes / 1024.0 / 1024.0);
447+
}
448+
if (stats.gpu_free_after_load_bytes != static_cast<uint64_t>(-1)) {
449+
printf(
450+
"\tGPU free after load: %.2f MB\n",
451+
stats.gpu_free_after_load_bytes / 1024.0 / 1024.0);
452+
}
453+
if (stats.gpu_free_after_generate_bytes != static_cast<uint64_t>(-1)) {
454+
printf(
455+
"\tGPU free after generate: %.2f MB\n",
456+
stats.gpu_free_after_generate_bytes / 1024.0 / 1024.0);
457+
}
458+
if (stats.gpu_peak_usage_mb >= 0.0) {
459+
printf("\tGPU peak usage: %.2f MB\n", stats.gpu_peak_usage_mb);
460+
}
461+
}
365462

366463
return 0;
367464
}

examples/models/qwen3_5_moe/model.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -479,7 +479,7 @@ def __init__(self, config):
479479
self.hidden_size = config.hidden_size
480480
self.group_size = 32
481481
self.use_batched_moe = False
482-
self.moe_moe_activation_dtype = "bf16"
482+
self.moe_moe_moe_activation_dtype = "bf16"
483483

484484
self.w1_weight = nn.Parameter(
485485
torch.empty(
@@ -498,7 +498,7 @@ def __init__(self, config):
498498

499499
def forward(self, x, expert_weights, expert_indices, top_k):
500500
if self.use_batched_moe:
501-
if self.moe_moe_activation_dtype == "int8":
501+
if self.moe_moe_moe_activation_dtype == "int8":
502502
return torch.ops.triton.fused_moe_batched_gemm_int8(
503503
x,
504504
self.w1,

0 commit comments

Comments
 (0)