@@ -23,32 +23,32 @@ struct ET_EXPERIMENTAL Stats {
2323 const long SCALING_FACTOR_UNITS_PER_SECOND = 1000 ;
2424 // Time stamps for the different stages of the execution
2525 // model_load_start_ms: Start of model loading.
26- long model_load_start_ms;
26+ long model_load_start_ms = 0 ;
2727 // model_load_end_ms: End of model loading.
28- long model_load_end_ms;
28+ long model_load_end_ms = 0 ;
2929 // inference_start_ms: Immediately after the model is loaded (or we check
3030 // for model load), measure the inference time.
3131 // NOTE: It's actually the tokenizer encode + model execution time.
32- long inference_start_ms;
32+ long inference_start_ms = 0 ;
3333 // End of the tokenizer encode time.
34- long token_encode_end_ms;
34+ long token_encode_end_ms = 0 ;
3535 // Start of the model execution (forward function) time.
36- long model_execution_start_ms;
36+ long model_execution_start_ms = 0 ;
3737 // End of the model execution (forward function) time.
38- long model_execution_end_ms;
38+ long model_execution_end_ms = 0 ;
3939 // prompt_eval_end_ms: Prompt array allocation and tokenization. Ends right
4040 // before the inference loop starts
41- long prompt_eval_end_ms;
41+ long prompt_eval_end_ms = 0 ;
4242 // first_token: Timestamp when the first generated token is emitted
43- long first_token_ms;
43+ long first_token_ms = 0 ;
4444 // inference_end_ms: End of inference/generation.
45- long inference_end_ms;
45+ long inference_end_ms = 0 ;
4646 // Keep a running total of the time spent in sampling.
4747 long aggregate_sampling_time_ms = 0 ;
4848 // Token count from prompt
49- int64_t num_prompt_tokens;
49+ int64_t num_prompt_tokens = 0 ;
5050 // Token count from generated (total - prompt)
51- int64_t num_generated_tokens;
51+ int64_t num_generated_tokens = 0 ;
5252 // GPU memory stats (optional; may be zero if not available)
5353 // GPU memory stats (optional). Use sentinel UINT64_MAX / -1.0 to indicate
5454 // "not available".
@@ -171,18 +171,18 @@ inline void print_report(const Stats& stats) {
171171 Info,
172172 " \t Total inference time:\t\t %f (seconds)\t\t Rate: \t %f (tokens/second)" ,
173173 inference_time_ms / stats.SCALING_FACTOR_UNITS_PER_SECOND ,
174-
175- (stats.num_generated_tokens ) /
176- (double )(stats.inference_end_ms - stats.inference_start_ms ) *
177- stats.SCALING_FACTOR_UNITS_PER_SECOND );
174+ inference_time_ms > 0 ? (stats.num_generated_tokens ) / inference_time_ms *
175+ stats.SCALING_FACTOR_UNITS_PER_SECOND
176+ : 0.0 );
178177 double prompt_eval_time =
179178 (double )(stats.prompt_eval_end_ms - stats.inference_start_ms );
180179 ET_LOG (
181180 Info,
182181 " \t\t Prompt evaluation:\t %f (seconds)\t\t Rate: \t %f (tokens/second)" ,
183182 prompt_eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND ,
184- (stats.num_prompt_tokens ) / prompt_eval_time *
185- stats.SCALING_FACTOR_UNITS_PER_SECOND );
183+ prompt_eval_time > 0 ? (stats.num_prompt_tokens ) / prompt_eval_time *
184+ stats.SCALING_FACTOR_UNITS_PER_SECOND
185+ : 0.0 );
186186
187187 double eval_time =
188188 (double )(stats.inference_end_ms - stats.prompt_eval_end_ms );
@@ -192,8 +192,9 @@ inline void print_report(const Stats& stats) {
192192 " tokens:\t %f (seconds)\t\t Rate: \t %f (tokens/second)" ,
193193 stats.num_generated_tokens ,
194194 eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND ,
195- stats.num_generated_tokens / eval_time *
196- stats.SCALING_FACTOR_UNITS_PER_SECOND );
195+ eval_time > 0 ? stats.num_generated_tokens / eval_time *
196+ stats.SCALING_FACTOR_UNITS_PER_SECOND
197+ : 0.0 );
197198
198199 // Time to first token is measured from the start of inference, excluding
199200 // model load time.
0 commit comments