Skip to content

Commit 2e0a5e1

Browse files
committed
fix(FR-2582): preserve final TPS value after LLM Playground response ends (#6707)
Resolves #6705(FR-2582) ## Summary In the LLM Playground, the TPS (tokens per second) indicator dropped to `0` immediately after a streaming response finished because `onFinish` cleared `startTime` to `null` and `ChatTokenCounter` returned `0` whenever `startTime` was nullish. This made the final TPS measurement disappear from the UI. This change: - **Aligns TPS measurement with the standard LLM inference convention** ([vLLM](https://docs.vllm.ai/en/stable/design/metrics/), [Ollama](https://github.com/ollama/ollama/blob/main/docs/api.md), [NVIDIA GenAI-Perf](https://docs.nvidia.com/nim/benchmarking/llm/latest/metrics.html), [Anyscale](https://docs.anyscale.com/llm/serving/benchmarking/metrics)): start the measurement window when the **first output token** actually arrives, not when the user presses send. This excludes file upload, network RTT, and prefill time (TTFT) from the TPS numerator, so the displayed value reflects pure decode rate. - Tracks the measurement window as `{ startTime, endTime }` in `ChatCard`: - `startTime` is set by a `useEffect` when `status` transitions to `'streaming'` (i.e., the first token has been received). - `endTime` is set by a `useEffect` when streaming ends — covers normal completion **and** abort / error paths, so TPS freezes correctly in every case instead of drifting downward indefinitely after `stop()`. - `handleSendMessage` resets both to `null` on every new send. - `ChatTokenCounter` now computes elapsed as `((endTime ?? Date.now()) - startTime) / 1000` and short-circuits to `0` when elapsed is non-positive, avoiding an `Infinity` TPS display when the computation runs before the first token chunk has been counted. ## Files changed - `react/src/components/Chat/ChatCard.tsx` - `react/src/components/Chat/ChatMessages.tsx` - `react/src/components/Chat/ChatTokenCounter.tsx` ## Manual test plan - Send a prompt to a model and confirm the TPS counter updates while the response streams in. - After the response completes, confirm the TPS value remains visible (frozen at the last measurement) instead of resetting to `0`. - Send a second prompt and confirm the TPS counter resets and starts measuring the new response. - Click stop mid-stream: TPS should freeze at the partial value rather than continue drifting downward. - Send a prompt with a large file attachment: TPS should reflect only the model's decode rate, not the upload duration. ## Verification `bash scripts/verify.sh` -> `=== ALL PASS ===` (Relay, Lint, Format, TypeScript)
1 parent e6ed169 commit 2e0a5e1

3 files changed

Lines changed: 47 additions & 29 deletions

File tree

react/src/components/Chat/ChatCard.tsx

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,7 @@ const PureChatCard: React.FC<ChatCardProps> = ({
238238
const dropContainerRef = useRef<HTMLDivElement>(null);
239239
const [fetchKey, updateFetchKey] = useUpdatableState('first');
240240
const [startTime, setStartTime] = useState<number | null>(null);
241+
const [endTime, setEndTime] = useState<number | null>(null);
241242

242243
const { agents } = useAIAgent();
243244
const agent = agents.find((a) => a.id === chat.provider.agentId);
@@ -261,9 +262,6 @@ const PureChatCard: React.FC<ChatCardProps> = ({
261262
const { error, messages, stop, status, sendMessage, setMessages } = useChat({
262263
experimental_throttle: 100,
263264
messages: chat.messages,
264-
onFinish: () => {
265-
setStartTime(null);
266-
},
267265
// Because there is an issue(https://github.com/vercel/ai/issues/8956) with useChat that does not run a new transport without an id change,
268266
// we have to change the id and use fetch by utilizing useEventNotStable.
269267
id: `chat-${baseURL}-${modelId}-${effectiveApiKey}`,
@@ -329,9 +327,28 @@ const PureChatCard: React.FC<ChatCardProps> = ({
329327

330328
const isStreaming = status === 'streaming' || status === 'submitted';
331329

330+
// TPS measurement window follows the standard LLM inference convention:
331+
// start when the first output token arrives (status transitions to
332+
// 'streaming') and stop when streaming ends (success, abort, or error).
333+
// This excludes file upload, network RTT, and prefill (TTFT), so the
334+
// displayed TPS reflects pure decode rate — the same definition used by
335+
// vLLM, Ollama, NVIDIA GenAI-Perf, etc.
336+
useEffect(() => {
337+
if (status === 'streaming' && startTime === null) {
338+
setStartTime(Date.now());
339+
}
340+
}, [status, startTime]);
341+
342+
useEffect(() => {
343+
if (!isStreaming && startTime !== null && endTime === null) {
344+
setEndTime(Date.now());
345+
}
346+
}, [isStreaming, startTime, endTime]);
347+
332348
// Helper function to handle message sending with files
333349
const handleSendMessage = async (textContent: string, files?: File[]) => {
334-
setStartTime(Date.now());
350+
setStartTime(null);
351+
setEndTime(null);
335352

336353
const parts: Array<
337354
| { type: 'text'; text: string }
@@ -540,6 +557,7 @@ const PureChatCard: React.FC<ChatCardProps> = ({
540557
input={input}
541558
isStreaming={isStreaming}
542559
startTime={startTime}
560+
endTime={endTime}
543561
/>
544562
<ChatInput
545563
disabled={!baseURL}

react/src/components/Chat/ChatMessages.tsx

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,15 @@ interface ChatMessageProps {
1818
input: string;
1919
isStreaming: boolean;
2020
startTime: number | null;
21+
endTime: number | null;
2122
}
2223

2324
const ChatMessages: React.FC<ChatMessageProps> = ({
2425
messages,
2526
input,
2627
isStreaming,
2728
startTime,
29+
endTime,
2830
}) => {
2931
const { token } = theme.useToken();
3032
return (
@@ -44,6 +46,7 @@ const ChatMessages: React.FC<ChatMessageProps> = ({
4446
messages={messages}
4547
input={input}
4648
startTime={startTime}
49+
endTime={endTime}
4750
/>
4851
</BAIFlex>
4952
</BAIFlex>

react/src/components/Chat/ChatTokenCounter.tsx

Lines changed: 22 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -9,52 +9,49 @@ import { Typography, Tag, Divider } from 'antd';
99
import { BAIFlex } from 'backend.ai-ui';
1010
import { t } from 'i18next';
1111
import { map, last } from 'lodash-es';
12-
import React, { useMemo } from 'react';
12+
import React from 'react';
1313

1414
interface ChatTokenCounterProps {
1515
input: string;
1616
messages: UIMessage[];
1717
startTime: number | null;
18+
endTime: number | null;
1819
style?: React.CSSProperties;
1920
}
2021

2122
const ChatTokenCounter: React.FC<ChatTokenCounterProps> = ({
2223
input,
2324
messages,
2425
startTime,
26+
endTime,
2527
}) => {
28+
'use memo';
29+
2630
const inputTokenCount = useTokenCount(input);
27-
const allChatMessageString = useMemo(() => {
28-
return map(messages, (message) =>
29-
message?.parts
30-
?.filter((part) => part.type === 'text')
31-
.map((part) => part.text)
32-
.join(''),
33-
).join('');
34-
}, [messages]);
31+
const allChatMessageString = map(messages, (message) =>
32+
message?.parts
33+
?.filter((part) => part.type === 'text')
34+
.map((part) => part.text)
35+
.join(''),
36+
).join('');
3537
const chatsTokenCount = useTokenCount(allChatMessageString);
3638
const totalTokenCount = inputTokenCount + chatsTokenCount;
37-
const lastAssistantMessageString = useMemo(() => {
38-
const lastAssistantMessage = last(messages);
39-
if (lastAssistantMessage?.role === 'assistant') {
40-
return (
41-
lastAssistantMessage?.parts
39+
const lastAssistantMessage = last(messages);
40+
const lastAssistantMessageString =
41+
lastAssistantMessage?.role === 'assistant'
42+
? lastAssistantMessage?.parts
4243
?.filter((part) => part.type === 'text')
4344
.map((part) => part.text)
4445
.join('') || ''
45-
);
46-
} else {
47-
return '';
48-
}
49-
}, [messages]);
46+
: '';
5047

5148
const lastAssistantTokenCount = useTokenCount(lastAssistantMessageString);
52-
const tokenPerSecond = useMemo(() => {
53-
return lastAssistantTokenCount > 0 && startTime
54-
? // eslint-disable-next-line react-hooks/purity
55-
lastAssistantTokenCount / ((Date.now() - startTime) / 1000)
56-
: 0;
57-
}, [lastAssistantTokenCount, startTime]);
49+
let tokenPerSecond = 0;
50+
if (lastAssistantTokenCount > 0 && startTime) {
51+
// eslint-disable-next-line react-hooks/purity
52+
const elapsedSec = ((endTime ?? Date.now()) - startTime) / 1000;
53+
tokenPerSecond = elapsedSec > 0 ? lastAssistantTokenCount / elapsedSec : 0;
54+
}
5855

5956
return (
6057
<BAIFlex justify="end" align="end">

0 commit comments

Comments
 (0)