fix(FR-2582): preserve final TPS value after LLM Playground response ends (#6707)

yomybaby · yomybaby · commit 2e0a5e1cbb6e · 2026-04-23T07:33:28.000Z
Resolves #6705(FR-2582) ## Summary In the LLM Playground, the TPS (tokens per second) indicator dropped to `0` immediately after a streaming response finished because `onFinish` cleared `startTime` to `null` and `ChatTokenCounter` returned `0` whenever `startTime` was nullish. This made the final TPS measurement disappear from the UI. This change: - **Aligns TPS measurement with the standard LLM inference convention** ([vLLM](https://docs.vllm.ai/en/stable/design/metrics/), [Ollama](https://github.com/ollama/ollama/blob/main/docs/api.md), [NVIDIA GenAI-Perf](https://docs.nvidia.com/nim/benchmarking/llm/latest/metrics.html), [Anyscale](https://docs.anyscale.com/llm/serving/benchmarking/metrics)): start the measurement window when the **first output token** actually arrives, not when the user presses send. This excludes file upload, network RTT, and prefill time (TTFT) from the TPS numerator, so the displayed value reflects pure decode rate. - Tracks the measurement window as `{ startTime, endTime }` in `ChatCard`: - `startTime` is set by a `useEffect` when `status` transitions to `'streaming'` (i.e., the first token has been received). - `endTime` is set by a `useEffect` when streaming ends — covers normal completion **and** abort / error paths, so TPS freezes correctly in every case instead of drifting downward indefinitely after `stop()`. - `handleSendMessage` resets both to `null` on every new send. - `ChatTokenCounter` now computes elapsed as `((endTime ?? Date.now()) - startTime) / 1000` and short-circuits to `0` when elapsed is non-positive, avoiding an `Infinity` TPS display when the computation runs before the first token chunk has been counted. ## Files changed - `react/src/components/Chat/ChatCard.tsx` - `react/src/components/Chat/ChatMessages.tsx` - `react/src/components/Chat/ChatTokenCounter.tsx` ## Manual test plan - Send a prompt to a model and confirm the TPS counter updates while the response streams in. - After the response completes, confirm the TPS value remains visible (frozen at the last measurement) instead of resetting to `0`. - Send a second prompt and confirm the TPS counter resets and starts measuring the new response. - Click stop mid-stream: TPS should freeze at the partial value rather than continue drifting downward. - Send a prompt with a large file attachment: TPS should reflect only the model's decode rate, not the upload duration. ## Verification `bash scripts/verify.sh` -> `=== ALL PASS ===` (Relay, Lint, Format, TypeScript)
diff --git a/react/src/components/Chat/ChatCard.tsx b/react/src/components/Chat/ChatCard.tsx
@@ -238,6 +238,7 @@ const PureChatCard: React.FC<ChatCardProps> = ({
   const dropContainerRef = useRef<HTMLDivElement>(null);
   const [fetchKey, updateFetchKey] = useUpdatableState('first');
   const [startTime, setStartTime] = useState<number | null>(null);
+  const [endTime, setEndTime] = useState<number | null>(null);
 
   const { agents } = useAIAgent();
   const agent = agents.find((a) => a.id === chat.provider.agentId);
@@ -261,9 +262,6 @@ const PureChatCard: React.FC<ChatCardProps> = ({
   const { error, messages, stop, status, sendMessage, setMessages } = useChat({
     experimental_throttle: 100,
     messages: chat.messages,
-    onFinish: () => {
-      setStartTime(null);
-    },
     // Because there is an issue(https://github.com/vercel/ai/issues/8956) with useChat that does not run a new transport without an id change,
     // we have to change the id and use fetch by utilizing useEventNotStable.
     id: `chat-${baseURL}-${modelId}-${effectiveApiKey}`,
@@ -329,9 +327,28 @@ const PureChatCard: React.FC<ChatCardProps> = ({
 
   const isStreaming = status === 'streaming' || status === 'submitted';
 
+  // TPS measurement window follows the standard LLM inference convention:
+  // start when the first output token arrives (status transitions to
+  // 'streaming') and stop when streaming ends (success, abort, or error).
+  // This excludes file upload, network RTT, and prefill (TTFT), so the
+  // displayed TPS reflects pure decode rate — the same definition used by
+  // vLLM, Ollama, NVIDIA GenAI-Perf, etc.
+  useEffect(() => {
+    if (status === 'streaming' && startTime === null) {
+      setStartTime(Date.now());
+    }
+  }, [status, startTime]);
+
+  useEffect(() => {
+    if (!isStreaming && startTime !== null && endTime === null) {
+      setEndTime(Date.now());
+    }
+  }, [isStreaming, startTime, endTime]);
+
   // Helper function to handle message sending with files
   const handleSendMessage = async (textContent: string, files?: File[]) => {
-    setStartTime(Date.now());
+    setStartTime(null);
+    setEndTime(null);
 
     const parts: Array<
       | { type: 'text'; text: string }
@@ -540,6 +557,7 @@ const PureChatCard: React.FC<ChatCardProps> = ({
         input={input}
         isStreaming={isStreaming}
         startTime={startTime}
+        endTime={endTime}
       />
       <ChatInput
         disabled={!baseURL}
diff --git a/react/src/components/Chat/ChatMessages.tsx b/react/src/components/Chat/ChatMessages.tsx
@@ -18,13 +18,15 @@ interface ChatMessageProps {
   input: string;
   isStreaming: boolean;
   startTime: number | null;
+  endTime: number | null;
 }
 
 const ChatMessages: React.FC<ChatMessageProps> = ({
   messages,
   input,
   isStreaming,
   startTime,
+  endTime,
 }) => {
   const { token } = theme.useToken();
   return (
@@ -44,6 +46,7 @@ const ChatMessages: React.FC<ChatMessageProps> = ({
           messages={messages}
           input={input}
           startTime={startTime}
+          endTime={endTime}
         />
       </BAIFlex>
     </BAIFlex>
diff --git a/react/src/components/Chat/ChatTokenCounter.tsx b/react/src/components/Chat/ChatTokenCounter.tsx
@@ -9,52 +9,49 @@ import { Typography, Tag, Divider } from 'antd';
 import { BAIFlex } from 'backend.ai-ui';
 import { t } from 'i18next';
 import { map, last } from 'lodash-es';
-import React, { useMemo } from 'react';
+import React from 'react';
 
 interface ChatTokenCounterProps {
   input: string;
   messages: UIMessage[];
   startTime: number | null;
+  endTime: number | null;
   style?: React.CSSProperties;
 }
 
 const ChatTokenCounter: React.FC<ChatTokenCounterProps> = ({
   input,
   messages,
   startTime,
+  endTime,
 }) => {
+  'use memo';
+
   const inputTokenCount = useTokenCount(input);
-  const allChatMessageString = useMemo(() => {
-    return map(messages, (message) =>
-      message?.parts
-        ?.filter((part) => part.type === 'text')
-        .map((part) => part.text)
-        .join(''),
-    ).join('');
-  }, [messages]);
+  const allChatMessageString = map(messages, (message) =>
+    message?.parts
+      ?.filter((part) => part.type === 'text')
+      .map((part) => part.text)
+      .join(''),
+  ).join('');
   const chatsTokenCount = useTokenCount(allChatMessageString);
   const totalTokenCount = inputTokenCount + chatsTokenCount;
-  const lastAssistantMessageString = useMemo(() => {
-    const lastAssistantMessage = last(messages);
-    if (lastAssistantMessage?.role === 'assistant') {
-      return (
-        lastAssistantMessage?.parts
+  const lastAssistantMessage = last(messages);
+  const lastAssistantMessageString =
+    lastAssistantMessage?.role === 'assistant'
+      ? lastAssistantMessage?.parts
           ?.filter((part) => part.type === 'text')
           .map((part) => part.text)
           .join('') || ''
-      );
-    } else {
-      return '';
-    }
-  }, [messages]);
+      : '';
 
   const lastAssistantTokenCount = useTokenCount(lastAssistantMessageString);
-  const tokenPerSecond = useMemo(() => {
-    return lastAssistantTokenCount > 0 && startTime
-      ? // eslint-disable-next-line react-hooks/purity
-        lastAssistantTokenCount / ((Date.now() - startTime) / 1000)
-      : 0;
-  }, [lastAssistantTokenCount, startTime]);
+  let tokenPerSecond = 0;
+  if (lastAssistantTokenCount > 0 && startTime) {
+    // eslint-disable-next-line react-hooks/purity
+    const elapsedSec = ((endTime ?? Date.now()) - startTime) / 1000;
+    tokenPerSecond = elapsedSec > 0 ? lastAssistantTokenCount / elapsedSec : 0;
+  }
 
   return (
     <BAIFlex justify="end" align="end">