Skip to content

Commit 6209b20

Browse files
committed
review: include tool-call argument tokens in tps generation window
- Bookmark first/last token timestamps on ResponseFunctionCallArgumentsDeltaEvent too so the tps generation window covers all event types whose tokens land in usage.output_tokens. Previously the numerator counted argument tokens but the denominator excluded their generation time, inflating tps for tool-heavy responses. - Lifted the bookmarking out of the text-delta branch into a single up-front check covering all four token-producing event types — cleaner than duplicating across branches. - Documented the single-token skip case (window collapses to 0) inline at the guard. TPS is undefined for a one-token response so emitting nothing is correct; the comment makes the intent visible to future readers.
1 parent 45733c9 commit 6209b20

1 file changed

Lines changed: 17 additions & 9 deletions

File tree

src/agentex/lib/core/temporal/plugins/openai_agents/models/temporal_streaming_model.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -725,6 +725,20 @@ async def get_response(
725725
# Log event type
726726
logger.debug(f"[TemporalStreamingModel] Event {event_count}: {type(event).__name__}")
727727

728+
# Bookmark first/last token-producing events for ttft and tps.
729+
# Includes function-call argument deltas so the generation window
730+
# covers every event type whose tokens land in usage.output_tokens.
731+
if isinstance(event, (
732+
ResponseTextDeltaEvent,
733+
ResponseReasoningTextDeltaEvent,
734+
ResponseReasoningSummaryTextDeltaEvent,
735+
ResponseFunctionCallArgumentsDeltaEvent,
736+
)):
737+
now_perf = time.perf_counter()
738+
if first_token_at is None:
739+
first_token_at = now_perf
740+
last_token_at = now_perf
741+
728742
# Handle different event types using isinstance for type safety
729743
if isinstance(event, ResponseOutputItemAddedEvent):
730744
# New output item (reasoning, function call, or message)
@@ -790,15 +804,6 @@ async def get_response(
790804
# Handle text streaming
791805
delta = getattr(event, 'delta', '')
792806

793-
# Bookmark first/last content-bearing events for ttft and tps.
794-
# last_token_at is updated on every delta so tps measures only
795-
# the model-generation window, not subsequent tool-call /
796-
# event-handler time.
797-
now_perf = time.perf_counter()
798-
if first_token_at is None:
799-
first_token_at = now_perf
800-
last_token_at = now_perf
801-
802807
if isinstance(event, ResponseReasoningSummaryTextDeltaEvent) and reasoning_context:
803808
# Stream reasoning summary deltas - these are the actual reasoning tokens!
804809
try:
@@ -1075,6 +1080,9 @@ async def get_response(
10751080
m.ttft_ms.record((first_token_at - stream_start_perf) * 1000, metric_attrs)
10761081
# tps denominator is the generation window (first→last delta), not
10771082
# total stream wall time — see _StreamingMetrics for rationale.
1083+
# Note: single-token responses (where first_token_at == last_token_at,
1084+
# e.g. a one-token tool-result acknowledgement) collapse the window
1085+
# to 0 and are intentionally skipped — TPS is undefined in that case.
10781086
if (
10791087
first_token_at is not None
10801088
and last_token_at is not None

0 commit comments

Comments
 (0)