Skip to content

Commit b4b23da

Browse files
stephentoubCopilot
andcommitted
Fix flaky pending-messages-modified E2E test across SDKs
The pending-messages-modified event-fidelity test was failing intermittently in the C# SDK CI leg. The root cause was that this test was the only one in its fixture not using the standard `SendAndWaitAsync` + event-collector pattern; it went through a custom helper that did two independently-timed awaits and used an `async void` local function for backfilling existing messages. Refactor the test to the standard pattern in C#, Node, Python, and Go, and while here, replace the `async void` (and its JS analog,`new Promise(async (...) => ...)`) with proper `async Task` / `async` functions drained deterministically. The C# helper now has zero `async void` methods. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 76168a8 commit b4b23da

6 files changed

Lines changed: 113 additions & 95 deletions

File tree

dotnet/test/E2E/EventFidelityE2ETests.cs

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -136,22 +136,26 @@ await session.SendAndWaitAsync(new MessageOptions
136136
public async Task Should_Emit_Pending_Messages_Modified_Event_When_Message_Queue_Changes()
137137
{
138138
var session = await CreateSessionAsync();
139-
var pendingMessagesModified = TestHelper.GetNextEventOfTypeAsync<PendingMessagesModifiedEvent>(
140-
session,
141-
static _ => true,
142-
timeout: TimeSpan.FromSeconds(60),
143-
timeoutDescription: "pending_messages.modified event");
139+
var events = new List<SessionEvent>();
140+
session.On<SessionEvent>(evt => { lock (events) { events.Add(evt); } });
144141

145-
await session.SendAsync(new MessageOptions
142+
// Use SendAndWaitAsync + a single event collector to match the pattern
143+
// of every other test in this fixture (and the Rust E2E equivalent).
144+
// The earlier SendAsync + GetFinalAssistantMessageAsync split relied
145+
// on a custom helper with an async-void backfill and required juggling
146+
// two independently-timed awaits, which has been observed to flake in
147+
// CI.
148+
var answer = await session.SendAndWaitAsync(new MessageOptions
146149
{
147150
Prompt = "What is 9+9? Reply with just the number.",
148-
});
151+
}, timeout: TimeSpan.FromSeconds(120));
149152

150-
var answer = await TestHelper.GetFinalAssistantMessageAsync(session);
151-
var pendingEvent = await pendingMessagesModified;
153+
PendingMessagesModifiedEvent? pendingEvent;
154+
lock (events) { pendingEvent = events.OfType<PendingMessagesModifiedEvent>().FirstOrDefault(); }
152155

153156
Assert.NotNull(pendingEvent);
154-
Assert.Contains("18", answer?.Data.Content ?? string.Empty);
157+
Assert.NotNull(answer);
158+
Assert.Contains("18", answer!.Data.Content);
155159

156160
await session.DisposeAsync();
157161
}

dotnet/test/Harness/TestHelper.cs

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ public static class TestHelper
2222
using var cts = new CancellationTokenSource(timeout ?? DefaultEventTimeout);
2323

2424
// Both `finalAssistantMessage` and `sawIdle` are set from two threads — the
25-
// subscription callback (CLI read loop) and CheckExistingMessages (RPC reply).
25+
// subscription callback (CLI read loop) and CheckExistingMessagesAsync (RPC reply).
2626
// We complete only once we've observed both, regardless of which path saw which.
2727
var stateLock = new object();
2828
AssistantMessageEvent? finalAssistantMessage = null;
@@ -59,14 +59,29 @@ void TryComplete()
5959
});
6060

6161
// Backfill from already-delivered messages so we don't lose events that arrived
62-
// between SendAsync returning and the subscription being installed.
63-
CheckExistingMessages();
62+
// between SendAsync returning and the subscription being installed. Run it
63+
// concurrently with the live subscription, but keep the Task observable so any
64+
// exception is propagated through tcs (not the unobserved-task handler) and so
65+
// we can drain it deterministically below.
66+
var backfill = CheckExistingMessagesAsync();
6467

65-
cts.Token.Register(() => tcs.TrySetException(new TimeoutException("Timeout waiting for assistant message")));
68+
using var registration = cts.Token.Register(
69+
static state => ((TaskCompletionSource<AssistantMessageEvent>)state!).TrySetException(
70+
new TimeoutException("Timeout waiting for assistant message")),
71+
tcs);
6672

67-
return await tcs.Task;
73+
try
74+
{
75+
return await tcs.Task;
76+
}
77+
finally
78+
{
79+
// Drain the backfill before our `using` scopes (cts, subscription) dispose.
80+
// Any exception was already routed through tcs above, so swallow here.
81+
try { await backfill.ConfigureAwait(false); } catch { }
82+
}
6883

69-
async void CheckExistingMessages()
84+
async Task CheckExistingMessagesAsync()
7085
{
7186
try
7287
{

go/internal/e2e/event_fidelity_e2e_test.go

Lines changed: 25 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ import (
66
"strings"
77
"sync"
88
"testing"
9-
"time"
109

1110
copilot "github.com/github/copilot-sdk/go"
1211
"github.com/github/copilot-sdk/go/internal/e2e/testharness"
@@ -135,34 +134,40 @@ func TestEventFidelityE2E(t *testing.T) {
135134
}
136135
t.Cleanup(func() { _ = session.Disconnect() })
137136

138-
pendingModified := make(chan *copilot.SessionEvent, 1)
137+
var mu sync.Mutex
138+
var events []copilot.SessionEvent
139139
session.On(func(event copilot.SessionEvent) {
140-
if _, ok := event.Data.(*copilot.PendingMessagesModifiedData); ok {
141-
select {
142-
case pendingModified <- &event:
143-
default:
144-
}
145-
}
140+
mu.Lock()
141+
events = append(events, event)
142+
mu.Unlock()
146143
})
147144

148-
if _, err := session.Send(t.Context(), copilot.MessageOptions{
145+
// SendAndWait collects everything in one round trip and matches the
146+
// pattern of every other test in this file (and the Rust E2E equivalent),
147+
// avoiding the split fire-and-forget + helper pattern that previously
148+
// made this test prone to flakes.
149+
answer, err := session.SendAndWait(t.Context(), copilot.MessageOptions{
149150
Prompt: "What is 9+9? Reply with just the number.",
150-
}); err != nil {
151-
t.Fatalf("Send failed: %v", err)
151+
})
152+
if err != nil {
153+
t.Fatalf("SendAndWait failed: %v", err)
152154
}
153155

154-
select {
155-
case evt := <-pendingModified:
156-
if evt == nil {
157-
t.Error("Expected a non-nil pending_messages.modified event")
156+
snapshot := snapshotEventFidelityEvents(&mu, &events)
157+
158+
var pendingEvent *copilot.SessionEvent
159+
for i := range snapshot {
160+
if _, ok := snapshot[i].Data.(*copilot.PendingMessagesModifiedData); ok {
161+
pendingEvent = &snapshot[i]
162+
break
158163
}
159-
case <-time.After(60 * time.Second):
160-
t.Fatal("Timed out waiting for pending_messages.modified event")
164+
}
165+
if pendingEvent == nil {
166+
t.Error("Expected to observe a pending_messages.modified event")
161167
}
162168

163-
answer, err := testharness.GetFinalAssistantMessage(t.Context(), session)
164-
if err != nil {
165-
t.Fatalf("Failed to get final assistant message: %v", err)
169+
if answer == nil {
170+
t.Fatal("Expected SendAndWait to return an assistant message")
166171
}
167172
if ad, ok := answer.Data.(*copilot.AssistantMessageData); !ok || !strings.Contains(ad.Content, "18") {
168173
t.Errorf("Expected answer to contain '18', got %v", answer.Data)

nodejs/test/e2e/event_fidelity.e2e.test.ts

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ import { join } from "path";
77
import { describe, expect, it } from "vitest";
88
import { SessionEvent, approveAll } from "../../src/index.js";
99
import { createSdkTestContext } from "./harness/sdkTestContext";
10-
import { getFinalAssistantMessage, getNextEventOfType } from "./harness/sdkTestHelper.js";
1110

1211
describe("Event Fidelity", async () => {
1312
const { copilotClient: client, workDir } = await createSdkTestContext();
@@ -178,17 +177,20 @@ describe("Event Fidelity", async () => {
178177

179178
it("should emit pending messages modified event when message queue changes", async () => {
180179
const session = await client.createSession({ onPermissionRequest: approveAll });
180+
const events: SessionEvent[] = [];
181+
session.on((event) => {
182+
events.push(event);
183+
});
181184

182-
const pendingModifiedP = getNextEventOfType(session, "pending_messages.modified");
183-
184-
void session.send({
185+
// sendAndWait collects everything in one round trip and matches the
186+
// pattern of every other test in this file (and the Rust E2E equivalent),
187+
// avoiding the split fire-and-forget + helper pattern that previously
188+
// made this test prone to flakes.
189+
const answer = await session.sendAndWait({
185190
prompt: "What is 9+9? Reply with just the number.",
186191
});
187192

188-
const [pendingEvent, answer] = await Promise.all([
189-
pendingModifiedP,
190-
getFinalAssistantMessage(session),
191-
]);
193+
const pendingEvent = events.find((e) => e.type === "pending_messages.modified");
192194

193195
expect(pendingEvent).toBeDefined();
194196
expect(answer?.data.content).toContain("18");

nodejs/test/e2e/harness/sdkTestHelper.ts

Lines changed: 32 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -8,50 +8,46 @@ export async function getFinalAssistantMessage(
88
session: CopilotSession,
99
{ alreadyIdle = false }: { alreadyIdle?: boolean } = {}
1010
): Promise<AssistantMessageEvent> {
11-
// We don't know whether the answer has already arrived or not, so race both possibilities
12-
return new Promise<AssistantMessageEvent>(async (resolve, reject) => {
13-
getFutureFinalResponse(session).then(resolve).catch(reject);
14-
getExistingFinalResponse(session, alreadyIdle)
15-
.then((msg) => {
16-
if (msg) {
17-
resolve(msg);
18-
}
19-
})
20-
.catch(reject);
21-
});
11+
// Start listening for the answer immediately so we don't miss any events that arrive
12+
// between the existing-messages RPC starting and the subscription being installed.
13+
const futurePromise = getFutureFinalResponse(session);
14+
// We may end up returning from the existing-messages path; attach a noop handler so
15+
// the unawaited future-response rejection doesn't surface as an unhandled rejection.
16+
futurePromise.catch(() => {});
17+
18+
const existing = await getExistingFinalResponse(session, alreadyIdle);
19+
if (existing) {
20+
return existing;
21+
}
22+
return futurePromise;
2223
}
2324

24-
function getExistingFinalResponse(
25+
async function getExistingFinalResponse(
2526
session: CopilotSession,
2627
alreadyIdle: boolean = false
2728
): Promise<AssistantMessageEvent | undefined> {
28-
return new Promise<AssistantMessageEvent | undefined>(async (resolve, reject) => {
29-
const messages = await session.getEvents();
30-
const finalUserMessageIndex = messages.findLastIndex((m) => m.type === "user.message");
31-
const currentTurnMessages =
32-
finalUserMessageIndex < 0 ? messages : messages.slice(finalUserMessageIndex);
29+
const messages = await session.getEvents();
30+
const finalUserMessageIndex = messages.findLastIndex((m) => m.type === "user.message");
31+
const currentTurnMessages =
32+
finalUserMessageIndex < 0 ? messages : messages.slice(finalUserMessageIndex);
3333

34-
const currentTurnError = currentTurnMessages.find((m) => m.type === "session.error");
35-
if (currentTurnError) {
36-
const error = new Error(currentTurnError.data.message);
37-
error.stack = currentTurnError.data.stack;
38-
reject(error);
39-
return;
40-
}
34+
const currentTurnError = currentTurnMessages.find((m) => m.type === "session.error");
35+
if (currentTurnError) {
36+
const error = new Error(currentTurnError.data.message);
37+
error.stack = currentTurnError.data.stack;
38+
throw error;
39+
}
4140

42-
const sessionIdleMessageIndex = alreadyIdle
43-
? currentTurnMessages.length
44-
: currentTurnMessages.findIndex((m) => m.type === "session.idle");
45-
if (sessionIdleMessageIndex !== -1) {
46-
const lastAssistantMessage = currentTurnMessages
47-
.slice(0, sessionIdleMessageIndex)
48-
.findLast((m) => m.type === "assistant.message");
49-
resolve(lastAssistantMessage as AssistantMessageEvent | undefined);
50-
return;
51-
}
41+
const sessionIdleMessageIndex = alreadyIdle
42+
? currentTurnMessages.length
43+
: currentTurnMessages.findIndex((m) => m.type === "session.idle");
44+
if (sessionIdleMessageIndex !== -1) {
45+
return currentTurnMessages
46+
.slice(0, sessionIdleMessageIndex)
47+
.findLast((m) => m.type === "assistant.message") as AssistantMessageEvent | undefined;
48+
}
5249

53-
resolve(undefined);
54-
});
50+
return undefined;
5551
}
5652

5753
function getFutureFinalResponse(session: CopilotSession): Promise<AssistantMessageEvent> {

python/e2e/test_event_fidelity_e2e.py

Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
from __future__ import annotations
44

5-
import asyncio
65
from pathlib import Path
76

87
import pytest
@@ -178,22 +177,19 @@ async def test_should_emit_pending_messages_modified_event_when_message_queue_ch
178177
session = await ctx.client.create_session(
179178
on_permission_request=PermissionHandler.approve_all
180179
)
181-
pending_task: asyncio.Future = asyncio.get_event_loop().create_future()
182-
183-
def on_event(event):
184-
if isinstance(event.data, PendingMessagesModifiedData) and not pending_task.done():
185-
pending_task.set_result(event)
186-
187-
unsubscribe = session.on(on_event)
180+
events = []
181+
unsubscribe = session.on(events.append)
188182
try:
189-
# Fire-and-forget to trigger pending_messages.modified; then wait for it
190-
asyncio.ensure_future(session.send("What is 9+9? Reply with just the number."))
191-
pending_event = await asyncio.wait_for(pending_task, timeout=60.0)
183+
# send_and_wait collects everything in one round trip and matches the
184+
# pattern of every other test in this file (and the Rust E2E equivalent),
185+
# avoiding the split fire-and-forget + helper pattern that previously
186+
# made this test prone to flakes.
187+
answer = await session.send_and_wait("What is 9+9? Reply with just the number.")
188+
189+
pending_event = next(
190+
(e for e in events if isinstance(e.data, PendingMessagesModifiedData)), None
191+
)
192192
assert pending_event is not None
193-
194-
from .testharness.helper import get_final_assistant_message
195-
196-
answer = await get_final_assistant_message(session, timeout=60.0)
197193
assert answer is not None
198194
assert "18" in (answer.data.content or "")
199195
finally:

0 commit comments

Comments
 (0)