Skip to content

Commit 09134a3

Browse files
spypsyaztec-bot
authored andcommitted
fix: interrupt publisher send-at-slot sleep on sequencer stop (#23990)
## Summary - Propagate `CheckpointProposalJob.interrupt()` to its `SequencerPublisher` so the publisher's `sendRequestsAt` slot-deadline sleep is cancelled on sequencer stop. - Check `interrupted` *before* sleeping in `sendRequestsAt`, since `InterruptibleSleep.interrupt()` only resolves sleeps already in flight. Completes #23930, which made the job's own polling waits interruptible but not the publisher's. In [this `e2e_ha_full.test.ts` flake](http://ci.aztec-labs.com/1c46f3d4a226073d) a node became proposer 165ms before teardown and took the no-broadcast votes path, leaving `pendingL1Submission = publisher.sendRequestsAt(targetSlot)` sleeping until the target slot — ~22 wall-clock minutes away under the warped test clock. `Sequencer.stop()` blocked on it ("Awaiting pending L1 payload submission"), node stops were abandoned, and Jest hung on leaked handles until CI killed the run. Nothing in the shutdown path interrupts the per-job `SequencerPublisher`: `publisherFactory.stopAll()` only interrupts the underlying `L1TxUtils`. ## Tests - `checkpoint_proposal_job.test.ts`: pending L1 submission blocked in the publisher resolves promptly on `job.interrupt()` (red without the fix). - `sequencer-publisher.test.ts`: `sendRequestsAt` returns immediately when interrupted before the sleep starts (red without the fix).
1 parent a438a66 commit 09134a3

5 files changed

Lines changed: 85 additions & 6 deletions

File tree

yarn-project/end-to-end/src/composed/ha/e2e_ha_full.test.ts

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -304,24 +304,39 @@ describe('HA Full Setup', () => {
304304
});
305305

306306
afterAll(async () => {
307+
const cleanupErrors: string[] = [];
308+
309+
dateProvider?.reset();
310+
307311
// Stop all HA peer nodes in parallel with a per-node deadline. A single stuck node can otherwise
308-
// block the serial loop long enough to blow the jest hook timeout — e.g. a sequencer.stop() that
309-
// awaits an L1 publish whose tx-timeout was computed on a test-warped clock and never fires.
312+
// block the serial loop long enough to blow the jest hook timeout, so report the stuck node directly
313+
// instead of letting the suite pass and later fail with Jest open handles.
310314
if (haNodeServices) {
311315
const STOP_DEADLINE_MS = 30_000;
312-
await Promise.allSettled(
316+
const stopResults = await Promise.allSettled(
313317
haNodeServices.map((service, i) => {
314318
logger.info(`Stopping HA peer node ${i}`);
315319
return Promise.race([
316320
service.stop().catch(error => {
317-
logger.error(`Failed to stop HA peer node ${i}: ${error}`);
321+
const message = `Failed to stop HA peer node ${i}: ${error}`;
322+
logger.error(message);
323+
return message;
318324
}),
319325
sleep(STOP_DEADLINE_MS).then(() => {
320-
logger.error(`HA peer node ${i} stop did not return within ${STOP_DEADLINE_MS}ms; abandoning`);
326+
const message = `HA peer node ${i} stop did not return within ${STOP_DEADLINE_MS}ms; abandoning`;
327+
logger.error(message);
328+
return message;
321329
}),
322330
]);
323331
}),
324332
);
333+
for (const result of stopResults) {
334+
if (result.status === 'rejected') {
335+
cleanupErrors.push(`Unexpected HA node stop error: ${result.reason}`);
336+
} else if (result.value) {
337+
cleanupErrors.push(result.value);
338+
}
339+
}
325340
}
326341

327342
// Cleanup HA keystore temp directories
@@ -350,6 +365,10 @@ describe('HA Full Setup', () => {
350365

351366
// Cleanup bootstrap node and test infrastructure (this cleans up the shared data directory)
352367
await teardown();
368+
369+
if (cleanupErrors.length > 0) {
370+
throw new Error(cleanupErrors.join('\n'));
371+
}
353372
});
354373

355374
afterEach(async () => {

yarn-project/sequencer-client/src/publisher/sequencer-publisher.test.ts

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -720,6 +720,28 @@ describe('SequencerPublisher', () => {
720720
expect((publisher as any).requests.length).toEqual(0);
721721
});
722722

723+
it('does not sleep in sendRequestsAt if interrupted beforehand', async () => {
724+
// A target slot far enough in the future that sendRequestsAt would sleep for ~1 hour
725+
// (EmptyL1RollupConstants has slotDuration 1s and l1GenesisTime 0, so slot N starts at N seconds).
726+
const targetSlot = SlotNumber(Math.ceil(Date.now() / 1000) + 3600);
727+
publisher.interrupt();
728+
729+
let timeout: NodeJS.Timeout | undefined;
730+
try {
731+
const result = await Promise.race([
732+
publisher.sendRequestsAt(targetSlot),
733+
new Promise<'timed-out'>(resolve => {
734+
timeout = setTimeout(() => resolve('timed-out'), 1000);
735+
}),
736+
]);
737+
expect(result).toBeUndefined();
738+
} finally {
739+
if (timeout) {
740+
clearTimeout(timeout);
741+
}
742+
}
743+
});
744+
723745
it('does not send requests if no valid requests are found', async () => {
724746
publisher.addRequest({
725747
action: 'propose',

yarn-project/sequencer-client/src/publisher/sequencer-publisher.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -630,6 +630,9 @@ export class SequencerPublisher {
630630
// Aim to be in the mempool one L1 slot before the L2 slot starts, so we have a chance of
631631
// being picked up by the first L1 block of the L2 slot.
632632
const submitAfterMs = startOfTargetSlotMs - Number(this.ethereumSlotDuration) * 1000;
633+
if (this.interrupted) {
634+
return undefined;
635+
}
633636
const sleepMs = submitAfterMs - this.dateProvider.now();
634637
if (sleepMs > 0) {
635638
this.log.debug(`Sleeping ${sleepMs}ms before sending requests`, {

yarn-project/sequencer-client/src/sequencer/checkpoint_proposal_job.test.ts

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import { TimeoutError } from '@aztec/foundation/error';
1414
import { EthAddress } from '@aztec/foundation/eth-address';
1515
import { Signature } from '@aztec/foundation/eth-signature';
1616
import { createLogger } from '@aztec/foundation/log';
17+
import { promiseWithResolvers } from '@aztec/foundation/promise';
1718
import { TestDateProvider } from '@aztec/foundation/timer';
1819
import type { TypedEventEmitter } from '@aztec/foundation/types';
1920
import { type P2P, P2PClientState } from '@aztec/p2p';
@@ -1719,6 +1720,39 @@ describe('CheckpointProposalJob', () => {
17191720
}
17201721
});
17211722

1723+
it('interrupts a pending L1 submission sleeping in the publisher', async () => {
1724+
const { txs, block } = await setupTxsAndBlock(p2p, globalVariables, 1, chainId);
1725+
checkpointBuilder.seedBlocks([block], [txs]);
1726+
validatorClient.collectAttestations.mockResolvedValue(getAttestations(block));
1727+
1728+
// Simulate sendRequestsAt sleeping until the target slot: the promise only resolves once
1729+
// the publisher itself is interrupted.
1730+
const sendDeferred = promiseWithResolvers<undefined>();
1731+
publisher.sendRequestsAt.mockReturnValue(sendDeferred.promise);
1732+
publisher.interrupt.mockImplementation(() => sendDeferred.resolve(undefined));
1733+
1734+
const checkpoint = await job.execute();
1735+
expect(checkpoint).toBeDefined();
1736+
1737+
const pendingSubmission = job.awaitPendingSubmission().then(() => 'stopped' as const);
1738+
job.interrupt();
1739+
1740+
let timeout: NodeJS.Timeout | undefined;
1741+
try {
1742+
const result = await Promise.race([
1743+
pendingSubmission,
1744+
new Promise<'timed-out'>(resolve => {
1745+
timeout = setTimeout(() => resolve('timed-out'), 1000);
1746+
}),
1747+
]);
1748+
expect(result).toBe('stopped');
1749+
} finally {
1750+
if (timeout) {
1751+
clearTimeout(timeout);
1752+
}
1753+
}
1754+
});
1755+
17221756
it('aborts checkpoint when syncing proposed block to archiver fails', async () => {
17231757
const { txs, block } = await setupTxsAndBlock(p2p, globalVariables, 1, chainId);
17241758
checkpointBuilder.seedBlocks([block], [txs]);

yarn-project/sequencer-client/src/sequencer/checkpoint_proposal_job.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,10 +189,11 @@ export class CheckpointProposalJob implements Traceable {
189189
await this.pendingL1Submission;
190190
}
191191

192-
/** Interrupts job-owned waits so shutdown can finish. */
192+
/** Interrupts job-owned waits, including the publisher's send-at-slot sleep, so shutdown can finish. */
193193
public interrupt(): void {
194194
this.interrupted = true;
195195
this.interruptibleSleep.interrupt(true);
196+
this.publisher.interrupt();
196197
}
197198

198199
private async awaitInterruptibleSleep(ms: number): Promise<void> {

0 commit comments

Comments
 (0)