1- import type { ClickHouse , TaskRunV1 , RawTaskRunPayloadV1 } from "@internal/clickhouse" ;
1+ import type { ClickHouse , RawTaskRunPayloadV1 , TaskRunV1 } from "@internal/clickhouse" ;
22import { RedisOptions } from "@internal/redis" ;
33import { LogicalReplicationClient , Transaction , type PgoutputMessage } from "@internal/replication" ;
44import { Logger } from "@trigger.dev/core/logger" ;
55import { tryCatch } from "@trigger.dev/core/utils" ;
6- import { TaskRunError } from "@trigger.dev/core/v3/schemas" ;
76import { parsePacket } from "@trigger.dev/core/v3/utils/ioSerialization" ;
87import { TaskRun } from "@trigger.dev/database" ;
98import { nanoid } from "nanoid" ;
@@ -46,6 +45,8 @@ export class RunsReplicationService {
4645 private _lastReplicationLagMs : number | null = null ;
4746 private _transactionCounter ?: Counter ;
4847 private _insertStrategy : "streaming" | "batching" ;
48+ private _isShuttingDown = false ;
49+ private _isShutDownComplete = false ;
4950
5051 constructor ( private readonly options : RunsReplicationServiceOptions ) {
5152 this . logger = new Logger ( "RunsReplicationService" , "debug" ) ;
@@ -62,7 +63,7 @@ export class RunsReplicationService {
6263 table : "TaskRun" ,
6364 redisOptions : options . redisOptions ,
6465 autoAcknowledge : false ,
65- publicationActions : [ "insert" , "update" ] ,
66+ publicationActions : [ "insert" , "update" , "delete" ] ,
6667 logger : new Logger ( "RunsReplicationService" , "debug" ) ,
6768 leaderLockTimeoutMs : options . leaderLockTimeoutMs ?? 30_000 ,
6869 leaderLockExtendIntervalMs : options . leaderLockExtendIntervalMs ?? 10_000 ,
@@ -84,6 +85,9 @@ export class RunsReplicationService {
8485 } ) ;
8586
8687 this . _replicationClient . events . on ( "heartbeat" , async ( { lsn, shouldRespond } ) => {
88+ if ( this . _isShuttingDown ) return ;
89+ if ( this . _isShutDownComplete ) return ;
90+
8791 if ( shouldRespond ) {
8892 await this . _replicationClient . acknowledge ( lsn ) ;
8993 }
@@ -130,6 +134,11 @@ export class RunsReplicationService {
130134 }
131135 }
132136
137+ public shutdown ( ) {
138+ this . logger . info ( "Initiating shutdown of runs replication service" ) ;
139+ this . _isShuttingDown = true ;
140+ }
141+
133142 async start ( insertStrategy ?: "streaming" | "batching" ) {
134143 this . _insertStrategy = insertStrategy ?? this . _insertStrategy ;
135144
@@ -201,11 +210,27 @@ export class RunsReplicationService {
201210 }
202211
203212 async #handleTransaction( transaction : Transaction < TaskRun > ) {
213+ if ( this . _isShutDownComplete ) return ;
214+
215+ let alreadyAcknowledged = false ;
216+
217+ if ( this . _isShuttingDown ) {
218+ // We need to immediately acknowledge the transaction
219+ // And then try and handle this transaction
220+ if ( transaction . commitEndLsn ) {
221+ await this . _replicationClient . acknowledge ( transaction . commitEndLsn ) ;
222+ alreadyAcknowledged = true ;
223+ }
224+
225+ await this . _replicationClient . stop ( ) ;
226+ this . _isShutDownComplete = true ;
227+ }
228+
204229 this . _lastReplicationLagMs = transaction . replicationLagMs ;
205230
206231 // If there are no events, do nothing
207232 if ( transaction . events . length === 0 ) {
208- if ( transaction . commitEndLsn ) {
233+ if ( transaction . commitEndLsn && ! alreadyAcknowledged ) {
209234 await this . _replicationClient . acknowledge ( transaction . commitEndLsn ) ;
210235 }
211236
@@ -222,6 +247,7 @@ export class RunsReplicationService {
222247
223248 this . logger . debug ( "Handling transaction" , {
224249 transaction,
250+ alreadyAcknowledged,
225251 } ) ;
226252
227253 // If there are events, we need to handle them
@@ -230,13 +256,19 @@ export class RunsReplicationService {
230256 this . _transactionCounter ?. inc ( ) ;
231257
232258 if ( this . _insertStrategy === "streaming" ) {
233- await this . _concurrentFlushScheduler . addToBatch (
234- transaction . events . map ( ( event ) => ( {
235- _version,
236- run : event . data ,
237- event : event . tag ,
238- } ) )
239- ) ;
259+ this . _concurrentFlushScheduler
260+ . addToBatch (
261+ transaction . events . map ( ( event ) => ( {
262+ _version,
263+ run : event . data ,
264+ event : event . tag ,
265+ } ) )
266+ )
267+ . catch ( ( error ) => {
268+ this . logger . error ( "Error adding to batch" , {
269+ error,
270+ } ) ;
271+ } ) ;
240272 } else {
241273 const [ flushError ] = await tryCatch (
242274 this . #flushBatch(
@@ -256,7 +288,9 @@ export class RunsReplicationService {
256288 }
257289 }
258290
259- await this . _replicationClient . acknowledge ( transaction . commitEndLsn ) ;
291+ if ( ! alreadyAcknowledged ) {
292+ await this . _replicationClient . acknowledge ( transaction . commitEndLsn ) ;
293+ }
260294 }
261295
262296 async #flushBatch( flushId : string , batch : Array < TaskRunInsert > ) {
@@ -497,7 +531,6 @@ export class ConcurrentFlushScheduler<T> {
497531 private readonly MAX_CONCURRENCY : number ;
498532 private readonly concurrencyLimiter : ReturnType < typeof pLimit > ;
499533 private flushTimer : NodeJS . Timeout | null ;
500- private isShuttingDown ;
501534 private failedBatchCount ;
502535 private metricsRegister ?: MetricsRegister ;
503536 private logger : Logger ;
@@ -510,7 +543,6 @@ export class ConcurrentFlushScheduler<T> {
510543 this . MAX_CONCURRENCY = config . maxConcurrency || 1 ;
511544 this . concurrencyLimiter = pLimit ( this . MAX_CONCURRENCY ) ;
512545 this . flushTimer = null ;
513- this . isShuttingDown = false ;
514546 this . failedBatchCount = 0 ;
515547
516548 this . logger . info ( "Initializing ConcurrentFlushScheduler" , {
@@ -520,7 +552,6 @@ export class ConcurrentFlushScheduler<T> {
520552 } ) ;
521553
522554 this . startFlushTimer ( ) ;
523- this . setupShutdownHandlers ( ) ;
524555
525556 if ( ! process . env . VITEST && config . metricsRegister ) {
526557 this . metricsRegister = config . metricsRegister ;
@@ -592,27 +623,6 @@ export class ConcurrentFlushScheduler<T> {
592623 this . logger . debug ( "Started flush timer" , { interval : this . FLUSH_INTERVAL } ) ;
593624 }
594625
595- private setupShutdownHandlers ( ) {
596- process . on ( "SIGTERM" , this . shutdown . bind ( this ) ) ;
597- process . on ( "SIGINT" , this . shutdown . bind ( this ) ) ;
598- this . logger . debug ( "Shutdown handlers configured" ) ;
599- }
600-
601- private async shutdown ( ) : Promise < void > {
602- if ( this . isShuttingDown ) return ;
603- this . isShuttingDown = true ;
604- this . logger . info ( "Initiating shutdown of dynamic flush scheduler" , {
605- remainingItems : this . currentBatch . length ,
606- } ) ;
607-
608- await this . checkAndFlush ( ) ;
609- this . clearTimer ( ) ;
610-
611- this . logger . info ( "Dynamic flush scheduler shutdown complete" , {
612- totalFailedBatches : this . failedBatchCount ,
613- } ) ;
614- }
615-
616626 private clearTimer ( ) : void {
617627 if ( this . flushTimer ) {
618628 clearInterval ( this . flushTimer ) ;
0 commit comments