@@ -664,32 +664,27 @@ describe("createEmbedPool", () => {
664664 pool . shutdown ( ) ;
665665 } ) ;
666666
667- it ( "BUG: workerFactory() throwing in scheduleRestart leaves pool permanently hung " , async ( ) => {
667+ it ( "BUG: workerFactory() throwing in scheduleRestart retries with backoff " , async ( ) => {
668668 let callCount = 0 ;
669669 const workers = [ ] ;
670670 const factory = ( ) => {
671671 callCount ++ ;
672- if ( callCount > = 2 ) throw new Error ( "Worker constructor exploded" ) ;
672+ if ( callCount == = 2 ) throw new Error ( "Worker constructor exploded" ) ;
673673 const w = new MockWorker ( ) ;
674674 workers . push ( w ) ;
675675 return w ;
676676 } ;
677- const pool = createEmbedPool ( factory , { restartDelay : 50 , workerReadyTimeout : 200 } ) ;
677+ const pool = createEmbedPool ( factory , { restartDelay : 50 , workerReadyTimeout : 2000 , maxRestartDelay : 200 } ) ;
678678 pool . initWorker ( ) ;
679679
680- // Worker exits cleanly — scheduleRestart fires, but second initWorker() throws
680+ // Worker exits cleanly — first restart throws, second should succeed via backoff
681681 workers [ 0 ] . emit ( "exit" , 0 ) ;
682682
683- // Wait for the restart attempt to fire and throw
684- await new Promise ( r => setTimeout ( r , 100 ) ) ;
685-
686- // Now try to embed — should fail fast, NOT hang until workerReadyTimeout
687- const start = Date . now ( ) ;
688- await assert . rejects ( ( ) => pool . embed ( "hello" ) , / n o t r u n n i n g | c o n s t r u c t o r e x p l o d e d / i) ;
689- const elapsed = Date . now ( ) - start ;
683+ // Wait for retry
684+ await new Promise ( r => setTimeout ( r , 500 ) ) ;
690685
691- // If this takes close to workerReadyTimeout (200ms), the promise was stuck
692- assert . ok ( elapsed < 150 , `embed() took ${ elapsed } ms — pool is hung on a never-resolving promise` ) ;
686+ assert . ok ( callCount >= 3 , `Expected at least 3 factory calls, got ${ callCount } ` ) ;
687+ assert . equal ( pool . isAlive ( ) , true , " pool should recover via backoff retry" ) ;
693688 pool . shutdown ( ) ;
694689 } ) ;
695690
@@ -730,4 +725,194 @@ describe("createEmbedPool", () => {
730725 assert . ok ( result instanceof Error ) ;
731726 assert . match ( result . message , / n o t r u n n i n g | s h u t t i n g d o w n / i) ;
732727 } ) ;
728+
729+ // === BUG: code-0 exit orphans in-flight embeds ===
730+
731+ it ( "BUG: code-0 exit rejects in-flight embeds (not orphaned for 60s)" , async ( ) => {
732+ const factory = mockWorkerFactory ( ) ;
733+ const pool = createEmbedPool ( factory , { embedTimeout : 5000 , restartDelay : 50 } ) ;
734+ pool . initWorker ( ) ;
735+
736+ // Start an embed (worker won't respond)
737+ const embedPromise = pool . embed ( "hello" ) . catch ( e => e ) ;
738+ await new Promise ( r => setTimeout ( r , 10 ) ) ;
739+
740+ // Worker exits cleanly — in-flight embed should be rejected promptly
741+ const start = Date . now ( ) ;
742+ factory . workers [ 0 ] . emit ( "exit" , 0 ) ;
743+
744+ const result = await embedPromise ;
745+ const elapsed = Date . now ( ) - start ;
746+
747+ assert . ok ( result instanceof Error , "in-flight embed should have been rejected" ) ;
748+ assert . ok ( elapsed < 1000 ,
749+ `embed took ${ elapsed } ms to reject — orphaned until embedTimeout instead of rejected on exit` ) ;
750+ pool . shutdown ( ) ;
751+ } ) ;
752+
753+ // === BUG: failed restart = permanent death (no retry) ===
754+
755+ it ( "BUG: failed restart retries with backoff instead of dying permanently" , async ( ) => {
756+ let callCount = 0 ;
757+ const workers = [ ] ;
758+ const factory = ( ) => {
759+ callCount ++ ;
760+ // Fail on attempts 2 and 3, succeed on attempt 4
761+ if ( callCount >= 2 && callCount <= 3 ) throw new Error ( "ONNX load failed" ) ;
762+ const w = new MockWorker ( ) ;
763+ workers . push ( w ) ;
764+ return w ;
765+ } ;
766+ const pool = createEmbedPool ( factory , {
767+ restartDelay : 30 ,
768+ workerReadyTimeout : 5000 ,
769+ maxRestartDelay : 200 ,
770+ } ) ;
771+ pool . initWorker ( ) ;
772+
773+ // Worker exits — first restart attempt will fail, second will fail, third should succeed
774+ workers [ 0 ] . emit ( "exit" , 0 ) ;
775+
776+ // Wait for backoff retries to play out
777+ await new Promise ( r => setTimeout ( r , 1500 ) ) ;
778+
779+ assert . ok ( callCount >= 4 ,
780+ `Expected at least 4 factory calls (1 init + 2 failures + 1 success), got ${ callCount } ` ) ;
781+ assert . equal ( pool . isAlive ( ) , true , "pool should have recovered after transient failures" ) ;
782+ pool . shutdown ( ) ;
783+ } ) ;
784+
785+ it ( "POSITIVE: backoff delay increases on consecutive failures" , async ( ) => {
786+ let callCount = 0 ;
787+ const timestamps = [ ] ;
788+ const workers = [ ] ;
789+ const factory = ( ) => {
790+ callCount ++ ;
791+ timestamps . push ( Date . now ( ) ) ;
792+ if ( callCount >= 2 ) throw new Error ( "still broken" ) ;
793+ const w = new MockWorker ( ) ;
794+ workers . push ( w ) ;
795+ return w ;
796+ } ;
797+ const pool = createEmbedPool ( factory , {
798+ restartDelay : 50 ,
799+ workerReadyTimeout : 5000 ,
800+ maxRestartDelay : 400 ,
801+ } ) ;
802+ pool . initWorker ( ) ;
803+
804+ workers [ 0 ] . emit ( "exit" , 0 ) ;
805+ // Wait for several backoff attempts
806+ await new Promise ( r => setTimeout ( r , 2000 ) ) ;
807+
808+ // Should have multiple attempts with increasing gaps
809+ assert . ok ( callCount >= 4 , `Expected at least 4 attempts, got ${ callCount } ` ) ;
810+
811+ // Verify delays are increasing (backoff)
812+ for ( let i = 2 ; i < timestamps . length - 1 ; i ++ ) {
813+ const gap1 = timestamps [ i ] - timestamps [ i - 1 ] ;
814+ const gap2 = timestamps [ i + 1 ] - timestamps [ i ] ;
815+ assert . ok ( gap2 >= gap1 * 0.8 , // allow 20% timing jitter
816+ `Expected increasing delays but gap ${ i } : ${ gap1 } ms, gap ${ i + 1 } : ${ gap2 } ms` ) ;
817+ }
818+ pool . shutdown ( ) ;
819+ } ) ;
820+
821+ it ( "POSITIVE: backoff resets after a successful restart" , async ( ) => {
822+ let callCount = 0 ;
823+ const workers = [ ] ;
824+ const factory = ( ) => {
825+ callCount ++ ;
826+ // Fail on second call, succeed on all others
827+ if ( callCount === 2 ) throw new Error ( "transient failure" ) ;
828+ const w = new MockWorker ( ) ;
829+ workers . push ( w ) ;
830+ return w ;
831+ } ;
832+ const pool = createEmbedPool ( factory , {
833+ restartDelay : 30 ,
834+ workerReadyTimeout : 5000 ,
835+ maxRestartDelay : 500 ,
836+ } ) ;
837+ pool . initWorker ( ) ;
838+
839+ // First exit → restart fails → retries → succeeds
840+ workers [ 0 ] . emit ( "exit" , 0 ) ;
841+ await new Promise ( r => setTimeout ( r , 500 ) ) ;
842+
843+ assert . equal ( pool . isAlive ( ) , true , "pool should have recovered" ) ;
844+ const secondWorkerIdx = workers . length - 1 ;
845+
846+ // Second exit → should restart with initial delay (not backoff from previous failure)
847+ workers [ secondWorkerIdx ] . emit ( "exit" , 0 ) ;
848+ await new Promise ( r => setTimeout ( r , 200 ) ) ;
849+
850+ assert . equal ( pool . isAlive ( ) , true , "pool should restart at base delay after prior success" ) ;
851+ pool . shutdown ( ) ;
852+ } ) ;
853+
854+ it ( "POSITIVE: backoff caps at maxRestartDelay" , async ( ) => {
855+ let callCount = 0 ;
856+ const timestamps = [ ] ;
857+ const workers = [ ] ;
858+ const factory = ( ) => {
859+ callCount ++ ;
860+ timestamps . push ( Date . now ( ) ) ;
861+ if ( callCount >= 2 ) throw new Error ( "permanently broken" ) ;
862+ const w = new MockWorker ( ) ;
863+ workers . push ( w ) ;
864+ return w ;
865+ } ;
866+ const pool = createEmbedPool ( factory , {
867+ restartDelay : 50 ,
868+ maxRestartDelay : 150 ,
869+ } ) ;
870+ pool . initWorker ( ) ;
871+ workers [ 0 ] . emit ( "exit" , 0 ) ;
872+
873+ await new Promise ( r => setTimeout ( r , 2000 ) ) ;
874+
875+ // All gaps after the first few should be capped at ~150ms
876+ const gaps = [ ] ;
877+ for ( let i = 1 ; i < timestamps . length ; i ++ ) {
878+ gaps . push ( timestamps [ i ] - timestamps [ i - 1 ] ) ;
879+ }
880+ // The last few gaps should all be ≤ maxRestartDelay + jitter
881+ const lastGaps = gaps . slice ( - 3 ) ;
882+ for ( const gap of lastGaps ) {
883+ assert . ok ( gap <= 250 ,
884+ `Gap ${ gap } ms exceeds maxRestartDelay (150ms) + reasonable jitter` ) ;
885+ }
886+ pool . shutdown ( ) ;
887+ } ) ;
888+
889+ it ( "shutdown cancels pending restart timer" , async ( ) => {
890+ const factory = mockWorkerFactory ( ) ;
891+ const pool = createEmbedPool ( factory , { restartDelay : 50 } ) ;
892+ pool . initWorker ( ) ;
893+
894+ // Trigger exit — restart timer starts
895+ factory . workers [ 0 ] . emit ( "exit" , 1 ) ;
896+
897+ // Shutdown immediately — should cancel the restart
898+ pool . shutdown ( ) ;
899+
900+ // Wait for timer that would have fired
901+ await new Promise ( r => setTimeout ( r , 100 ) ) ;
902+
903+ // Should NOT have created a second worker
904+ assert . equal ( factory . workers . length , 1 , "shutdown should cancel pending restart" ) ;
905+ } ) ;
906+
907+ it ( "uses default options when none provided" , async ( ) => {
908+ const factory = mockWorkerFactory ( ) ;
909+ const pool = createEmbedPool ( factory ) ;
910+ pool . initWorker ( ) ;
911+
912+ const p = pool . embed ( "test" ) ;
913+ const msg = factory . workers [ 0 ] . messages [ 0 ] ;
914+ factory . workers [ 0 ] . emit ( "message" , { id : msg . id , embedding : Buffer . from ( [ 1 ] ) } ) ;
915+ await p ;
916+ pool . shutdown ( ) ;
917+ } ) ;
733918} ) ;
0 commit comments