@@ -16,7 +16,7 @@ const DEFAULT_ENCODER = new TextEncoder()
1616
1717export const DEFAULT_HISTORY_LIMIT = 200
1818export const DEFAULT_SSE_RETRY_MS = 2000
19- export const DEFAULT_INSTANCE_COOLDOWN_MS = 60_000
19+ export const DEFAULT_INSTANCE_COOLDOWN_MS = 3_600_000 // 60 min: exhausted instance (402/429 w/o Retry-After) stays out long enough to skip dead quota
2020
2121export interface ProxyContext {
2222 body : string
@@ -91,6 +91,7 @@ export interface ProxyToOptions {
9191 logger : ( line : string ) => void
9292 fetchImpl ?: typeof fetch
9393 onQuotaSnapshots ?: ( quotaSnapshots : unknown ) => void
94+ onQuotaExceeded ?: ( ) => void
9495}
9596
9697export interface DashboardHandlerOptions {
@@ -539,8 +540,9 @@ export function getInstanceName(
539540function observeResponsesSseQuotaSnapshots (
540541 body : ReadableStream < Uint8Array > | null ,
541542 onQuotaSnapshots ?: ( quotaSnapshots : unknown ) => void ,
543+ onQuotaExceeded ?: ( ) => void ,
542544) : ReadableStream < Uint8Array > | null {
543- if ( ! body || ! onQuotaSnapshots ) {
545+ if ( ! body || ( ! onQuotaSnapshots && ! onQuotaExceeded ) ) {
544546 return body
545547 }
546548
@@ -559,10 +561,20 @@ function observeResponsesSseQuotaSnapshots(
559561 }
560562
561563 try {
562- const parsed = JSON . parse ( data ) as { copilot_quota_snapshots ?: unknown }
563- if ( parsed . copilot_quota_snapshots ) {
564+ const parsed = JSON . parse ( data ) as {
565+ code ?: unknown
566+ error ?: { code ?: unknown }
567+ copilot_quota_snapshots ?: unknown
568+ }
569+ if ( parsed . copilot_quota_snapshots && onQuotaSnapshots ) {
564570 onQuotaSnapshots ( parsed . copilot_quota_snapshots )
565571 }
572+ if (
573+ parsed . code === "quota_exceeded"
574+ || parsed . error ?. code === "quota_exceeded"
575+ ) {
576+ onQuotaExceeded ?.( )
577+ }
566578 } catch {
567579 return
568580 }
@@ -619,6 +631,7 @@ export async function proxyTo(options: ProxyToOptions): Promise<Response> {
619631 observeResponsesSseQuotaSnapshots (
620632 upstream . body ,
621633 options . onQuotaSnapshots ,
634+ options . onQuotaExceeded ,
622635 )
623636 : upstream . body
624637
@@ -724,24 +737,58 @@ function applyCooldownOnExhaustion(
724737 model : string
725738 requestNowMs : number
726739 } ,
727- ) {
740+ ) : boolean {
728741 if ( ! COOLDOWN_STATUSES . has ( proxied . status ) ) {
729- return
742+ return false
730743 }
731744
732745 // 402 has no Retry-After; falls back to defaultCooldownMs below.
733- const retryAfter = proxied . headers . get ( "Retry-After" )
734- const retryAfterMs = parseRetryAfterMs ( retryAfter , params . requestNowMs )
746+ applyCooldown ( runtime , {
747+ ...params ,
748+ status : proxied . status ,
749+ retryAfter : proxied . headers . get ( "Retry-After" ) ,
750+ } )
751+ return true
752+ }
753+
754+ function applyCooldown (
755+ runtime : RouterRuntime ,
756+ params : {
757+ port : number
758+ instanceName : string
759+ model : string
760+ requestNowMs : number
761+ status : number
762+ retryAfter : string | null
763+ } ,
764+ ) {
765+ const retryAfterMs = parseRetryAfterMs ( params . retryAfter , params . requestNowMs )
735766 const cooldownMs = retryAfterMs ?? runtime . defaultCooldownMs
736767 const cooldownUntilMs = params . requestNowMs + cooldownMs
737768
738769 runtime . state . portCooldownUntil . set ( params . port , cooldownUntilMs )
739- runtime . state . portCooldownRetryAfter . set ( params . port , retryAfter )
770+ runtime . state . portCooldownRetryAfter . set ( params . port , params . retryAfter )
740771 runtime . logger (
741- `cooldown set instance=${ params . instanceName } :${ params . port } model=${ params . model } status=${ proxied . status } until=${ new Date ( cooldownUntilMs ) . toISOString ( ) } retry-after=${ retryAfter || "_" } ` ,
772+ `cooldown set instance=${ params . instanceName } :${ params . port } model=${ params . model } status=${ params . status } until=${ new Date ( cooldownUntilMs ) . toISOString ( ) } retry-after=${ params . retryAfter || "_" } ` ,
742773 )
743774}
744775
776+ function applyCooldownOnStreamQuotaExceeded (
777+ runtime : RouterRuntime ,
778+ params : {
779+ port : number
780+ instanceName : string
781+ model : string
782+ requestNowMs : number
783+ } ,
784+ ) {
785+ applyCooldown ( runtime , {
786+ ...params ,
787+ status : 402 ,
788+ retryAfter : null ,
789+ } )
790+ }
791+
745792function createAllCoolingResponse (
746793 runtime : RouterRuntime ,
747794 params : {
@@ -830,6 +877,13 @@ async function handleNoModelRequest(
830877 fetchImpl : runtime . fetchImpl ,
831878 onQuotaSnapshots : ( quotaSnapshots ) =>
832879 updateUpstreamQuotaSnapshot ( runtime . state , port , quotaSnapshots ) ,
880+ onQuotaExceeded : ( ) =>
881+ applyCooldownOnStreamQuotaExceeded ( runtime , {
882+ port,
883+ instanceName,
884+ model : "_" ,
885+ requestNowMs : request . requestNowMs ,
886+ } ) ,
833887 } )
834888 applyCooldownOnExhaustion ( runtime , proxied , {
835889 port,
@@ -846,70 +900,89 @@ async function handleModelRequest(
846900 runtime : RouterRuntime ,
847901 request : RouterRequestContext ,
848902) : Promise < Response > {
849- const result = pickPort ( runtime . state , {
850- sessionId : request . sessionId ,
851- agent : request . agent ,
852- model : request . model ,
853- nowMs : request . requestNowMs ,
854- } )
903+ const modelPorts = runtime . state . modelToPorts . get ( request . model ) || [ ]
904+ const maxAttempts = Math . max ( modelPorts . length , 1 )
855905
856- if ( ! result ) {
857- const modelPorts = runtime . state . modelToPorts . get ( request . model ) || [ ]
858- const allCoolingResponse = createAllCoolingResponse ( runtime , {
906+ for ( let attempt = 0 ; attempt < maxAttempts ; attempt ++ ) {
907+ const result = pickPort ( runtime . state , {
859908 sessionId : request . sessionId ,
860909 agent : request . agent ,
910+ model : request . model ,
911+ nowMs : request . requestNowMs ,
912+ } )
913+
914+ if ( ! result ) {
915+ break
916+ }
917+
918+ const instanceName = getInstanceName ( runtime . state , result . port )
919+ const routeRecord : RouteRecord = {
920+ ts : runtime . now ( ) ,
921+ sid : request . sessionId || "-" ,
922+ agent : request . agent ,
923+ model : request . model ,
861924 provider : request . provider ,
925+ port : result . port ,
926+ reason : result . reason ,
927+ instanceName,
928+ }
929+ recordRoute ( runtime . state , routeRecord )
930+ runtime . logger (
931+ `sid=${ routeRecord . sid } agent=${ request . agent } provider=${ request . provider } → ${ instanceName } :${ result . port } model=${ request . model } reason=${ result . reason } ` ,
932+ )
933+
934+ const proxied = await proxyTo ( {
935+ port : result . port ,
936+ context : { body : request . bodyText , req : request . req , url : request . url } ,
937+ logger : runtime . logger ,
938+ fetchImpl : runtime . fetchImpl ,
939+ onQuotaSnapshots : ( quotaSnapshots ) =>
940+ updateUpstreamQuotaSnapshot ( runtime . state , result . port , quotaSnapshots ) ,
941+ onQuotaExceeded : ( ) =>
942+ applyCooldownOnStreamQuotaExceeded ( runtime , {
943+ port : result . port ,
944+ instanceName,
945+ model : request . model ,
946+ requestNowMs : request . requestNowMs ,
947+ } ) ,
948+ } )
949+ const exhausted = applyCooldownOnExhaustion ( runtime , proxied , {
950+ port : result . port ,
951+ instanceName,
862952 model : request . model ,
863- ports : modelPorts ,
864953 requestNowMs : request . requestNowMs ,
865- error : `all upstream instances are cooling down for model: ${ request . model } ` ,
866954 } )
867- if ( allCoolingResponse ) {
868- return allCoolingResponse
955+ updateUpstreamHeaderSnapshot ( runtime . state , result . port , proxied . headers )
956+
957+ if ( ! exhausted ) {
958+ return proxied
869959 }
870960
871961 runtime . logger (
872- `NO PORT sid=${ request . sessionId || "-" } agent=${ request . agent } model=${ request . model } provider=${ request . provider } ` ,
873- )
874- return Response . json (
875- { error : `no instance serves model: ${ request . model } ` } ,
876- { status : 502 } ,
962+ `retry model=${ request . model } after exhausted instance=${ instanceName } :${ result . port } status=${ proxied . status } ` ,
877963 )
878964 }
879965
880- const instanceName = getInstanceName ( runtime . state , result . port )
881- const routeRecord : RouteRecord = {
882- ts : runtime . now ( ) ,
883- sid : request . sessionId || "-" ,
966+ const allCoolingResponse = createAllCoolingResponse ( runtime , {
967+ sessionId : request . sessionId ,
884968 agent : request . agent ,
885- model : request . model ,
886969 provider : request . provider ,
887- port : result . port ,
888- reason : result . reason ,
889- instanceName,
890- }
891- recordRoute ( runtime . state , routeRecord )
892- runtime . logger (
893- `sid=${ routeRecord . sid } agent=${ request . agent } provider=${ request . provider } → ${ instanceName } :${ result . port } model=${ request . model } reason=${ result . reason } ` ,
894- )
895-
896- const proxied = await proxyTo ( {
897- port : result . port ,
898- context : { body : request . bodyText , req : request . req , url : request . url } ,
899- logger : runtime . logger ,
900- fetchImpl : runtime . fetchImpl ,
901- onQuotaSnapshots : ( quotaSnapshots ) =>
902- updateUpstreamQuotaSnapshot ( runtime . state , result . port , quotaSnapshots ) ,
903- } )
904- applyCooldownOnExhaustion ( runtime , proxied , {
905- port : result . port ,
906- instanceName,
907970 model : request . model ,
971+ ports : modelPorts ,
908972 requestNowMs : request . requestNowMs ,
973+ error : `all upstream instances are cooling down for model: ${ request . model } ` ,
909974 } )
910- updateUpstreamHeaderSnapshot ( runtime . state , result . port , proxied . headers )
975+ if ( allCoolingResponse ) {
976+ return allCoolingResponse
977+ }
911978
912- return proxied
979+ runtime . logger (
980+ `NO PORT sid=${ request . sessionId || "-" } agent=${ request . agent } model=${ request . model } provider=${ request . provider } ` ,
981+ )
982+ return Response . json (
983+ { error : `no instance serves model: ${ request . model } ` } ,
984+ { status : 502 } ,
985+ )
913986}
914987
915988export function createRouterHandler ( options : RouterHandlerOptions ) {
0 commit comments