-
Notifications
You must be signed in to change notification settings - Fork 733
fix(lambda): return partial CreateFleet instances instead of discarding them #5131
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
b8c45d8
e8cdc4e
d4f3be6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -513,7 +513,7 @@ describe('create runner with errors', () => { | |
|
|
||
| await expect(createRunner(createRunnerConfig(defaultRunnerConfig))).rejects.toMatchObject({ | ||
| name: 'ScaleError', | ||
| failedInstanceCount: 2, | ||
| failedInstanceCount: 1, // numberOfRunners when zero instances created | ||
| }); | ||
| expect(mockEC2Client).toHaveReceivedCommandWith( | ||
| CreateFleetCommand, | ||
|
|
@@ -543,6 +543,16 @@ describe('create runner with errors', () => { | |
| ); | ||
| }); | ||
|
|
||
| it('returns partial instances on recognized scale error instead of throwing', async () => { | ||
| createFleetMockWithErrors(['UnfulfillableCapacity'], ['i-partial']); | ||
|
|
||
| await expect(createRunner(createRunnerConfig(defaultRunnerConfig))).resolves.toEqual(['i-partial']); | ||
| expect(mockEC2Client).toHaveReceivedCommandWith( | ||
| CreateFleetCommand, | ||
| expectedCreateFleetRequest(defaultExpectedFleetRequestValues), | ||
| ); | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed. Fixed in e8cdc4e — test now requests 3 runners and gets 1 instance back, making it a genuine partial-success scenario. |
||
| }); | ||
|
|
||
| it('test error by create fleet call is thrown.', async () => { | ||
| mockEC2Client.on(CreateFleetCommand).rejects(new Error('Some error')); | ||
|
|
||
|
|
@@ -688,12 +698,14 @@ describe('create runner with errors fail over to OnDemand', () => { | |
| // fallback to on demand for UnfulfillableCapacity but InsufficientInstanceCapacity is thrown | ||
| createFleetMockWithWithOnDemandFallback(['UnfulfillableCapacity'], instancesIds); | ||
|
|
||
| // Partial success: 1 instance created, unrecognized error for the rest. | ||
| // Returns partial instances instead of throwing to prevent orphans. | ||
| await expect( | ||
| createRunner({ | ||
| ...createRunnerConfig(defaultRunnerConfig), | ||
| numberOfRunners: 2, | ||
| }), | ||
| ).rejects.toBeInstanceOf(Error); | ||
| ).resolves.toEqual(['i-123']); | ||
|
|
||
| expect(mockEC2Client).toHaveReceivedCommandTimes(CreateFleetCommand, 1); | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -201,9 +201,26 @@ async function processFleetResult( | |
|
|
||
| const failedCount = countScaleErrors(errors, scaleErrors); | ||
| if (failedCount > 0) { | ||
| logger.warn('Create fleet failed, ScaleError will be thrown to trigger retry for ephemeral runners.'); | ||
| if (instances.length > 0) { | ||
| logger.warn( | ||
| `Partial fleet success: ${instances.length}/${runnerParameters.numberOfRunners} instances created. ` + | ||
| `Returning partial results; caller will retry the shortfall via SQS.`, | ||
| { data: fleet.Errors }, | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point. Fixed in e8cdc4e — removed SQS reference, reworded to mechanism-agnostic, and added |
||
| ); | ||
| return instances; | ||
| } | ||
|
Comment on lines
+223
to
+230
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The caller ( Introducing a structured return type would be a larger refactor that touches every callsite of |
||
| logger.warn('Create fleet failed with zero instances, ScaleError will be thrown to trigger retry.'); | ||
| logger.debug('Create fleet failed.', { data: fleet.Errors }); | ||
| throw new ScaleError(failedCount); | ||
| throw new ScaleError(runnerParameters.numberOfRunners); | ||
| } | ||
|
|
||
| if (instances.length > 0) { | ||
| logger.warn( | ||
| `Partial fleet success: ${instances.length}/${runnerParameters.numberOfRunners} instances created. ` + | ||
| `Error not recognized as scaling error; returning partial results.`, | ||
| { data: fleet.Errors }, | ||
| ); | ||
| return instances; | ||
| } | ||
|
|
||
| logger.warn('Create fleet failed, error not recognized as scaling error.', { data: fleet.Errors }); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agreed. Fixed in e8cdc4e — test now uses
numberOfRunners: 3so the assertion actually validates the value.