diff --git a/dist/index.js b/dist/index.js index 6dc8b45a..b72ad5e8 100644 --- a/dist/index.js +++ b/dist/index.js @@ -263,11 +263,104 @@ async function updateEcsService(ecs, clusterName, service, taskDefArn, waitForSe services: [service], cluster: clusterName }); + + await verifyServiceDeployment(ecs, clusterName, service, taskDefArn); } else { core.debug('Not waiting for the service to become stable'); } } +async function verifyServiceDeployment(ecs, clusterName, serviceName, expectedTaskDefArn) { + core.debug( + `Verifying that service '${serviceName}' stabilized on expected task definition '${expectedTaskDefArn}'` + ); + + // Describe the service after the waiter reports "stable". + // This extra check is necessary because ECS can become stable again + // by rolling back to the previous deployment if circuit breaker + // rollback is enabled. + const describeResponse = await ecs.describeServices({ + cluster: clusterName, + services: [serviceName] + }); + + // Surface any ECS-level lookup failures explicitly. + const failures = describeResponse.failures || []; + if (failures.length > 0) { + const failure = failures[0]; + throw new Error( + `Failed to describe service '${serviceName}': ${failure.reason || 'unknown error'}` + ); + } + + // We expect exactly one service back because we queried by name. + const service = describeResponse.services && describeResponse.services[0]; + if (!service) { + throw new Error(`Service '${serviceName}' was not returned by DescribeServices`); + } + + const deployments = service.deployments || []; + + // Find the deployment created from the task definition revision + // we just deployed. + const expectedDeployment = deployments.find( + deployment => deployment.taskDefinition === expectedTaskDefArn + ); + + // Find the deployment ECS considers PRIMARY after stabilization. + // This is the deployment currently serving traffic / considered active. + const primaryDeployment = deployments.find( + deployment => deployment.status === 'PRIMARY' + ); + + // If ECS explicitly marks the expected deployment as FAILED, + // fail immediately and include the AWS reason when available. + if (expectedDeployment && expectedDeployment.rolloutState === 'FAILED') { + const reason = expectedDeployment.rolloutStateReason + ? ` Reason: ${expectedDeployment.rolloutStateReason}` + : ''; + throw new Error( + `ECS deployment failed for task definition '${expectedTaskDefArn}'.${reason}` + ); + } + + // PRIMARY should always exist for a healthy service state. + if (!primaryDeployment) { + throw new Error(`No PRIMARY deployment found for service '${serviceName}'`); + } + + // This is the key rollback check: + // even if the service is "stable", ECS may have rolled back to the + // previous task definition. In that case, the PRIMARY deployment + // will not match the task definition we expected to promote. + if (primaryDeployment.taskDefinition !== expectedTaskDefArn) { + throw new Error( + `ECS deployment did not complete on the expected task definition. ` + + `Expected PRIMARY task definition '${expectedTaskDefArn}', but found ` + + `'${primaryDeployment.taskDefinition}'. This usually means ECS rolled back ` + + `after the new deployment failed.` + ); + } + + // When rolloutState is available, require the expected deployment + // to have fully completed, not merely exist. + // This is an additional safeguard on top of the PRIMARY check. + if ( + expectedDeployment && + expectedDeployment.rolloutState && + expectedDeployment.rolloutState !== 'COMPLETED' + ) { + throw new Error( + `ECS deployment for task definition '${expectedTaskDefArn}' did not reach ` + + `COMPLETED. Current rolloutState: '${expectedDeployment.rolloutState}'.` + ); + } + + core.info( + `Deployment verified: service '${serviceName}' is PRIMARY on expected task definition.` + ); +} + // Find value in a CodeDeploy AppSpec file with a case-insensitive key function findAppSpecValue(obj, keyName) { return obj[findAppSpecKey(obj, keyName)]; diff --git a/index.js b/index.js index bc580559..dafd45cf 100644 --- a/index.js +++ b/index.js @@ -257,11 +257,104 @@ async function updateEcsService(ecs, clusterName, service, taskDefArn, waitForSe services: [service], cluster: clusterName }); + + await verifyServiceDeployment(ecs, clusterName, service, taskDefArn); } else { core.debug('Not waiting for the service to become stable'); } } +async function verifyServiceDeployment(ecs, clusterName, serviceName, expectedTaskDefArn) { + core.debug( + `Verifying that service '${serviceName}' stabilized on expected task definition '${expectedTaskDefArn}'` + ); + + // Describe the service after the waiter reports "stable". + // This extra check is necessary because ECS can become stable again + // by rolling back to the previous deployment if circuit breaker + // rollback is enabled. + const describeResponse = await ecs.describeServices({ + cluster: clusterName, + services: [serviceName] + }); + + // Surface any ECS-level lookup failures explicitly. + const failures = describeResponse.failures || []; + if (failures.length > 0) { + const failure = failures[0]; + throw new Error( + `Failed to describe service '${serviceName}': ${failure.reason || 'unknown error'}` + ); + } + + // We expect exactly one service back because we queried by name. + const service = describeResponse.services && describeResponse.services[0]; + if (!service) { + throw new Error(`Service '${serviceName}' was not returned by DescribeServices`); + } + + const deployments = service.deployments || []; + + // Find the deployment created from the task definition revision + // we just deployed. + const expectedDeployment = deployments.find( + deployment => deployment.taskDefinition === expectedTaskDefArn + ); + + // Find the deployment ECS considers PRIMARY after stabilization. + // This is the deployment currently serving traffic / considered active. + const primaryDeployment = deployments.find( + deployment => deployment.status === 'PRIMARY' + ); + + // If ECS explicitly marks the expected deployment as FAILED, + // fail immediately and include the AWS reason when available. + if (expectedDeployment && expectedDeployment.rolloutState === 'FAILED') { + const reason = expectedDeployment.rolloutStateReason + ? ` Reason: ${expectedDeployment.rolloutStateReason}` + : ''; + throw new Error( + `ECS deployment failed for task definition '${expectedTaskDefArn}'.${reason}` + ); + } + + // PRIMARY should always exist for a healthy service state. + if (!primaryDeployment) { + throw new Error(`No PRIMARY deployment found for service '${serviceName}'`); + } + + // This is the key rollback check: + // even if the service is "stable", ECS may have rolled back to the + // previous task definition. In that case, the PRIMARY deployment + // will not match the task definition we expected to promote. + if (primaryDeployment.taskDefinition !== expectedTaskDefArn) { + throw new Error( + `ECS deployment did not complete on the expected task definition. ` + + `Expected PRIMARY task definition '${expectedTaskDefArn}', but found ` + + `'${primaryDeployment.taskDefinition}'. This usually means ECS rolled back ` + + `after the new deployment failed.` + ); + } + + // When rolloutState is available, require the expected deployment + // to have fully completed, not merely exist. + // This is an additional safeguard on top of the PRIMARY check. + if ( + expectedDeployment && + expectedDeployment.rolloutState && + expectedDeployment.rolloutState !== 'COMPLETED' + ) { + throw new Error( + `ECS deployment for task definition '${expectedTaskDefArn}' did not reach ` + + `COMPLETED. Current rolloutState: '${expectedDeployment.rolloutState}'.` + ); + } + + core.info( + `Deployment verified: service '${serviceName}' is PRIMARY on expected task definition.` + ); +} + // Find value in a CodeDeploy AppSpec file with a case-insensitive key function findAppSpecValue(obj, keyName) { return obj[findAppSpecKey(obj, keyName)]; diff --git a/index.test.js b/index.test.js index 690d2f1e..083cb8db 100644 --- a/index.test.js +++ b/index.test.js @@ -93,13 +93,21 @@ describe('Deploy to ECS', () => { mockEcsUpdateService.mockImplementation(() => Promise.resolve({})); - mockEcsDescribeServices.mockImplementation( - () => Promise.resolve({ - failures: [], - services: [{ - status: 'ACTIVE' - }] - }) + mockEcsDescribeServices.mockImplementation(() => + Promise.resolve({ + failures: [], + services: [{ + status: 'ACTIVE', + deploymentController: { type: 'ECS' }, + deployments: [ + { + status: 'PRIMARY', + taskDefinition: 'task:def:arn', + rolloutState: 'COMPLETED' + } + ] + }] + }) ); mockCodeDeployCreateDeployment.mockImplementation( @@ -1117,6 +1125,141 @@ describe('Deploy to ECS', () => { expect(core.setOutput).toHaveBeenNthCalledWith(1, 'task-definition-arn', 'task:def:arn'); }); + test('verifies expected task definition becomes PRIMARY after service stability wait', async () => { + core.getInput = jest + .fn() + .mockReturnValueOnce('task-definition.json') // task-definition + .mockReturnValueOnce('service-456') // service + .mockReturnValueOnce('cluster-789') // cluster + .mockReturnValueOnce('3') // max-retries + .mockReturnValueOnce('true'); // wait-for-service-stability + + let describeCount = 0; + mockEcsDescribeServices.mockImplementation(() => { + describeCount += 1; + + if (describeCount === 1) { + return Promise.resolve({ + failures: [], + services: [{ status: 'ACTIVE', deploymentController: { type: 'ECS' } }] + }); + } + + return Promise.resolve({ + failures: [], + services: [{ + status: 'ACTIVE', + deployments: [ + { + status: 'PRIMARY', + taskDefinition: 'task:def:arn', + rolloutState: 'COMPLETED' + } + ] + }] + }); + }); + + await run(); + + expect(waitUntilServicesStable).toHaveBeenCalledTimes(1); + expect(core.setFailed).toHaveBeenCalledTimes(0); + }); + + test('fails when expected deployment rolloutState is FAILED', async () => { + core.getInput = jest + .fn() + .mockReturnValueOnce('task-definition.json') + .mockReturnValueOnce('service-456') + .mockReturnValueOnce('cluster-789') + .mockReturnValueOnce('3') + .mockReturnValueOnce('true'); + + let describeCount = 0; + mockEcsDescribeServices.mockImplementation(() => { + describeCount += 1; + + if (describeCount === 1) { + return Promise.resolve({ + failures: [], + services: [{ status: 'ACTIVE', deploymentController: { type: 'ECS' } }] + }); + } + + return Promise.resolve({ + failures: [], + services: [{ + status: 'ACTIVE', + deployments: [ + { + status: 'ACTIVE', + taskDefinition: 'task:def:arn', + rolloutState: 'FAILED', + rolloutStateReason: 'ECS deployment circuit breaker: task failed health checks' + }, + { + status: 'PRIMARY', + taskDefinition: 'task:def:old', + rolloutState: 'COMPLETED' + } + ] + }] + }); + }); + + await run(); + + expect(waitUntilServicesStable).toHaveBeenCalledTimes(1); + expect(core.setFailed).toHaveBeenCalledTimes(1); + expect(core.setFailed.mock.calls[0][0]).toContain("ECS deployment failed for task definition 'task:def:arn'"); + }); + + test('fails when service stabilizes on a different PRIMARY task definition after rollback', async () => { + core.getInput = jest + .fn() + .mockReturnValueOnce('task-definition.json') + .mockReturnValueOnce('service-456') + .mockReturnValueOnce('cluster-789') + .mockReturnValueOnce('3') + .mockReturnValueOnce('true'); + + let describeCount = 0; + mockEcsDescribeServices.mockImplementation(() => { + describeCount += 1; + + if (describeCount === 1) { + return Promise.resolve({ + failures: [], + services: [{ status: 'ACTIVE', deploymentController: { type: 'ECS' } }] + }); + } + + return Promise.resolve({ + failures: [], + services: [{ + status: 'ACTIVE', + deployments: [ + { + status: 'PRIMARY', + taskDefinition: 'task:def:old', + rolloutState: 'COMPLETED' + }, + { + status: 'ACTIVE', + taskDefinition: 'task:def:arn' + } + ] + }] + }); + }); + + await run(); + + expect(waitUntilServicesStable).toHaveBeenCalledTimes(1); + expect(core.setFailed).toHaveBeenCalledTimes(1); + expect(core.setFailed.mock.calls[0][0]).toContain('did not complete on the expected task definition'); + }); + test('waits for the service to be stable', async () => { core.getInput = jest .fn()