Skip to content

Commit a37ff24

Browse files
CopilotlpcoxCopilot
authored
refactor: reduce pid-tracker API surface and harden api-proxy startup retry handling (#2895)
* Initial plan * fix: reduce pid-tracker public API surface * docs: fix stale pid tracker jsdoc * fix: retry api-proxy startup when compose reports exited (1) Agent-Logs-Url: https://github.com/github/gh-aw-firewall/sessions/fcf2b573-5256-4c69-8f65-0a79431bd838 * fix(api-proxy): add missing JS modules to Dockerfile COPY The OIDC refactoring PRs (#2811, #2772, #2887) added new JS modules (github-oidc.js, aws-oidc-token-provider.js, gcp-oidc-token-provider.js, oidc-refresh-utils.js) but did not update the Dockerfile COPY command. This caused the api-proxy container to crash immediately on startup with exit code 1 (Cannot find module './github-oidc'), breaking all integration tests since commit 7c25298. Fixes the api-proxy container startup crash that has been failing all integration test runs on main since 2026-05-11T15:45Z. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * test: restore missing pid-tracker test cases for coverage Add back tests for malformed /proc/net/tcp rows and non-symlink file descriptors that were removed when the async trackPidForPort was dropped. These paths are still exercised in production via trackPidForPortSync and need coverage. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: Landon Cox <landon.cox@microsoft.com> Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 869b0c3 commit a37ff24

4 files changed

Lines changed: 88 additions & 168 deletions

File tree

src/container-lifecycle.ts

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -376,13 +376,16 @@ async function checkSquidLogs(workDir: string, proxyLogsDir?: string): Promise<{
376376

377377
/**
378378
* Returns true when the Docker Compose error message indicates that the
379-
* api-proxy container specifically failed its health check.
379+
* api-proxy container specifically failed to start.
380380
* Docker emits "dependency failed to start: container <name> is unhealthy"
381-
* when a dependent container's health check does not pass.
381+
* for healthcheck failures, and may emit "dependency failed to start:
382+
* container <name> exited (1)" for startup-time process exits.
382383
*/
383-
function isApiProxyUnhealthyError(errorMsg: string): boolean {
384-
return errorMsg.includes('is unhealthy') &&
385-
errorMsg.includes(API_PROXY_CONTAINER_NAME);
384+
function isApiProxyStartupFailureError(errorMsg: string): boolean {
385+
if (!errorMsg.includes(API_PROXY_CONTAINER_NAME)) {
386+
return false;
387+
}
388+
return errorMsg.includes('is unhealthy') || errorMsg.includes('exited (1)');
386389
}
387390

388391
/**
@@ -461,11 +464,11 @@ export async function startContainers(workDir: string, allowedDomains: string[],
461464
} catch (firstError) {
462465
const firstErrorMsg = firstError instanceof Error ? firstError.message : String(firstError);
463466

464-
// When api-proxy specifically fails its health check, retry once.
467+
// When api-proxy specifically fails to start, retry once.
465468
// Transient failures are common on slow or busy runners (e.g. Azure-hosted runners)
466469
// where the Node.js process inside the container takes longer to bind its port.
467-
if (isApiProxyUnhealthyError(firstErrorMsg)) {
468-
logger.warn(`${API_PROXY_CONTAINER_NAME} failed its health check — this may be a transient startup failure, retrying once...`);
470+
if (isApiProxyStartupFailureError(firstErrorMsg)) {
471+
logger.warn(`${API_PROXY_CONTAINER_NAME} failed to start — this may be a transient startup failure, retrying once...`);
469472
await logContainerLogsToStderr(API_PROXY_CONTAINER_NAME);
470473

471474
// Tear down before retry so Docker Compose starts fresh
@@ -488,12 +491,12 @@ export async function startContainers(workDir: string, allowedDomains: string[],
488491
return;
489492
} catch (retryError) {
490493
const retryErrorMsg = retryError instanceof Error ? retryError.message : String(retryError);
491-
if (isApiProxyUnhealthyError(retryErrorMsg)) {
494+
if (isApiProxyStartupFailureError(retryErrorMsg)) {
492495
// Surface api-proxy logs and emit a clear, unambiguous error so
493496
// downstream parse steps don't blame the model for never running.
494497
await logContainerLogsToStderr(API_PROXY_CONTAINER_NAME);
495498
throw new Error(
496-
`AWF firewall failed to start: ${API_PROXY_CONTAINER_NAME} failed its health check on both attempts. ` +
499+
`AWF firewall failed to start: ${API_PROXY_CONTAINER_NAME} failed to start on both attempts. ` +
497500
`The agent was never invoked. ` +
498501
`See ${API_PROXY_CONTAINER_NAME} container logs above for details.`
499502
);

src/docker-manager-lifecycle.test.ts

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,10 +152,31 @@ describe('docker-manager lifecycle', () => {
152152
mockExecaFn.mockResolvedValueOnce({ stdout: 'api-proxy logs', stderr: '', exitCode: 0 } as any);
153153

154154
await expect(startContainers(testDir, ['github.com'])).rejects.toThrow(
155-
'AWF firewall failed to start: awf-api-proxy failed its health check on both attempts'
155+
'AWF firewall failed to start: awf-api-proxy failed to start on both attempts'
156156
);
157157
});
158158

159+
it('should retry once when awf-api-proxy exits (1) during startup', async () => {
160+
// 1. docker rm (initial cleanup)
161+
mockExecaFn.mockResolvedValueOnce({ stdout: '', stderr: '', exitCode: 0 } as any);
162+
// 2. docker compose up (first attempt - fails with api-proxy exited)
163+
mockExecaFn.mockRejectedValueOnce(new Error('dependency failed to start: container awf-api-proxy exited (1)'));
164+
// 3. docker logs --tail 50 awf-api-proxy (get logs for diagnosis)
165+
mockExecaFn.mockResolvedValueOnce({ stdout: 'api-proxy startup logs', stderr: '', exitCode: 0 } as any);
166+
// 4. docker compose down (cleanup before retry)
167+
mockExecaFn.mockResolvedValueOnce({ stdout: '', stderr: '', exitCode: 0 } as any);
168+
// 5. docker compose up (retry - succeeds)
169+
mockExecaFn.mockResolvedValueOnce({ stdout: '', stderr: '', exitCode: 0 } as any);
170+
171+
await expect(startContainers(testDir, ['github.com'])).resolves.toBeUndefined();
172+
173+
// Verify retry happened: compose up was called twice
174+
const upCalls = mockExecaFn.mock.calls.filter((call: any[]) =>
175+
call[0] === 'docker' && Array.isArray(call[1]) && call[1].includes('up')
176+
);
177+
expect(upCalls).toHaveLength(2);
178+
});
179+
159180
it('should not retry for non-api-proxy healthcheck failures', async () => {
160181
// 1. docker rm (initial cleanup)
161182
mockExecaFn.mockResolvedValueOnce({ stdout: '', stderr: '', exitCode: 0 } as any);

src/pid-tracker.test.ts

Lines changed: 47 additions & 116 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,24 @@
11
/**
22
* Unit tests for pid-tracker.ts
33
*
4-
* These tests use mock /proc filesystem data to test the parsing
5-
* and tracking logic without requiring actual system access.
4+
* These tests use mock /proc filesystem data to validate behavior
5+
* through the module's public API.
66
*/
77

88
import * as fs from 'fs';
99
import * as path from 'path';
1010
import * as os from 'os';
11-
import {
12-
trackPidForPort,
13-
trackPidForPortSync,
14-
isPidTrackingAvailable,
15-
} from './pid-tracker';
11+
import { trackPidForPortSync, isPidTrackingAvailable } from './pid-tracker';
1612

1713
describe('pid-tracker', () => {
1814
describe('Mock /proc filesystem tests', () => {
1915
let mockProcPath: string;
2016

2117
beforeEach(() => {
22-
// Create a temporary mock /proc directory
2318
mockProcPath = fs.mkdtempSync(path.join(os.tmpdir(), 'mock-proc-'));
2419
});
2520

2621
afterEach(() => {
27-
// Clean up
2822
fs.rmSync(mockProcPath, { recursive: true, force: true });
2923
});
3024

@@ -34,18 +28,6 @@ describe('pid-tracker', () => {
3428
fs.writeFileSync(path.join(netDir, 'tcp'), entries);
3529
};
3630

37-
describe('isPidTrackingAvailable', () => {
38-
it('should return true when /proc/net/tcp exists', () => {
39-
createMockNetTcp('header\n');
40-
expect(isPidTrackingAvailable(mockProcPath)).toBe(true);
41-
});
42-
43-
it('should return false when /proc/net/tcp does not exist', () => {
44-
expect(isPidTrackingAvailable(mockProcPath)).toBe(false);
45-
});
46-
});
47-
48-
// Helper to create mock proc with actual symlinks (for socket fd testing)
4931
const createMockProcWithSymlinks = (
5032
pid: number,
5133
cmdline: string,
@@ -55,196 +37,145 @@ describe('pid-tracker', () => {
5537
const pidDir = path.join(mockProcPath, pid.toString());
5638
fs.mkdirSync(pidDir, { recursive: true });
5739

58-
// Write cmdline (null-separated)
5940
fs.writeFileSync(path.join(pidDir, 'cmdline'), cmdline.replace(/ /g, '\0'));
60-
61-
// Write comm
6241
fs.writeFileSync(path.join(pidDir, 'comm'), comm);
6342

64-
// Create fd directory and socket symlinks
6543
const fdDir = path.join(pidDir, 'fd');
6644
fs.mkdirSync(fdDir, { recursive: true });
6745

6846
socketInodes.forEach((inode, index) => {
6947
const fdPath = path.join(fdDir, (index + 3).toString());
70-
// Create actual symlink to socket:[inode]
7148
fs.symlinkSync(`socket:[${inode}]`, fdPath);
7249
});
7350
};
7451

75-
describe('trackPidForPort', () => {
76-
it('should ignore malformed /proc/net/tcp rows', async () => {
77-
const netTcpContent = ` sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode
78-
0: malformed`;
79-
createMockNetTcp(netTcpContent);
52+
describe('isPidTrackingAvailable', () => {
53+
it('should return true when /proc/net/tcp exists', () => {
54+
createMockNetTcp('header\n');
55+
expect(isPidTrackingAvailable(mockProcPath)).toBe(true);
56+
});
8057

81-
const result = await trackPidForPort(3306, mockProcPath);
82-
expect(result.pid).toBe(-1);
83-
expect(result.error).toContain('No socket found');
58+
it('should return false when /proc/net/tcp does not exist', () => {
59+
expect(isPidTrackingAvailable(mockProcPath)).toBe(false);
8460
});
61+
});
8562

86-
it('should return error when /proc/net/tcp does not exist', async () => {
87-
const result = await trackPidForPort(45678, mockProcPath);
63+
describe('trackPidForPortSync', () => {
64+
it('should return error when /proc/net/tcp does not exist', () => {
65+
const result = trackPidForPortSync(45678, mockProcPath);
8866
expect(result.pid).toBe(-1);
8967
expect(result.error).toContain('Failed to read');
9068
});
9169

92-
it('should return error when port not found in tcp table', async () => {
70+
it('should return error when tcp table content is empty', () => {
71+
createMockNetTcp('');
72+
73+
const result = trackPidForPortSync(3306, mockProcPath);
74+
expect(result.pid).toBe(-1);
75+
expect(result.error).toContain('No socket found');
76+
});
77+
78+
it('should return error when port not found in tcp table', () => {
9379
const netTcpContent = ` sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode
9480
0: 0100007F:0CEA 00000000:0000 0A 00000000:00000000 00:00000000 00000000 1000 0 123456 1 0000000000000000 100 0 0 10 0`;
9581
createMockNetTcp(netTcpContent);
9682

97-
const result = await trackPidForPort(99999, mockProcPath);
83+
const result = trackPidForPortSync(99999, mockProcPath);
9884
expect(result.pid).toBe(-1);
9985
expect(result.error).toContain('No socket found');
10086
});
10187

102-
it('should return error when inode is 0', async () => {
103-
// Inode 0 indicates no socket assigned
88+
it('should return error when inode is 0', () => {
10489
const netTcpContent = ` sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode
10590
0: 0100007F:0CEA 00000000:0000 0A 00000000:00000000 00:00000000 00000000 1000 0 0 1 0000000000000000 100 0 0 10 0`;
10691
createMockNetTcp(netTcpContent);
10792

108-
const result = await trackPidForPort(3306, mockProcPath);
93+
const result = trackPidForPortSync(3306, mockProcPath);
10994
expect(result.pid).toBe(-1);
11095
expect(result.error).toContain('No socket found');
11196
});
11297

113-
it('should successfully track process for port', async () => {
98+
it('should successfully track process for parsed hex port and inode', () => {
11499
const netTcpContent = ` sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode
115100
0: 0100007F:B278 00000000:0000 0A 00000000:00000000 00:00000000 00000000 1000 0 123456 1 0000000000000000 100 0 0 10 0`;
116101
createMockNetTcp(netTcpContent);
117102
createMockProcWithSymlinks(1234, 'curl https://github.com', 'curl', ['123456']);
118103

119-
const result = await trackPidForPort(45688, mockProcPath); // B278 in hex
104+
const result = trackPidForPortSync(45688, mockProcPath); // B278 in hex
120105
expect(result.pid).toBe(1234);
121106
expect(result.cmdline).toBe('curl https://github.com');
122107
expect(result.comm).toBe('curl');
123108
expect(result.inode).toBe('123456');
124109
expect(result.error).toBeUndefined();
125110
});
126111

127-
it('should return error when no process owns the socket', async () => {
128-
const netTcpContent = ` sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode
129-
0: 0100007F:B278 00000000:0000 0A 00000000:00000000 00:00000000 00000000 1000 0 123456 1 0000000000000000 100 0 0 10 0`;
130-
createMockNetTcp(netTcpContent);
131-
// Create a process but with different inode
132-
createMockProcWithSymlinks(1234, 'curl', 'curl', ['999999']);
133-
expect(fs.existsSync(path.join(mockProcPath, '1234'))).toBe(true);
134-
135-
const result = await trackPidForPort(45688, mockProcPath);
136-
expect(result.pid).toBe(-1);
137-
expect(result.inode).toBe('123456');
138-
expect(result.error).toContain('Socket inode 123456 found but no process owns it');
139-
});
140-
141-
it('should return unknown metadata when cmdline and comm are unreadable', async () => {
112+
it('should return unknown labels when process info files are missing', () => {
142113
const netTcpContent = ` sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode
143114
0: 0100007F:B278 00000000:0000 0A 00000000:00000000 00:00000000 00000000 1000 0 123456 1 0000000000000000 100 0 0 10 0`;
144115
createMockNetTcp(netTcpContent);
145116

146-
const pidDir = path.join(mockProcPath, '4321');
147-
const fdDir = path.join(pidDir, 'fd');
148-
fs.mkdirSync(fdDir, { recursive: true });
149-
fs.symlinkSync('socket:[123456]', path.join(fdDir, '3'));
150-
// Intentionally omit /proc/[pid]/cmdline and /proc/[pid]/comm
117+
const pidDir = path.join(mockProcPath, '1234');
118+
fs.mkdirSync(path.join(pidDir, 'fd'), { recursive: true });
119+
fs.symlinkSync('socket:[123456]', path.join(pidDir, 'fd', '3'));
151120

152-
const result = await trackPidForPort(45688, mockProcPath);
153-
expect(result.pid).toBe(4321);
154-
expect(result.inode).toBe('123456');
121+
const result = trackPidForPortSync(45688, mockProcPath);
122+
expect(result.pid).toBe(1234);
155123
expect(result.cmdline).toBe('unknown');
156124
expect(result.comm).toBe('unknown');
157125
});
158-
});
159126

160-
describe('trackPidForPortSync', () => {
161-
it('should return error when /proc/net/tcp does not exist', () => {
162-
const result = trackPidForPortSync(45678, mockProcPath);
163-
expect(result.pid).toBe(-1);
164-
expect(result.error).toContain('Failed to read');
165-
});
166-
167-
it('should return error when port not found in tcp table', () => {
127+
it('should return error when no process owns the socket', () => {
168128
const netTcpContent = ` sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode
169-
0: 0100007F:0CEA 00000000:0000 0A 00000000:00000000 00:00000000 00000000 1000 0 123456 1 0000000000000000 100 0 0 10 0`;
129+
0: 0100007F:B278 00000000:0000 0A 00000000:00000000 00:00000000 00000000 1000 0 123456 1 0000000000000000 100 0 0 10 0`;
170130
createMockNetTcp(netTcpContent);
131+
createMockProcWithSymlinks(1234, 'curl', 'curl', ['999999']);
171132

172-
const result = trackPidForPortSync(99999, mockProcPath);
133+
const result = trackPidForPortSync(45688, mockProcPath);
173134
expect(result.pid).toBe(-1);
174-
expect(result.error).toContain('No socket found');
135+
expect(result.inode).toBe('123456');
136+
expect(result.error).toContain('Socket inode 123456 found but no process owns it');
175137
});
176138

177-
it('should return error when inode is 0', () => {
139+
it('should ignore malformed /proc/net/tcp rows', () => {
178140
const netTcpContent = ` sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode
179-
0: 0100007F:0CEA 00000000:0000 0A 00000000:00000000 00:00000000 00000000 1000 0 0 1 0000000000000000 100 0 0 10 0`;
141+
0: malformed`;
180142
createMockNetTcp(netTcpContent);
181143

182144
const result = trackPidForPortSync(3306, mockProcPath);
183145
expect(result.pid).toBe(-1);
184146
expect(result.error).toContain('No socket found');
185147
});
186148

187-
it('should successfully track process for port synchronously', () => {
188-
const netTcpContent = ` sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode
189-
0: 0100007F:B278 00000000:0000 0A 00000000:00000000 00:00000000 00000000 1000 0 123456 1 0000000000000000 100 0 0 10 0`;
190-
createMockNetTcp(netTcpContent);
191-
createMockProcWithSymlinks(1234, 'curl https://github.com', 'curl', ['123456']);
192-
193-
const result = trackPidForPortSync(45688, mockProcPath); // B278 in hex
194-
expect(result.pid).toBe(1234);
195-
expect(result.cmdline).toBe('curl https://github.com');
196-
expect(result.comm).toBe('curl');
197-
expect(result.inode).toBe('123456');
198-
expect(result.error).toBeUndefined();
199-
});
200-
201-
it('should return error when no process owns the socket synchronously', () => {
202-
const netTcpContent = ` sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode
203-
0: 0100007F:B278 00000000:0000 0A 00000000:00000000 00:00000000 00000000 1000 0 123456 1 0000000000000000 100 0 0 10 0`;
204-
createMockNetTcp(netTcpContent);
205-
// Create a process but with different inode
206-
createMockProcWithSymlinks(1234, 'curl', 'curl', ['999999']);
207-
expect(fs.existsSync(path.join(mockProcPath, '1234'))).toBe(true);
208-
209-
const result = trackPidForPortSync(45688, mockProcPath);
210-
expect(result.pid).toBe(-1);
211-
expect(result.inode).toBe('123456');
212-
expect(result.error).toContain('Socket inode 123456 found but no process owns it');
213-
});
214-
215149
it('should ignore non-symlink file descriptors', () => {
216150
const netTcpContent = ` sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode
217151
0: 0100007F:B278 00000000:0000 0A 00000000:00000000 00:00000000 00000000 1000 0 123456 1 0000000000000000 100 0 0 10 0`;
218152
createMockNetTcp(netTcpContent);
219153

220-
const pidDir = path.join(mockProcPath, '5678');
154+
// Create a process with a regular file fd instead of a socket symlink
155+
const pidDir = path.join(mockProcPath, '1234');
221156
const fdDir = path.join(pidDir, 'fd');
222157
fs.mkdirSync(fdDir, { recursive: true });
223-
fs.writeFileSync(path.join(pidDir, 'cmdline'), 'curl');
224-
fs.writeFileSync(path.join(pidDir, 'comm'), 'curl');
225-
fs.writeFileSync(path.join(fdDir, '3'), 'not-a-symlink');
158+
fs.writeFileSync(path.join(pidDir, 'cmdline'), 'test');
159+
fs.writeFileSync(path.join(pidDir, 'comm'), 'test');
160+
fs.writeFileSync(path.join(fdDir, '3'), 'regular-file');
226161

227162
const result = trackPidForPortSync(45688, mockProcPath);
228163
expect(result.pid).toBe(-1);
229-
expect(result.inode).toBe('123456');
230164
expect(result.error).toContain('Socket inode 123456 found but no process owns it');
231165
});
232166
});
233167
});
234168

235169
describe('Real /proc filesystem (integration)', () => {
236-
// These tests only run if /proc is available (Linux only)
237170
const isLinux = process.platform === 'linux';
238171

239172
it('should check if PID tracking is available', () => {
240173
const result = isPidTrackingAvailable();
241-
// On Linux, this should be true; on other platforms, false
242174
if (isLinux) {
243175
expect(result).toBe(true);
244176
} else {
245177
expect(result).toBe(false);
246178
}
247179
});
248-
249180
});
250181
});

0 commit comments

Comments
 (0)