Skip to content

Commit 82b7c49

Browse files
authored
fix(kiloclaw): omit ord from Fly region list due to provisioning issues (#1090)
* fix(kiloclaw): omit ord from Fly region list due to provisioning issues * revert(kiloclaw): restore shuffle+deprioritize in capacity recovery (#1059) With explicit region codes instead of the 'us' alias, deprioritizeRegion can now correctly match concrete regions (e.g. 'iad') against the list. The original bug — meta-region 'us' never matching concrete region codes — no longer applies.
1 parent 4bea3f8 commit 82b7c49

4 files changed

Lines changed: 39 additions & 15 deletions

File tree

kiloclaw/src/config.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,9 @@ export const DEFAULT_MACHINE_GUEST = {
3838
export const DEFAULT_VOLUME_SIZE_GB = 10;
3939

4040
/** Default Fly region priority list when FLY_REGION env var is not set.
41-
* Callers shuffle before selecting so order here doesn't matter. */
42-
export const DEFAULT_FLY_REGION = 'us,eu';
41+
* Callers shuffle before selecting so order here doesn't matter.
42+
* ord omitted due to provisioning issues. */
43+
export const DEFAULT_FLY_REGION = 'dfw,ewr,iad,lax,sjc,eu';
4344

4445
// Alarm cadence by instance status
4546
/** Running machines: fast health checks */

kiloclaw/src/durable-objects/kiloclaw-instance.test.ts

Lines changed: 32 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ function createFakeEnv() {
156156
return {
157157
FLY_API_TOKEN: 'test-token',
158158
FLY_APP_NAME: 'test-app',
159-
FLY_REGION: 'us,eu',
159+
FLY_REGION: 'dfw,ewr,iad,lax,sjc,eu',
160160
GATEWAY_TOKEN_SECRET: 'test-secret',
161161
NEXTAUTH_SECRET: 'test-nextauth-secret-at-least-32-chars',
162162
WORKER_ENV: 'development',
@@ -2128,7 +2128,7 @@ describe('start: volume region validation', () => {
21282128
// ============================================================================
21292129

21302130
describe('start: 412 insufficient resources recovery', () => {
2131-
it('fresh provision (never started): deletes volume and creates fresh with reversed regions', async () => {
2131+
it('fresh provision (never started): deletes volume and creates fresh with deprioritized regions', async () => {
21322132
const { instance, storage } = createInstance();
21332133
await seedProvisioned(storage, { flyMachineId: null, lastStartedAt: null });
21342134

@@ -2150,15 +2150,23 @@ describe('start: 412 insufficient resources recovery', () => {
21502150

21512151
// Old volume was deleted
21522152
expect(flyClient.deleteVolume).toHaveBeenCalledWith(expect.anything(), 'vol-1');
2153-
// New volume created via fallback with reversed regions and compute hint
2153+
// New volume created via fallback with deprioritized regions and compute hint
21542154
const regions412Call = (flyClient.createVolumeWithFallback as Mock).mock.calls[0];
21552155
expect(regions412Call[1]).toEqual(
21562156
expect.objectContaining({
21572157
compute: expect.objectContaining({ cpus: 2, memory_mb: 3072 }) as unknown,
21582158
})
21592159
);
2160-
// Regions are reversed from FLY_REGION='us,eu' so EU is tried first on recovery
2161-
expect(regions412Call[2]).toEqual(['eu', 'us']);
2160+
// Regions are shuffled, so just check the set (deprioritize is a no-op here
2161+
// because 'iad' is not in FLY_REGION='dfw,ewr,iad,lax,sjc,eu')
2162+
expect((regions412Call[2] as string[]).sort()).toEqual([
2163+
'dfw',
2164+
'eu',
2165+
'ewr',
2166+
'iad',
2167+
'lax',
2168+
'sjc',
2169+
]);
21622170
// source_volume_id should NOT be set for fresh provision
21632171
const createVolumeCall = (flyClient.createVolumeWithFallback as Mock).mock
21642172
.calls[0][1] as Record<string, unknown>;
@@ -2196,16 +2204,23 @@ describe('start: 412 insufficient resources recovery', () => {
21962204

21972205
await instance.start('user-1');
21982206

2199-
// Volume was forked (source_volume_id set) with compute hint and reversed regions
2207+
// Volume was forked (source_volume_id set) with compute hint and deprioritized regions
22002208
const regionsForkCall = (flyClient.createVolumeWithFallback as Mock).mock.calls[0];
22012209
expect(regionsForkCall[1]).toEqual(
22022210
expect.objectContaining({
22032211
source_volume_id: 'vol-1',
22042212
compute: expect.objectContaining({ cpus: 2, memory_mb: 3072 }) as unknown,
22052213
})
22062214
);
2207-
// Regions are reversed from FLY_REGION='us,eu' so EU is tried first on recovery
2208-
expect(regionsForkCall[2]).toEqual(['eu', 'us']);
2215+
// Regions are shuffled — check the set
2216+
expect((regionsForkCall[2] as string[]).sort()).toEqual([
2217+
'dfw',
2218+
'eu',
2219+
'ewr',
2220+
'iad',
2221+
'lax',
2222+
'sjc',
2223+
]);
22092224
// Old volume was deleted
22102225
expect(flyClient.deleteVolume).toHaveBeenCalledWith(expect.anything(), 'vol-1');
22112226
// Machine was retried
@@ -2269,8 +2284,15 @@ describe('start: 412 insufficient resources recovery', () => {
22692284
compute: expect.objectContaining({ cpus: 2, memory_mb: 3072 }) as unknown,
22702285
})
22712286
);
2272-
// Regions are reversed from FLY_REGION='us,eu' so EU is tried first on recovery
2273-
expect(regionsUpdateCall[2]).toEqual(['eu', 'us']);
2287+
// Regions are shuffled then deprioritized — check the set
2288+
expect((regionsUpdateCall[2] as string[]).sort()).toEqual([
2289+
'dfw',
2290+
'eu',
2291+
'ewr',
2292+
'iad',
2293+
'lax',
2294+
'sjc',
2295+
]);
22742296
// New machine was created
22752297
expect(storage._store.get('flyMachineId')).toBe('machine-new');
22762298
expect(storage._store.get('flyVolumeId')).toBe('vol-new');

kiloclaw/src/durable-objects/kiloclaw-instance/fly-machines.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ import {
88
DEFAULT_FLY_REGION,
99
STALE_PROVISION_THRESHOLD_MS,
1010
} from '../../config';
11-
import { parseRegions, shuffleRegions } from '../regions';
11+
import { parseRegions, shuffleRegions, deprioritizeRegion } from '../regions';
1212
import { guestFromSize, volumeNameFromSandboxId } from '../machine-config';
1313
import type { InstanceMutableState } from './types';
1414
import { storageUpdate } from './state';
@@ -65,7 +65,8 @@ export async function replaceStrandedVolume(
6565
const oldVolumeId = state.flyVolumeId;
6666
const oldRegion = state.flyRegion;
6767
const hasUserData = state.lastStartedAt !== null;
68-
const regions = parseRegions(env.FLY_REGION ?? DEFAULT_FLY_REGION).reverse();
68+
const allRegions = shuffleRegions(parseRegions(env.FLY_REGION ?? DEFAULT_FLY_REGION));
69+
const regions = deprioritizeRegion(allRegions, oldRegion);
6970
const compute = guestFromSize(state.machineSize);
7071

7172
// Destroy existing machine if any — it's stuck on the constrained host.

kiloclaw/wrangler.jsonc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@
7070
"FLY_REGISTRY_APP": "kiloclaw-machines", // Shared Docker image registry
7171
"FLY_ORG_SLUG": "kilo-679", // Org for creating per-user Fly apps
7272
"FLY_IMAGE_TAG": "latest",
73-
"FLY_REGION": "us,eu",
73+
"FLY_REGION": "dfw,ewr,iad,lax,sjc,eu",
7474
"OPENCLAW_ALLOWED_ORIGINS": "https://claw.kilosessions.ai,https://kilo.ai,https://www.kilo.ai",
7575
"REQUIRE_PROXY_TOKEN": "true",
7676
// Defaults to "production". For local dev, override to "development" in .dev.vars.

0 commit comments

Comments
 (0)