Skip to content

Commit 0423e5c

Browse files
fix(backend): add filesystem-first scan to GC to handle orphaned disk resources
Adds cleanupOrphanedDiskResources() which walks the repos/ and index/ directories on disk and removes any entries with no corresponding Repo record in the database. This handles desyncs caused by DB resets or cascade deletes that bypass the normal cleanup job flow. Also adds getRepoIdFromPath() to @sourcebot/shared and getRepoIdFromShardFileName() to the backend utils as inverse helpers to the existing getRepoPath() and getShardPrefix() functions. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 4004933 commit 0423e5c

File tree

4 files changed

+61
-3
lines changed

4 files changed

+61
-3
lines changed

packages/backend/src/repoIndexManager.ts

Lines changed: 47 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,18 @@
11
import * as Sentry from '@sentry/node';
22
import { PrismaClient, Repo, RepoIndexingJobStatus, RepoIndexingJobType } from "@sourcebot/db";
3-
import { createLogger, env, getRepoPath, Logger, RepoIndexingJobMetadata, repoIndexingJobMetadataSchema, RepoMetadata, repoMetadataSchema } from "@sourcebot/shared";
3+
import { createLogger, env, getRepoPath, Logger, getRepoIdFromPath, RepoIndexingJobMetadata, repoIndexingJobMetadataSchema, RepoMetadata, repoMetadataSchema } from "@sourcebot/shared";
44
import { DelayedError, Job, Queue, Worker } from "bullmq";
55
import { existsSync } from 'fs';
66
import { readdir, rm } from 'fs/promises';
77
import { Redis } from 'ioredis';
88
import micromatch from 'micromatch';
99
import Redlock, { ExecutionError } from 'redlock';
10-
import { INDEX_CACHE_DIR, WORKER_STOP_GRACEFUL_TIMEOUT_MS } from './constants.js';
10+
import { INDEX_CACHE_DIR, REPOS_CACHE_DIR, WORKER_STOP_GRACEFUL_TIMEOUT_MS } from './constants.js';
1111
import { cloneRepository, fetchRepository, getBranches, getCommitHashForRefName, getLatestCommitTimestamp, getLocalDefaultBranch, getTags, isPathAValidGitRepoRoot, isRepoEmpty, unsetGitConfig, upsertGitConfig } from './git.js';
1212
import { captureEvent } from './posthog.js';
1313
import { PromClient } from './promClient.js';
1414
import { RepoWithConnections, Settings } from "./types.js";
15-
import { getAuthCredentialsForRepo, getShardPrefix, measure, setIntervalAsync } from './utils.js';
15+
import { getAuthCredentialsForRepo, getRepoIdFromShardFileName, getShardPrefix, measure, setIntervalAsync } from './utils.js';
1616
import { cleanupTempShards, indexGitRepository } from './zoekt.js';
1717

1818
const LOG_TAG = 'repo-index-manager';
@@ -170,6 +170,8 @@ export class RepoIndexManager {
170170
}
171171

172172
private async scheduleCleanupJobs() {
173+
await this.cleanupOrphanedDiskResources();
174+
173175
const gcGracePeriodMs = new Date(Date.now() - this.settings.repoGarbageCollectionGracePeriodMs);
174176
const timeoutDate = new Date(Date.now() - this.settings.repoIndexTimeoutMs);
175177

@@ -637,6 +639,48 @@ export class RepoIndexManager {
637639
}
638640
}
639641

642+
// Scans the repos and index directories on disk and removes any entries
643+
// that have no corresponding Repo record in the database. This handles
644+
// edge cases where the DB and disk resources are out of sync.
645+
private async cleanupOrphanedDiskResources() {
646+
// --- Repo directories ---
647+
// Dirs are named by repoId: DATA_CACHE_DIR/repos/<repoId>/
648+
if (existsSync(REPOS_CACHE_DIR)) {
649+
const entries = await readdir(REPOS_CACHE_DIR);
650+
for (const entry of entries) {
651+
const repoPath = `${REPOS_CACHE_DIR}/${entry}`;
652+
const repoId = getRepoIdFromPath(repoPath);
653+
if (repoId === undefined) {
654+
continue;
655+
}
656+
657+
const repo = await this.db.repo.findUnique({ where: { id: repoId } });
658+
if (!repo) {
659+
logger.info(`Removing orphaned repo directory with no DB record: ${repoPath}`);
660+
await rm(repoPath, { recursive: true, force: true });
661+
}
662+
}
663+
}
664+
665+
// --- Index shards ---
666+
// Shard files are prefixed with <orgId>_<repoId>: DATA_CACHE_DIR/index/<orgId>_<repoId>_*.zoekt
667+
if (existsSync(INDEX_CACHE_DIR)) {
668+
const entries = await readdir(INDEX_CACHE_DIR);
669+
for (const entry of entries) {
670+
const repoId = getRepoIdFromShardFileName(entry);
671+
if (repoId === undefined) {
672+
continue;
673+
}
674+
const repo = await this.db.repo.findUnique({ where: { id: repoId } });
675+
if (!repo) {
676+
const shardPath = `${INDEX_CACHE_DIR}/${entry}`;
677+
logger.info(`Removing orphaned index shard with no DB record: ${shardPath}`);
678+
await rm(shardPath, { force: true });
679+
}
680+
}
681+
}
682+
}
683+
640684
public async dispose() {
641685
if (this.interval) {
642686
clearInterval(this.interval);

packages/backend/src/utils.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,14 @@ export const getShardPrefix = (orgId: number, repoId: number) => {
5757
return `${orgId}_${repoId}`;
5858
}
5959

60+
export const getRepoIdFromShardFileName = (fileName: string): number | undefined => {
61+
const match = fileName.match(/^(\d+)_(\d+)_/);
62+
if (!match) {
63+
return undefined;
64+
}
65+
return parseInt(match[2], 10);
66+
}
67+
6068
export const fetchWithRetry = async <T>(
6169
fetchFn: () => Promise<T>,
6270
identifier: string,

packages/shared/src/index.server.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ export {
2424
loadJsonFile,
2525
getConfigSettings,
2626
getRepoPath,
27+
getRepoIdFromPath,
2728
} from "./utils.js";
2829
export * from "./constants.js";
2930
export {

packages/shared/src/utils.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,11 @@ export const getConfigSettings = async (configPath?: string): Promise<ConfigSett
8787
}
8888
}
8989

90+
export const getRepoIdFromPath = (repoPath: string): number | undefined => {
91+
const id = parseInt(path.basename(repoPath), 10);
92+
return isNaN(id) ? undefined : id;
93+
}
94+
9095
export const getRepoPath = (repo: Repo): { path: string, isReadOnly: boolean } => {
9196
// If we are dealing with a local repository, then use that as the path.
9297
// Mark as read-only since we aren't guaranteed to have write access to the local filesystem.

0 commit comments

Comments
 (0)