Skip to content

Commit fde1280

Browse files
fix(backend): add filesystem-first scan to GC to handle orphaned disk resources (#973)
* fix(backend): add filesystem-first scan to GC to handle orphaned disk resources Adds cleanupOrphanedDiskResources() which walks the repos/ and index/ directories on disk and removes any entries with no corresponding Repo record in the database. This handles desyncs caused by DB resets or cascade deletes that bypass the normal cleanup job flow. Also adds getRepoIdFromPath() to @sourcebot/shared and getRepoIdFromShardFileName() to the backend utils as inverse helpers to the existing getRepoPath() and getShardPrefix() functions. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * wip --------- Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 65cceec commit fde1280

File tree

6 files changed

+88
-5
lines changed

6 files changed

+88
-5
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1515
### Fixed
1616
- Preserve regex and case sensitivity query parameters when loading more search results. [#972](https://github.com/sourcebot-dev/sourcebot/pull/972)
1717
- Fixed page navigation failing after Next.js 16 upgrade by removing `router.refresh()` calls immediately following `router.push()`. [#974](https://github.com/sourcebot-dev/sourcebot/pull/974)
18+
- Add filesystem-first GC scan to remove orphaned repo directories and index shards that have no corresponding database record. [#973](https://github.com/sourcebot-dev/sourcebot/pull/973)
19+
1820

1921
## [4.13.1] - 2026-02-28
2022

packages/backend/src/index.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ const repoIndexManager = new RepoIndexManager(prisma, settings, redis, promClien
6666
const configManager = new ConfigManager(prisma, connectionManager, env.CONFIG_PATH);
6767

6868
connectionManager.startScheduler();
69-
repoIndexManager.startScheduler();
69+
await repoIndexManager.startScheduler();
7070

7171
if (env.EXPERIMENT_EE_PERMISSION_SYNC_ENABLED === 'true' && !hasEntitlement('permission-syncing')) {
7272
logger.error('Permission syncing is not supported in current plan. Please contact team@sourcebot.dev for assistance.');

packages/backend/src/repoIndexManager.ts

Lines changed: 71 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,18 @@
11
import * as Sentry from '@sentry/node';
22
import { PrismaClient, Repo, RepoIndexingJobStatus, RepoIndexingJobType } from "@sourcebot/db";
3-
import { createLogger, env, getRepoPath, Logger, RepoIndexingJobMetadata, repoIndexingJobMetadataSchema, RepoMetadata, repoMetadataSchema } from "@sourcebot/shared";
3+
import { createLogger, env, getRepoPath, Logger, getRepoIdFromPath, RepoIndexingJobMetadata, repoIndexingJobMetadataSchema, RepoMetadata, repoMetadataSchema } from "@sourcebot/shared";
44
import { DelayedError, Job, Queue, Worker } from "bullmq";
55
import { existsSync } from 'fs';
66
import { readdir, rm } from 'fs/promises';
77
import { Redis } from 'ioredis';
88
import micromatch from 'micromatch';
99
import Redlock, { ExecutionError } from 'redlock';
10-
import { INDEX_CACHE_DIR, WORKER_STOP_GRACEFUL_TIMEOUT_MS } from './constants.js';
10+
import { INDEX_CACHE_DIR, REPOS_CACHE_DIR, WORKER_STOP_GRACEFUL_TIMEOUT_MS } from './constants.js';
1111
import { cloneRepository, fetchRepository, getBranches, getCommitHashForRefName, getLatestCommitTimestamp, getLocalDefaultBranch, getTags, isPathAValidGitRepoRoot, isRepoEmpty, unsetGitConfig, upsertGitConfig } from './git.js';
1212
import { captureEvent } from './posthog.js';
1313
import { PromClient } from './promClient.js';
1414
import { RepoWithConnections, Settings } from "./types.js";
15-
import { getAuthCredentialsForRepo, getShardPrefix, measure, setIntervalAsync } from './utils.js';
15+
import { getAuthCredentialsForRepo, getRepoIdFromShardFileName, getShardPrefix, measure, setIntervalAsync } from './utils.js';
1616
import { cleanupTempShards, indexGitRepository } from './zoekt.js';
1717

1818
const LOG_TAG = 'repo-index-manager';
@@ -96,8 +96,10 @@ export class RepoIndexManager {
9696
});
9797
}
9898

99-
public startScheduler() {
99+
public async startScheduler() {
100100
logger.debug('Starting scheduler');
101+
// Cleanup any orphaned disk resources on startup
102+
await this.cleanupOrphanedDiskResources();
101103
this.interval = setIntervalAsync(async () => {
102104
await this.scheduleIndexJobs();
103105
await this.scheduleCleanupJobs();
@@ -637,6 +639,71 @@ export class RepoIndexManager {
637639
}
638640
}
639641

642+
// Scans the repos and index directories on disk and removes any entries
643+
// that have no corresponding Repo record in the database. This handles
644+
// edge cases where the DB and disk resources are out of sync.
645+
private async cleanupOrphanedDiskResources() {
646+
// --- Repo directories ---
647+
// Dirs are named by repoId: DATA_CACHE_DIR/repos/<repoId>/
648+
if (existsSync(REPOS_CACHE_DIR)) {
649+
const entries = await readdir(REPOS_CACHE_DIR);
650+
const repoIdToPath = new Map<number, string>();
651+
for (const entry of entries) {
652+
const repoPath = `${REPOS_CACHE_DIR}/${entry}`;
653+
const repoId = getRepoIdFromPath(repoPath);
654+
if (repoId !== undefined) {
655+
repoIdToPath.set(repoId, repoPath);
656+
}
657+
}
658+
659+
if (repoIdToPath.size > 0) {
660+
const existingRepos = await this.db.repo.findMany({
661+
where: { id: { in: [...repoIdToPath.keys()] } },
662+
select: { id: true },
663+
});
664+
const existingIds = new Set(existingRepos.map(r => r.id));
665+
for (const [repoId, repoPath] of repoIdToPath) {
666+
if (!existingIds.has(repoId)) {
667+
logger.info(`Removing orphaned repo directory with no DB record: ${repoPath}`);
668+
await rm(repoPath, { recursive: true, force: true });
669+
}
670+
}
671+
}
672+
}
673+
674+
// --- Index shards ---
675+
// Shard files are prefixed with <orgId>_<repoId>: DATA_CACHE_DIR/index/<orgId>_<repoId>_*.zoekt
676+
if (existsSync(INDEX_CACHE_DIR)) {
677+
const entries = await readdir(INDEX_CACHE_DIR);
678+
const repoIdToShards = new Map<number, string[]>();
679+
for (const entry of entries) {
680+
const repoId = getRepoIdFromShardFileName(entry);
681+
if (repoId !== undefined) {
682+
const shards = repoIdToShards.get(repoId) ?? [];
683+
shards.push(entry);
684+
repoIdToShards.set(repoId, shards);
685+
}
686+
}
687+
688+
if (repoIdToShards.size > 0) {
689+
const existingRepos = await this.db.repo.findMany({
690+
where: { id: { in: [...repoIdToShards.keys()] } },
691+
select: { id: true },
692+
});
693+
const existingIds = new Set(existingRepos.map(r => r.id));
694+
for (const [repoId, shards] of repoIdToShards) {
695+
if (!existingIds.has(repoId)) {
696+
for (const entry of shards) {
697+
const shardPath = `${INDEX_CACHE_DIR}/${entry}`;
698+
logger.info(`Removing orphaned index shard with no DB record: ${shardPath}`);
699+
await rm(shardPath, { force: true });
700+
}
701+
}
702+
}
703+
}
704+
}
705+
}
706+
640707
public async dispose() {
641708
if (this.interval) {
642709
clearInterval(this.interval);

packages/backend/src/utils.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,14 @@ export const getShardPrefix = (orgId: number, repoId: number) => {
5757
return `${orgId}_${repoId}`;
5858
}
5959

60+
export const getRepoIdFromShardFileName = (fileName: string): number | undefined => {
61+
const match = fileName.match(/^(\d+)_(\d+)_/);
62+
if (!match) {
63+
return undefined;
64+
}
65+
return parseInt(match[2], 10);
66+
}
67+
6068
export const fetchWithRetry = async <T>(
6169
fetchFn: () => Promise<T>,
6270
identifier: string,

packages/shared/src/index.server.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ export {
2424
loadJsonFile,
2525
getConfigSettings,
2626
getRepoPath,
27+
getRepoIdFromPath,
2728
} from "./utils.js";
2829
export * from "./constants.js";
2930
export {

packages/shared/src/utils.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,11 @@ export const getConfigSettings = async (configPath?: string): Promise<ConfigSett
8787
}
8888
}
8989

90+
export const getRepoIdFromPath = (repoPath: string): number | undefined => {
91+
const id = parseInt(path.basename(repoPath), 10);
92+
return isNaN(id) ? undefined : id;
93+
}
94+
9095
export const getRepoPath = (repo: Repo): { path: string, isReadOnly: boolean } => {
9196
// If we are dealing with a local repository, then use that as the path.
9297
// Mark as read-only since we aren't guaranteed to have write access to the local filesystem.

0 commit comments

Comments
 (0)