Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Fixed
- Preserve regex and case sensitivity query parameters when loading more search results. [#972](https://github.com/sourcebot-dev/sourcebot/pull/972)
- Add filesystem-first GC scan to remove orphaned repo directories and index shards that have no corresponding database record. [#973](https://github.com/sourcebot-dev/sourcebot/pull/973)

## [4.13.1] - 2026-02-28

Expand Down
50 changes: 47 additions & 3 deletions packages/backend/src/repoIndexManager.ts
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
import * as Sentry from '@sentry/node';
import { PrismaClient, Repo, RepoIndexingJobStatus, RepoIndexingJobType } from "@sourcebot/db";
import { createLogger, env, getRepoPath, Logger, RepoIndexingJobMetadata, repoIndexingJobMetadataSchema, RepoMetadata, repoMetadataSchema } from "@sourcebot/shared";
import { createLogger, env, getRepoPath, Logger, getRepoIdFromPath, RepoIndexingJobMetadata, repoIndexingJobMetadataSchema, RepoMetadata, repoMetadataSchema } from "@sourcebot/shared";
import { DelayedError, Job, Queue, Worker } from "bullmq";
import { existsSync } from 'fs';
import { readdir, rm } from 'fs/promises';
import { Redis } from 'ioredis';
import micromatch from 'micromatch';
import Redlock, { ExecutionError } from 'redlock';
import { INDEX_CACHE_DIR, WORKER_STOP_GRACEFUL_TIMEOUT_MS } from './constants.js';
import { INDEX_CACHE_DIR, REPOS_CACHE_DIR, WORKER_STOP_GRACEFUL_TIMEOUT_MS } from './constants.js';
import { cloneRepository, fetchRepository, getBranches, getCommitHashForRefName, getLatestCommitTimestamp, getLocalDefaultBranch, getTags, isPathAValidGitRepoRoot, isRepoEmpty, unsetGitConfig, upsertGitConfig } from './git.js';
import { captureEvent } from './posthog.js';
import { PromClient } from './promClient.js';
import { RepoWithConnections, Settings } from "./types.js";
import { getAuthCredentialsForRepo, getShardPrefix, measure, setIntervalAsync } from './utils.js';
import { getAuthCredentialsForRepo, getRepoIdFromShardFileName, getShardPrefix, measure, setIntervalAsync } from './utils.js';
import { cleanupTempShards, indexGitRepository } from './zoekt.js';

const LOG_TAG = 'repo-index-manager';
Expand Down Expand Up @@ -170,6 +170,8 @@ export class RepoIndexManager {
}

private async scheduleCleanupJobs() {
await this.cleanupOrphanedDiskResources();

const gcGracePeriodMs = new Date(Date.now() - this.settings.repoGarbageCollectionGracePeriodMs);
const timeoutDate = new Date(Date.now() - this.settings.repoIndexTimeoutMs);

Expand Down Expand Up @@ -637,6 +639,48 @@ export class RepoIndexManager {
}
}

// Scans the repos and index directories on disk and removes any entries
// that have no corresponding Repo record in the database. This handles
// edge cases where the DB and disk resources are out of sync.
private async cleanupOrphanedDiskResources() {
// --- Repo directories ---
// Dirs are named by repoId: DATA_CACHE_DIR/repos/<repoId>/
if (existsSync(REPOS_CACHE_DIR)) {
const entries = await readdir(REPOS_CACHE_DIR);
for (const entry of entries) {
const repoPath = `${REPOS_CACHE_DIR}/${entry}`;
const repoId = getRepoIdFromPath(repoPath);
if (repoId === undefined) {
continue;
}

const repo = await this.db.repo.findUnique({ where: { id: repoId } });
if (!repo) {
logger.info(`Removing orphaned repo directory with no DB record: ${repoPath}`);
await rm(repoPath, { recursive: true, force: true });
}
}
}

// --- Index shards ---
// Shard files are prefixed with <orgId>_<repoId>: DATA_CACHE_DIR/index/<orgId>_<repoId>_*.zoekt
if (existsSync(INDEX_CACHE_DIR)) {
const entries = await readdir(INDEX_CACHE_DIR);
for (const entry of entries) {
const repoId = getRepoIdFromShardFileName(entry);
if (repoId === undefined) {
continue;
}
const repo = await this.db.repo.findUnique({ where: { id: repoId } });
if (!repo) {
Comment thread
brendan-kellam marked this conversation as resolved.
Outdated
const shardPath = `${INDEX_CACHE_DIR}/${entry}`;
logger.info(`Removing orphaned index shard with no DB record: ${shardPath}`);
await rm(shardPath, { force: true });
}
}
}
}

public async dispose() {
if (this.interval) {
clearInterval(this.interval);
Expand Down
8 changes: 8 additions & 0 deletions packages/backend/src/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,14 @@ export const getShardPrefix = (orgId: number, repoId: number) => {
return `${orgId}_${repoId}`;
}

export const getRepoIdFromShardFileName = (fileName: string): number | undefined => {
const match = fileName.match(/^(\d+)_(\d+)_/);
if (!match) {
return undefined;
}
return parseInt(match[2], 10);
}

export const fetchWithRetry = async <T>(
fetchFn: () => Promise<T>,
identifier: string,
Expand Down
1 change: 1 addition & 0 deletions packages/shared/src/index.server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ export {
loadJsonFile,
getConfigSettings,
getRepoPath,
getRepoIdFromPath,
} from "./utils.js";
export * from "./constants.js";
export {
Expand Down
5 changes: 5 additions & 0 deletions packages/shared/src/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,11 @@ export const getConfigSettings = async (configPath?: string): Promise<ConfigSett
}
}

export const getRepoIdFromPath = (repoPath: string): number | undefined => {
const id = parseInt(path.basename(repoPath), 10);
return isNaN(id) ? undefined : id;
}
Comment thread
brendan-kellam marked this conversation as resolved.

export const getRepoPath = (repo: Repo): { path: string, isReadOnly: boolean } => {
// If we are dealing with a local repository, then use that as the path.
// Mark as read-only since we aren't guaranteed to have write access to the local filesystem.
Expand Down