Skip to content

Commit 278172a

Browse files
msukkaricursoragentbrendan-kellam
authored
fix(worker): Add tmp shard cleanup on indexing failure (#805)
* fix: cleanup temporary shard files on failed indexing When zoekt-git-index fails during repository indexing, it can leave behind .tmp shard files that accumulate over time and fill up disk space. This is especially problematic for large repos that repeatedly fail to index. Changes: - Add cleanupTempShards() function to zoekt.ts that removes temporary shard files (files with .tmp in their name) for a specific repository - Call cleanupTempShards() in repoIndexManager.ts when indexGitRepository fails, before re-throwing the error This ensures that even if a repository consistently fails to index, the temporary files created during each attempt are cleaned up. Co-authored-by: michael <michael@sourcebot.dev> * fix merge issue * changelog * changelog --------- Co-authored-by: Cursor Agent <cursoragent@cursor.com> Co-authored-by: Brendan Kellam <bshizzle1234@gmail.com>
1 parent f14c52f commit 278172a

File tree

3 files changed

+49
-9
lines changed

3 files changed

+49
-9
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1616
### Fixed
1717
- Fixed issue where certain file and folder names would cause type errors. [#862](https://github.com/sourcebot-dev/sourcebot/pull/862)
1818
- Fixed token refresh error "Provider config not found or invalid for: x" when a sso is configured using deprecated env vars. [#841](https://github.com/sourcebot-dev/sourcebot/pull/841)
19+
- Fixed issue where temporary shard files created on index failure were not being cleaned up. [#805](https://github.com/sourcebot-dev/sourcebot/pull/805)
1920

2021
## [4.10.27] - 2026-02-05
2122

packages/backend/src/repoIndexManager.ts

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,19 @@
11
import * as Sentry from '@sentry/node';
22
import { PrismaClient, Repo, RepoIndexingJobStatus, RepoIndexingJobType } from "@sourcebot/db";
3-
import { createLogger, Logger } from "@sourcebot/shared";
4-
import { env, RepoIndexingJobMetadata, repoIndexingJobMetadataSchema, RepoMetadata, repoMetadataSchema, getRepoPath } from '@sourcebot/shared';
3+
import { createLogger, env, getRepoPath, Logger, RepoIndexingJobMetadata, repoIndexingJobMetadataSchema, RepoMetadata, repoMetadataSchema } from "@sourcebot/shared";
4+
import { DelayedError, Job, Queue, Worker } from "bullmq";
55
import { existsSync } from 'fs';
66
import { readdir, rm } from 'fs/promises';
7-
import { DelayedError, Job, Queue, Worker } from "bullmq";
87
import { Redis } from 'ioredis';
9-
import Redlock, { ExecutionError } from 'redlock';
108
import micromatch from 'micromatch';
11-
import { WORKER_STOP_GRACEFUL_TIMEOUT_MS, INDEX_CACHE_DIR } from './constants.js';
9+
import Redlock, { ExecutionError } from 'redlock';
10+
import { INDEX_CACHE_DIR, WORKER_STOP_GRACEFUL_TIMEOUT_MS } from './constants.js';
1211
import { cloneRepository, fetchRepository, getBranches, getCommitHashForRefName, getLatestCommitTimestamp, getLocalDefaultBranch, getTags, isPathAValidGitRepoRoot, unsetGitConfig, upsertGitConfig } from './git.js';
1312
import { captureEvent } from './posthog.js';
1413
import { PromClient } from './promClient.js';
1514
import { RepoWithConnections, Settings } from "./types.js";
1615
import { getAuthCredentialsForRepo, getShardPrefix, measure, setIntervalAsync } from './utils.js';
17-
import { indexGitRepository } from './zoekt.js';
16+
import { cleanupTempShards, indexGitRepository } from './zoekt.js';
1817

1918
const LOG_TAG = 'repo-index-manager';
2019
const logger = createLogger(LOG_TAG);
@@ -478,9 +477,17 @@ export class RepoIndexManager {
478477
}
479478

480479
logger.info(`Indexing ${repo.name} (id: ${repo.id})...`);
481-
const { durationMs } = await measure(() => indexGitRepository(repo, this.settings, revisions, signal));
482-
const indexDuration_s = durationMs / 1000;
483-
logger.info(`Indexed ${repo.name} (id: ${repo.id}) in ${indexDuration_s}s`);
480+
try {
481+
const { durationMs } = await measure(() => indexGitRepository(repo, this.settings, revisions, signal));
482+
const indexDuration_s = durationMs / 1000;
483+
logger.info(`Indexed ${repo.name} (id: ${repo.id}) in ${indexDuration_s}s`);
484+
} catch (error) {
485+
// Clean up any temporary shard files left behind by the failed indexing operation.
486+
// Zoekt creates .tmp files during indexing which can accumulate if indexing fails repeatedly.
487+
logger.warn(`Indexing failed for ${repo.name} (id: ${repo.id}), cleaning up temp shard files...`);
488+
await cleanupTempShards(repo);
489+
throw error;
490+
}
484491

485492
return revisions;
486493
}

packages/backend/src/zoekt.ts

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import { Repo } from "@sourcebot/db";
22
import { createLogger, env, getRepoPath } from "@sourcebot/shared";
33
import { exec } from "child_process";
4+
import { readdir, rm } from "fs/promises";
45
import { INDEX_CACHE_DIR } from "./constants.js";
56
import { Settings } from "./types.js";
67
import { getShardPrefix } from "./utils.js";
@@ -54,3 +55,34 @@ export const indexGitRepository = async (repo: Repo, settings: Settings, revisio
5455
})
5556
});
5657
}
58+
59+
/**
60+
* Cleans up temporary shard files left behind by a failed indexing operation.
61+
* Zoekt creates temporary files (with `.tmp` suffix) during indexing, which
62+
* can be left behind if the indexing process fails or is interrupted.
63+
*
64+
* @param repo - The repository whose temp shards should be cleaned up
65+
*/
66+
export const cleanupTempShards = async (repo: Repo) => {
67+
const shardPrefix = getShardPrefix(repo.orgId, repo.id);
68+
69+
try {
70+
const files = await readdir(INDEX_CACHE_DIR);
71+
const tempFiles = files.filter(file =>
72+
file.startsWith(shardPrefix) && file.includes('.tmp')
73+
);
74+
75+
for (const file of tempFiles) {
76+
const filePath = `${INDEX_CACHE_DIR}/${file}`;
77+
logger.info(`Cleaning up temp shard file: ${filePath}`);
78+
await rm(filePath, { force: true });
79+
}
80+
81+
if (tempFiles.length > 0) {
82+
logger.info(`Cleaned up ${tempFiles.length} temp shard file(s) for repo ${repo.id}`);
83+
}
84+
} catch (error) {
85+
// Log but don't throw - cleanup is best effort
86+
logger.warn(`Failed to cleanup temp shards for repo ${repo.id}:`, error);
87+
}
88+
}

0 commit comments

Comments
 (0)