From 74ba58abb545d930594329c7e48fa4820e481760 Mon Sep 17 00:00:00 2001 From: Brendan Kellam Date: Wed, 10 Jun 2026 16:11:53 -0700 Subject: [PATCH 1/2] fix(backend): prevent duplicate index jobs from indexedAt race onJobCompleted marked the index job COMPLETED in one write, then ran several git reads (isRepoEmpty, getCommitHashForRefName, getLatestCommitTimestamp, getLocalDefaultBranch), then updated repo.indexedAt in a separate write. During the git-read window the job was already COMPLETED but indexedAt was still stale, so the scheduler (scheduleIndexJobs) saw no active job and an out-of-date indexedAt and scheduled a duplicate index job. On large repos the window is long enough to be hit routinely by the 1s scheduler poll. Run the git reads first, then write status=COMPLETED and indexedAt together in a single repoIndexingJob.update (nested repo update). Now the job stays IN_PROGRESS until the moment indexedAt becomes fresh, so the scheduler's two guards can never both pass at once. Fixes SOU-1150 Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/backend/src/repoIndexManager.test.ts | 39 +++++------ packages/backend/src/repoIndexManager.ts | 65 +++++++++++-------- 2 files changed, 55 insertions(+), 49 deletions(-) diff --git a/packages/backend/src/repoIndexManager.test.ts b/packages/backend/src/repoIndexManager.test.ts index deb2ebdfd..684f1a826 100644 --- a/packages/backend/src/repoIndexManager.test.ts +++ b/packages/backend/src/repoIndexManager.test.ts @@ -525,17 +525,16 @@ describe('RepoIndexManager', () => { manager = new RepoIndexManager(mockPrisma, mockSettings, mockRedis, mockPromClient as any); + // The onJobCompleted handler reads the job via findUniqueOrThrow, then marks it + // COMPLETED and updates the repo (indexedAt, etc.) in a single repoIndexingJob.update + // with a nested repo update. (mockPrisma.repoIndexingJob.findUniqueOrThrow as Mock).mockResolvedValue({ - status: RepoIndexingJobStatus.PENDING, - }); - // The onJobCompleted handler calls repoIndexingJob.update once and then repo.update - (mockPrisma.repoIndexingJob.update as Mock).mockResolvedValue({ type: RepoIndexingJobType.INDEX, repoId: repo.id, repo, metadata: {}, }); - (mockPrisma.repo.update as Mock).mockResolvedValue(repo); + (mockPrisma.repoIndexingJob.update as Mock).mockResolvedValue({ repo }); // Get the onCompleted handler const onCompletedHandler = mockWorkerOn.mock.calls.find((call: unknown[]) => call[0] === 'completed')?.[1]; @@ -552,22 +551,20 @@ describe('RepoIndexManager', () => { await onCompletedHandler(mockJob); + // The job status and indexedAt must be written together (single transaction) to + // close the race where the scheduler sees a completed job but a stale indexedAt. expect(mockPrisma.repoIndexingJob.update).toHaveBeenCalledWith( expect.objectContaining({ where: { id: 'job-1' }, data: expect.objectContaining({ status: RepoIndexingJobStatus.COMPLETED, completedAt: expect.any(Date), - }), - }) - ); - - expect(mockPrisma.repo.update).toHaveBeenCalledWith( - expect.objectContaining({ - where: { id: repo.id }, - data: expect.objectContaining({ - indexedAt: expect.any(Date), - indexedCommitHash: 'abc123', + repo: { + update: expect.objectContaining({ + indexedAt: expect.any(Date), + indexedCommitHash: 'abc123', + }), + }, }), }) ); @@ -754,14 +751,12 @@ describe('RepoIndexManager', () => { manager = new RepoIndexManager(mockPrisma, mockSettings, mockRedis, mockPromClient as any); (mockPrisma.repoIndexingJob.findUniqueOrThrow as Mock).mockResolvedValue({ - status: RepoIndexingJobStatus.PENDING, - }); - (mockPrisma.repoIndexingJob.update as Mock).mockResolvedValue({ type: RepoIndexingJobType.CLEANUP, repoId: repo.id, repo, metadata: {}, }); + (mockPrisma.repoIndexingJob.update as Mock).mockResolvedValue({ repo }); (mockPrisma.repo.delete as Mock).mockResolvedValue(repo); const onCompletedHandler = mockWorkerOn.mock.calls.find((call: unknown[]) => call[0] === 'completed')?.[1]; @@ -840,13 +835,13 @@ describe('RepoIndexManager', () => { manager = new RepoIndexManager(mockPrisma, mockSettings, mockRedis, mockPromClient as any); - (mockPrisma.repoIndexingJob.update as Mock).mockResolvedValue({ + (mockPrisma.repoIndexingJob.findUniqueOrThrow as Mock).mockResolvedValue({ type: RepoIndexingJobType.INDEX, repoId: repo.id, repo, metadata: {}, }); - (mockPrisma.repo.update as Mock).mockResolvedValue(repo); + (mockPrisma.repoIndexingJob.update as Mock).mockResolvedValue({ repo }); const onCompletedHandler = mockWorkerOn.mock.calls.find((call: unknown[]) => call[0] === 'completed')?.[1]; @@ -867,9 +862,9 @@ describe('RepoIndexManager', () => { data: expect.objectContaining({ status: RepoIndexingJobStatus.COMPLETED, repo: { - update: { + update: expect.objectContaining({ latestIndexingJobStatus: RepoIndexingJobStatus.COMPLETED, - }, + }), }, }), }) diff --git a/packages/backend/src/repoIndexManager.ts b/packages/backend/src/repoIndexManager.ts index 0fb055a31..fdd51af9b 100644 --- a/packages/backend/src/repoIndexManager.ts +++ b/packages/backend/src/repoIndexManager.ts @@ -545,23 +545,16 @@ export class RepoIndexManager { private async onJobCompleted(job: Job) { try { const logger = createJobLogger(job.data.jobId); - const jobData = await this.db.repoIndexingJob.update({ + const jobData = await this.db.repoIndexingJob.findUniqueOrThrow({ where: { id: job.data.jobId }, - data: { - status: RepoIndexingJobStatus.COMPLETED, - completedAt: new Date(), - repo: { - update: { - latestIndexingJobStatus: RepoIndexingJobStatus.COMPLETED, - } - } - }, include: { repo: true, } }); const jobTypeLabel = getJobTypePrometheusLabel(jobData.type); + // @note: capture this before the update below, since the update sets indexedAt. + const isFirstIndex = jobData.repo.indexedAt === null; if (jobData.type === RepoIndexingJobType.INDEX) { const { path: repoPath } = getRepoPath(jobData.repo); @@ -576,29 +569,47 @@ export class RepoIndexManager { const jobMetadata = repoIndexingJobMetadataSchema.parse(jobData.metadata); - const repo = await this.db.repo.update({ - where: { id: jobData.repoId }, + const { repo } = await this.db.repoIndexingJob.update({ + where: { id: job.data.jobId }, data: { - indexedAt: new Date(), - indexedCommitHash: commitHash, - pushedAt: pushedAt, - metadata: { - ...(jobData.repo.metadata as RepoMetadata), - indexedRevisions: jobMetadata.indexedRevisions, - } satisfies RepoMetadata, - // @note: always update the default branch. While this field can be set - // during connection syncing, by setting it here we ensure that a) the - // default branch is as up to date as possible (since repo indexing happens - // more frequently than connection syncing) and b) for hosts where it is - // impossible to determine the default branch from the host's API - // (e.g., generic git url), we still set the default branch here. - defaultBranch: defaultBranch, + status: RepoIndexingJobStatus.COMPLETED, + completedAt: new Date(), + repo: { + update: { + latestIndexingJobStatus: RepoIndexingJobStatus.COMPLETED, + indexedAt: new Date(), + indexedCommitHash: commitHash, + pushedAt: pushedAt, + metadata: { + ...(jobData.repo.metadata as RepoMetadata), + indexedRevisions: jobMetadata.indexedRevisions, + } satisfies RepoMetadata, + // @note: always update the default branch. While this field can be set + // during connection syncing, by setting it here we ensure that a) the + // default branch is as up to date as possible (since repo indexing happens + // more frequently than connection syncing) and b) for hosts where it is + // impossible to determine the default branch from the host's API + // (e.g., generic git url), we still set the default branch here. + defaultBranch: defaultBranch, + } + } + }, + include: { + repo: true, } }); logger.debug(`Completed index job ${job.data.jobId} for repo ${repo.name} (id: ${repo.id})`); } else if (jobData.type === RepoIndexingJobType.CLEANUP) { + await this.db.repoIndexingJob.update({ + where: { id: job.data.jobId }, + data: { + status: RepoIndexingJobStatus.COMPLETED, + completedAt: new Date(), + } + }); + const repo = await this.db.repo.delete({ where: { id: jobData.repoId }, }); @@ -610,7 +621,7 @@ export class RepoIndexManager { this.promClient.activeRepoIndexJobs.dec({ repo: job.data.repoName, type: jobTypeLabel }); this.promClient.repoIndexJobSuccessTotal.inc({ repo: job.data.repoName, type: jobTypeLabel }); - if (jobData.type === RepoIndexingJobType.INDEX && jobData.repo.indexedAt === null) { + if (jobData.type === RepoIndexingJobType.INDEX && isFirstIndex) { captureEvent('backend_repo_first_indexed', { repoId: job.data.repoId, type: jobData.repo.external_codeHostType, From 6b2b9ea6e85938f87938ec7d3275d3916f29a53f Mon Sep 17 00:00:00 2001 From: Brendan Kellam Date: Wed, 10 Jun 2026 16:12:26 -0700 Subject: [PATCH 2/2] docs: add CHANGELOG entry for #1298 Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 957cd8ee4..b9e840082 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Surfaced an actionable error when the Lighthouse licensing service is unreachable, instead of a generic "unexpected error". [#1293](https://github.com/sourcebot-dev/sourcebot/pull/1293) - Fixed the selected language model rapidly flipping in local storage after a language model was removed. [#1295](https://github.com/sourcebot-dev/sourcebot/pull/1295) - Fixed issue where using multiple identity providers of the same type (e.g., gitlab) would result in unexpected behaviours. [#1177](https://github.com/sourcebot-dev/sourcebot/pull/1177) +- Fixed a race condition where large repositories could be indexed twice within a single reindex interval. [#1298](https://github.com/sourcebot-dev/sourcebot/pull/1298) ## [5.0.1] - 2026-06-04