Skip to content

Commit 5a15a7e

Browse files
fix(backend): prevent duplicate index jobs from indexedAt race (#1298)
* fix(backend): prevent duplicate index jobs from indexedAt race onJobCompleted marked the index job COMPLETED in one write, then ran several git reads (isRepoEmpty, getCommitHashForRefName, getLatestCommitTimestamp, getLocalDefaultBranch), then updated repo.indexedAt in a separate write. During the git-read window the job was already COMPLETED but indexedAt was still stale, so the scheduler (scheduleIndexJobs) saw no active job and an out-of-date indexedAt and scheduled a duplicate index job. On large repos the window is long enough to be hit routinely by the 1s scheduler poll. Run the git reads first, then write status=COMPLETED and indexedAt together in a single repoIndexingJob.update (nested repo update). Now the job stays IN_PROGRESS until the moment indexedAt becomes fresh, so the scheduler's two guards can never both pass at once. Fixes SOU-1150 Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> * docs: add CHANGELOG entry for #1298 Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent e668c3f commit 5a15a7e

3 files changed

Lines changed: 56 additions & 49 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
2525
- Surfaced an actionable error when the Lighthouse licensing service is unreachable, instead of a generic "unexpected error". [#1293](https://github.com/sourcebot-dev/sourcebot/pull/1293)
2626
- Fixed the selected language model rapidly flipping in local storage after a language model was removed. [#1295](https://github.com/sourcebot-dev/sourcebot/pull/1295)
2727
- Fixed issue where using multiple identity providers of the same type (e.g., gitlab) would result in unexpected behaviours. [#1177](https://github.com/sourcebot-dev/sourcebot/pull/1177)
28+
- Fixed a race condition where large repositories could be indexed twice within a single reindex interval. [#1298](https://github.com/sourcebot-dev/sourcebot/pull/1298)
2829

2930
## [5.0.1] - 2026-06-04
3031

packages/backend/src/repoIndexManager.test.ts

Lines changed: 17 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -525,17 +525,16 @@ describe('RepoIndexManager', () => {
525525

526526
manager = new RepoIndexManager(mockPrisma, mockSettings, mockRedis, mockPromClient as any);
527527

528+
// The onJobCompleted handler reads the job via findUniqueOrThrow, then marks it
529+
// COMPLETED and updates the repo (indexedAt, etc.) in a single repoIndexingJob.update
530+
// with a nested repo update.
528531
(mockPrisma.repoIndexingJob.findUniqueOrThrow as Mock).mockResolvedValue({
529-
status: RepoIndexingJobStatus.PENDING,
530-
});
531-
// The onJobCompleted handler calls repoIndexingJob.update once and then repo.update
532-
(mockPrisma.repoIndexingJob.update as Mock).mockResolvedValue({
533532
type: RepoIndexingJobType.INDEX,
534533
repoId: repo.id,
535534
repo,
536535
metadata: {},
537536
});
538-
(mockPrisma.repo.update as Mock).mockResolvedValue(repo);
537+
(mockPrisma.repoIndexingJob.update as Mock).mockResolvedValue({ repo });
539538

540539
// Get the onCompleted handler
541540
const onCompletedHandler = mockWorkerOn.mock.calls.find((call: unknown[]) => call[0] === 'completed')?.[1];
@@ -552,22 +551,20 @@ describe('RepoIndexManager', () => {
552551

553552
await onCompletedHandler(mockJob);
554553

554+
// The job status and indexedAt must be written together (single transaction) to
555+
// close the race where the scheduler sees a completed job but a stale indexedAt.
555556
expect(mockPrisma.repoIndexingJob.update).toHaveBeenCalledWith(
556557
expect.objectContaining({
557558
where: { id: 'job-1' },
558559
data: expect.objectContaining({
559560
status: RepoIndexingJobStatus.COMPLETED,
560561
completedAt: expect.any(Date),
561-
}),
562-
})
563-
);
564-
565-
expect(mockPrisma.repo.update).toHaveBeenCalledWith(
566-
expect.objectContaining({
567-
where: { id: repo.id },
568-
data: expect.objectContaining({
569-
indexedAt: expect.any(Date),
570-
indexedCommitHash: 'abc123',
562+
repo: {
563+
update: expect.objectContaining({
564+
indexedAt: expect.any(Date),
565+
indexedCommitHash: 'abc123',
566+
}),
567+
},
571568
}),
572569
})
573570
);
@@ -754,14 +751,12 @@ describe('RepoIndexManager', () => {
754751
manager = new RepoIndexManager(mockPrisma, mockSettings, mockRedis, mockPromClient as any);
755752

756753
(mockPrisma.repoIndexingJob.findUniqueOrThrow as Mock).mockResolvedValue({
757-
status: RepoIndexingJobStatus.PENDING,
758-
});
759-
(mockPrisma.repoIndexingJob.update as Mock).mockResolvedValue({
760754
type: RepoIndexingJobType.CLEANUP,
761755
repoId: repo.id,
762756
repo,
763757
metadata: {},
764758
});
759+
(mockPrisma.repoIndexingJob.update as Mock).mockResolvedValue({ repo });
765760
(mockPrisma.repo.delete as Mock).mockResolvedValue(repo);
766761

767762
const onCompletedHandler = mockWorkerOn.mock.calls.find((call: unknown[]) => call[0] === 'completed')?.[1];
@@ -840,13 +835,13 @@ describe('RepoIndexManager', () => {
840835

841836
manager = new RepoIndexManager(mockPrisma, mockSettings, mockRedis, mockPromClient as any);
842837

843-
(mockPrisma.repoIndexingJob.update as Mock).mockResolvedValue({
838+
(mockPrisma.repoIndexingJob.findUniqueOrThrow as Mock).mockResolvedValue({
844839
type: RepoIndexingJobType.INDEX,
845840
repoId: repo.id,
846841
repo,
847842
metadata: {},
848843
});
849-
(mockPrisma.repo.update as Mock).mockResolvedValue(repo);
844+
(mockPrisma.repoIndexingJob.update as Mock).mockResolvedValue({ repo });
850845

851846
const onCompletedHandler = mockWorkerOn.mock.calls.find((call: unknown[]) => call[0] === 'completed')?.[1];
852847

@@ -867,9 +862,9 @@ describe('RepoIndexManager', () => {
867862
data: expect.objectContaining({
868863
status: RepoIndexingJobStatus.COMPLETED,
869864
repo: {
870-
update: {
865+
update: expect.objectContaining({
871866
latestIndexingJobStatus: RepoIndexingJobStatus.COMPLETED,
872-
},
867+
}),
873868
},
874869
}),
875870
})

packages/backend/src/repoIndexManager.ts

Lines changed: 38 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -545,23 +545,16 @@ export class RepoIndexManager {
545545
private async onJobCompleted(job: Job<JobPayload>) {
546546
try {
547547
const logger = createJobLogger(job.data.jobId);
548-
const jobData = await this.db.repoIndexingJob.update({
548+
const jobData = await this.db.repoIndexingJob.findUniqueOrThrow({
549549
where: { id: job.data.jobId },
550-
data: {
551-
status: RepoIndexingJobStatus.COMPLETED,
552-
completedAt: new Date(),
553-
repo: {
554-
update: {
555-
latestIndexingJobStatus: RepoIndexingJobStatus.COMPLETED,
556-
}
557-
}
558-
},
559550
include: {
560551
repo: true,
561552
}
562553
});
563554

564555
const jobTypeLabel = getJobTypePrometheusLabel(jobData.type);
556+
// @note: capture this before the update below, since the update sets indexedAt.
557+
const isFirstIndex = jobData.repo.indexedAt === null;
565558

566559
if (jobData.type === RepoIndexingJobType.INDEX) {
567560
const { path: repoPath } = getRepoPath(jobData.repo);
@@ -576,29 +569,47 @@ export class RepoIndexManager {
576569

577570
const jobMetadata = repoIndexingJobMetadataSchema.parse(jobData.metadata);
578571

579-
const repo = await this.db.repo.update({
580-
where: { id: jobData.repoId },
572+
const { repo } = await this.db.repoIndexingJob.update({
573+
where: { id: job.data.jobId },
581574
data: {
582-
indexedAt: new Date(),
583-
indexedCommitHash: commitHash,
584-
pushedAt: pushedAt,
585-
metadata: {
586-
...(jobData.repo.metadata as RepoMetadata),
587-
indexedRevisions: jobMetadata.indexedRevisions,
588-
} satisfies RepoMetadata,
589-
// @note: always update the default branch. While this field can be set
590-
// during connection syncing, by setting it here we ensure that a) the
591-
// default branch is as up to date as possible (since repo indexing happens
592-
// more frequently than connection syncing) and b) for hosts where it is
593-
// impossible to determine the default branch from the host's API
594-
// (e.g., generic git url), we still set the default branch here.
595-
defaultBranch: defaultBranch,
575+
status: RepoIndexingJobStatus.COMPLETED,
576+
completedAt: new Date(),
577+
repo: {
578+
update: {
579+
latestIndexingJobStatus: RepoIndexingJobStatus.COMPLETED,
580+
indexedAt: new Date(),
581+
indexedCommitHash: commitHash,
582+
pushedAt: pushedAt,
583+
metadata: {
584+
...(jobData.repo.metadata as RepoMetadata),
585+
indexedRevisions: jobMetadata.indexedRevisions,
586+
} satisfies RepoMetadata,
587+
// @note: always update the default branch. While this field can be set
588+
// during connection syncing, by setting it here we ensure that a) the
589+
// default branch is as up to date as possible (since repo indexing happens
590+
// more frequently than connection syncing) and b) for hosts where it is
591+
// impossible to determine the default branch from the host's API
592+
// (e.g., generic git url), we still set the default branch here.
593+
defaultBranch: defaultBranch,
594+
}
595+
}
596+
},
597+
include: {
598+
repo: true,
596599
}
597600
});
598601

599602
logger.debug(`Completed index job ${job.data.jobId} for repo ${repo.name} (id: ${repo.id})`);
600603
}
601604
else if (jobData.type === RepoIndexingJobType.CLEANUP) {
605+
await this.db.repoIndexingJob.update({
606+
where: { id: job.data.jobId },
607+
data: {
608+
status: RepoIndexingJobStatus.COMPLETED,
609+
completedAt: new Date(),
610+
}
611+
});
612+
602613
const repo = await this.db.repo.delete({
603614
where: { id: jobData.repoId },
604615
});
@@ -610,7 +621,7 @@ export class RepoIndexManager {
610621
this.promClient.activeRepoIndexJobs.dec({ repo: job.data.repoName, type: jobTypeLabel });
611622
this.promClient.repoIndexJobSuccessTotal.inc({ repo: job.data.repoName, type: jobTypeLabel });
612623

613-
if (jobData.type === RepoIndexingJobType.INDEX && jobData.repo.indexedAt === null) {
624+
if (jobData.type === RepoIndexingJobType.INDEX && isFirstIndex) {
614625
captureEvent('backend_repo_first_indexed', {
615626
repoId: job.data.repoId,
616627
type: jobData.repo.external_codeHostType,

0 commit comments

Comments
 (0)