From dd64739c9e06b2dee9ac22d1802cf474264c0d61 Mon Sep 17 00:00:00 2001 From: Rudraraj Sakariya <68952069+rudraasakariya@users.noreply.github.com> Date: Fri, 12 Jun 2026 08:33:53 -0400 Subject: [PATCH 1/2] fix(worker): re-index repos whose zoekt shards are missing from disk When the index directory is lost (e.g., it lives on ephemeral storage in a k8s deployment) while the database still marks repos as indexed, search silently returns empty results and nothing triggers a rebuild until the reindex interval elapses. Add a reconciliation step that runs on scheduler startup and on every scheduler poll: repos marked as indexed in the DB that have no index shards on disk get their indexedAt cleared, so the existing scheduler re-indexes them with its usual dedup and backoff guards. --- packages/backend/src/repoIndexManager.test.ts | 131 ++++++++++++++++++ packages/backend/src/repoIndexManager.ts | 62 +++++++++ 2 files changed, 193 insertions(+) diff --git a/packages/backend/src/repoIndexManager.test.ts b/packages/backend/src/repoIndexManager.test.ts index 684f1a826..c72f9fe67 100644 --- a/packages/backend/src/repoIndexManager.test.ts +++ b/packages/backend/src/repoIndexManager.test.ts @@ -37,6 +37,7 @@ vi.mock('@sourcebot/shared', () => ({ vi.mock('./constants.js', () => ({ WORKER_STOP_GRACEFUL_TIMEOUT_MS: 5000, INDEX_CACHE_DIR: 'test-data/index', + REPOS_CACHE_DIR: 'test-data/repos', })); vi.mock('./git.js', () => ({ @@ -65,6 +66,13 @@ vi.mock('./posthog.js', () => ({ vi.mock('./utils.js', () => ({ getAuthCredentialsForRepo: vi.fn().mockResolvedValue(null), getShardPrefix: vi.fn((orgId: number, repoId: number) => `${orgId}_${repoId}`), + getRepoIdFromShardFileName: vi.fn((fileName: string) => { + const match = fileName.match(/^(\d+)_(\d+)_/); + if (!match) { + return undefined; + } + return parseInt(match[2], 10); + }), measure: vi.fn(async (cb: () => Promise) => { const data = await cb(); return { data, durationMs: 100 }; @@ -148,6 +156,7 @@ const createMockPrisma = () => { repo: { findMany: vi.fn().mockResolvedValue([]), update: vi.fn(), + updateMany: vi.fn(), delete: vi.fn(), }, repoIndexingJob: { @@ -783,6 +792,128 @@ describe('RepoIndexManager', () => { }); }); + describe('Missing Shard Reconciliation', () => { + const indexedRepo = (id: number, name: string) => createMockRepo({ + id, + name, + indexedAt: new Date(), + indexedCommitHash: 'abc123', + }); + + test('clears indexedAt for indexed repos whose shard files are missing on startup', async () => { + (existsSync as Mock).mockImplementation((path: string) => path === 'test-data/index'); + // Repo 1 has a shard on disk; repo 2 does not. + (readdir as Mock).mockResolvedValue(['1_1_v16.00000.zoekt']); + (mockPrisma.repo.findMany as Mock).mockResolvedValue([ + indexedRepo(1, 'repo-with-shard'), + indexedRepo(2, 'repo-missing-shard'), + ]); + + manager = new RepoIndexManager(mockPrisma, mockSettings, mockRedis, mockPromClient as any); + await manager.startScheduler(); + + expect(mockPrisma.repo.updateMany).toHaveBeenCalledWith({ + where: { id: { in: [2] } }, + data: { indexedAt: null }, + }); + }); + + test('does not touch repos when all shards are present', async () => { + (existsSync as Mock).mockImplementation((path: string) => path === 'test-data/index'); + (readdir as Mock).mockResolvedValue(['1_1_v16.00000.zoekt', '1_2_v16.00000.zoekt']); + (mockPrisma.repo.findMany as Mock).mockResolvedValue([ + indexedRepo(1, 'repo-1'), + indexedRepo(2, 'repo-2'), + ]); + + manager = new RepoIndexManager(mockPrisma, mockSettings, mockRedis, mockPromClient as any); + await manager.startScheduler(); + + expect(mockPrisma.repo.updateMany).not.toHaveBeenCalled(); + }); + + test('marks all indexed repos as stale when the index directory is missing', async () => { + (existsSync as Mock).mockReturnValue(false); + (mockPrisma.repo.findMany as Mock).mockResolvedValue([ + indexedRepo(1, 'repo-1'), + indexedRepo(2, 'repo-2'), + ]); + + manager = new RepoIndexManager(mockPrisma, mockSettings, mockRedis, mockPromClient as any); + await manager.startScheduler(); + + expect(mockPrisma.repo.updateMany).toHaveBeenCalledWith({ + where: { id: { in: [1, 2] } }, + data: { indexedAt: null }, + }); + }); + + test('does not count temporary shard files as valid shards', async () => { + (existsSync as Mock).mockImplementation((path: string) => path === 'test-data/index'); + (readdir as Mock).mockResolvedValue(['1_2_v16.00000.zoekt123.tmp']); + (mockPrisma.repo.findMany as Mock).mockResolvedValue([ + indexedRepo(2, 'repo-with-only-tmp-shard'), + ]); + + manager = new RepoIndexManager(mockPrisma, mockSettings, mockRedis, mockPromClient as any); + await manager.startScheduler(); + + expect(mockPrisma.repo.updateMany).toHaveBeenCalledWith({ + where: { id: { in: [2] } }, + data: { indexedAt: null }, + }); + }); + + test('only considers repos that are indexed, non-empty, and connected', async () => { + (existsSync as Mock).mockImplementation((path: string) => path === 'test-data/index'); + (readdir as Mock).mockResolvedValue([]); + (mockPrisma.repo.findMany as Mock).mockResolvedValue([]); + + manager = new RepoIndexManager(mockPrisma, mockSettings, mockRedis, mockPromClient as any); + await manager.startScheduler(); + + // The reconciliation query must exclude unindexed repos (nothing to mark), + // empty repos (indexing completes without producing a shard), and + // unconnected repos (clearing indexedAt would bypass the GC grace period). + expect(mockPrisma.repo.findMany).toHaveBeenCalledWith( + expect.objectContaining({ + where: expect.objectContaining({ + indexedAt: { not: null }, + indexedCommitHash: { not: null }, + connections: { some: {} }, + }), + }) + ); + + expect(mockPrisma.repo.updateMany).not.toHaveBeenCalled(); + }); + + test('reconciles on every scheduler poll, not just startup', async () => { + (existsSync as Mock).mockImplementation((path: string) => path === 'test-data/index'); + (readdir as Mock).mockResolvedValue(['1_1_v16.00000.zoekt']); + (mockPrisma.repo.findMany as Mock).mockResolvedValue([]); + + manager = new RepoIndexManager(mockPrisma, mockSettings, mockRedis, mockPromClient as any); + await manager.startScheduler(); + + // Simulate the index directory being wiped while the worker is running, + // with repo 1 still marked as indexed in the DB. + (readdir as Mock).mockResolvedValue([]); + (mockPrisma.repo.findMany as Mock).mockResolvedValue([ + indexedRepo(1, 'repo-1'), + ]); + + const { setIntervalAsync } = await import('./utils.js'); + const tick = (setIntervalAsync as Mock).mock.calls[0][0]; + await tick(); + + expect(mockPrisma.repo.updateMany).toHaveBeenCalledWith({ + where: { id: { in: [1] } }, + data: { indexedAt: null }, + }); + }); + }); + describe('latestIndexingJobStatus Updates', () => { test('sets latestIndexingJobStatus to IN_PROGRESS when job starts', async () => { const repo = createMockRepoWithConnections(); diff --git a/packages/backend/src/repoIndexManager.ts b/packages/backend/src/repoIndexManager.ts index aea1291dc..80a17ed09 100644 --- a/packages/backend/src/repoIndexManager.ts +++ b/packages/backend/src/repoIndexManager.ts @@ -100,7 +100,9 @@ export class RepoIndexManager { logger.debug('Starting scheduler'); // Cleanup any orphaned disk resources on startup await this.cleanupOrphanedDiskResources(); + await this.markReposWithMissingShardsAsStale(); this.interval = setIntervalAsync(async () => { + await this.markReposWithMissingShardsAsStale(); await this.scheduleIndexJobs(); await this.scheduleCleanupJobs(); }, this.settings.reindexRepoPollingIntervalMs); @@ -682,6 +684,66 @@ export class RepoIndexManager { } } + // Detects repos that are marked as indexed in the database but have no + // index shards on disk (e.g., because the index directory lives on + // ephemeral storage and was lost), and clears their `indexedAt` so the + // scheduler re-indexes them. This is the inverse of + // `cleanupOrphanedDiskResources`. + private async markReposWithMissingShardsAsStale() { + // @note: the DB is queried *before* the disk is scanned so that a repo + // whose first index job completes between the two reads is not falsely + // marked as stale (its shard is guaranteed to be visible by the time it + // appears in the query result). + // + // Empty repositories are excluded (via `indexedCommitHash`) since they + // complete indexing without producing a shard. Unconnected repos are + // excluded since clearing `indexedAt` would bypass the garbage + // collection grace period in `scheduleCleanupJobs`. + const indexedRepos = await this.db.repo.findMany({ + where: { + indexedAt: { not: null }, + indexedCommitHash: { not: null }, + connections: { some: {} }, + }, + select: { + id: true, + name: true, + }, + }); + + if (indexedRepos.length === 0) { + return; + } + + const repoIdsWithShards = new Set(); + if (existsSync(INDEX_CACHE_DIR)) { + const entries = await readdir(INDEX_CACHE_DIR); + for (const entry of entries) { + // Ignore temporary files (e.g., `.tmp` files from in-flight or + // failed indexing operations) - only completed shards count. + if (!entry.endsWith('.zoekt')) { + continue; + } + const repoId = getRepoIdFromShardFileName(entry); + if (repoId !== undefined) { + repoIdsWithShards.add(repoId); + } + } + } + + const staleRepos = indexedRepos.filter(repo => !repoIdsWithShards.has(repo.id)); + if (staleRepos.length === 0) { + return; + } + + logger.warn(`Found ${staleRepos.length} repo(s) marked as indexed but with no index shards on disk. Marking as stale for re-indexing: ${staleRepos.map(repo => repo.name).join(', ')}`); + + await this.db.repo.updateMany({ + where: { id: { in: staleRepos.map(repo => repo.id) } }, + data: { indexedAt: null }, + }); + } + // Scans the repos and index directories on disk and removes any entries // that have no corresponding Repo record in the database. This handles // edge cases where the DB and disk resources are out of sync. From 5e1624e4ebaad6f01a6ddf611edbb65d4cfb8b4a Mon Sep 17 00:00:00 2001 From: Rudraraj Sakariya <68952069+rudraasakariya@users.noreply.github.com> Date: Fri, 12 Jun 2026 10:11:52 -0400 Subject: [PATCH 2/2] chore: add CHANGELOG entry for #1304 --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9fadcf22d..70fd493e0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Fixed +- Fixed repos not being re-indexed when their zoekt index shards are missing from disk (e.g., when the index directory is stored on ephemeral storage) while the database still marks them as indexed. [#1304](https://github.com/sourcebot-dev/sourcebot/pull/1304) + ## [5.0.2] - 2026-06-11 ### Changed