Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 12 additions & 9 deletions packages/opencode/src/cli/cmd/serve.ts
Original file line number Diff line number Diff line change
Expand Up @@ -171,16 +171,19 @@ export const ServeCommand = effectCmd({

// S7 β€” sweep orphan workspace directories on EFS (dirs with no live
// session row, older than the 24 h safety floor). Reclaims space left
// by failed cleanups / drift. Fire-and-forget; per-dir failures log.
// by failed cleanups / drift. DEFERRED ~90 s after boot (and unref'd) so
// it never runs during the ALB's startup health-check window β€” the sweep
// does slow EFS deletes, and even though it's now async (non-blocking),
// keeping it out of the boot path entirely is belt-and-suspenders against
// the 2026-06-14 crash-loop where a synchronous version blocked /healthz.
// Fire-and-forget; per-dir failures log.
if (isCollabMode) {
yield* Effect.promise(async () => {
try {
const Workspace = await import("../../collab/workspace")
void Workspace.cleanupOrphanWorkspaces()
} catch (err) {
console.warn("[collab] orphan-workspace sweep skipped:", err)
}
})
const orphanSweepTimer = setTimeout(() => {
void import("../../collab/workspace")
.then((Workspace) => Workspace.cleanupOrphanWorkspaces())
.catch((err) => console.warn("[collab] orphan-workspace sweep skipped:", err))
}, 90_000)
if (typeof orphanSweepTimer.unref === "function") orphanSweepTimer.unref()
}

yield* Effect.never
Expand Down
28 changes: 26 additions & 2 deletions packages/opencode/src/collab/workspace.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
*/

import { spawn } from "child_process"
import { mkdirSync, rmSync, existsSync, writeFileSync, renameSync, readdirSync, statSync } from "fs"

Check warning on line 13 in packages/opencode/src/collab/workspace.ts

View workflow job for this annotation

GitHub Actions / Lint changed files (oxlint)

eslint(no-unused-vars)

Identifier 'statSync' is imported but never used.
import { rm as rmAsync, stat as statAsync } from "fs/promises"
import { join } from "path"
import type { Participant } from "@opencode-ai/collab"

Expand Down Expand Up @@ -63,7 +64,7 @@
* directory up and reachable by an explicit cd.
*/
export function nativeSessionDirectory(collabSessionId: string, repos: string[]): string {
if (repos.length > 0) return repoWorkspacePath(collabSessionId, repos[0]!)

Check warning on line 67 in packages/opencode/src/collab/workspace.ts

View workflow job for this annotation

GitHub Actions / Lint changed files (oxlint)

typescript-eslint(no-unnecessary-type-assertion)

This assertion is unnecessary since it does not change the type of the expression.
return sessionWorkspacePath(collabSessionId)
}

Expand Down Expand Up @@ -627,8 +628,23 @@
* 24 h, eliminating any boot-time TOCTOU against an in-progress init.
*
* Best-effort: per-dir failures log and continue. Never throws.
*
* CRITICAL β€” must NOT block the event loop. Each orphan is a ~1.5 GB tree on
* EFS (a network filesystem); the original synchronous `rmSync(recursive)`
* blocked the loop for minutes while deleting several of them on boot, so
* /healthz couldn't respond and the ALB health check timed out β†’ the task was
* killed mid-sweep and crash-looped (observed 2026-06-14, right after this
* sweep first shipped). We now use the async `fs/promises` `rm`/`stat` and
* `await` each deletion: libuv does the filesystem work off-thread, so the
* loop stays free to answer /healthz between deletions. The caller also
* defers the sweep until well after boot (see serve.ts), and we cap how many
* we remove per boot so a large backlog drains over a few restarts instead of
* one marathon run.
*/
const ORPHAN_WORKSPACE_MIN_AGE_MS = 24 * 60 * 60 * 1000
/** Max orphan dirs removed per boot β€” bounds a single sweep's wall-clock /
* EFS load. A backlog larger than this drains over subsequent boots. */
const ORPHAN_WORKSPACE_MAX_PER_SWEEP = 10

export async function cleanupOrphanWorkspaces(): Promise<void> {
const root = workspaceRoot()
Expand Down Expand Up @@ -656,13 +672,21 @@
const now = Date.now()
let removed = 0
for (const name of entries) {
if (removed >= ORPHAN_WORKSPACE_MAX_PER_SWEEP) {
console.log(
`[collab.workspace] cleanupOrphanWorkspaces: hit per-sweep cap (${ORPHAN_WORKSPACE_MAX_PER_SWEEP}); remaining orphans drain next boot`,
)
break
}
if (liveIds.has(name)) continue // live session β€” leave it
const dir = join(root, name)
try {
const st = statSync(dir)
// Async stat + rm β€” each `await` yields the event loop so /healthz keeps
// answering while EFS does the (slow) recursive delete off-thread.
const st = await statAsync(dir)
if (!st.isDirectory()) continue
if (now - st.mtimeMs < ORPHAN_WORKSPACE_MIN_AGE_MS) continue // too fresh β€” protect against init races
rmSync(dir, { recursive: true, force: true })
await rmAsync(dir, { recursive: true, force: true })
removed++
console.log(`[collab.workspace] cleanupOrphanWorkspaces: removed orphan workspace ${name}`)
} catch (err) {
Expand Down
Loading