From a7f90e0e23f8f5284169319281719a89ded57326 Mon Sep 17 00:00:00 2001 From: Hanno Blankenstein Date: Sun, 14 Jun 2026 21:42:00 +1000 Subject: [PATCH] fix(collab): orphan-workspace sweep must not block the boot event loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The S7 sweep shipped in #42 used synchronous rmSync(recursive) to delete orphan workspace dirs on boot. Each orphan is a ~1.5 GB tree on EFS (a network FS); deleting several synchronously blocked the event loop for minutes, so /healthz couldn't respond, the ALB health check timed out ("Request timed out"), ECS killed the task mid-sweep, and it crash-looped (observed 2026-06-14: server reached "listening on :4096" three times, each killed ~4 min later by failed ELB health checks; exit code null, not OOM). Fixes: - Use fs/promises rm + stat with `await` per deletion — libuv does the slow EFS work off-thread, so the loop stays free to answer /healthz between deletions. - Cap removals at 10 per boot (ORPHAN_WORKSPACE_MAX_PER_SWEEP); a larger backlog drains over subsequent boots instead of one marathon. - Defer the sweep ~90 s after boot (unref'd timer) in serve.ts, so it can't run during the ALB startup health-check window at all — belt-and-suspenders on top of the non-blocking rewrite. cleanupSessionWorkspace (explicit DELETE path) keeps sync rmSync — it's a single dir on a user action, not the boot path. This is a hotfix for the crash-loop that took the site down after the #37-#46 batch deploy; merge + Deploy collab to recover (or roll the service back to the pre-#42 task-def revision in the meantime). Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/opencode/src/cli/cmd/serve.ts | 21 +++++++++-------- packages/opencode/src/collab/workspace.ts | 28 +++++++++++++++++++++-- 2 files changed, 38 insertions(+), 11 deletions(-) diff --git a/packages/opencode/src/cli/cmd/serve.ts b/packages/opencode/src/cli/cmd/serve.ts index 186b6afd6276..ac8e69f5a119 100644 --- a/packages/opencode/src/cli/cmd/serve.ts +++ b/packages/opencode/src/cli/cmd/serve.ts @@ -171,16 +171,19 @@ export const ServeCommand = effectCmd({ // S7 — sweep orphan workspace directories on EFS (dirs with no live // session row, older than the 24 h safety floor). Reclaims space left - // by failed cleanups / drift. Fire-and-forget; per-dir failures log. + // by failed cleanups / drift. DEFERRED ~90 s after boot (and unref'd) so + // it never runs during the ALB's startup health-check window — the sweep + // does slow EFS deletes, and even though it's now async (non-blocking), + // keeping it out of the boot path entirely is belt-and-suspenders against + // the 2026-06-14 crash-loop where a synchronous version blocked /healthz. + // Fire-and-forget; per-dir failures log. if (isCollabMode) { - yield* Effect.promise(async () => { - try { - const Workspace = await import("../../collab/workspace") - void Workspace.cleanupOrphanWorkspaces() - } catch (err) { - console.warn("[collab] orphan-workspace sweep skipped:", err) - } - }) + const orphanSweepTimer = setTimeout(() => { + void import("../../collab/workspace") + .then((Workspace) => Workspace.cleanupOrphanWorkspaces()) + .catch((err) => console.warn("[collab] orphan-workspace sweep skipped:", err)) + }, 90_000) + if (typeof orphanSweepTimer.unref === "function") orphanSweepTimer.unref() } yield* Effect.never diff --git a/packages/opencode/src/collab/workspace.ts b/packages/opencode/src/collab/workspace.ts index 879f86bd37d8..6f116aa7d31f 100644 --- a/packages/opencode/src/collab/workspace.ts +++ b/packages/opencode/src/collab/workspace.ts @@ -11,6 +11,7 @@ import { spawn } from "child_process" import { mkdirSync, rmSync, existsSync, writeFileSync, renameSync, readdirSync, statSync } from "fs" +import { rm as rmAsync, stat as statAsync } from "fs/promises" import { join } from "path" import type { Participant } from "@opencode-ai/collab" @@ -627,8 +628,23 @@ export function cleanupSessionWorkspace(collabSessionId: string): void { * 24 h, eliminating any boot-time TOCTOU against an in-progress init. * * Best-effort: per-dir failures log and continue. Never throws. + * + * CRITICAL — must NOT block the event loop. Each orphan is a ~1.5 GB tree on + * EFS (a network filesystem); the original synchronous `rmSync(recursive)` + * blocked the loop for minutes while deleting several of them on boot, so + * /healthz couldn't respond and the ALB health check timed out → the task was + * killed mid-sweep and crash-looped (observed 2026-06-14, right after this + * sweep first shipped). We now use the async `fs/promises` `rm`/`stat` and + * `await` each deletion: libuv does the filesystem work off-thread, so the + * loop stays free to answer /healthz between deletions. The caller also + * defers the sweep until well after boot (see serve.ts), and we cap how many + * we remove per boot so a large backlog drains over a few restarts instead of + * one marathon run. */ const ORPHAN_WORKSPACE_MIN_AGE_MS = 24 * 60 * 60 * 1000 +/** Max orphan dirs removed per boot — bounds a single sweep's wall-clock / + * EFS load. A backlog larger than this drains over subsequent boots. */ +const ORPHAN_WORKSPACE_MAX_PER_SWEEP = 10 export async function cleanupOrphanWorkspaces(): Promise { const root = workspaceRoot() @@ -656,13 +672,21 @@ export async function cleanupOrphanWorkspaces(): Promise { const now = Date.now() let removed = 0 for (const name of entries) { + if (removed >= ORPHAN_WORKSPACE_MAX_PER_SWEEP) { + console.log( + `[collab.workspace] cleanupOrphanWorkspaces: hit per-sweep cap (${ORPHAN_WORKSPACE_MAX_PER_SWEEP}); remaining orphans drain next boot`, + ) + break + } if (liveIds.has(name)) continue // live session — leave it const dir = join(root, name) try { - const st = statSync(dir) + // Async stat + rm — each `await` yields the event loop so /healthz keeps + // answering while EFS does the (slow) recursive delete off-thread. + const st = await statAsync(dir) if (!st.isDirectory()) continue if (now - st.mtimeMs < ORPHAN_WORKSPACE_MIN_AGE_MS) continue // too fresh — protect against init races - rmSync(dir, { recursive: true, force: true }) + await rmAsync(dir, { recursive: true, force: true }) removed++ console.log(`[collab.workspace] cleanupOrphanWorkspaces: removed orphan workspace ${name}`) } catch (err) {