From 3dd8d683a8b2bc69646fd9580050c21cb597995d Mon Sep 17 00:00:00 2001 From: Hanno Blankenstein Date: Sat, 13 Jun 2026 10:57:30 +1000 Subject: [PATCH] fix(collab): preview install-hang watchdog + crash-loop breaker + cache telemetry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three stability guardrails for the preview lifecycle, all targeting the frontend's heavy install/compile path on the 4 vCPU / 16 GB task. S1 — install-hang watchdog The 30-min idle cap only counts request traffic, so an install wedged at minute 5 (dead registry, stuck native build, OOMing dep) sat holding memory for 25 more minutes before anything noticed. Track _lastOutput per active state (reset on every stdout/stderr line); the 60-s sweep now stops any preview still in the `installing` phase that has emitted nothing for INSTALL_SILENCE_TIMEOUT_MS (5 min). A healthy pnpm install / ng compile emits progress constantly, so prolonged silence is a strong wedge signal. S2 — crash-loop breaker on auto-resume resumePreviewsOnBoot's 24-h freshness cap (PR #32) stops STALE intents from looping, but a FRESH intent whose workspace is deterministically broken (bad lockfile, missing dep, OOM during native build) re-crashed on every boot. New collab_session columns preview_crash_count + preview_crash_at: incremented when a preview's child exits non-zero while still installing (and when the watchdog kills a hung install), reset to 0 on a successful install→running transition. Boot-resume now skips any session with >= 3 install crashes in the last hour. A Driver pressing Launch clears the counter (explicit human retry overrides the breaker). Only INSTALL-phase crashes feed the counter — a crash after reaching "running" is a dev-server runtime error, a different class that shouldn't suppress resume (the workspace installed fine). V2 — framework-cache telemetry Log whether /.angular/cache survived the previous container at each launch. That cache lives on EFS and should persist across deploys; when it does, second-launch compile drops from ~2 min to ~20 s. If it logs "absent" on a previously-launched session, the cache is being wiped and that's a regression to chase. Shallow, bounded, never throws. Schema: two nullable/defaulted columns added via the existing PRAGMA-probe ALTER pattern in migrate.ts; legacy rows backfill to crash_count=0 (eligible for resume), so no special handling. Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/opencode/src/collab/migrate.ts | 13 ++ .../opencode/src/collab/preview-launcher.ts | 142 +++++++++++++++++- packages/opencode/src/collab/router.ts | 6 + packages/opencode/src/collab/schema.sql.ts | 12 ++ packages/opencode/src/collab/session.ts | 47 +++++- 5 files changed, 214 insertions(+), 6 deletions(-) diff --git a/packages/opencode/src/collab/migrate.ts b/packages/opencode/src/collab/migrate.ts index e48f62466c85..4583ff84ea72 100644 --- a/packages/opencode/src/collab/migrate.ts +++ b/packages/opencode/src/collab/migrate.ts @@ -19,6 +19,8 @@ const SQL = ` init_error TEXT, preview_intent TEXT, preview_intent_at INTEGER, + preview_crash_count INTEGER NOT NULL DEFAULT 0, + preview_crash_at INTEGER, created_at INTEGER NOT NULL, deleted_at INTEGER ); @@ -158,6 +160,17 @@ export function runCollabMigrations() { db.$client.exec("ALTER TABLE collab_session ADD COLUMN preview_intent_at INTEGER") } + // preview_crash_count / preview_crash_at — the auto-resume crash-loop + // breaker (S2). Legacy rows backfill to 0 (no crashes recorded), which + // is exactly the "eligible for resume" state, so no special handling + // needed. See resumePreviewsOnBoot's breaker check in preview-launcher.ts. + if (!cols.some((c) => c.name === "preview_crash_count")) { + db.$client.exec("ALTER TABLE collab_session ADD COLUMN preview_crash_count INTEGER NOT NULL DEFAULT 0") + } + if (!cols.some((c) => c.name === "preview_crash_at")) { + db.$client.exec("ALTER TABLE collab_session ADD COLUMN preview_crash_at INTEGER") + } + // Boot sweep: revert mid-flight LLM dispatches back to `approved` so the // newly-booted task's queue executor picks them up and re-runs them. A // row sits in `in_flight` ONLY while the previous container had an open diff --git a/packages/opencode/src/collab/preview-launcher.ts b/packages/opencode/src/collab/preview-launcher.ts index f9b348e41aa8..1da78c378496 100644 --- a/packages/opencode/src/collab/preview-launcher.ts +++ b/packages/opencode/src/collab/preview-launcher.ts @@ -34,7 +34,7 @@ */ import { spawn, type ChildProcess } from "child_process" -import { existsSync, readFileSync } from "fs" +import { existsSync, readFileSync, statSync, readdirSync } from "fs" import { join } from "path" import { repoWorkspacePath } from "./workspace" import type { CollabEvent } from "@opencode-ai/collab" @@ -73,7 +73,23 @@ const MAX_LIFETIME_MS = 2 * 60 * 60 * 1000 * of the 16 GB ceiling where the kernel takes the WHOLE task down. */ const MEMORY_CAP_BYTES = 12 * 1024 * 1024 * 1024 -/** How often the sweep runs (idle / lifetime / memory checks). */ +/** Install-hang watchdog (S1) — if a preview is still in the `installing` + * phase and has produced no stdout/stderr for this long, presume the + * install is wedged and stop it. pnpm emits progress lines constantly + * during a healthy install, so 5 min of total silence is a strong wedge + * signal (dead registry, stuck native build, OOMing dep). Distinct from + * IDLE_TIMEOUT_MS, which only counts request traffic and so never fires + * during install. */ +const INSTALL_SILENCE_TIMEOUT_MS = 5 * 60 * 1000 + +/** Crash-loop breaker (S2) — refuse to auto-resume a session on boot once it + * has crashed during install this many times within BREAKER_WINDOW_MS. A + * Driver pressing Launch manually overrides the breaker (and resets the + * count); a successful "ready" transition also resets it. */ +const BREAKER_CRASH_THRESHOLD = 3 +const BREAKER_WINDOW_MS = 60 * 60 * 1000 + +/** How often the sweep runs (idle / lifetime / memory / install-hang checks). */ const SWEEP_INTERVAL_MS = 60 * 1000 /** Cap on retained install / run log lines (so memory is bounded across a @@ -167,6 +183,14 @@ interface ActiveState extends PreviewStateSnapshot { config: PreviewConfig // Mutable accumulators (not snapshot-able directly) _log: Array<{ stream: "stdout" | "stderr"; line: string; ts: number }> + /** Epoch-ms of the most-recent stdout/stderr line from the child. The + * install-hang watchdog (S1) reads this every sweep: if the preview is + * still in the `installing` phase and has emitted nothing for + * INSTALL_SILENCE_TIMEOUT_MS, the install is presumed wedged (dead + * registry, stuck native build, OOMing dep) and gets stopped so memory + * is freed and the Driver can retry — instead of sitting until the + * 30-minute idle cap, which only counts request traffic, not output. */ + _lastOutput: number /** True iff stopPreview was called for THIS state (vs the process exiting * on its own). Lets the exit handler decide between firing * collab:preview_stopped (clean exit we triggered) vs collab:preview_failed @@ -551,6 +575,7 @@ export function launchPreview( startedAt: now, lastTraffic: now, _log: [], + _lastOutput: now, recentLog: [], errorMessage: undefined, child, @@ -577,6 +602,14 @@ export function launchPreview( // spurious "branch changed" → auto-restart loop on the first LLM turn. lastKnownHead = null + // V2 telemetry — log whether the framework dep-optimization cache survived + // the previous container. Angular CLI / Vite write to `/.angular/cache`, + // which lives on EFS and SHOULD persist across deploys; when it does, the + // second-launch compile drops from ~2 min to ~20 s. If this logs "absent" + // on a session that's been launched before, the cache is getting wiped and + // that's a regression worth chasing. Cheap + best-effort; never throws. + logPreviewCacheState(cwd) + wireChildStreams(state) startSweepLoop() @@ -756,6 +789,30 @@ export async function maybeRestartOnBranchChange(): Promise { // ── Internal wiring ──────────────────────────────────────────────────────── +/** + * Best-effort V2 telemetry: log the framework dep-optimization cache state so + * we can confirm it persists across container restarts (the thing that makes + * a second-launch compile fast). Shallow + bounded — counts top-level + * entries under `.angular/cache`, never walks the whole tree, never throws. + */ +function logPreviewCacheState(cwd: string): void { + try { + const cacheDir = join(cwd, ".angular", "cache") + if (!existsSync(cacheDir)) { + console.log(`[collab.preview] framework cache: absent at ${cacheDir} (cold compile expected)`) + return + } + const entries = readdirSync(cacheDir) + const mtime = statSync(cacheDir).mtimeMs + const ageMin = Math.round((Date.now() - mtime) / 60_000) + console.log( + `[collab.preview] framework cache: present (${entries.length} top-level entr${entries.length === 1 ? "y" : "ies"}, last-modified ${ageMin}m ago) — warm compile expected`, + ) + } catch (err) { + console.warn("[collab.preview] framework cache probe failed (non-fatal):", err) + } +} + function wireChildStreams(state: ActiveState): void { const onLine = (stream: "stdout" | "stderr") => (chunk: Buffer) => { // Stop emitting log/state events for a child whose state has been @@ -766,6 +823,10 @@ function wireChildStreams(state: ActiveState): void { // "Preview stopped". if (active !== state) return + // Feed the install-hang watchdog (S1): any output — progress, warning, + // error — counts as liveness. Reset the clock before processing lines. + state._lastOutput = Date.now() + const lines = chunk.toString("utf8").split("\n").filter(Boolean) for (const line of lines) { state._log.push({ stream, line, ts: Date.now() }) @@ -821,6 +882,13 @@ function wireChildStreams(state: ActiveState): void { } if (ready) { ;(state as { status: PreviewStatus }).status = "running" + // S2: a clean install → running transition means this workspace is + // healthy; reset its crash-loop counter so a future transient + // failure starts from zero and the breaker doesn't fire on a + // session that's actually fine. Fire-and-forget DB write. + void import("./session") + .then((Session) => Session.clearPreviewCrashCount(state.collabSessionId)) + .catch((err) => console.warn("[collab.preview] clearPreviewCrashCount failed:", err)) broadcast(state.collabSessionId, { type: "collab:preview_started", state: getPreviewState()!, @@ -874,6 +942,20 @@ function wireChildStreams(state: ActiveState): void { // as a failure so the user can read the tail of the log and Retry. const msg = `Preview process exited with code ${code} ${signal ? `(signal ${signal})` : ""}` console.error(`[collab.preview] ${msg}`) + + // S2 crash-loop breaker: only an INSTALL-phase crash feeds the counter. + // A crash after reaching "running" is a different failure class (dev + // server runtime error) and shouldn't suppress boot-resume — the + // workspace installed fine, so resuming it on the next boot is + // reasonable. An install crash, by contrast, tends to be deterministic + // (broken lockfile, missing dep, OOM during native build) and WILL + // recur on every boot — that's exactly what the breaker guards against. + if (state.status === "installing") { + void import("./session") + .then((Session) => Session.recordPreviewCrash(state.collabSessionId)) + .catch((err) => console.warn("[collab.preview] recordPreviewCrash failed:", err)) + } + ;(state as { status: PreviewStatus }).status = "failed" ;(state as { errorMessage?: string }).errorMessage = msg broadcast(state.collabSessionId, { @@ -922,6 +1004,29 @@ function startSweepLoop(): void { return } + // 1b. Install-hang watchdog (S1) — a preview still in the `installing` + // phase that has emitted zero output for INSTALL_SILENCE_TIMEOUT_MS + // is presumed wedged. A healthy pnpm install / ng compile emits + // progress constantly, so prolonged silence means a dead registry, + // stuck native build, or an OOMing dep holding memory with no + // forward progress. Stop it now rather than waiting out the 30 min + // idle cap (which never fires here — no request traffic during + // install). We record the crash explicitly here (rather than + // relying on the exit handler, which skips crash-recording when WE + // initiated the stop) so a workspace that hangs install on every + // boot eventually trips the crash-loop breaker instead of wasting + // 5 min per boot indefinitely. + if (active.status === "installing" && now - active._lastOutput > INSTALL_SILENCE_TIMEOUT_MS) { + const hungSession = active.collabSessionId + void import("./session") + .then((Session) => Session.recordPreviewCrash(hungSession)) + .catch((err) => console.warn("[collab.preview] recordPreviewCrash (hang) failed:", err)) + stopPreview( + `install hung — no output for ${Math.round(INSTALL_SILENCE_TIMEOUT_MS / 60_000)}m — Driver can re-Launch`, + ) + return + } + // 2. Lifetime cap — preview has been alive for MAX_LIFETIME_MS, // regardless of traffic. Forces a clean restart before the // ng-serve / Vite heap leak overflows the task's memory. Driver @@ -989,7 +1094,13 @@ export async function resumePreviewsOnBoot(): Promise { return } - let intents: Array<{ collabSessionId: string; repoFullName: string; at: number }> + let intents: Array<{ + collabSessionId: string + repoFullName: string + at: number + crashCount: number + crashAt: number + }> try { intents = session.listPreviewIntents() } catch (err) { @@ -1037,12 +1148,35 @@ export async function resumePreviewsOnBoot(): Promise { } if (fresh.length === 0) return + // Crash-loop breaker (S2). A fresh intent whose workspace has crashed + // during install BREAKER_CRASH_THRESHOLD+ times within the recent window + // is almost certainly deterministically broken (bad lockfile, missing + // dep, OOM during native build) — auto-resuming it just burns another + // install attempt + memory every boot. Skip those; the Driver can press + // Launch manually (which clears the counter and overrides the breaker) + // once they've fixed the underlying workspace/config issue. The freshness + // cap above handles age; this handles repeated failure within the window. + const eligible: typeof fresh = [] + for (const i of fresh) { + const recentlyTripped = i.crashAt > 0 && now - i.crashAt < BREAKER_WINDOW_MS + if (i.crashCount >= BREAKER_CRASH_THRESHOLD && recentlyTripped) { + console.warn( + `[collab.preview] resumePreviewsOnBoot: session=${i.collabSessionId} repo=${i.repoFullName} ` + + `skipped — crash-loop breaker (${i.crashCount} install crashes within ${Math.round(BREAKER_WINDOW_MS / 60_000)}m). ` + + `Driver must Launch manually to retry.`, + ) + continue + } + eligible.push(i) + } + if (eligible.length === 0) return + // First-launch-wins constraint (one preview per container) means we pick // the most-recently active intent and ignore the rest. If multiple // intents survived to disk, the rest will sit clear in the DB until a // Driver explicitly Launches one — we never auto-stomp a more-recent // wish in favour of a stale one. - const pick = fresh[0] + const pick = eligible[0] console.log( `[collab.preview] resumePreviewsOnBoot: ${intents.length} intent(s) on disk; picking session=${pick.collabSessionId} repo=${pick.repoFullName} (most-recent)`, ) diff --git a/packages/opencode/src/collab/router.ts b/packages/opencode/src/collab/router.ts index 0cc8bea4a8e2..8dc75d4edce5 100644 --- a/packages/opencode/src/collab/router.ts +++ b/packages/opencode/src/collab/router.ts @@ -1464,6 +1464,12 @@ async function handleSessionRoutes(req: Request, url: URL, path: string): Promis // GIT_ASKPASS helper for the consumption side, and preview-launcher's // ActiveState._gitAccessToken comment for the in-memory caching // semantics across restart. + // A manual Driver launch is an explicit human retry — clear any + // crash-loop breaker state (S2) BEFORE launching so a session that the + // breaker previously locked out of auto-resume becomes eligible again, + // and so this attempt starts from a clean counter. If the workspace is + // still broken it'll re-accumulate crashes and the breaker re-engages. + Session.clearPreviewCrashCount(sessionId) const result = Preview.launchPreview(sessionId, repoFullName, sess.githubAccessToken) if (!result.ok) return json({ error: result.error, ...("existing" in result ? { existing: result.existing } : {}) }, result.status) // Persist the Driver's intent so an ECS task replacement re-spawns the diff --git a/packages/opencode/src/collab/schema.sql.ts b/packages/opencode/src/collab/schema.sql.ts index e4bdea996d76..9afbff5b5b33 100644 --- a/packages/opencode/src/collab/schema.sql.ts +++ b/packages/opencode/src/collab/schema.sql.ts @@ -34,6 +34,18 @@ export const CollabSessionTable = sqliteTable("collab_session", { * shutdown (rare — single-launcher constraint usually prevents this), the * more recently active one wins re-spawn priority on boot. */ preview_intent_at: integer({ mode: "timestamp_ms" }), + /** + * Consecutive preview-install crash count + timestamp of the last crash. + * Incremented when a resumed-or-launched preview's child process exits + * non-zero while still in the `installing` phase. `resumePreviewsOnBoot()` + * refuses to auto-resume a session whose count has reached the breaker + * threshold within the recent window — stops a broken workspace from + * OOM-looping the task across boots. A successful "ready" transition + * resets the count to 0. A Driver pressing Launch manually also clears it + * (explicit human retry overrides the breaker). + */ + preview_crash_count: integer().notNull().default(0), + preview_crash_at: integer({ mode: "timestamp_ms" }), created_at: integer({ mode: "timestamp_ms" }).notNull(), deleted_at: integer({ mode: "timestamp_ms" }), }) diff --git a/packages/opencode/src/collab/session.ts b/packages/opencode/src/collab/session.ts index 6f84007c09ea..9c76877db7fc 100644 --- a/packages/opencode/src/collab/session.ts +++ b/packages/opencode/src/collab/session.ts @@ -1,4 +1,4 @@ -import { Database, eq, and, isNull } from "@/storage/db" +import { Database, eq, and, isNull, sql } from "@/storage/db" import { CollabSessionTable, CollabParticipantTable, @@ -229,13 +229,21 @@ export function setPreviewIntent(collabSessionId: string, repoFullName: string | * pick the most-recently-active one. Soft-deleted sessions are excluded — * we never resurrect a preview for a session the Driver tore down. */ -export function listPreviewIntents(): Array<{ collabSessionId: string; repoFullName: string; at: number }> { +export function listPreviewIntents(): Array<{ + collabSessionId: string + repoFullName: string + at: number + crashCount: number + crashAt: number +}> { return Database.use((db) => { const rows = db .select({ id: CollabSessionTable.id, repo: CollabSessionTable.preview_intent, at: CollabSessionTable.preview_intent_at, + crashCount: CollabSessionTable.preview_crash_count, + crashAt: CollabSessionTable.preview_crash_at, deleted_at: CollabSessionTable.deleted_at, }) .from(CollabSessionTable) @@ -246,11 +254,46 @@ export function listPreviewIntents(): Array<{ collabSessionId: string; repoFullN collabSessionId: r.id, repoFullName: r.repo as string, at: r.at ? r.at.getTime() : 0, + crashCount: r.crashCount ?? 0, + crashAt: r.crashAt ? r.crashAt.getTime() : 0, })) .sort((a, b) => b.at - a.at) }) } +/** + * Crash-loop breaker bookkeeping (S2). Three operations: + * + * recordPreviewCrash — a preview's child process died non-zero while still + * installing. Bump the counter + stamp now. + * clearPreviewCrashCount — the preview reached "ready", OR a Driver pressed + * Launch manually. Reset to 0 so a later transient + * failure starts fresh and a human retry overrides + * the breaker. + * + * Kept tiny + idempotent; all three no-op against a deleted/absent session. + */ +export function recordPreviewCrash(collabSessionId: string): void { + Database.use((db) => { + db.update(CollabSessionTable) + .set({ + preview_crash_count: sql`${CollabSessionTable.preview_crash_count} + 1`, + preview_crash_at: new Date(), + }) + .where(eq(CollabSessionTable.id, collabSessionId)) + .run() + }) +} + +export function clearPreviewCrashCount(collabSessionId: string): void { + Database.use((db) => { + db.update(CollabSessionTable) + .set({ preview_crash_count: 0, preview_crash_at: null }) + .where(eq(CollabSessionTable.id, collabSessionId)) + .run() + }) +} + /** * Append `repos` to the collab session's linked-repo list. *