Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions packages/opencode/src/collab/migrate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ const SQL = `
init_error TEXT,
preview_intent TEXT,
preview_intent_at INTEGER,
preview_crash_count INTEGER NOT NULL DEFAULT 0,
preview_crash_at INTEGER,
created_at INTEGER NOT NULL,
deleted_at INTEGER
);
Expand Down Expand Up @@ -158,6 +160,17 @@ export function runCollabMigrations() {
db.$client.exec("ALTER TABLE collab_session ADD COLUMN preview_intent_at INTEGER")
}

// preview_crash_count / preview_crash_at — the auto-resume crash-loop
// breaker (S2). Legacy rows backfill to 0 (no crashes recorded), which
// is exactly the "eligible for resume" state, so no special handling
// needed. See resumePreviewsOnBoot's breaker check in preview-launcher.ts.
if (!cols.some((c) => c.name === "preview_crash_count")) {
db.$client.exec("ALTER TABLE collab_session ADD COLUMN preview_crash_count INTEGER NOT NULL DEFAULT 0")
}
if (!cols.some((c) => c.name === "preview_crash_at")) {
db.$client.exec("ALTER TABLE collab_session ADD COLUMN preview_crash_at INTEGER")
}

// Boot sweep: revert mid-flight LLM dispatches back to `approved` so the
// newly-booted task's queue executor picks them up and re-runs them. A
// row sits in `in_flight` ONLY while the previous container had an open
Expand Down
142 changes: 138 additions & 4 deletions packages/opencode/src/collab/preview-launcher.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
*/

import { spawn, type ChildProcess } from "child_process"
import { existsSync, readFileSync } from "fs"
import { existsSync, readFileSync, statSync, readdirSync } from "fs"
import { join } from "path"
import { repoWorkspacePath } from "./workspace"
import type { CollabEvent } from "@opencode-ai/collab"
Expand Down Expand Up @@ -73,7 +73,23 @@ const MAX_LIFETIME_MS = 2 * 60 * 60 * 1000
* of the 16 GB ceiling where the kernel takes the WHOLE task down. */
const MEMORY_CAP_BYTES = 12 * 1024 * 1024 * 1024

/** How often the sweep runs (idle / lifetime / memory checks). */
/** Install-hang watchdog (S1) — if a preview is still in the `installing`
* phase and has produced no stdout/stderr for this long, presume the
* install is wedged and stop it. pnpm emits progress lines constantly
* during a healthy install, so 5 min of total silence is a strong wedge
* signal (dead registry, stuck native build, OOMing dep). Distinct from
* IDLE_TIMEOUT_MS, which only counts request traffic and so never fires
* during install. */
const INSTALL_SILENCE_TIMEOUT_MS = 5 * 60 * 1000

/** Crash-loop breaker (S2) — refuse to auto-resume a session on boot once it
* has crashed during install this many times within BREAKER_WINDOW_MS. A
* Driver pressing Launch manually overrides the breaker (and resets the
* count); a successful "ready" transition also resets it. */
const BREAKER_CRASH_THRESHOLD = 3
const BREAKER_WINDOW_MS = 60 * 60 * 1000

/** How often the sweep runs (idle / lifetime / memory / install-hang checks). */
const SWEEP_INTERVAL_MS = 60 * 1000

/** Cap on retained install / run log lines (so memory is bounded across a
Expand Down Expand Up @@ -167,6 +183,14 @@ interface ActiveState extends PreviewStateSnapshot {
config: PreviewConfig
// Mutable accumulators (not snapshot-able directly)
_log: Array<{ stream: "stdout" | "stderr"; line: string; ts: number }>
/** Epoch-ms of the most-recent stdout/stderr line from the child. The
* install-hang watchdog (S1) reads this every sweep: if the preview is
* still in the `installing` phase and has emitted nothing for
* INSTALL_SILENCE_TIMEOUT_MS, the install is presumed wedged (dead
* registry, stuck native build, OOMing dep) and gets stopped so memory
* is freed and the Driver can retry — instead of sitting until the
* 30-minute idle cap, which only counts request traffic, not output. */
_lastOutput: number
/** True iff stopPreview was called for THIS state (vs the process exiting
* on its own). Lets the exit handler decide between firing
* collab:preview_stopped (clean exit we triggered) vs collab:preview_failed
Expand Down Expand Up @@ -551,6 +575,7 @@ export function launchPreview(
startedAt: now,
lastTraffic: now,
_log: [],
_lastOutput: now,
recentLog: [],
errorMessage: undefined,
child,
Expand All @@ -577,6 +602,14 @@ export function launchPreview(
// spurious "branch changed" → auto-restart loop on the first LLM turn.
lastKnownHead = null

// V2 telemetry — log whether the framework dep-optimization cache survived
// the previous container. Angular CLI / Vite write to `<repo>/.angular/cache`,
// which lives on EFS and SHOULD persist across deploys; when it does, the
// second-launch compile drops from ~2 min to ~20 s. If this logs "absent"
// on a session that's been launched before, the cache is getting wiped and
// that's a regression worth chasing. Cheap + best-effort; never throws.
logPreviewCacheState(cwd)

wireChildStreams(state)
startSweepLoop()

Expand Down Expand Up @@ -756,6 +789,30 @@ export async function maybeRestartOnBranchChange(): Promise<void> {

// ── Internal wiring ────────────────────────────────────────────────────────

/**
* Best-effort V2 telemetry: log the framework dep-optimization cache state so
* we can confirm it persists across container restarts (the thing that makes
* a second-launch compile fast). Shallow + bounded — counts top-level
* entries under `.angular/cache`, never walks the whole tree, never throws.
*/
function logPreviewCacheState(cwd: string): void {
try {
const cacheDir = join(cwd, ".angular", "cache")
if (!existsSync(cacheDir)) {
console.log(`[collab.preview] framework cache: absent at ${cacheDir} (cold compile expected)`)
return
}
const entries = readdirSync(cacheDir)
const mtime = statSync(cacheDir).mtimeMs
const ageMin = Math.round((Date.now() - mtime) / 60_000)
console.log(
`[collab.preview] framework cache: present (${entries.length} top-level entr${entries.length === 1 ? "y" : "ies"}, last-modified ${ageMin}m ago) — warm compile expected`,
)
} catch (err) {
console.warn("[collab.preview] framework cache probe failed (non-fatal):", err)
}
}

function wireChildStreams(state: ActiveState): void {
const onLine = (stream: "stdout" | "stderr") => (chunk: Buffer) => {
// Stop emitting log/state events for a child whose state has been
Expand All @@ -766,6 +823,10 @@ function wireChildStreams(state: ActiveState): void {
// "Preview stopped".
if (active !== state) return

// Feed the install-hang watchdog (S1): any output — progress, warning,
// error — counts as liveness. Reset the clock before processing lines.
state._lastOutput = Date.now()

const lines = chunk.toString("utf8").split("\n").filter(Boolean)
for (const line of lines) {
state._log.push({ stream, line, ts: Date.now() })
Expand Down Expand Up @@ -821,6 +882,13 @@ function wireChildStreams(state: ActiveState): void {
}
if (ready) {
;(state as { status: PreviewStatus }).status = "running"
// S2: a clean install → running transition means this workspace is
// healthy; reset its crash-loop counter so a future transient
// failure starts from zero and the breaker doesn't fire on a
// session that's actually fine. Fire-and-forget DB write.
void import("./session")
.then((Session) => Session.clearPreviewCrashCount(state.collabSessionId))
.catch((err) => console.warn("[collab.preview] clearPreviewCrashCount failed:", err))
broadcast(state.collabSessionId, {
type: "collab:preview_started",
state: getPreviewState()!,
Expand Down Expand Up @@ -874,6 +942,20 @@ function wireChildStreams(state: ActiveState): void {
// as a failure so the user can read the tail of the log and Retry.
const msg = `Preview process exited with code ${code} ${signal ? `(signal ${signal})` : ""}`
console.error(`[collab.preview] ${msg}`)

// S2 crash-loop breaker: only an INSTALL-phase crash feeds the counter.
// A crash after reaching "running" is a different failure class (dev
// server runtime error) and shouldn't suppress boot-resume — the
// workspace installed fine, so resuming it on the next boot is
// reasonable. An install crash, by contrast, tends to be deterministic
// (broken lockfile, missing dep, OOM during native build) and WILL
// recur on every boot — that's exactly what the breaker guards against.
if (state.status === "installing") {
void import("./session")
.then((Session) => Session.recordPreviewCrash(state.collabSessionId))
.catch((err) => console.warn("[collab.preview] recordPreviewCrash failed:", err))
}

;(state as { status: PreviewStatus }).status = "failed"
;(state as { errorMessage?: string }).errorMessage = msg
broadcast(state.collabSessionId, {
Expand Down Expand Up @@ -922,6 +1004,29 @@ function startSweepLoop(): void {
return
}

// 1b. Install-hang watchdog (S1) — a preview still in the `installing`
// phase that has emitted zero output for INSTALL_SILENCE_TIMEOUT_MS
// is presumed wedged. A healthy pnpm install / ng compile emits
// progress constantly, so prolonged silence means a dead registry,
// stuck native build, or an OOMing dep holding memory with no
// forward progress. Stop it now rather than waiting out the 30 min
// idle cap (which never fires here — no request traffic during
// install). We record the crash explicitly here (rather than
// relying on the exit handler, which skips crash-recording when WE
// initiated the stop) so a workspace that hangs install on every
// boot eventually trips the crash-loop breaker instead of wasting
// 5 min per boot indefinitely.
if (active.status === "installing" && now - active._lastOutput > INSTALL_SILENCE_TIMEOUT_MS) {
const hungSession = active.collabSessionId
void import("./session")
.then((Session) => Session.recordPreviewCrash(hungSession))
.catch((err) => console.warn("[collab.preview] recordPreviewCrash (hang) failed:", err))
stopPreview(
`install hung — no output for ${Math.round(INSTALL_SILENCE_TIMEOUT_MS / 60_000)}m — Driver can re-Launch`,
)
return
}

// 2. Lifetime cap — preview has been alive for MAX_LIFETIME_MS,
// regardless of traffic. Forces a clean restart before the
// ng-serve / Vite heap leak overflows the task's memory. Driver
Expand Down Expand Up @@ -989,7 +1094,13 @@ export async function resumePreviewsOnBoot(): Promise<void> {
return
}

let intents: Array<{ collabSessionId: string; repoFullName: string; at: number }>
let intents: Array<{
collabSessionId: string
repoFullName: string
at: number
crashCount: number
crashAt: number
}>
try {
intents = session.listPreviewIntents()
} catch (err) {
Expand Down Expand Up @@ -1037,12 +1148,35 @@ export async function resumePreviewsOnBoot(): Promise<void> {
}
if (fresh.length === 0) return

// Crash-loop breaker (S2). A fresh intent whose workspace has crashed
// during install BREAKER_CRASH_THRESHOLD+ times within the recent window
// is almost certainly deterministically broken (bad lockfile, missing
// dep, OOM during native build) — auto-resuming it just burns another
// install attempt + memory every boot. Skip those; the Driver can press
// Launch manually (which clears the counter and overrides the breaker)
// once they've fixed the underlying workspace/config issue. The freshness
// cap above handles age; this handles repeated failure within the window.
const eligible: typeof fresh = []
for (const i of fresh) {
const recentlyTripped = i.crashAt > 0 && now - i.crashAt < BREAKER_WINDOW_MS
if (i.crashCount >= BREAKER_CRASH_THRESHOLD && recentlyTripped) {
console.warn(
`[collab.preview] resumePreviewsOnBoot: session=${i.collabSessionId} repo=${i.repoFullName} ` +
`skipped — crash-loop breaker (${i.crashCount} install crashes within ${Math.round(BREAKER_WINDOW_MS / 60_000)}m). ` +
`Driver must Launch manually to retry.`,
)
continue
}
eligible.push(i)
}
if (eligible.length === 0) return

// First-launch-wins constraint (one preview per container) means we pick
// the most-recently active intent and ignore the rest. If multiple
// intents survived to disk, the rest will sit clear in the DB until a
// Driver explicitly Launches one — we never auto-stomp a more-recent
// wish in favour of a stale one.
const pick = fresh[0]
const pick = eligible[0]
console.log(
`[collab.preview] resumePreviewsOnBoot: ${intents.length} intent(s) on disk; picking session=${pick.collabSessionId} repo=${pick.repoFullName} (most-recent)`,
)
Expand Down
6 changes: 6 additions & 0 deletions packages/opencode/src/collab/router.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1464,6 +1464,12 @@ async function handleSessionRoutes(req: Request, url: URL, path: string): Promis
// GIT_ASKPASS helper for the consumption side, and preview-launcher's
// ActiveState._gitAccessToken comment for the in-memory caching
// semantics across restart.
// A manual Driver launch is an explicit human retry — clear any
// crash-loop breaker state (S2) BEFORE launching so a session that the
// breaker previously locked out of auto-resume becomes eligible again,
// and so this attempt starts from a clean counter. If the workspace is
// still broken it'll re-accumulate crashes and the breaker re-engages.
Session.clearPreviewCrashCount(sessionId)
const result = Preview.launchPreview(sessionId, repoFullName, sess.githubAccessToken)
if (!result.ok) return json({ error: result.error, ...("existing" in result ? { existing: result.existing } : {}) }, result.status)
// Persist the Driver's intent so an ECS task replacement re-spawns the
Expand Down
12 changes: 12 additions & 0 deletions packages/opencode/src/collab/schema.sql.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,18 @@ export const CollabSessionTable = sqliteTable("collab_session", {
* shutdown (rare — single-launcher constraint usually prevents this), the
* more recently active one wins re-spawn priority on boot. */
preview_intent_at: integer({ mode: "timestamp_ms" }),
/**
* Consecutive preview-install crash count + timestamp of the last crash.
* Incremented when a resumed-or-launched preview's child process exits
* non-zero while still in the `installing` phase. `resumePreviewsOnBoot()`
* refuses to auto-resume a session whose count has reached the breaker
* threshold within the recent window — stops a broken workspace from
* OOM-looping the task across boots. A successful "ready" transition
* resets the count to 0. A Driver pressing Launch manually also clears it
* (explicit human retry overrides the breaker).
*/
preview_crash_count: integer().notNull().default(0),
preview_crash_at: integer({ mode: "timestamp_ms" }),
created_at: integer({ mode: "timestamp_ms" }).notNull(),
deleted_at: integer({ mode: "timestamp_ms" }),
})
Expand Down
47 changes: 45 additions & 2 deletions packages/opencode/src/collab/session.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { Database, eq, and, isNull } from "@/storage/db"
import { Database, eq, and, isNull, sql } from "@/storage/db"
import {
CollabSessionTable,
CollabParticipantTable,
Expand Down Expand Up @@ -229,13 +229,21 @@ export function setPreviewIntent(collabSessionId: string, repoFullName: string |
* pick the most-recently-active one. Soft-deleted sessions are excluded —
* we never resurrect a preview for a session the Driver tore down.
*/
export function listPreviewIntents(): Array<{ collabSessionId: string; repoFullName: string; at: number }> {
export function listPreviewIntents(): Array<{
collabSessionId: string
repoFullName: string
at: number
crashCount: number
crashAt: number
}> {
return Database.use((db) => {
const rows = db
.select({
id: CollabSessionTable.id,
repo: CollabSessionTable.preview_intent,
at: CollabSessionTable.preview_intent_at,
crashCount: CollabSessionTable.preview_crash_count,
crashAt: CollabSessionTable.preview_crash_at,
deleted_at: CollabSessionTable.deleted_at,
})
.from(CollabSessionTable)
Expand All @@ -246,11 +254,46 @@ export function listPreviewIntents(): Array<{ collabSessionId: string; repoFullN
collabSessionId: r.id,
repoFullName: r.repo as string,
at: r.at ? r.at.getTime() : 0,
crashCount: r.crashCount ?? 0,
crashAt: r.crashAt ? r.crashAt.getTime() : 0,
}))
.sort((a, b) => b.at - a.at)
})
}

/**
* Crash-loop breaker bookkeeping (S2). Three operations:
*
* recordPreviewCrash — a preview's child process died non-zero while still
* installing. Bump the counter + stamp now.
* clearPreviewCrashCount — the preview reached "ready", OR a Driver pressed
* Launch manually. Reset to 0 so a later transient
* failure starts fresh and a human retry overrides
* the breaker.
*
* Kept tiny + idempotent; all three no-op against a deleted/absent session.
*/
export function recordPreviewCrash(collabSessionId: string): void {
Database.use((db) => {
db.update(CollabSessionTable)
.set({
preview_crash_count: sql`${CollabSessionTable.preview_crash_count} + 1`,
preview_crash_at: new Date(),
})
.where(eq(CollabSessionTable.id, collabSessionId))
.run()
})
}

export function clearPreviewCrashCount(collabSessionId: string): void {
Database.use((db) => {
db.update(CollabSessionTable)
.set({ preview_crash_count: 0, preview_crash_at: null })
.where(eq(CollabSessionTable.id, collabSessionId))
.run()
})
}

/**
* Append `repos` to the collab session's linked-repo list.
*
Expand Down
Loading