diff --git a/packages/opencode/src/cli/cmd/serve.ts b/packages/opencode/src/cli/cmd/serve.ts index 3fb775852431..186b6afd6276 100644 --- a/packages/opencode/src/cli/cmd/serve.ts +++ b/packages/opencode/src/cli/cmd/serve.ts @@ -154,6 +154,35 @@ export const ServeCommand = effectCmd({ }) } + // S6 — start the container-RSS monitor. Pure telemetry: logs a WARNING + // when total memory crosses 13 GB (leading indicator for the 16 GB task + // ceiling) so an operator can correlate it with a later OOM. Never kills + // anything. Self-disables on platforms without the cgroup file. + if (isCollabMode) { + yield* Effect.promise(async () => { + try { + const { startMemoryMonitor } = await import("../../collab/cgroup-memory") + startMemoryMonitor() + } catch (err) { + console.warn("[collab] memory monitor skipped:", err) + } + }) + } + + // S7 — sweep orphan workspace directories on EFS (dirs with no live + // session row, older than the 24 h safety floor). Reclaims space left + // by failed cleanups / drift. Fire-and-forget; per-dir failures log. + if (isCollabMode) { + yield* Effect.promise(async () => { + try { + const Workspace = await import("../../collab/workspace") + void Workspace.cleanupOrphanWorkspaces() + } catch (err) { + console.warn("[collab] orphan-workspace sweep skipped:", err) + } + }) + } + yield* Effect.never }), }) diff --git a/packages/opencode/src/collab/cgroup-memory.ts b/packages/opencode/src/collab/cgroup-memory.ts new file mode 100644 index 000000000000..daa3f1a4a840 --- /dev/null +++ b/packages/opencode/src/collab/cgroup-memory.ts @@ -0,0 +1,83 @@ +/** + * Container memory introspection via the cgroups v2 interface that AWS + * Fargate (platform version 1.4+) exposes at /sys/fs/cgroup/memory.current. + * + * The value is the WHOLE container's current memory usage in bytes — + * opencode + any spawned preview dev-server + everything else in the task. + * That's exactly the figure the kernel OOM-killer accounts against, so it's + * the right number to watch when the goal is "stop gracefully BEFORE the + * kernel takes the whole task down." + * + * Returns null on platforms where the file isn't present (macOS dev, older + * kernels, cgroups v1) — callers treat null as "skip the memory check". + * + * NOTE: preview-launcher.ts carries a private copy of this read for its own + * 12 GB preview memory cap (shipped in PR #34, before this util existed). + * Consolidating the two onto this shared util is a deferred cleanup — kept + * separate for now so this telemetry PR doesn't conflict with the in-flight + * preview-launcher hardening PR. Both read the same file; no behavioural + * difference. + */ +import { readFileSync } from "fs" + +/** Total container RSS in bytes, or null when the cgroup file is unreadable. */ +export function readContainerMemoryBytes(): number | null { + try { + const raw = readFileSync("/sys/fs/cgroup/memory.current", "utf8").trim() + const n = Number(raw) + return Number.isFinite(n) ? n : null + } catch { + return null + } +} + +/** Soft warning threshold — leading indicator logged before the kernel + * OOM-killer's hard ceiling (16 GB task). 13 GB leaves ~3 GB of headroom; + * crossing it means something (a preview compile, a runaway plugin, SSE + * broadcaster accumulation) is trending toward the danger zone and is worth + * a CloudWatch breadcrumb so an operator can correlate it with a later OOM. */ +const MEMORY_WARN_BYTES = 13 * 1024 * 1024 * 1024 + +/** How often the monitor samples. 60 s matches the preview sweep cadence; + * memory pressure builds over minutes, not milliseconds. */ +const MONITOR_INTERVAL_MS = 60 * 1000 + +/** + * Start a best-effort background monitor that logs a WARNING whenever total + * container RSS crosses MEMORY_WARN_BYTES, and an INFO line when it recovers + * back below. Pure telemetry — it never kills anything (the preview memory + * cap in preview-launcher.ts is the actor; this is the leading indicator for + * the WHOLE task, including opencode itself). + * + * Returns a stop function; the interval is unref'd so it never keeps the + * event loop alive on shutdown. No-ops (logs once) on platforms without the + * cgroup file so non-Linux dev doesn't spam. + */ +export function startMemoryMonitor(): () => void { + if (readContainerMemoryBytes() === null) { + console.log("[collab.memory] cgroup memory file unavailable — RSS monitor disabled (non-Linux/cgroups-v1)") + return () => {} + } + + let warned = false + const timer = setInterval(() => { + const used = readContainerMemoryBytes() + if (used === null) return + const usedMB = Math.round(used / (1024 * 1024)) + const warnMB = Math.round(MEMORY_WARN_BYTES / (1024 * 1024)) + if (used > MEMORY_WARN_BYTES) { + if (!warned) { + warned = true + console.warn( + `[collab.memory] WARNING container RSS ${usedMB}MB crossed ${warnMB}MB — ` + + `approaching the 16 GB task ceiling; watch for OOM. Leading indicator only.`, + ) + } + } else if (warned) { + warned = false + console.log(`[collab.memory] container RSS recovered to ${usedMB}MB (below ${warnMB}MB)`) + } + }, MONITOR_INTERVAL_MS) + if (typeof timer.unref === "function") timer.unref() + return () => clearInterval(timer) +} diff --git a/packages/opencode/src/collab/workspace.ts b/packages/opencode/src/collab/workspace.ts index 81a0af2b77e0..879f86bd37d8 100644 --- a/packages/opencode/src/collab/workspace.ts +++ b/packages/opencode/src/collab/workspace.ts @@ -10,7 +10,7 @@ */ import { spawn } from "child_process" -import { mkdirSync, rmSync, existsSync, writeFileSync, renameSync } from "fs" +import { mkdirSync, rmSync, existsSync, writeFileSync, renameSync, readdirSync, statSync } from "fs" import { join } from "path" import type { Participant } from "@opencode-ai/collab" @@ -608,6 +608,72 @@ export function cleanupSessionWorkspace(collabSessionId: string): void { } } +/** + * Orphan-workspace sweep (S7). Runs once on container boot. + * + * Explicit session deletion already wipes the workspace synchronously + * (router.ts DELETE → cleanupSessionWorkspace), so the steady state has no + * orphans. But drift accumulates: an rmSync that threw on an EFS hiccup, a + * task killed between soft-delete and cleanup, a manual DB edit. Each + * frontend-sized orphan is ~1.5 GB on EFS, so left unchecked this grows the + * filesystem (and the bill) indefinitely. + * + * This sweep lists the workspace-root subdirectories (each named by a + * collabSessionId) and removes any that have NO corresponding live + * (non-soft-deleted) session row AND whose directory mtime is older than the + * safety floor. The mtime floor is belt-and-suspenders: a session inserts + * its DB row BEFORE cloning, so a live dir always has a live row — but the + * floor guarantees we never touch anything that was written in the last + * 24 h, eliminating any boot-time TOCTOU against an in-progress init. + * + * Best-effort: per-dir failures log and continue. Never throws. + */ +const ORPHAN_WORKSPACE_MIN_AGE_MS = 24 * 60 * 60 * 1000 + +export async function cleanupOrphanWorkspaces(): Promise { + const root = workspaceRoot() + if (!existsSync(root)) return + + let liveIds: Set + try { + const session = await import("./session") + // listCollabSessions() already excludes soft-deleted rows — exactly the + // "live" set we want to protect. + liveIds = new Set(session.listCollabSessions().map((s) => s.id)) + } catch (err) { + console.warn("[collab.workspace] cleanupOrphanWorkspaces: session list failed; skipping:", err) + return + } + + let entries: string[] + try { + entries = readdirSync(root) + } catch (err) { + console.warn("[collab.workspace] cleanupOrphanWorkspaces: readdir failed; skipping:", err) + return + } + + const now = Date.now() + let removed = 0 + for (const name of entries) { + if (liveIds.has(name)) continue // live session — leave it + const dir = join(root, name) + try { + const st = statSync(dir) + if (!st.isDirectory()) continue + if (now - st.mtimeMs < ORPHAN_WORKSPACE_MIN_AGE_MS) continue // too fresh — protect against init races + rmSync(dir, { recursive: true, force: true }) + removed++ + console.log(`[collab.workspace] cleanupOrphanWorkspaces: removed orphan workspace ${name}`) + } catch (err) { + console.warn(`[collab.workspace] cleanupOrphanWorkspaces: failed to remove ${name}:`, err) + } + } + if (removed > 0) { + console.log(`[collab.workspace] cleanupOrphanWorkspaces: reclaimed ${removed} orphan workspace dir(s)`) + } +} + /** * Build git commit trailers for co-authorship attribution. * diff --git a/packages/opencode/src/server/server.ts b/packages/opencode/src/server/server.ts index 169182ea222c..49d70162c0aa 100644 --- a/packages/opencode/src/server/server.ts +++ b/packages/opencode/src/server/server.ts @@ -23,6 +23,23 @@ import { lazy } from "@/util/lazy" // Close enough for an ALB health probe; not used for SLA reporting. const serverStartedAt = Date.now() +// ── Event-loop liveness heartbeat (S5) ────────────────────────────────────── +// A 5-s interval stamps `lastEventLoopTick`. /healthz compares it against +// now: if the loop has been blocked long enough that the tick is >30 s stale, +// the server is wedged (a long synchronous operation, a tight loop in a +// plugin, a giant JSON.parse on a runaway preview log) even though the HTTP +// listener might still technically accept the connection. Returning 503 in +// that window lets the ALB pull the task ~1 min sooner than waiting for the +// request to time out. Unref'd so it never holds the loop open on shutdown. +let lastEventLoopTick = Date.now() +const EVENT_LOOP_STALL_THRESHOLD_MS = 30_000 +{ + const tick = setInterval(() => { + lastEventLoopTick = Date.now() + }, 5_000) + if (typeof tick.unref === "function") tick.unref() +} + // ── Collab middleware ────────────────────────────────────────────────────────── // Intercepts /collab/* requests before the Effect HTTP router's catch-all UI // route can serve index.html for them. Bridges the standard Web Request/Response @@ -127,13 +144,21 @@ const serveHealthz = () => Effect.sync(() => { const dbOk = pingDatabase() const githubStatus = cachedGitHubStatus() - // db is the only check that can flip overall ok; github + native_api are - // informational so a degraded external dep doesn't pull the ALB out from - // under us (we'd be DoS-ing ourselves if GitHub's HEAD ever 5xx'd). + // S5 — event-loop liveness. Stale tick = the loop was blocked long + // enough to miss several 5-s heartbeats, i.e. the server is wedged. + const eventLoopLagMs = Date.now() - lastEventLoopTick + const eventLoopOk = eventLoopLagMs <= EVENT_LOOP_STALL_THRESHOLD_MS + // db + event-loop are the checks that can flip overall ok; github + + // native_api are informational so a degraded external dep doesn't pull + // the ALB out from under us (we'd be DoS-ing ourselves if GitHub's HEAD + // ever 5xx'd). + const ok = dbOk && eventLoopOk const body = { - ok: dbOk, + ok, checks: { db: dbOk ? "ok" : "fail", + event_loop: eventLoopOk ? "ok" : "stalled", + event_loop_lag_ms: eventLoopLagMs, github: githubStatus, // native_api is the server itself; if Bun is up enough to answer /healthz // then the native API is up too — we just record it for the dashboard. @@ -142,8 +167,11 @@ const serveHealthz = () => version: process.env["OPENCODE_VERSION"] ?? "unknown", uptime_s: Math.floor((Date.now() - serverStartedAt) / 1000), } + if (!eventLoopOk) { + log.error("/healthz event-loop stall detected", { lagMs: eventLoopLagMs }) + } return HttpServerResponse.jsonUnsafe(body, { - status: dbOk ? 200 : 503, + status: ok ? 200 : 503, headers: { "cache-control": "no-store" }, }) })