Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions packages/opencode/src/cli/cmd/serve.ts
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,35 @@ export const ServeCommand = effectCmd({
})
}

// S6 — start the container-RSS monitor. Pure telemetry: logs a WARNING
// when total memory crosses 13 GB (leading indicator for the 16 GB task
// ceiling) so an operator can correlate it with a later OOM. Never kills
// anything. Self-disables on platforms without the cgroup file.
if (isCollabMode) {
yield* Effect.promise(async () => {
try {
const { startMemoryMonitor } = await import("../../collab/cgroup-memory")
startMemoryMonitor()
} catch (err) {
console.warn("[collab] memory monitor skipped:", err)
}
})
}

// S7 — sweep orphan workspace directories on EFS (dirs with no live
// session row, older than the 24 h safety floor). Reclaims space left
// by failed cleanups / drift. Fire-and-forget; per-dir failures log.
if (isCollabMode) {
yield* Effect.promise(async () => {
try {
const Workspace = await import("../../collab/workspace")
void Workspace.cleanupOrphanWorkspaces()
} catch (err) {
console.warn("[collab] orphan-workspace sweep skipped:", err)
}
})
}

yield* Effect.never
}),
})
83 changes: 83 additions & 0 deletions packages/opencode/src/collab/cgroup-memory.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
/**
* Container memory introspection via the cgroups v2 interface that AWS
* Fargate (platform version 1.4+) exposes at /sys/fs/cgroup/memory.current.
*
* The value is the WHOLE container's current memory usage in bytes —
* opencode + any spawned preview dev-server + everything else in the task.
* That's exactly the figure the kernel OOM-killer accounts against, so it's
* the right number to watch when the goal is "stop gracefully BEFORE the
* kernel takes the whole task down."
*
* Returns null on platforms where the file isn't present (macOS dev, older
* kernels, cgroups v1) — callers treat null as "skip the memory check".
*
* NOTE: preview-launcher.ts carries a private copy of this read for its own
* 12 GB preview memory cap (shipped in PR #34, before this util existed).
* Consolidating the two onto this shared util is a deferred cleanup — kept
* separate for now so this telemetry PR doesn't conflict with the in-flight
* preview-launcher hardening PR. Both read the same file; no behavioural
* difference.
*/
import { readFileSync } from "fs"

/** Total container RSS in bytes, or null when the cgroup file is unreadable. */
export function readContainerMemoryBytes(): number | null {
try {
const raw = readFileSync("/sys/fs/cgroup/memory.current", "utf8").trim()
const n = Number(raw)
return Number.isFinite(n) ? n : null
} catch {
return null
}
}

/** Soft warning threshold — leading indicator logged before the kernel
* OOM-killer's hard ceiling (16 GB task). 13 GB leaves ~3 GB of headroom;
* crossing it means something (a preview compile, a runaway plugin, SSE
* broadcaster accumulation) is trending toward the danger zone and is worth
* a CloudWatch breadcrumb so an operator can correlate it with a later OOM. */
const MEMORY_WARN_BYTES = 13 * 1024 * 1024 * 1024

/** How often the monitor samples. 60 s matches the preview sweep cadence;
* memory pressure builds over minutes, not milliseconds. */
const MONITOR_INTERVAL_MS = 60 * 1000

/**
* Start a best-effort background monitor that logs a WARNING whenever total
* container RSS crosses MEMORY_WARN_BYTES, and an INFO line when it recovers
* back below. Pure telemetry — it never kills anything (the preview memory
* cap in preview-launcher.ts is the actor; this is the leading indicator for
* the WHOLE task, including opencode itself).
*
* Returns a stop function; the interval is unref'd so it never keeps the
* event loop alive on shutdown. No-ops (logs once) on platforms without the
* cgroup file so non-Linux dev doesn't spam.
*/
export function startMemoryMonitor(): () => void {
if (readContainerMemoryBytes() === null) {
console.log("[collab.memory] cgroup memory file unavailable — RSS monitor disabled (non-Linux/cgroups-v1)")
return () => {}
}

let warned = false
const timer = setInterval(() => {
const used = readContainerMemoryBytes()
if (used === null) return
const usedMB = Math.round(used / (1024 * 1024))
const warnMB = Math.round(MEMORY_WARN_BYTES / (1024 * 1024))
if (used > MEMORY_WARN_BYTES) {
if (!warned) {
warned = true
console.warn(
`[collab.memory] WARNING container RSS ${usedMB}MB crossed ${warnMB}MB — ` +
`approaching the 16 GB task ceiling; watch for OOM. Leading indicator only.`,
)
}
} else if (warned) {
warned = false
console.log(`[collab.memory] container RSS recovered to ${usedMB}MB (below ${warnMB}MB)`)
}
}, MONITOR_INTERVAL_MS)
if (typeof timer.unref === "function") timer.unref()
return () => clearInterval(timer)
}
68 changes: 67 additions & 1 deletion packages/opencode/src/collab/workspace.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
*/

import { spawn } from "child_process"
import { mkdirSync, rmSync, existsSync, writeFileSync, renameSync } from "fs"
import { mkdirSync, rmSync, existsSync, writeFileSync, renameSync, readdirSync, statSync } from "fs"
import { join } from "path"
import type { Participant } from "@opencode-ai/collab"

Expand Down Expand Up @@ -608,6 +608,72 @@ export function cleanupSessionWorkspace(collabSessionId: string): void {
}
}

/**
* Orphan-workspace sweep (S7). Runs once on container boot.
*
* Explicit session deletion already wipes the workspace synchronously
* (router.ts DELETE → cleanupSessionWorkspace), so the steady state has no
* orphans. But drift accumulates: an rmSync that threw on an EFS hiccup, a
* task killed between soft-delete and cleanup, a manual DB edit. Each
* frontend-sized orphan is ~1.5 GB on EFS, so left unchecked this grows the
* filesystem (and the bill) indefinitely.
*
* This sweep lists the workspace-root subdirectories (each named by a
* collabSessionId) and removes any that have NO corresponding live
* (non-soft-deleted) session row AND whose directory mtime is older than the
* safety floor. The mtime floor is belt-and-suspenders: a session inserts
* its DB row BEFORE cloning, so a live dir always has a live row — but the
* floor guarantees we never touch anything that was written in the last
* 24 h, eliminating any boot-time TOCTOU against an in-progress init.
*
* Best-effort: per-dir failures log and continue. Never throws.
*/
const ORPHAN_WORKSPACE_MIN_AGE_MS = 24 * 60 * 60 * 1000

export async function cleanupOrphanWorkspaces(): Promise<void> {
const root = workspaceRoot()
if (!existsSync(root)) return

let liveIds: Set<string>
try {
const session = await import("./session")
// listCollabSessions() already excludes soft-deleted rows — exactly the
// "live" set we want to protect.
liveIds = new Set(session.listCollabSessions().map((s) => s.id))
} catch (err) {
console.warn("[collab.workspace] cleanupOrphanWorkspaces: session list failed; skipping:", err)
return
}

let entries: string[]
try {
entries = readdirSync(root)
} catch (err) {
console.warn("[collab.workspace] cleanupOrphanWorkspaces: readdir failed; skipping:", err)
return
}

const now = Date.now()
let removed = 0
for (const name of entries) {
if (liveIds.has(name)) continue // live session — leave it
const dir = join(root, name)
try {
const st = statSync(dir)
if (!st.isDirectory()) continue
if (now - st.mtimeMs < ORPHAN_WORKSPACE_MIN_AGE_MS) continue // too fresh — protect against init races
rmSync(dir, { recursive: true, force: true })
removed++
console.log(`[collab.workspace] cleanupOrphanWorkspaces: removed orphan workspace ${name}`)
} catch (err) {
console.warn(`[collab.workspace] cleanupOrphanWorkspaces: failed to remove ${name}:`, err)
}
}
if (removed > 0) {
console.log(`[collab.workspace] cleanupOrphanWorkspaces: reclaimed ${removed} orphan workspace dir(s)`)
}
}

/**
* Build git commit trailers for co-authorship attribution.
*
Expand Down
38 changes: 33 additions & 5 deletions packages/opencode/src/server/server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,23 @@ import { lazy } from "@/util/lazy"
// Close enough for an ALB health probe; not used for SLA reporting.
const serverStartedAt = Date.now()

// ── Event-loop liveness heartbeat (S5) ──────────────────────────────────────
// A 5-s interval stamps `lastEventLoopTick`. /healthz compares it against
// now: if the loop has been blocked long enough that the tick is >30 s stale,
// the server is wedged (a long synchronous operation, a tight loop in a
// plugin, a giant JSON.parse on a runaway preview log) even though the HTTP
// listener might still technically accept the connection. Returning 503 in
// that window lets the ALB pull the task ~1 min sooner than waiting for the
// request to time out. Unref'd so it never holds the loop open on shutdown.
let lastEventLoopTick = Date.now()
const EVENT_LOOP_STALL_THRESHOLD_MS = 30_000
{
const tick = setInterval(() => {
lastEventLoopTick = Date.now()
}, 5_000)
if (typeof tick.unref === "function") tick.unref()
}

// ── Collab middleware ──────────────────────────────────────────────────────────
// Intercepts /collab/* requests before the Effect HTTP router's catch-all UI
// route can serve index.html for them. Bridges the standard Web Request/Response
Expand Down Expand Up @@ -127,13 +144,21 @@ const serveHealthz = () =>
Effect.sync(() => {
const dbOk = pingDatabase()
const githubStatus = cachedGitHubStatus()
// db is the only check that can flip overall ok; github + native_api are
// informational so a degraded external dep doesn't pull the ALB out from
// under us (we'd be DoS-ing ourselves if GitHub's HEAD ever 5xx'd).
// S5 — event-loop liveness. Stale tick = the loop was blocked long
// enough to miss several 5-s heartbeats, i.e. the server is wedged.
const eventLoopLagMs = Date.now() - lastEventLoopTick
const eventLoopOk = eventLoopLagMs <= EVENT_LOOP_STALL_THRESHOLD_MS
// db + event-loop are the checks that can flip overall ok; github +
// native_api are informational so a degraded external dep doesn't pull
// the ALB out from under us (we'd be DoS-ing ourselves if GitHub's HEAD
// ever 5xx'd).
const ok = dbOk && eventLoopOk
const body = {
ok: dbOk,
ok,
checks: {
db: dbOk ? "ok" : "fail",
event_loop: eventLoopOk ? "ok" : "stalled",
event_loop_lag_ms: eventLoopLagMs,
github: githubStatus,
// native_api is the server itself; if Bun is up enough to answer /healthz
// then the native API is up too — we just record it for the dashboard.
Expand All @@ -142,8 +167,11 @@ const serveHealthz = () =>
version: process.env["OPENCODE_VERSION"] ?? "unknown",
uptime_s: Math.floor((Date.now() - serverStartedAt) / 1000),
}
if (!eventLoopOk) {
log.error("/healthz event-loop stall detected", { lagMs: eventLoopLagMs })
}
return HttpServerResponse.jsonUnsafe(body, {
status: dbOk ? 200 : 503,
status: ok ? 200 : 503,
headers: { "cache-control": "no-store" },
})
})
Expand Down
Loading