diff --git a/packages/opencode/src/cli/cmd/serve.ts b/packages/opencode/src/cli/cmd/serve.ts
index 3fb775852431..186b6afd6276 100644
--- a/packages/opencode/src/cli/cmd/serve.ts
+++ b/packages/opencode/src/cli/cmd/serve.ts
@@ -154,6 +154,35 @@ export const ServeCommand = effectCmd({
       })
     }
 
+    // S6 — start the container-RSS monitor.  Pure telemetry: logs a WARNING
+    // when total memory crosses 13 GB (leading indicator for the 16 GB task
+    // ceiling) so an operator can correlate it with a later OOM.  Never kills
+    // anything.  Self-disables on platforms without the cgroup file.
+    if (isCollabMode) {
+      yield* Effect.promise(async () => {
+        try {
+          const { startMemoryMonitor } = await import("../../collab/cgroup-memory")
+          startMemoryMonitor()
+        } catch (err) {
+          console.warn("[collab] memory monitor skipped:", err)
+        }
+      })
+    }
+
+    // S7 — sweep orphan workspace directories on EFS (dirs with no live
+    // session row, older than the 24 h safety floor).  Reclaims space left
+    // by failed cleanups / drift.  Fire-and-forget; per-dir failures log.
+    if (isCollabMode) {
+      yield* Effect.promise(async () => {
+        try {
+          const Workspace = await import("../../collab/workspace")
+          void Workspace.cleanupOrphanWorkspaces()
+        } catch (err) {
+          console.warn("[collab] orphan-workspace sweep skipped:", err)
+        }
+      })
+    }
+
     yield* Effect.never
   }),
 })
diff --git a/packages/opencode/src/collab/cgroup-memory.ts b/packages/opencode/src/collab/cgroup-memory.ts
new file mode 100644
index 000000000000..daa3f1a4a840
--- /dev/null
+++ b/packages/opencode/src/collab/cgroup-memory.ts
@@ -0,0 +1,83 @@
+/**
+ * Container memory introspection via the cgroups v2 interface that AWS
+ * Fargate (platform version 1.4+) exposes at /sys/fs/cgroup/memory.current.
+ *
+ * The value is the WHOLE container's current memory usage in bytes —
+ * opencode + any spawned preview dev-server + everything else in the task.
+ * That's exactly the figure the kernel OOM-killer accounts against, so it's
+ * the right number to watch when the goal is "stop gracefully BEFORE the
+ * kernel takes the whole task down."
+ *
+ * Returns null on platforms where the file isn't present (macOS dev, older
+ * kernels, cgroups v1) — callers treat null as "skip the memory check".
+ *
+ * NOTE: preview-launcher.ts carries a private copy of this read for its own
+ * 12 GB preview memory cap (shipped in PR #34, before this util existed).
+ * Consolidating the two onto this shared util is a deferred cleanup — kept
+ * separate for now so this telemetry PR doesn't conflict with the in-flight
+ * preview-launcher hardening PR.  Both read the same file; no behavioural
+ * difference.
+ */
+import { readFileSync } from "fs"
+
+/** Total container RSS in bytes, or null when the cgroup file is unreadable. */
+export function readContainerMemoryBytes(): number | null {
+  try {
+    const raw = readFileSync("/sys/fs/cgroup/memory.current", "utf8").trim()
+    const n = Number(raw)
+    return Number.isFinite(n) ? n : null
+  } catch {
+    return null
+  }
+}
+
+/** Soft warning threshold — leading indicator logged before the kernel
+ *  OOM-killer's hard ceiling (16 GB task).  13 GB leaves ~3 GB of headroom;
+ *  crossing it means something (a preview compile, a runaway plugin, SSE
+ *  broadcaster accumulation) is trending toward the danger zone and is worth
+ *  a CloudWatch breadcrumb so an operator can correlate it with a later OOM. */
+const MEMORY_WARN_BYTES = 13 * 1024 * 1024 * 1024
+
+/** How often the monitor samples.  60 s matches the preview sweep cadence;
+ *  memory pressure builds over minutes, not milliseconds. */
+const MONITOR_INTERVAL_MS = 60 * 1000
+
+/**
+ * Start a best-effort background monitor that logs a WARNING whenever total
+ * container RSS crosses MEMORY_WARN_BYTES, and an INFO line when it recovers
+ * back below.  Pure telemetry — it never kills anything (the preview memory
+ * cap in preview-launcher.ts is the actor; this is the leading indicator for
+ * the WHOLE task, including opencode itself).
+ *
+ * Returns a stop function; the interval is unref'd so it never keeps the
+ * event loop alive on shutdown.  No-ops (logs once) on platforms without the
+ * cgroup file so non-Linux dev doesn't spam.
+ */
+export function startMemoryMonitor(): () => void {
+  if (readContainerMemoryBytes() === null) {
+    console.log("[collab.memory] cgroup memory file unavailable — RSS monitor disabled (non-Linux/cgroups-v1)")
+    return () => {}
+  }
+
+  let warned = false
+  const timer = setInterval(() => {
+    const used = readContainerMemoryBytes()
+    if (used === null) return
+    const usedMB = Math.round(used / (1024 * 1024))
+    const warnMB = Math.round(MEMORY_WARN_BYTES / (1024 * 1024))
+    if (used > MEMORY_WARN_BYTES) {
+      if (!warned) {
+        warned = true
+        console.warn(
+          `[collab.memory] WARNING container RSS ${usedMB}MB crossed ${warnMB}MB — ` +
+            `approaching the 16 GB task ceiling; watch for OOM. Leading indicator only.`,
+        )
+      }
+    } else if (warned) {
+      warned = false
+      console.log(`[collab.memory] container RSS recovered to ${usedMB}MB (below ${warnMB}MB)`)
+    }
+  }, MONITOR_INTERVAL_MS)
+  if (typeof timer.unref === "function") timer.unref()
+  return () => clearInterval(timer)
+}
diff --git a/packages/opencode/src/collab/workspace.ts b/packages/opencode/src/collab/workspace.ts
index 81a0af2b77e0..879f86bd37d8 100644
--- a/packages/opencode/src/collab/workspace.ts
+++ b/packages/opencode/src/collab/workspace.ts
@@ -10,7 +10,7 @@
  */
 
 import { spawn } from "child_process"
-import { mkdirSync, rmSync, existsSync, writeFileSync, renameSync } from "fs"
+import { mkdirSync, rmSync, existsSync, writeFileSync, renameSync, readdirSync, statSync } from "fs"
 import { join } from "path"
 import type { Participant } from "@opencode-ai/collab"
 
@@ -608,6 +608,72 @@ export function cleanupSessionWorkspace(collabSessionId: string): void {
   }
 }
 
+/**
+ * Orphan-workspace sweep (S7).  Runs once on container boot.
+ *
+ * Explicit session deletion already wipes the workspace synchronously
+ * (router.ts DELETE → cleanupSessionWorkspace), so the steady state has no
+ * orphans.  But drift accumulates: an rmSync that threw on an EFS hiccup, a
+ * task killed between soft-delete and cleanup, a manual DB edit.  Each
+ * frontend-sized orphan is ~1.5 GB on EFS, so left unchecked this grows the
+ * filesystem (and the bill) indefinitely.
+ *
+ * This sweep lists the workspace-root subdirectories (each named by a
+ * collabSessionId) and removes any that have NO corresponding live
+ * (non-soft-deleted) session row AND whose directory mtime is older than the
+ * safety floor.  The mtime floor is belt-and-suspenders: a session inserts
+ * its DB row BEFORE cloning, so a live dir always has a live row — but the
+ * floor guarantees we never touch anything that was written in the last
+ * 24 h, eliminating any boot-time TOCTOU against an in-progress init.
+ *
+ * Best-effort: per-dir failures log and continue.  Never throws.
+ */
+const ORPHAN_WORKSPACE_MIN_AGE_MS = 24 * 60 * 60 * 1000
+
+export async function cleanupOrphanWorkspaces(): Promise<void> {
+  const root = workspaceRoot()
+  if (!existsSync(root)) return
+
+  let liveIds: Set<string>
+  try {
+    const session = await import("./session")
+    // listCollabSessions() already excludes soft-deleted rows — exactly the
+    // "live" set we want to protect.
+    liveIds = new Set(session.listCollabSessions().map((s) => s.id))
+  } catch (err) {
+    console.warn("[collab.workspace] cleanupOrphanWorkspaces: session list failed; skipping:", err)
+    return
+  }
+
+  let entries: string[]
+  try {
+    entries = readdirSync(root)
+  } catch (err) {
+    console.warn("[collab.workspace] cleanupOrphanWorkspaces: readdir failed; skipping:", err)
+    return
+  }
+
+  const now = Date.now()
+  let removed = 0
+  for (const name of entries) {
+    if (liveIds.has(name)) continue // live session — leave it
+    const dir = join(root, name)
+    try {
+      const st = statSync(dir)
+      if (!st.isDirectory()) continue
+      if (now - st.mtimeMs < ORPHAN_WORKSPACE_MIN_AGE_MS) continue // too fresh — protect against init races
+      rmSync(dir, { recursive: true, force: true })
+      removed++
+      console.log(`[collab.workspace] cleanupOrphanWorkspaces: removed orphan workspace ${name}`)
+    } catch (err) {
+      console.warn(`[collab.workspace] cleanupOrphanWorkspaces: failed to remove ${name}:`, err)
+    }
+  }
+  if (removed > 0) {
+    console.log(`[collab.workspace] cleanupOrphanWorkspaces: reclaimed ${removed} orphan workspace dir(s)`)
+  }
+}
+
 /**
  * Build git commit trailers for co-authorship attribution.
  *
diff --git a/packages/opencode/src/server/server.ts b/packages/opencode/src/server/server.ts
index 169182ea222c..49d70162c0aa 100644
--- a/packages/opencode/src/server/server.ts
+++ b/packages/opencode/src/server/server.ts
@@ -23,6 +23,23 @@ import { lazy } from "@/util/lazy"
 // Close enough for an ALB health probe; not used for SLA reporting.
 const serverStartedAt = Date.now()
 
+// ── Event-loop liveness heartbeat (S5) ──────────────────────────────────────
+// A 5-s interval stamps `lastEventLoopTick`.  /healthz compares it against
+// now: if the loop has been blocked long enough that the tick is >30 s stale,
+// the server is wedged (a long synchronous operation, a tight loop in a
+// plugin, a giant JSON.parse on a runaway preview log) even though the HTTP
+// listener might still technically accept the connection.  Returning 503 in
+// that window lets the ALB pull the task ~1 min sooner than waiting for the
+// request to time out.  Unref'd so it never holds the loop open on shutdown.
+let lastEventLoopTick = Date.now()
+const EVENT_LOOP_STALL_THRESHOLD_MS = 30_000
+{
+  const tick = setInterval(() => {
+    lastEventLoopTick = Date.now()
+  }, 5_000)
+  if (typeof tick.unref === "function") tick.unref()
+}
+
 // ── Collab middleware ──────────────────────────────────────────────────────────
 // Intercepts /collab/* requests before the Effect HTTP router's catch-all UI
 // route can serve index.html for them. Bridges the standard Web Request/Response
@@ -127,13 +144,21 @@ const serveHealthz = () =>
   Effect.sync(() => {
     const dbOk = pingDatabase()
     const githubStatus = cachedGitHubStatus()
-    // db is the only check that can flip overall ok; github + native_api are
-    // informational so a degraded external dep doesn't pull the ALB out from
-    // under us (we'd be DoS-ing ourselves if GitHub's HEAD ever 5xx'd).
+    // S5 — event-loop liveness.  Stale tick = the loop was blocked long
+    // enough to miss several 5-s heartbeats, i.e. the server is wedged.
+    const eventLoopLagMs = Date.now() - lastEventLoopTick
+    const eventLoopOk = eventLoopLagMs <= EVENT_LOOP_STALL_THRESHOLD_MS
+    // db + event-loop are the checks that can flip overall ok; github +
+    // native_api are informational so a degraded external dep doesn't pull
+    // the ALB out from under us (we'd be DoS-ing ourselves if GitHub's HEAD
+    // ever 5xx'd).
+    const ok = dbOk && eventLoopOk
     const body = {
-      ok: dbOk,
+      ok,
       checks: {
         db: dbOk ? "ok" : "fail",
+        event_loop: eventLoopOk ? "ok" : "stalled",
+        event_loop_lag_ms: eventLoopLagMs,
         github: githubStatus,
         // native_api is the server itself; if Bun is up enough to answer /healthz
         // then the native API is up too — we just record it for the dashboard.
@@ -142,8 +167,11 @@ const serveHealthz = () =>
       version: process.env["OPENCODE_VERSION"] ?? "unknown",
       uptime_s: Math.floor((Date.now() - serverStartedAt) / 1000),
     }
+    if (!eventLoopOk) {
+      log.error("/healthz event-loop stall detected", { lagMs: eventLoopLagMs })
+    }
     return HttpServerResponse.jsonUnsafe(body, {
-      status: dbOk ? 200 : 503,
+      status: ok ? 200 : 503,
       headers: { "cache-control": "no-store" },
     })
   })