From 7d9a9f6609fe006c1c39e3fc27f72e6a8168972a Mon Sep 17 00:00:00 2001 From: truffle Date: Sat, 27 Jun 2026 21:09:03 +0000 Subject: [PATCH] fix(scheduler): honor a pauseJob() that lands while a job is in flight executeJob captured newStatus from the job snapshot taken at pickup, then wrote it back unconditionally after handleMessage returned. For an agent task that runs for minutes, a pauseJob() landing in that window set status='paused' in the row, but the write-back stomped it back to 'active' and the next fire ran anyway. Re-read the live status just before the write-back and honor an external pause. Only the 'active' continuation case is at risk; 'completed' and 'failed' are this run's terminal outcomes and must win. resumeJob() recomputes next_run_at, so clearing it here is safe. Fixes #148 --- .../__tests__/scheduler-hardening.test.ts | 34 +++++++++++++++++++ src/scheduler/executor.ts | 17 ++++++++++ 2 files changed, 51 insertions(+) diff --git a/src/scheduler/__tests__/scheduler-hardening.test.ts b/src/scheduler/__tests__/scheduler-hardening.test.ts index 5448b2b9..189e07d3 100644 --- a/src/scheduler/__tests__/scheduler-hardening.test.ts +++ b/src/scheduler/__tests__/scheduler-hardening.test.ts @@ -506,6 +506,40 @@ describe("Phase 2.5 scheduler fixes", () => { expect(after?.lastDeliveryStatus).toContain("ECONNREFUSED"); expect(after?.lastRunStatus).toBe("ok"); }); + + test("a pauseJob() that lands mid-run is honored, not stomped by the write-back (#148)", async () => { + const runtime = createMockRuntime(); + const scheduler = new Scheduler({ db, runtime: runtime as never }); + + const job = scheduler.createJob({ + name: "Pause Mid Run", + schedule: { kind: "every", intervalMs: 60_000 }, + task: "x", + }); + + // Simulate a pauseJob() landing while handleMessage is in flight: the + // status flips to 'paused' after executeJob captured its 'active' + // snapshot at pickup. Before the fix, the unconditional write-back + // stomped this back to 'active' and the next fire ran anyway. + runtime.handleMessage.mockImplementation(async () => { + db.run("UPDATE scheduled_jobs SET status = 'paused' WHERE id = ?", [job.id]); + return { + text: "Mock response", + sessionId: "mock-session", + cost: { totalUsd: 0, inputTokens: 0, outputTokens: 0, modelUsage: {} }, + durationMs: 10, + }; + }); + + await scheduler.runJobNow(job.id); + + const after = scheduler.getJob(job.id); + // The concurrent pause must survive: status stays 'paused' and the + // recurring job gets no next fire scheduled. resumeJob() recomputes + // next_run_at when the operator un-pauses. + expect(after?.status).toBe("paused"); + expect(after?.nextRunAt).toBeNull(); + }); }); // ---------- M1: non-blocking missed-job recovery ---------- diff --git a/src/scheduler/executor.ts b/src/scheduler/executor.ts index 80a3f7f4..87cf1c38 100644 --- a/src/scheduler/executor.ts +++ b/src/scheduler/executor.ts @@ -100,6 +100,23 @@ export async function executeJob(job: ScheduledJob, ctx: ExecutorContext): Promi }); } + // handleMessage for an agent task runs for minutes, and pauseJob() can land + // during that window. We computed newStatus from the job snapshot taken at + // pickup, so an unconditional write-back would silently stomp a concurrent + // pause. Re-read the live status and honor an external pause. Only the + // 'active' continuation case is at risk: 'completed' and 'failed' are this + // run's terminal outcomes and must win. resumeJob() recomputes next_run_at, + // so clearing it here is safe. + if (newStatus === "active") { + const live = ctx.db.query("SELECT status FROM scheduled_jobs WHERE id = ?").get(job.id) as { + status: string; + } | null; + if (live?.status === "paused") { + newStatus = "paused"; + nextRunAt = null; + } + } + // Runtime safety net for OOS#4. if (!JOB_STATUS_VALUES.includes(newStatus)) { throw new Error(`refusing to write invalid status '${newStatus}' for job ${job.id}`);