From 5b2a915edd6a3dc3b315994b0591ffdc5c816289 Mon Sep 17 00:00:00 2001 From: Dmitriy Zhuk Date: Tue, 23 Jun 2026 18:44:01 +0300 Subject: [PATCH] feat(agent): add stop/start to free cluster resources Restart only redeployed the pod, so there was no way to free a slot on a full cluster. Add a true Stop (cancel workflow + delete pod, keep the agent as `stopped`) and a paired Start (deploy a fresh pod). Backend: - POST /agents/:id/stop and /agents/:id/start (Owner/Admin) - AgentDeployService.stopAgent() with deploy-tracker guard so the dying pod's Failed event can't flip the row to `failed` - reconciler ignores a stale Running event for a stopped/restarting agent - syncStatus skips `stopped`; updateStatus can clear workflowId via null Admin UI: - Stop/Start in the agents table row dropdown - Stop/Start in the agent detail header (top-right) + a "stopped" chat overlay with a Start button Co-Authored-By: Claude Opus 4.8 (1M context) --- .../agent/agent/components/agent/Provider.vue | 101 +++++++++++++++++- .../agent/components/agentList/Provider.vue | 60 ++++++++++- admin/slices/agent/agent/stores/agent.ts | 52 +++++++++ .../slices/agent/agent/agent.controller.ts | 29 +++++ .../slices/agent/agent/data/agent.gateway.ts | 6 +- .../agent/agent/domain/agent.gateway.ts | 2 +- .../agent/agent/domain/agentDeploy.service.ts | 38 +++++++ .../agent/agent/domain/agentStatus.service.ts | 7 +- 8 files changed, 286 insertions(+), 9 deletions(-) diff --git a/admin/slices/agent/agent/components/agent/Provider.vue b/admin/slices/agent/agent/components/agent/Provider.vue index e964810..cb40f68 100644 --- a/admin/slices/agent/agent/components/agent/Provider.vue +++ b/admin/slices/agent/agent/components/agent/Provider.vue @@ -21,7 +21,7 @@ import { TableRow, } from '#theme/components/ui/table'; import { Tabs, TabsContent, TabsList, TabsTrigger } from '#theme/components/ui/tabs'; -import { IconAlertTriangle, IconArrowLeft, IconEye, IconEyeOff, IconLoader2, IconRefresh, IconShield, IconX } from '@tabler/icons-vue'; +import { IconAlertTriangle, IconArrowLeft, IconEye, IconEyeOff, IconLoader2, IconPlayerPlay, IconPlayerStop, IconRefresh, IconShield, IconX } from '@tabler/icons-vue'; import { FileText, X } from 'lucide-vue-next'; import type { IPaddockScenario } from '#paddock/stores/paddockScenario'; @@ -236,6 +236,50 @@ function dismissRestartBanner() { agentStore.clearPendingRestart(props.id); } +// ── Stop / Start ───────────────────────────────────────────────────────── +// Stop cancels the workflow and deletes the pod to free cluster CPU/memory +// (so another agent can start when the cluster is full); Start deploys a fresh +// pod. Which one we show depends on whether the agent currently holds a pod. +const RESOURCE_HOLDING: ReadonlySet = new Set([ + 'running', + 'deploying', + 'pending', +]); +const canStop = computed(() => + agent.value ? RESOURCE_HOLDING.has(agent.value.status) : false, +); +const toggling = ref(false); +const toggleError = ref(null); + +async function onToggleRunning() { + if (!agent.value || toggling.value) return; + toggling.value = true; + toggleError.value = null; + const previousStatus = agent.value.status; + const stopping = canStop.value; + // Optimistic flip so the badge reacts before the API resolves. + agent.value = { + ...agent.value, + status: stopping ? 'stopped' : 'deploying', + }; + try { + if (stopping) { + await agentStore.stop(agent.value.id); + } else { + agentStore.markRestartInFlight(agent.value.id); + await agentStore.start(agent.value.id); + } + await refresh(); + } catch (err) { + if (agent.value) agent.value = { ...agent.value, status: previousStatus }; + if (!stopping) agentStore.clearRestartInFlight(props.id); + toggleError.value = + (err as Error).message || (stopping ? 'Stop failed' : 'Start failed'); + } finally { + toggling.value = false; + } +} + // ── Live pod state from SSE ───────────────────────────────────────────── // Lets the user watch sub-second pod transitions (Pending → ContainerCreating // → Running) instead of waiting on the 5s status poll below. @@ -261,6 +305,7 @@ const podPhaseLabel = computed(() => { type ChatOverlay = | { kind: 'starting'; title: string; detail: string } | { kind: 'failed'; title: string; detail: string } + | { kind: 'stopped'; title: string; detail: string } | null; const chatOverlay = computed(() => { @@ -277,6 +322,15 @@ const chatOverlay = computed(() => { if (chatLive) return null; + if (s === 'stopped') { + return { + kind: 'stopped', + title: 'Agent stopped', + detail: + 'The pod was deleted to free cluster resources. Start it to chat again.', + }; + } + if (s === 'failed') { return { kind: 'failed', @@ -579,7 +633,29 @@ onBeforeUnmount(stopMetricsPolling); +