From 2da8971b0e04ebde45704b9577fb68b6d769d041 Mon Sep 17 00:00:00 2001 From: Matee ullah Malik Date: Tue, 9 Jun 2026 18:42:11 +0000 Subject: [PATCH] fix(devnet): supervise lumerad lifecycle in start.sh + add restart policy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When lumerad inside a devnet validator container dies (crash, OOM, or a host-side `pkill -f 'lumerad start'` that matches container processes via the shared PID namespace), `start.sh` previously kept running because its final `exec tail -F …` replaced bash as PID 1. `docker ps` reported "Up" indefinitely while the chain process was a defunct zombie and the container never restarted. On 2026-06-02 this exact misfire — an operator's `sudo pkill -9 -f 'lumerad start'` aimed at the host cosmovisor process matched the in-container lumerads through the shared PID namespace — silently killed all 5 lumera-devnet-1 validators at h=301810 and left the chain dead for ~6 days before anyone noticed. Fix: 1. `devnet/scripts/start.sh`: capture `LUMERAD_PID=$!`, demote `tail -F` from `exec` to background, then `wait "$LUMERAD_PID"` and propagate the exit code via `exit "$rc"`. PID 1 now dies with lumerad. 2. `devnet/generators/docker-compose.go`: add `Restart string` field to `DockerComposeService` and set `Restart: "unless-stopped"` on validator services. Combined with (1), a non-zero lumerad exit triggers docker to restart the container and rejoin the chain. End-to-end validated against a real docker container with the new supervisor logic: - iteration=1 fake-lumerad SIGKILLed at +3s - start.sh propagated rc=137 to PID 1 - docker restart policy fired - iteration=2 started cleanly with fresh fake-lumerad - state preserved through the restart `run`, `auto`, and bootstrap/wait modes all updated consistently so the behaviour is identical for the modes that actually launch lumerad. Risks / non-goals: - This is a devnet-only change (devnet/* paths). No chain state machine impact. - `docker compose stop` continues to win over the restart policy (unless-stopped semantics), so intentional shutdowns are unchanged. - Existing live devnet containers still need to be recreated (`docker compose up -d --force-recreate`) for the new restart policy to apply; an in-place `compose restart` is not enough. --- devnet/generators/docker-compose.go | 5 ++++ devnet/scripts/start.sh | 38 ++++++++++++++++++++++++++++- 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/devnet/generators/docker-compose.go b/devnet/generators/docker-compose.go index 579b0940..866d3c47 100644 --- a/devnet/generators/docker-compose.go +++ b/devnet/generators/docker-compose.go @@ -44,6 +44,7 @@ type DockerComposeService struct { CapAdd []string `yaml:"cap_add,omitempty"` SecurityOpt []string `yaml:"security_opt,omitempty"` Logging *DockerComposeLogging `yaml:"logging,omitempty"` + Restart string `yaml:"restart,omitempty"` } type DockerComposeNetwork struct { @@ -226,6 +227,10 @@ func GenerateDockerCompose(config *confg.ChainConfig, validators []confg.Validat service := DockerComposeService{ Build: ".", ContainerName: serviceName, + // Auto-restart on lumerad crashes / host pkill mishaps. + // start.sh wait_for_lumera() propagates lumerad exit code to PID 1, + // so a non-zero exit triggers docker restart and rejoins the chain. + Restart: "unless-stopped", Ports: []string{ fmt.Sprintf("%d:%d", validator.Port, DefaultP2PPort), fmt.Sprintf("%d:%d", validator.RPCPort, DefaultRPCPort), diff --git a/devnet/scripts/start.sh b/devnet/scripts/start.sh index 8c4e6a7a..41ed3f9e 100755 --- a/devnet/scripts/start.sh +++ b/devnet/scripts/start.sh @@ -338,6 +338,8 @@ start_lumera() { fi # shellcheck disable=SC2086 run "${DAEMON}" start --home "${DAEMON_HOME}" ${EXTRA_START_FLAGS} >"${VALIDATOR_LOG}" 2>&1 & + LUMERAD_PID=$! + echo "[BOOT] ${MONIKER}: lumerad started, pid=${LUMERAD_PID}" if [ "${MONIKER}" = "${PRIMARY_MONIKER}" ]; then mkdir -p "$(dirname "${PRIMARY_STARTED_FLAG}")" @@ -348,7 +350,39 @@ start_lumera() { tail_logs() { touch "${VALIDATOR_LOG}" "${SUPERNODE_LOG}" "${SUPERNODE_SETUP_OUT}" "${VALIDATOR_SETUP_OUT}" "${UPLOADER_SETUP_OUT}" "${TEST_ACCOUNTS_SETUP_OUT}" - exec tail -F "${VALIDATOR_LOG}" "${SUPERNODE_LOG}" "${SUPERNODE_SETUP_OUT}" "${VALIDATOR_SETUP_OUT}" "${UPLOADER_SETUP_OUT}" "${TEST_ACCOUNTS_SETUP_OUT}" + tail -F "${VALIDATOR_LOG}" "${SUPERNODE_LOG}" "${SUPERNODE_SETUP_OUT}" "${VALIDATOR_SETUP_OUT}" "${UPLOADER_SETUP_OUT}" "${TEST_ACCOUNTS_SETUP_OUT}" & + TAIL_PID=$! +} + +# Wait on the lumerad process and propagate its exit code as the container's +# exit code. If lumerad dies (crash, SIGKILL on host that matches `pkill -f +# 'lumerad start'`, OOM, etc.) the container exits non-zero. Combined with the +# docker-compose `restart: unless-stopped` policy this auto-recovers from +# silent zombification and surfaces real crashes to docker / observability. +# +# History: 2026-06-02 — a host `pkill -9 -f 'lumerad start'` matched lumerad +# inside the 5 validator containers. PID 1 was bash + tail -F, so containers +# stayed "Up" for 6 days while chain was dead. See PR description. +wait_for_lumera() { + if [ -z "${LUMERAD_PID:-}" ]; then + echo "[BOOT] ${MONIKER}: lumerad pid not set; cannot supervise." + # Fall back to old tail-forever behaviour rather than exit 0 silently. + wait "${TAIL_PID:-}" 2>/dev/null || true + return 0 + fi + # `wait ` returns the exit status of that process. We deliberately do + # NOT use `set -e` here so we can capture the code. + set +e + wait "${LUMERAD_PID}" + local rc=$? + set -e + echo "[BOOT] ${MONIKER}: lumerad exited rc=${rc} — terminating container so docker restart policy can recover." + if [ -n "${TAIL_PID:-}" ]; then + kill "${TAIL_PID}" 2>/dev/null || true + fi + # Sleep briefly so the last log lines are flushed by tail -F before we exit. + sleep 1 + exit "${rc}" } run_auto_flow() { @@ -361,6 +395,7 @@ run_auto_flow() { launch_test_accounts_setup start_nm_ui_if_present tail_logs + wait_for_lumera } case "${START_MODE}" in @@ -388,6 +423,7 @@ run) launch_test_accounts_setup start_nm_ui_if_present tail_logs + wait_for_lumera ;; wait)