diff --git a/src/kernel_ci_cloud_labs/launch_vm.py b/src/kernel_ci_cloud_labs/launch_vm.py index 2110f2f..ac4add2 100644 --- a/src/kernel_ci_cloud_labs/launch_vm.py +++ b/src/kernel_ci_cloud_labs/launch_vm.py @@ -7,9 +7,12 @@ import base64 import json +import os import shlex import sys +import threading import time +import traceback import uuid import boto3 @@ -18,6 +21,12 @@ from kernel_ci_cloud_labs.core.log_scrub import scrub_text +# Enable verbose diagnostics by setting KCI_DEBUG=1 (or true/yes/on) on the +# ECS task. log_debug() lines land in the container log group alongside +# INFO/ERROR. Defaults to off so production logs stay quiet. +_DEBUG_ENABLED = os.environ.get("KCI_DEBUG", "").lower() in ("1", "true", "yes", "on") + + def log_error(msg): """Print error to stderr (shows in container log).""" sys.stderr.write(f"ERROR: {msg}\n") @@ -43,6 +52,30 @@ def log_not(msg): sys.stdout.flush() +def log_debug(msg): + """Print debug diagnostic only when KCI_DEBUG is set in the environment. + + Used to dump intermediate state (response shapes, retry timing, buffer + lengths) without polluting normal production runs. + """ + if _DEBUG_ENABLED: + sys.stdout.write(f"DEBUG: {msg}\n") + sys.stdout.flush() + + +def log_exception(prefix, exc): + """Log an exception with full traceback to stderr. + + Use on any catch site where a bare ``log_error(str(e))`` would hide + the call stack — boto3 errors in particular need the traceback to + distinguish a NoCredentialsError from a ClientError from a transient + network blip. + """ + sys.stderr.write(f"ERROR: {prefix}: {exc!r}\n") + sys.stderr.write(traceback.format_exc()) + sys.stderr.flush() + + # Kernel-side fatal/near-fatal markers we scan captured console buffers for. # Hit on any of these gets logged loudly and stamped into the S3 object # metadata so downstream consumers (KCIDB submitter, triage tooling) can @@ -437,10 +470,15 @@ def capture_console_output(self, reason="cleanup"): # because post-terminate covers the lag. output_b64 = "" if reason == "post-terminate": - poll_budget = 240 # 4 min; EC2 mirror typically settles in 1–3 min + # AWS documents the serial-console mirror update cadence as + # "approximately every 5 minutes" — arm64/Graviton tends to be + # at the slow end. 9 min gives the mirror two full cycles to + # show up; 15 s polling keeps API cost bounded. + poll_budget = 540 poll_interval = 15 start = time.time() attempt = 0 + last_exc_repr = None while time.time() - start < poll_budget: attempt += 1 try: @@ -448,13 +486,22 @@ def capture_console_output(self, reason="cleanup"): InstanceId=self.instance_id, Latest=True ) output_b64 = resp.get("Output", "") + log_debug( + f"get_console_output attempt={attempt} " + f"output_len_b64={len(output_b64)} " + f"keys={sorted(resp.keys())}" + ) except Exception as e: - log_not(f" get_console_output error (attempt {attempt}): {e}") + last_exc_repr = repr(e) + log_exception( + f"get_console_output failed (attempt {attempt})", e + ) output_b64 = "" if output_b64: log_not( f" Console buffer available on attempt {attempt} " - f"(after {int(time.time() - start)}s)" + f"(after {int(time.time() - start)}s, " + f"b64_len={len(output_b64)})" ) break log_not( @@ -463,9 +510,11 @@ def capture_console_output(self, reason="cleanup"): ) time.sleep(poll_interval) if not output_b64: - log_not( - f" No console output after {attempt} attempts " - f"({int(time.time() - start)}s) — EC2 mirror never flushed" + log_error( + f"No console output after {attempt} attempts " + f"({int(time.time() - start)}s, budget {poll_budget}s) — " + f"EC2 mirror never flushed for instance {self.instance_id}" + + (f"; last exception: {last_exc_repr}" if last_exc_repr else "") ) return else: @@ -473,10 +522,15 @@ def capture_console_output(self, reason="cleanup"): resp = self.ec2.get_console_output( InstanceId=self.instance_id, Latest=True ) + output_b64 = resp.get("Output", "") + log_debug( + f"get_console_output (reason={reason}) " + f"output_len_b64={len(output_b64)} " + f"keys={sorted(resp.keys())}" + ) except Exception as e: - log_not(f" Failed to fetch console output: {e}") + log_exception(f"get_console_output failed (reason={reason})", e) return - output_b64 = resp.get("Output", "") if not output_b64: log_not(" No console output available yet") return @@ -523,7 +577,10 @@ def capture_console_output(self, reason="cleanup"): log_not(f"✓ Console output uploaded ({len(output)} bytes) to s3://{self.s3_bucket}/{s3_key}") self._console_captured = True except Exception as e: - log_not(f" Failed to upload console output: {e}") + log_exception( + f"Failed to upload console output to s3://{self.s3_bucket}/{s3_key}", + e, + ) def _wait_for_terminated(self, timeout=90): """Poll describe_instances until the VM reaches a terminal state. @@ -577,9 +634,14 @@ def cleanup(self): The capture path scans for panic markers and stamps metadata on whichever upload wins (post-terminate overwrites cleanup). """ - # Best-effort live capture. Skipped silently inside the helper if a - # prior ssm-failure capture already uploaded. - self.capture_console_output(reason="cleanup") + # Each stage is wrapped so one failure doesn't silently abort the + # remaining cleanup work. Previously a boto3 error in terminate or + # an unhandled exception in capture would short-circuit cleanup and + # leave no traceback in the container log. + try: + self.capture_console_output(reason="cleanup") + except Exception as e: + log_exception("capture_console_output(cleanup) raised", e) if not self.instance_id: return @@ -589,20 +651,30 @@ def cleanup(self): self.ec2.terminate_instances(InstanceIds=[self.instance_id]) log_not("✓ Termination requested") except Exception as e: - log_not(f"Error terminating instance: {e}") - return + log_exception( + f"terminate_instances failed for {self.instance_id}", e + ) + # Still attempt post-terminate capture: instance may already be + # in a terminal state, in which case GetConsoleOutput still + # works against the preserved buffer. # Wait for the VM to actually wind down, then grab the flushed # buffer. This is where an early-boot panic that never made it into # the live mirror finally becomes visible. - self._wait_for_terminated(timeout=90) - self.capture_console_output(reason="post-terminate") + try: + self._wait_for_terminated(timeout=90) + except Exception as e: + log_exception("_wait_for_terminated raised", e) + + try: + self.capture_console_output(reason="post-terminate") + except Exception as e: + log_exception("capture_console_output(post-terminate) raised", e) def launch_vms_from_config(): """Launch and test multiple VMs from environment variables (no config file needed).""" - import os - import threading + log_info(f"launch_vm startup: KCI_DEBUG={'on' if _DEBUG_ENABLED else 'off'}") # Read config from environment variables run_prefix = os.getenv("RUN_PREFIX") @@ -730,10 +802,17 @@ def launch_and_test_vm(vm_config, instance_num, results_list): results_list.append({"vm_name": vm_name, "success": False}) except Exception as e: - log_error(f"FAILED: {vm_name} - {e}") + log_exception(f"FAILED: {vm_name}", e) results_list.append({"vm_name": vm_name, "success": False}) finally: - launcher.cleanup() + # cleanup() is already internally guarded per-stage, but a thread + # that escapes its target with an unhandled exception is silent + # by default — wrap once more so any surviving error reaches + # the container log with a traceback. + try: + launcher.cleanup() + except Exception as e: + log_exception(f"cleanup raised for {vm_name}", e) # Launch all VMs in parallel using threads threads = [] diff --git a/src/kernel_ci_cloud_labs/providers/aws_provider.py b/src/kernel_ci_cloud_labs/providers/aws_provider.py index 0c72c02..8ae665d 100644 --- a/src/kernel_ci_cloud_labs/providers/aws_provider.py +++ b/src/kernel_ci_cloud_labs/providers/aws_provider.py @@ -6,6 +6,7 @@ # providers/aws_provider.py +import json import os import re import time @@ -97,9 +98,6 @@ def spawn_container(self): logger.debug("Network config: %s", network_config) # Build environment overrides with runtime config - import json - import time - env_vars = [] # Pass run prefix @@ -124,6 +122,14 @@ def spawn_container(self): env_vars.append({"name": "EC2_LOG_GROUP", "value": ec2_log_group}) logger.debug("Passing EC2_LOG_GROUP to container: %s", ec2_log_group) + # Forward KCI_DEBUG to the container so launch_vm.log_debug() lines + # show up in the ECS task log group. Set on the host running the + # poller: KCI_DEBUG=1 ./prod-amd64.sh + kci_debug = os.environ.get("KCI_DEBUG") + if kci_debug: + env_vars.append({"name": "KCI_DEBUG", "value": kci_debug}) + logger.debug("Passing KCI_DEBUG to container: %s", kci_debug) + # Always pass test config via S3 in results bucket if self.config.get("test_config"): test_config_json = json.dumps(self.config["test_config"])