Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 118 additions & 0 deletions .github/scripts/check-suite-tooling-floor.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
#!/usr/bin/env bash
# check-suite-tooling-floor.sh - fail when an example repo's scenario suite
# pins the cascade tooling below the feature floor.
#
# Each cascade-example repo runs its own scenario-suite.yaml, which bootstraps
# a cascade CLI through the setup-cli action to drive its scenarios. That
# bootstrap version is pinned by hand in the suite. Nothing keeps it moving
# forward, so a suite can sit on a release that predates a command the suite
# now invokes, producing an "unknown command" failure deep inside a live fleet
# run. This check compares every suite's pin against the floor and fails fast
# when one has drifted below it, so the drift is caught before a fleet run.
#
# The floor is the latest published stable cascade release: the newest version
# every released command is guaranteed to exist in. A suite pinned at or above
# the floor passes; only a strictly-lower pin fails. A suite that tracks a
# moving ref (for example @main) carries no semver pin and is treated as
# current, so it is never flagged.
#
# Usage:
# check-suite-tooling-floor.sh # floor = latest release
# FLOOR=v0.7.0 check-suite-tooling-floor.sh # explicit floor override
# REPOS="4env 3env" check-suite-tooling-floor.sh # subset of the roster
#
# Environment:
# FLOOR Override the floor version (vX.Y.Z). Empty resolves to the
# latest stable release of the cascade repo.
# REPOS Space-separated example-repo short names to check. Empty uses
# the full roster below.
# FLEET_OWNER GitHub owner of the cascade and example repos (default
# stablekernel).
#
# Requires: gh (authenticated), base64, sort -V.
set -euo pipefail

OWNER="${FLEET_OWNER:-stablekernel}"

# Canonical floor-check roster. This mirrors the fleet-e2e repin roster: every
# example repo whose suite installs a pinned cascade CLI belongs here. Keep it
# in sync with the roster in .github/workflows/fleet-e2e.yaml when a repo is
# added or removed.
DEFAULT_REPOS="primary artifact-a artifact-b 4env 3env 2env single-env release-only no-env callbacks rollback-dispatch"

SUITE_PATH=".github/workflows/scenario-suite.yaml"

# ver_lt A B: succeed when semver A is strictly lower than semver B.
ver_lt() {
[ "$1" != "$2" ] && \
[ "$(printf '%s\n%s\n' "$1" "$2" | sort -V | head -n 1)" = "$1" ]
}

# resolve_floor: echo the newest non-prerelease, non-draft release tag.
resolve_floor() {
gh release list --repo "${OWNER}/cascade" -L 50 \
--json tagName,isPrerelease,isDraft \
--jq '.[] | select(.isPrerelease == false and .isDraft == false) | .tagName' \
| grep -E '^v[0-9]+\.[0-9]+\.[0-9]+$' | sort -V | tail -n 1
}

floor="${FLOOR:-}"
if [ -z "$floor" ]; then
floor="$(resolve_floor || true)"
fi
if ! printf '%s' "$floor" | grep -qE '^v[0-9]+\.[0-9]+\.[0-9]+$'; then
echo "::error::could not resolve a stable floor version (got '${floor:-<empty>}')"
exit 1
fi
echo "Feature floor (latest stable cascade release): ${floor}"
echo ""

read -ra repos <<< "${REPOS:-$DEFAULT_REPOS}"

stale=""
skipped=""
for name in "${repos[@]}"; do
slug="${OWNER}/cascade-example-${name}"
content="$(gh api "repos/${slug}/contents/${SUITE_PATH}" --jq '.content' 2>/dev/null \
| base64 -d 2>/dev/null || true)"
if [ -z "$content" ]; then
skipped="${skipped} ${name}(no-suite)"
continue
fi

# Extract every semver pin from the suite's setup-cli action ref and its
# version input. A moving ref (for example setup-cli@main) yields no match
# and is skipped rather than flagged.
pins="$(printf '%s' "$content" \
| grep -oE 'setup-cli@v[0-9]+\.[0-9]+\.[0-9]+|version:[[:space:]]*v[0-9]+\.[0-9]+\.[0-9]+' \
| grep -oE 'v[0-9]+\.[0-9]+\.[0-9]+' | sort -u || true)"
if [ -z "$pins" ]; then
echo " ${name}: no semver tooling pin (tracks a moving ref); skipped"
skipped="${skipped} ${name}(moving-ref)"
continue
fi

while IFS= read -r pin; do
[ -n "$pin" ] || continue
if ver_lt "$pin" "$floor"; then
echo " ${name}: ${pin} is BELOW floor ${floor}"
stale="${stale} ${name}:${pin}"
else
echo " ${name}: ${pin} >= floor ${floor}"
fi
done <<< "$pins"
done

echo ""
if [ -n "$skipped" ]; then
echo "Not pin-checked:${skipped}"
fi

if [ -n "$stale" ]; then
echo "::error::example-suite tooling pins below floor ${floor}:${stale}"
echo "Bump each listed repo's ${SUITE_PATH} setup-cli pin (both the action"
echo "ref and the version input) to at least ${floor}, then rerun this check."
exit 1
fi

echo "All example-repo suite tooling pins are at or above floor ${floor}."
42 changes: 37 additions & 5 deletions .github/workflows/fleet-e2e.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,29 @@ jobs:
echo "| remainder | $RUN_REMAINDER |"
} >> "$GITHUB_STEP_SUMMARY"

# Floor check: the repin below points each repo's manifest cli_version at the
# rc under test, but it deliberately leaves each suite's OWN setup-cli
# bootstrap pin alone (it only rewrites prerelease refs). A suite pinned to a
# stable release that predates a command the suite now invokes fails a live
# lane with a cryptic "unknown command" mid-fan-out. This job runs the shared
# floor check before any repin or fan-out so that drift reds the run up front
# with a clear message instead. The suite pin must be at or above the latest
# stable release; a pin at or above the floor (including one ahead of it)
# passes. The daily Suite Tooling Floor workflow runs the same check off the
# release cadence so drift is usually caught before an rc run reaches here.
floor-check:
name: Check suite tooling pins
needs: resolve
runs-on: ubuntu-latest
permissions:
contents: read
env:
GH_TOKEN: ${{ secrets.CASCADE_STATE_TOKEN }}
steps:
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
- name: Check example-suite tooling pins against the floor
run: ./.github/scripts/check-suite-tooling-floor.sh

# Repin: pin every example repo to the rc UNDER TEST before any suite fans
# out. Without this the suites would install whatever version each repo's
# manifest is statically pinned to, so a fresh rc would never actually run -
Expand All @@ -296,10 +319,11 @@ jobs:
# job gates on this job so none can start against a stale pin. Repin always
# covers the full roster regardless of the repos selector: pinning is cheap,
# idempotent, and sequential (one repo at a time), so it does not add to live
# fan-out concurrency.
# fan-out concurrency. Gated on floor-check so a below-floor suite pin reds the
# run before any live dispatch.
repin:
name: Repin fleet to rc
needs: resolve
needs: [resolve, floor-check]
runs-on: ubuntu-latest
permissions:
contents: read
Expand Down Expand Up @@ -624,7 +648,7 @@ jobs:
# over exactly the lanes that ran. A real fan-out failure still reds the run.
aggregate:
name: Fleet gate
needs: [resolve, plan, repin, primary, dependents, heavy, remainder]
needs: [resolve, plan, floor-check, repin, primary, dependents, heavy, remainder]
# Only render a verdict when the fleet actually fanned out. On filtered-out
# completions (merge_group, non-rc tags, dispatch with no rc) resolve is
# skipped, so this job is skipped too and the run is a clean no-op rather
Expand All @@ -637,6 +661,7 @@ jobs:
steps:
- name: Aggregate fleet result
env:
R_FLOOR: ${{ needs.floor-check.result }}
R_REPIN: ${{ needs.repin.result }}
R_PRIMARY: ${{ needs.primary.result }}
R_DEPENDENTS: ${{ needs.dependents.result }}
Expand All @@ -652,6 +677,7 @@ jobs:
echo ""
echo "| Lane | Result |"
echo "|---|---|"
echo "| floor-check (suite tooling pins) | $R_FLOOR |"
echo "| repin (all 10 repos to rc) | $R_REPIN |"
echo "| primary | $R_PRIMARY |"
echo "| dependents (artifact-a, artifact-b) | $R_DEPENDENTS |"
Expand All @@ -670,13 +696,19 @@ jobs:
# A lane passes when it succeeded OR was skipped (filtered out by the
# repos selector, or - for dependents - skipped because primary was
# not selected). Only an actual failure or cancellation reds the gate.
# repin is never selector-gated, so a non-success repin always reds.
# floor-check and repin are never selector-gated, so a non-success
# result from either always reds. A failed floor-check also skips
# repin, so it must be checked directly here or the skipped repin would
# read as a pass.
fail=0
for r in "$R_REPIN" "$R_PRIMARY" "$R_DEPENDENTS" "$R_HEAVY" "$R_REMAINDER"; do
for r in "$R_FLOOR" "$R_REPIN" "$R_PRIMARY" "$R_DEPENDENTS" "$R_HEAVY" "$R_REMAINDER"; do
if [ "$r" != "success" ] && [ "$r" != "skipped" ]; then
fail=1
fi
done
if [ "$R_FLOOR" != "success" ]; then
fail=1
fi
if [ "$fail" -ne 0 ]; then
echo "::error::Fleet E2E failed: one or more lanes did not pass"
exit 1
Expand Down
50 changes: 50 additions & 0 deletions .github/workflows/suite-tooling-floor.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Suite Tooling Floor - guards the cascade-example fleet against tooling drift.
#
# Every example repo's scenario-suite.yaml bootstraps a cascade CLI through the
# setup-cli action, pinned by hand to a fixed release. Nothing keeps that pin
# moving forward, so a suite can drift onto a release that predates a command it
# now invokes and fail a live fleet run with a cryptic "unknown command". This
# job runs the floor check daily so the drift surfaces on its own schedule, well
# before an rc fleet run trips over it. The same check also gates fleet-e2e.yaml
# before fan-out, so a stale pin is caught at release time too.
#
# The floor is the latest published stable cascade release. A suite pinned at or
# above the floor passes; only a strictly-lower pin fails. A suite that tracks a
# moving ref (for example @main) carries no semver pin and is never flagged.
name: Suite Tooling Floor

on:
schedule:
# Daily, offset from other scheduled jobs to spread live API load.
- cron: '17 6 * * *'
workflow_dispatch:
inputs:
floor:
description: >-
Override the floor version (e.g. v0.7.0). Empty resolves to the latest
stable cascade release.
required: false
default: ''

permissions:
contents: read

concurrency:
group: suite-tooling-floor
cancel-in-progress: true

jobs:
check:
name: Check example-suite tooling pins
runs-on: ubuntu-latest
permissions:
contents: read
env:
# Cross-repo reads of the example repos' suites use the fleet token, the
# same credential the fleet itself reads and writes those repos with.
GH_TOKEN: ${{ secrets.CASCADE_STATE_TOKEN }}
FLOOR: ${{ github.event.inputs.floor }}
steps:
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
- name: Check suite tooling pins against the floor
run: ./.github/scripts/check-suite-tooling-floor.sh
Loading