From 663cec6520a72680609c0d803494fb0bde4ce765 Mon Sep 17 00:00:00 2001 From: Craig Jennings Date: Sun, 17 May 2026 14:38:40 -0500 Subject: fix(testing): cleanup traps, arg validation, and two real bugs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two real bugs and a sweep of hygiene across the harness. `make test` passed cleanly on this branch with the same 52/0/5 profile as the 2026-05-11 run, so the wiring is verified end-to-end. Real bugs: - `lib/vm-utils.sh` `snapshot_exists` was running `qemu-img snapshot -l | grep -q "$snapshot_name"`, which matches the name as a substring anywhere in the output — including inside dates or filenames in other fields. Replaced with an awk field extraction on the TAG column plus `grep -Fxq` for a whole-line literal match. - `run-test-baremetal.sh` was setting `VALIDATION_PASSED=true|false` after validation, but `validation.sh` already uses `VALIDATION_PASSED` as a pass counter. The test report then referenced `$VALIDATION_PASSED_COUNT`, which is defined nowhere. Renamed the boolean to `TEST_PASSED` (matching run-test.sh's pattern) and report the actual counter. Cleanup traps and arg validation: - `run-test.sh` now installs a top-level EXIT trap that, on abort, kills QEMU and restores the clean-install snapshot. A `CLEANUP_DONE=1` sentinel keeps the existing normal-path cleanup from double-firing. This is the recurring pain from 2026-05-11 where two failed runs left orphaned QEMU processes and dirty base disks behind. - `create-base-vm.sh` and `debug-vm.sh` got the same kind of trap, plus `debug-vm.sh` now rejects non-`.qcow2` paths up front instead of letting QEMU fail later. - `run-test.sh`, `run-test-baremetal.sh`, and `cleanup-tests.sh` now validate that options with required values actually receive one (`${var:?msg}` for `--script`/`--snapshot`/`--host`/`--password`, numeric check for `--keep`). - `run-test-baremetal.sh` traps the temp git bundle for cleanup if the script aborts before its explicit `rm`. The ZFS rollback loop now uses `while IFS= read -r ds` and quotes `$ds` inside the ssh_cmd so dataset names with whitespace wouldn't break it. Smaller hygiene: - `vm-utils.sh` `check_ovmf` also checks `OVMF_VARS_TEMPLATE`; `start_qemu` validates disk and ISO paths before building the QEMU command; numeric tests quoted. - `cleanup-tests.sh` find expression for temp disks wrapped in `\( ... -o ... \)`, all `while read` loops use `IFS= read -r`, orphaned QEMU cleanup tries SIGTERM with a 2s sleep before SIGKILL. - `create-base-vm.sh` moved the "Copy an archangel-*.iso" info line before its `fatal` instead of after (unreachable), and added the serial-log path to the final summary. - `lib/logging.sh` `stop_timer` no longer produces `$((end - ))` when the named timer was never started. - `lib/network-diagnostics.sh` `read` → `IFS= read -r`. - `setup-testing-env.sh` now installs all missing pacman packages in one transaction instead of one-at-a-time (avoids half-installed state if package N fails). KVM check also verifies the user has read+write on `/dev/kvm` and prints the `gpasswd -a $(id -un) kvm` fix if not. A few items from the review I deliberately skipped: replacing the codebase-wide unquoted `$SSH_OPTS` string with an array (cosmetic, would need to be done everywhere at once), `set -e` adds where the existing fall-through-on-failure is intentional, and a `--force` gate on `create-base-vm.sh` (would break the expected workflow). --- scripts/testing/run-test.sh | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) (limited to 'scripts/testing/run-test.sh') diff --git a/scripts/testing/run-test.sh b/scripts/testing/run-test.sh index c0eaf50..18f4fdf 100755 --- a/scripts/testing/run-test.sh +++ b/scripts/testing/run-test.sh @@ -36,11 +36,11 @@ while [[ $# -gt 0 ]]; do shift ;; --script) - ARCHSETUP_SCRIPT="$2" + ARCHSETUP_SCRIPT="${2:?--script requires a value}" shift 2 ;; --snapshot) - SNAPSHOT_NAME="$2" + SNAPSHOT_NAME="${2:?--snapshot requires a value}" shift 2 ;; *) @@ -53,6 +53,28 @@ while [[ $# -gt 0 ]]; do esac done +# Failure-path cleanup. Normal completion (further down) sets CLEANUP_DONE=1 +# so the trap becomes a no-op. If we abort partway through, the trap is the +# safety net that stops a leaked QEMU and reverts the base disk. +CLEANUP_DONE=0 +BUNDLE_FILE="" +cleanup_run_test() { + [ "$CLEANUP_DONE" = "1" ] && return 0 + CLEANUP_DONE=1 + [ -n "$BUNDLE_FILE" ] && [ -f "$BUNDLE_FILE" ] && rm -f "$BUNDLE_FILE" + if [ "$KEEP_VM" = "true" ]; then + return 0 + fi + if declare -f stop_qemu >/dev/null 2>&1; then + stop_qemu 2>/dev/null || true + fi + if [ -n "${DISK_PATH:-}" ] && [ -n "${SNAPSHOT_NAME:-}" ] \ + && declare -f restore_snapshot >/dev/null 2>&1; then + restore_snapshot "$DISK_PATH" "$SNAPSHOT_NAME" 2>/dev/null || true + fi +} +trap cleanup_run_test EXIT + # Configuration TIMESTAMP=$(date +'%Y%m%d-%H%M%S') VM_IMAGES_DIR="$PROJECT_ROOT/vm-images" @@ -79,8 +101,8 @@ fi # Check disk exists if [ ! -f "$DISK_PATH" ]; then - fatal "Base disk not found: $DISK_PATH" info "Create it first: ./scripts/testing/create-base-vm.sh" + fatal "Base disk not found: $DISK_PATH" fi # Check if snapshot exists @@ -88,11 +110,11 @@ section "Preparing Test Environment" step "Checking for snapshot: $SNAPSHOT_NAME" if ! snapshot_exists "$DISK_PATH" "$SNAPSHOT_NAME"; then - fatal "Snapshot '$SNAPSHOT_NAME' not found on $DISK_PATH" info "Available snapshots:" list_snapshots "$DISK_PATH" info "" info "Create base VM with: ./scripts/testing/create-base-vm.sh" + fatal "Snapshot '$SNAPSHOT_NAME' not found on $DISK_PATH" fi success "Snapshot $SNAPSHOT_NAME exists" @@ -127,7 +149,6 @@ section "Simulating Git Clone" step "Creating shallow git clone on VM" info "This simulates: git clone --depth 1 /home/cjennings/code/archsetup" -# Create a temporary git bundle from current repo BUNDLE_FILE=$(mktemp) git -C "$PROJECT_ROOT" bundle create "$BUNDLE_FILE" HEAD >> "$LOGFILE" 2>&1 @@ -276,7 +297,7 @@ analyze_log_diff "$TEST_RESULTS_DIR" generate_issue_report "$TEST_RESULTS_DIR" "$ARCHZFS_INBOX" # Set validation result based on failure count -if [ $VALIDATION_FAILED -eq 0 ]; then +if [ "$VALIDATION_FAILED" -eq 0 ]; then TEST_PASSED=true else TEST_PASSED=false @@ -350,6 +371,7 @@ else warn "Failed to revert snapshot - VM may be in modified state" fi fi +CLEANUP_DONE=1 # Final summary section "Test Complete" -- cgit v1.2.3