From 663cec6520a72680609c0d803494fb0bde4ce765 Mon Sep 17 00:00:00 2001 From: Craig Jennings Date: Sun, 17 May 2026 14:38:40 -0500 Subject: fix(testing): cleanup traps, arg validation, and two real bugs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two real bugs and a sweep of hygiene across the harness. `make test` passed cleanly on this branch with the same 52/0/5 profile as the 2026-05-11 run, so the wiring is verified end-to-end. Real bugs: - `lib/vm-utils.sh` `snapshot_exists` was running `qemu-img snapshot -l | grep -q "$snapshot_name"`, which matches the name as a substring anywhere in the output — including inside dates or filenames in other fields. Replaced with an awk field extraction on the TAG column plus `grep -Fxq` for a whole-line literal match. - `run-test-baremetal.sh` was setting `VALIDATION_PASSED=true|false` after validation, but `validation.sh` already uses `VALIDATION_PASSED` as a pass counter. The test report then referenced `$VALIDATION_PASSED_COUNT`, which is defined nowhere. Renamed the boolean to `TEST_PASSED` (matching run-test.sh's pattern) and report the actual counter. Cleanup traps and arg validation: - `run-test.sh` now installs a top-level EXIT trap that, on abort, kills QEMU and restores the clean-install snapshot. A `CLEANUP_DONE=1` sentinel keeps the existing normal-path cleanup from double-firing. This is the recurring pain from 2026-05-11 where two failed runs left orphaned QEMU processes and dirty base disks behind. - `create-base-vm.sh` and `debug-vm.sh` got the same kind of trap, plus `debug-vm.sh` now rejects non-`.qcow2` paths up front instead of letting QEMU fail later. - `run-test.sh`, `run-test-baremetal.sh`, and `cleanup-tests.sh` now validate that options with required values actually receive one (`${var:?msg}` for `--script`/`--snapshot`/`--host`/`--password`, numeric check for `--keep`). - `run-test-baremetal.sh` traps the temp git bundle for cleanup if the script aborts before its explicit `rm`. The ZFS rollback loop now uses `while IFS= read -r ds` and quotes `$ds` inside the ssh_cmd so dataset names with whitespace wouldn't break it. Smaller hygiene: - `vm-utils.sh` `check_ovmf` also checks `OVMF_VARS_TEMPLATE`; `start_qemu` validates disk and ISO paths before building the QEMU command; numeric tests quoted. - `cleanup-tests.sh` find expression for temp disks wrapped in `\( ... -o ... \)`, all `while read` loops use `IFS= read -r`, orphaned QEMU cleanup tries SIGTERM with a 2s sleep before SIGKILL. - `create-base-vm.sh` moved the "Copy an archangel-*.iso" info line before its `fatal` instead of after (unreachable), and added the serial-log path to the final summary. - `lib/logging.sh` `stop_timer` no longer produces `$((end - ))` when the named timer was never started. - `lib/network-diagnostics.sh` `read` → `IFS= read -r`. - `setup-testing-env.sh` now installs all missing pacman packages in one transaction instead of one-at-a-time (avoids half-installed state if package N fails). KVM check also verifies the user has read+write on `/dev/kvm` and prints the `gpasswd -a $(id -un) kvm` fix if not. A few items from the review I deliberately skipped: replacing the codebase-wide unquoted `$SSH_OPTS` string with an array (cosmetic, would need to be done everywhere at once), `set -e` adds where the existing fall-through-on-failure is intentional, and a `--force` gate on `create-base-vm.sh` (would break the expected workflow). --- scripts/testing/lib/logging.sh | 9 +++++++-- scripts/testing/lib/network-diagnostics.sh | 2 +- scripts/testing/lib/vm-utils.sh | 24 +++++++++++++++++++++--- 3 files changed, 29 insertions(+), 6 deletions(-) (limited to 'scripts/testing/lib') diff --git a/scripts/testing/lib/logging.sh b/scripts/testing/lib/logging.sh index eda9eb1..ed20707 100755 --- a/scripts/testing/lib/logging.sh +++ b/scripts/testing/lib/logging.sh @@ -135,8 +135,13 @@ start_timer() { stop_timer() { local name="${1:-default}" - local start=${TIMERS[$name]} - local end=$(date +%s) + local start="${TIMERS[$name]:-}" + if [ -z "$start" ]; then + log "TIMER STOP: $name (never started, skipping)" + return 0 + fi + local end + end=$(date +%s) local duration=$((end - start)) local mins=$((duration / 60)) local secs=$((duration % 60)) diff --git a/scripts/testing/lib/network-diagnostics.sh b/scripts/testing/lib/network-diagnostics.sh index d73ffe5..674aeba 100644 --- a/scripts/testing/lib/network-diagnostics.sh +++ b/scripts/testing/lib/network-diagnostics.sh @@ -53,7 +53,7 @@ run_network_diagnostics() { # Show network info info "Network configuration:" - $ssh_base "ip addr show | grep 'inet ' | grep -v '127.0.0.1'" 2>/dev/null | while read line; do + $ssh_base "ip addr show | grep 'inet ' | grep -v '127.0.0.1'" 2>/dev/null | while IFS= read -r line; do info " $line" done diff --git a/scripts/testing/lib/vm-utils.sh b/scripts/testing/lib/vm-utils.sh index 47bd391..a8736a3 100755 --- a/scripts/testing/lib/vm-utils.sh +++ b/scripts/testing/lib/vm-utils.sh @@ -72,6 +72,11 @@ check_ovmf() { info "Install with: sudo pacman -S edk2-ovmf" return 1 fi + if [ ! -f "$OVMF_VARS_TEMPLATE" ]; then + error "OVMF vars template not found: $OVMF_VARS_TEMPLATE" + info "Install with: sudo pacman -S edk2-ovmf" + return 1 + fi return 0 } @@ -132,6 +137,15 @@ start_qemu() { local iso_path="${3:-}" local display="${4:-none}" + if [ -z "$disk" ] || [ ! -f "$disk" ]; then + error "Disk image not found: ${disk:-}" + return 1 + fi + if [ "$mode" = "iso" ] && { [ -z "$iso_path" ] || [ ! -f "$iso_path" ]; }; then + error "ISO not found: ${iso_path:-}" + return 1 + fi + # Stop any existing instance stop_qemu 2>/dev/null || true @@ -223,7 +237,7 @@ stop_qemu() { # Wait for graceful shutdown local elapsed=0 - while [ $elapsed -lt $timeout ]; do + while [ "$elapsed" -lt "$timeout" ]; do if ! vm_is_running; then success "VM stopped gracefully" _cleanup_qemu_files @@ -319,7 +333,11 @@ list_snapshots() { snapshot_exists() { local disk="${1:-$DISK_PATH}" local snapshot_name="${2:-clean-install}" - qemu-img snapshot -l "$disk" 2>/dev/null | grep -q "$snapshot_name" + # Match on the TAG column (field 2) so a name appearing inside a timestamp + # or filename elsewhere in the output can't false-positive the check. + qemu-img snapshot -l "$disk" 2>/dev/null \ + | awk 'NR > 2 { print $2 }' \ + | grep -Fxq "$snapshot_name" } # ─── SSH Operations ─────────────────────────────────────────────────── @@ -331,7 +349,7 @@ wait_for_ssh() { local elapsed=0 progress "Waiting for SSH on localhost:$SSH_PORT..." - while [ $elapsed -lt $timeout ]; do + while [ "$elapsed" -lt "$timeout" ]; do if sshpass -p "$password" ssh $SSH_OPTS -p "$SSH_PORT" root@localhost true 2>/dev/null; then success "SSH is available" return 0 -- cgit v1.2.3