diff options
| author | Craig Jennings <c@cjennings.net> | 2026-05-20 09:58:01 -0500 |
|---|---|---|
| committer | Craig Jennings <c@cjennings.net> | 2026-05-20 09:58:01 -0500 |
| commit | 4ef30e5c84ab22ba1724608009093d6725a1ceda (patch) | |
| tree | a52907d5929d1bee2aae04b92ece44d51b34cc6a | |
| parent | 21b745d7634cf8e743020b591df101b439883511 (diff) | |
| download | archangel-4ef30e5c84ab22ba1724608009093d6725a1ceda.tar.gz archangel-4ef30e5c84ab22ba1724608009093d6725a1ceda.zip | |
feat(test): retry pacstrap through transient mirror flakes
test-install.sh aborts a whole 5-minute VM run when pacstrap hits a transient mirror blip, and the suite reports a failure indistinguishable from a real install regression. run_test now retries the install up to twice, but only when the in-VM log shows both pacstrap's "Failed to install packages to new root" marker and a download/network indicator. A deterministic failure like "target not found" carries the marker without a network indicator, so it still fails fast. archangel's failure trap exports the pool and unmounts on abort, so each retry re-partitions and re-pacstraps from a clean state.
Wiring the predicate up needed a source-guard so bats can source the harness, which had none. With that in place I unit-covered the pure helpers — is_transient_install_failure, char_to_qemu_key, get_disk_count, get_disk_args — and lifted char_to_qemu_key out of monitor_sendkeys so the QEMU keymap is testable on its own.
The keymap test found a dead branch. The backslash case pattern was '\\', which never matches a lone backslash because bash matches one against '\', so a passphrase containing a backslash would have sent an invalid QEMU keyname instead of "backslash". No test passphrase uses one, so it never bit. I fixed the pattern.
| -rwxr-xr-x | scripts/test-install.sh | 108 | ||||
| -rw-r--r-- | tests/unit/test_test_install.bats | 216 |
2 files changed, 298 insertions, 26 deletions
diff --git a/scripts/test-install.sh b/scripts/test-install.sh index 7f2bcda..cf933a5 100755 --- a/scripts/test-install.sh +++ b/scripts/test-install.sh @@ -289,6 +289,35 @@ wait_for_boot_console() { done } +# Map a single character to its QEMU `sendkey` key name. Pure: char in, +# key name out, no side effects. Covers the character set test +# passphrases use; unknown characters pass through unchanged (QEMU +# rejects an unmapped name at sendkey time rather than here). +char_to_qemu_key() { + local char="$1" + case "$char" in + [a-z]) printf '%s' "$char" ;; + [A-Z]) printf 'shift-%s' "$(printf '%s' "$char" | tr '[:upper:]' '[:lower:]')" ;; + [0-9]) printf '%s' "$char" ;; + ' ') printf 'spc' ;; + '-') printf 'minus' ;; + '=') printf 'equal' ;; + '.') printf 'dot' ;; + ',') printf 'comma' ;; + '/') printf 'slash' ;; + '\') printf 'backslash' ;; + ';') printf 'semicolon' ;; + "'") printf 'apostrophe' ;; + '[') printf 'bracket_left' ;; + ']') printf 'bracket_right' ;; + '!') printf 'shift-1' ;; + '@') printf 'shift-2' ;; + '#') printf 'shift-3' ;; + '$') printf 'shift-4' ;; + *) printf '%s' "$char" ;; + esac +} + # Send a string as QEMU monitor sendkey commands # Converts each character to a QEMU key name and sends via monitor socket monitor_sendkeys() { @@ -297,28 +326,8 @@ monitor_sendkeys() { for ((ci=0; ci<${#text}; ci++)); do local char="${text:$ci:1}" - local key="" - case "$char" in - [a-z]) key="$char" ;; - [A-Z]) key="shift-$(echo "$char" | tr '[:upper:]' '[:lower:]')" ;; - [0-9]) key="$char" ;; - ' ') key="spc" ;; - '-') key="minus" ;; - '=') key="equal" ;; - '.') key="dot" ;; - ',') key="comma" ;; - '/') key="slash" ;; - '\\') key="backslash" ;; - ';') key="semicolon" ;; - "'") key="apostrophe" ;; - '[') key="bracket_left" ;; - ']') key="bracket_right" ;; - '!') key="shift-1" ;; - '@') key="shift-2" ;; - '#') key="shift-3" ;; - '$') key="shift-4" ;; - *) key="$char" ;; - esac + local key + key=$(char_to_qemu_key "$char") echo "sendkey $key" | socat - UNIX-CONNECT:"$monitor_sock" >/dev/null 2>&1 sleep 0.05 # Small delay between keystrokes done @@ -414,6 +423,25 @@ ssh_cmd() { -p "$SSH_PORT" root@localhost "$@" 2>/dev/null } +# Decide whether a failed install is a transient pacstrap/network flake +# (worth retrying) or a deterministic regression (fail fast). Returns 0 +# only when the install log shows BOTH pacstrap's own base-install +# failure marker AND a download/network indicator. Requiring both keeps +# deterministic pacstrap failures — e.g. "target not found" for a real +# missing package — from being retried, which is exactly the masking +# risk the retry loop has to avoid. +# +# Usage: is_transient_install_failure "$install_log" +is_transient_install_failure() { + local log="$1" + # Scope the match to the base-install step. A blip during some later + # pacman call shouldn't be read as a pacstrap flake. + grep -q "Failed to install packages to new root" <<<"$log" || return 1 + grep -Eqi \ + 'failed retrieving file|failed to retrieve|could not resolve host|connection timed out|connection refused|operation too slow|temporary failure in name resolution|failed to synchronize all databases' \ + <<<"$log" +} + # Copy config to VM and run install run_install() { local config="$1" @@ -863,10 +891,36 @@ run_test() { fi info "VM is accessible via SSH" - # Run install + # Run install. Retry up to twice on a transient pacstrap/network + # flake — first-run downloads (even through pacoloco, during cache + # population) can hit a mirror blip that aborts pacstrap with the + # same shape as a real install regression. archangel's failure trap + # exports the pool and unmounts on abort, so each retry re-partitions + # and re-pacstraps from a clean state. step "Running installation (timeout: ${INSTALL_TIMEOUT}s)..." - if timeout "$INSTALL_TIMEOUT" bash -c "$(declare -f ssh_cmd run_install); run_install '$config'"; then - info "Installation completed" + local install_ok=0 attempt install_log + for attempt in 1 2 3; do + if timeout "$INSTALL_TIMEOUT" bash -c "$(declare -f ssh_cmd run_install); run_install '$config'"; then + install_ok=1 + break + fi + # ssh_cmd swallows stderr, so the in-VM log is the only place + # pacstrap's failure text survives. Read just the latest log — + # a retry leaves a second timestamped log behind. + install_log=$(ssh_cmd "cat \"\$(ls -t /tmp/archangel-*.log 2>/dev/null | head -1)\"" 2>/dev/null) || true + if [[ "$attempt" -lt 3 ]] && is_transient_install_failure "$install_log"; then + warn "Install attempt $attempt hit a transient pacstrap flake — retrying ($((attempt + 1))/3)" + continue + fi + break + done + + if [[ "$install_ok" -eq 1 ]]; then + if [[ "$attempt" -gt 1 ]]; then + info "Installation completed (passed on attempt $attempt)" + else + info "Installation completed" + fi else error "Installation failed or timed out" stop_vm "$config_name" @@ -1116,4 +1170,6 @@ main() { fi } -main "$@" +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/tests/unit/test_test_install.bats b/tests/unit/test_test_install.bats new file mode 100644 index 0000000..9cbb1aa --- /dev/null +++ b/tests/unit/test_test_install.bats @@ -0,0 +1,216 @@ +#!/usr/bin/env bats +# Unit tests for scripts/test-install.sh +# +# Coverage scope: pure-logic helpers. The VM lifecycle (start_vm, +# run_install, verify_install, run_test) shells out to qemu / ssh / +# archangel and is exercised by the integration run itself, not bats. +# +# Sourcing test-install.sh relies on the source-guard at the bottom of +# the script: when sourced, function definitions load but main is not +# called. + +setup() { + # shellcheck disable=SC1091 + source "${BATS_TEST_DIRNAME}/../../scripts/test-install.sh" +} + +############################# +# is_transient_install_failure +############################# + +# Normal: a flaky-mirror failure (pacstrap marker + download error) retries. +@test "is_transient_install_failure matches a mirror download flake" { + local log="==> Installing base system +error: failed retrieving file 'core.db' from mirror.example.org : Operation too slow +error: failed to synchronize all databases +==> ERROR: Failed to install packages to new root" + run is_transient_install_failure "$log" + [ "$status" -eq 0 ] +} + +@test "is_transient_install_failure matches a name-resolution flake" { + local log="error: could not resolve host: mirror.archlinux.org +==> ERROR: Failed to install packages to new root" + run is_transient_install_failure "$log" + [ "$status" -eq 0 ] +} + +@test "is_transient_install_failure matches a connection timeout" { + local log="error: failed retrieving file: Connection timed out +==> ERROR: Failed to install packages to new root" + run is_transient_install_failure "$log" + [ "$status" -eq 0 ] +} + +# Error/deterministic: a real regression must NOT retry. +@test "is_transient_install_failure does not match a missing-package failure" { + local log="error: target not found: bogus-package +==> ERROR: Failed to install packages to new root" + run is_transient_install_failure "$log" + [ "$status" -ne 0 ] +} + +@test "is_transient_install_failure does not match a network error without the pacstrap marker" { + # A transient blip somewhere other than base install (e.g. a later + # pacman step) should not be treated as a pacstrap flake. + local log="error: failed retrieving file 'extra.db' : Connection timed out +==> Configuring system" + run is_transient_install_failure "$log" + [ "$status" -ne 0 ] +} + +@test "is_transient_install_failure does not match a clean log" { + local log="==> Installing base system +info: Base system installed. +==> Installation complete" + run is_transient_install_failure "$log" + [ "$status" -ne 0 ] +} + +# Boundary: empty input must not match (a timeout can leave an empty log). +@test "is_transient_install_failure does not match empty input" { + run is_transient_install_failure "" + [ "$status" -ne 0 ] +} + +# Boundary: matching is case-insensitive on the transient indicator. +@test "is_transient_install_failure matches indicator regardless of case" { + local log="ERROR: Failed Retrieving File from mirror : CONNECTION REFUSED +==> ERROR: Failed to install packages to new root" + run is_transient_install_failure "$log" + [ "$status" -eq 0 ] +} + +############################# +# char_to_qemu_key +############################# + +# Normal: alphanumerics map to themselves; uppercase gains a shift- prefix. +@test "char_to_qemu_key passes lowercase letters through unchanged" { + [ "$(char_to_qemu_key a)" = "a" ] + [ "$(char_to_qemu_key z)" = "z" ] +} + +@test "char_to_qemu_key prefixes uppercase letters with shift-" { + [ "$(char_to_qemu_key A)" = "shift-a" ] + [ "$(char_to_qemu_key Z)" = "shift-z" ] +} + +@test "char_to_qemu_key passes digits through unchanged" { + [ "$(char_to_qemu_key 0)" = "0" ] + [ "$(char_to_qemu_key 9)" = "9" ] +} + +# Boundary: every special character in the mapping table. +@test "char_to_qemu_key maps each special character to its QEMU name" { + while IFS='|' read -r ch want; do + run char_to_qemu_key "$ch" + [ "$status" -eq 0 ] + [ "$output" = "$want" ] || { + echo "char '$ch' => '$output', want '$want'" + false + } + done <<'EOF' + |spc +-|minus +=|equal +.|dot +,|comma +/|slash +\|backslash +;|semicolon +'|apostrophe +[|bracket_left +]|bracket_right +!|shift-1 +@|shift-2 +#|shift-3 +$|shift-4 +EOF +} + +# Error/passthrough: an unmapped character comes back verbatim. +@test "char_to_qemu_key passes unmapped characters through unchanged" { + [ "$(char_to_qemu_key '%')" = "%" ] + [ "$(char_to_qemu_key '*')" = "*" ] +} + +@test "char_to_qemu_key returns empty for empty input" { + run char_to_qemu_key "" + [ "$status" -eq 0 ] + [ "$output" = "" ] +} + +############################# +# get_disk_count +############################# + +@test "get_disk_count returns 1 for a single-disk config" { + local cfg="$BATS_TEST_TMPDIR/single.conf" + printf 'DISKS=/dev/vda\n' > "$cfg" + run get_disk_count "$cfg" + [ "$status" -eq 0 ] + [ "$output" = "1" ] +} + +@test "get_disk_count returns 2 for a two-disk config" { + local cfg="$BATS_TEST_TMPDIR/mirror.conf" + printf 'DISKS=/dev/vda,/dev/vdb\n' > "$cfg" + run get_disk_count "$cfg" + [ "$status" -eq 0 ] + [ "$output" = "2" ] +} + +@test "get_disk_count returns 3 for a three-disk config" { + local cfg="$BATS_TEST_TMPDIR/raidz1.conf" + printf 'DISKS=/dev/vda,/dev/vdb,/dev/vdc\n' > "$cfg" + run get_disk_count "$cfg" + [ "$status" -eq 0 ] + [ "$output" = "3" ] +} + +# Boundary: the ^DISKS= anchor must not match a decoy line. +@test "get_disk_count ignores a non-anchored decoy line" { + local cfg="$BATS_TEST_TMPDIR/decoy.conf" + printf 'ROOT_DISKS=/dev/sda,/dev/sdb,/dev/sdc\nDISKS=/dev/vda\n' > "$cfg" + run get_disk_count "$cfg" + [ "$status" -eq 0 ] + [ "$output" = "1" ] +} + +# Error/characterization: a config with no DISKS= line counts as 0. +@test "get_disk_count returns 0 when no DISKS line is present" { + local cfg="$BATS_TEST_TMPDIR/nodisks.conf" + printf 'HOSTNAME=test\nFILESYSTEM=zfs\n' > "$cfg" + run get_disk_count "$cfg" + [ "$status" -eq 0 ] + [ "$output" = "0" ] +} + +############################# +# get_disk_args +############################# + +@test "get_disk_args builds one -drive block for a single disk" { + run get_disk_args 1 single + [ "$status" -eq 0 ] + [ "$(grep -o -- '-drive' <<<"$output" | wc -l)" -eq 1 ] + [[ "$output" == *"test-single-disk1.qcow2"* ]] + [[ "$output" == *"format=qcow2"* ]] + [[ "$output" == *"if=virtio"* ]] +} + +@test "get_disk_args builds one -drive block per disk for multiple disks" { + run get_disk_args 2 mirror + [ "$status" -eq 0 ] + [ "$(grep -o -- '-drive' <<<"$output" | wc -l)" -eq 2 ] + [[ "$output" == *"test-mirror-disk1.qcow2"* ]] + [[ "$output" == *"test-mirror-disk2.qcow2"* ]] +} + +# Boundary: zero disks yields no arguments. +@test "get_disk_args returns empty for a zero count" { + run get_disk_args 0 empty + [ "$status" -eq 0 ] + [ "$output" = "" ] +} |
