aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCraig Jennings <c@cjennings.net>2026-05-20 09:58:01 -0500
committerCraig Jennings <c@cjennings.net>2026-05-20 09:58:01 -0500
commit4ef30e5c84ab22ba1724608009093d6725a1ceda (patch)
treea52907d5929d1bee2aae04b92ece44d51b34cc6a
parent21b745d7634cf8e743020b591df101b439883511 (diff)
downloadarchangel-4ef30e5c84ab22ba1724608009093d6725a1ceda.tar.gz
archangel-4ef30e5c84ab22ba1724608009093d6725a1ceda.zip
feat(test): retry pacstrap through transient mirror flakes
test-install.sh aborts a whole 5-minute VM run when pacstrap hits a transient mirror blip, and the suite reports a failure indistinguishable from a real install regression. run_test now retries the install up to twice, but only when the in-VM log shows both pacstrap's "Failed to install packages to new root" marker and a download/network indicator. A deterministic failure like "target not found" carries the marker without a network indicator, so it still fails fast. archangel's failure trap exports the pool and unmounts on abort, so each retry re-partitions and re-pacstraps from a clean state. Wiring the predicate up needed a source-guard so bats can source the harness, which had none. With that in place I unit-covered the pure helpers — is_transient_install_failure, char_to_qemu_key, get_disk_count, get_disk_args — and lifted char_to_qemu_key out of monitor_sendkeys so the QEMU keymap is testable on its own. The keymap test found a dead branch. The backslash case pattern was '\\', which never matches a lone backslash because bash matches one against '\', so a passphrase containing a backslash would have sent an invalid QEMU keyname instead of "backslash". No test passphrase uses one, so it never bit. I fixed the pattern.
-rwxr-xr-xscripts/test-install.sh108
-rw-r--r--tests/unit/test_test_install.bats216
2 files changed, 298 insertions, 26 deletions
diff --git a/scripts/test-install.sh b/scripts/test-install.sh
index 7f2bcda..cf933a5 100755
--- a/scripts/test-install.sh
+++ b/scripts/test-install.sh
@@ -289,6 +289,35 @@ wait_for_boot_console() {
done
}
+# Map a single character to its QEMU `sendkey` key name. Pure: char in,
+# key name out, no side effects. Covers the character set test
+# passphrases use; unknown characters pass through unchanged (QEMU
+# rejects an unmapped name at sendkey time rather than here).
+char_to_qemu_key() {
+ local char="$1"
+ case "$char" in
+ [a-z]) printf '%s' "$char" ;;
+ [A-Z]) printf 'shift-%s' "$(printf '%s' "$char" | tr '[:upper:]' '[:lower:]')" ;;
+ [0-9]) printf '%s' "$char" ;;
+ ' ') printf 'spc' ;;
+ '-') printf 'minus' ;;
+ '=') printf 'equal' ;;
+ '.') printf 'dot' ;;
+ ',') printf 'comma' ;;
+ '/') printf 'slash' ;;
+ '\') printf 'backslash' ;;
+ ';') printf 'semicolon' ;;
+ "'") printf 'apostrophe' ;;
+ '[') printf 'bracket_left' ;;
+ ']') printf 'bracket_right' ;;
+ '!') printf 'shift-1' ;;
+ '@') printf 'shift-2' ;;
+ '#') printf 'shift-3' ;;
+ '$') printf 'shift-4' ;;
+ *) printf '%s' "$char" ;;
+ esac
+}
+
# Send a string as QEMU monitor sendkey commands
# Converts each character to a QEMU key name and sends via monitor socket
monitor_sendkeys() {
@@ -297,28 +326,8 @@ monitor_sendkeys() {
for ((ci=0; ci<${#text}; ci++)); do
local char="${text:$ci:1}"
- local key=""
- case "$char" in
- [a-z]) key="$char" ;;
- [A-Z]) key="shift-$(echo "$char" | tr '[:upper:]' '[:lower:]')" ;;
- [0-9]) key="$char" ;;
- ' ') key="spc" ;;
- '-') key="minus" ;;
- '=') key="equal" ;;
- '.') key="dot" ;;
- ',') key="comma" ;;
- '/') key="slash" ;;
- '\\') key="backslash" ;;
- ';') key="semicolon" ;;
- "'") key="apostrophe" ;;
- '[') key="bracket_left" ;;
- ']') key="bracket_right" ;;
- '!') key="shift-1" ;;
- '@') key="shift-2" ;;
- '#') key="shift-3" ;;
- '$') key="shift-4" ;;
- *) key="$char" ;;
- esac
+ local key
+ key=$(char_to_qemu_key "$char")
echo "sendkey $key" | socat - UNIX-CONNECT:"$monitor_sock" >/dev/null 2>&1
sleep 0.05 # Small delay between keystrokes
done
@@ -414,6 +423,25 @@ ssh_cmd() {
-p "$SSH_PORT" root@localhost "$@" 2>/dev/null
}
+# Decide whether a failed install is a transient pacstrap/network flake
+# (worth retrying) or a deterministic regression (fail fast). Returns 0
+# only when the install log shows BOTH pacstrap's own base-install
+# failure marker AND a download/network indicator. Requiring both keeps
+# deterministic pacstrap failures — e.g. "target not found" for a real
+# missing package — from being retried, which is exactly the masking
+# risk the retry loop has to avoid.
+#
+# Usage: is_transient_install_failure "$install_log"
+is_transient_install_failure() {
+ local log="$1"
+ # Scope the match to the base-install step. A blip during some later
+ # pacman call shouldn't be read as a pacstrap flake.
+ grep -q "Failed to install packages to new root" <<<"$log" || return 1
+ grep -Eqi \
+ 'failed retrieving file|failed to retrieve|could not resolve host|connection timed out|connection refused|operation too slow|temporary failure in name resolution|failed to synchronize all databases' \
+ <<<"$log"
+}
+
# Copy config to VM and run install
run_install() {
local config="$1"
@@ -863,10 +891,36 @@ run_test() {
fi
info "VM is accessible via SSH"
- # Run install
+ # Run install. Retry up to twice on a transient pacstrap/network
+ # flake — first-run downloads (even through pacoloco, during cache
+ # population) can hit a mirror blip that aborts pacstrap with the
+ # same shape as a real install regression. archangel's failure trap
+ # exports the pool and unmounts on abort, so each retry re-partitions
+ # and re-pacstraps from a clean state.
step "Running installation (timeout: ${INSTALL_TIMEOUT}s)..."
- if timeout "$INSTALL_TIMEOUT" bash -c "$(declare -f ssh_cmd run_install); run_install '$config'"; then
- info "Installation completed"
+ local install_ok=0 attempt install_log
+ for attempt in 1 2 3; do
+ if timeout "$INSTALL_TIMEOUT" bash -c "$(declare -f ssh_cmd run_install); run_install '$config'"; then
+ install_ok=1
+ break
+ fi
+ # ssh_cmd swallows stderr, so the in-VM log is the only place
+ # pacstrap's failure text survives. Read just the latest log —
+ # a retry leaves a second timestamped log behind.
+ install_log=$(ssh_cmd "cat \"\$(ls -t /tmp/archangel-*.log 2>/dev/null | head -1)\"" 2>/dev/null) || true
+ if [[ "$attempt" -lt 3 ]] && is_transient_install_failure "$install_log"; then
+ warn "Install attempt $attempt hit a transient pacstrap flake — retrying ($((attempt + 1))/3)"
+ continue
+ fi
+ break
+ done
+
+ if [[ "$install_ok" -eq 1 ]]; then
+ if [[ "$attempt" -gt 1 ]]; then
+ info "Installation completed (passed on attempt $attempt)"
+ else
+ info "Installation completed"
+ fi
else
error "Installation failed or timed out"
stop_vm "$config_name"
@@ -1116,4 +1170,6 @@ main() {
fi
}
-main "$@"
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+ main "$@"
+fi
diff --git a/tests/unit/test_test_install.bats b/tests/unit/test_test_install.bats
new file mode 100644
index 0000000..9cbb1aa
--- /dev/null
+++ b/tests/unit/test_test_install.bats
@@ -0,0 +1,216 @@
+#!/usr/bin/env bats
+# Unit tests for scripts/test-install.sh
+#
+# Coverage scope: pure-logic helpers. The VM lifecycle (start_vm,
+# run_install, verify_install, run_test) shells out to qemu / ssh /
+# archangel and is exercised by the integration run itself, not bats.
+#
+# Sourcing test-install.sh relies on the source-guard at the bottom of
+# the script: when sourced, function definitions load but main is not
+# called.
+
+setup() {
+ # shellcheck disable=SC1091
+ source "${BATS_TEST_DIRNAME}/../../scripts/test-install.sh"
+}
+
+#############################
+# is_transient_install_failure
+#############################
+
+# Normal: a flaky-mirror failure (pacstrap marker + download error) retries.
+@test "is_transient_install_failure matches a mirror download flake" {
+ local log="==> Installing base system
+error: failed retrieving file 'core.db' from mirror.example.org : Operation too slow
+error: failed to synchronize all databases
+==> ERROR: Failed to install packages to new root"
+ run is_transient_install_failure "$log"
+ [ "$status" -eq 0 ]
+}
+
+@test "is_transient_install_failure matches a name-resolution flake" {
+ local log="error: could not resolve host: mirror.archlinux.org
+==> ERROR: Failed to install packages to new root"
+ run is_transient_install_failure "$log"
+ [ "$status" -eq 0 ]
+}
+
+@test "is_transient_install_failure matches a connection timeout" {
+ local log="error: failed retrieving file: Connection timed out
+==> ERROR: Failed to install packages to new root"
+ run is_transient_install_failure "$log"
+ [ "$status" -eq 0 ]
+}
+
+# Error/deterministic: a real regression must NOT retry.
+@test "is_transient_install_failure does not match a missing-package failure" {
+ local log="error: target not found: bogus-package
+==> ERROR: Failed to install packages to new root"
+ run is_transient_install_failure "$log"
+ [ "$status" -ne 0 ]
+}
+
+@test "is_transient_install_failure does not match a network error without the pacstrap marker" {
+ # A transient blip somewhere other than base install (e.g. a later
+ # pacman step) should not be treated as a pacstrap flake.
+ local log="error: failed retrieving file 'extra.db' : Connection timed out
+==> Configuring system"
+ run is_transient_install_failure "$log"
+ [ "$status" -ne 0 ]
+}
+
+@test "is_transient_install_failure does not match a clean log" {
+ local log="==> Installing base system
+info: Base system installed.
+==> Installation complete"
+ run is_transient_install_failure "$log"
+ [ "$status" -ne 0 ]
+}
+
+# Boundary: empty input must not match (a timeout can leave an empty log).
+@test "is_transient_install_failure does not match empty input" {
+ run is_transient_install_failure ""
+ [ "$status" -ne 0 ]
+}
+
+# Boundary: matching is case-insensitive on the transient indicator.
+@test "is_transient_install_failure matches indicator regardless of case" {
+ local log="ERROR: Failed Retrieving File from mirror : CONNECTION REFUSED
+==> ERROR: Failed to install packages to new root"
+ run is_transient_install_failure "$log"
+ [ "$status" -eq 0 ]
+}
+
+#############################
+# char_to_qemu_key
+#############################
+
+# Normal: alphanumerics map to themselves; uppercase gains a shift- prefix.
+@test "char_to_qemu_key passes lowercase letters through unchanged" {
+ [ "$(char_to_qemu_key a)" = "a" ]
+ [ "$(char_to_qemu_key z)" = "z" ]
+}
+
+@test "char_to_qemu_key prefixes uppercase letters with shift-" {
+ [ "$(char_to_qemu_key A)" = "shift-a" ]
+ [ "$(char_to_qemu_key Z)" = "shift-z" ]
+}
+
+@test "char_to_qemu_key passes digits through unchanged" {
+ [ "$(char_to_qemu_key 0)" = "0" ]
+ [ "$(char_to_qemu_key 9)" = "9" ]
+}
+
+# Boundary: every special character in the mapping table.
+@test "char_to_qemu_key maps each special character to its QEMU name" {
+ while IFS='|' read -r ch want; do
+ run char_to_qemu_key "$ch"
+ [ "$status" -eq 0 ]
+ [ "$output" = "$want" ] || {
+ echo "char '$ch' => '$output', want '$want'"
+ false
+ }
+ done <<'EOF'
+ |spc
+-|minus
+=|equal
+.|dot
+,|comma
+/|slash
+\|backslash
+;|semicolon
+'|apostrophe
+[|bracket_left
+]|bracket_right
+!|shift-1
+@|shift-2
+#|shift-3
+$|shift-4
+EOF
+}
+
+# Error/passthrough: an unmapped character comes back verbatim.
+@test "char_to_qemu_key passes unmapped characters through unchanged" {
+ [ "$(char_to_qemu_key '%')" = "%" ]
+ [ "$(char_to_qemu_key '*')" = "*" ]
+}
+
+@test "char_to_qemu_key returns empty for empty input" {
+ run char_to_qemu_key ""
+ [ "$status" -eq 0 ]
+ [ "$output" = "" ]
+}
+
+#############################
+# get_disk_count
+#############################
+
+@test "get_disk_count returns 1 for a single-disk config" {
+ local cfg="$BATS_TEST_TMPDIR/single.conf"
+ printf 'DISKS=/dev/vda\n' > "$cfg"
+ run get_disk_count "$cfg"
+ [ "$status" -eq 0 ]
+ [ "$output" = "1" ]
+}
+
+@test "get_disk_count returns 2 for a two-disk config" {
+ local cfg="$BATS_TEST_TMPDIR/mirror.conf"
+ printf 'DISKS=/dev/vda,/dev/vdb\n' > "$cfg"
+ run get_disk_count "$cfg"
+ [ "$status" -eq 0 ]
+ [ "$output" = "2" ]
+}
+
+@test "get_disk_count returns 3 for a three-disk config" {
+ local cfg="$BATS_TEST_TMPDIR/raidz1.conf"
+ printf 'DISKS=/dev/vda,/dev/vdb,/dev/vdc\n' > "$cfg"
+ run get_disk_count "$cfg"
+ [ "$status" -eq 0 ]
+ [ "$output" = "3" ]
+}
+
+# Boundary: the ^DISKS= anchor must not match a decoy line.
+@test "get_disk_count ignores a non-anchored decoy line" {
+ local cfg="$BATS_TEST_TMPDIR/decoy.conf"
+ printf 'ROOT_DISKS=/dev/sda,/dev/sdb,/dev/sdc\nDISKS=/dev/vda\n' > "$cfg"
+ run get_disk_count "$cfg"
+ [ "$status" -eq 0 ]
+ [ "$output" = "1" ]
+}
+
+# Error/characterization: a config with no DISKS= line counts as 0.
+@test "get_disk_count returns 0 when no DISKS line is present" {
+ local cfg="$BATS_TEST_TMPDIR/nodisks.conf"
+ printf 'HOSTNAME=test\nFILESYSTEM=zfs\n' > "$cfg"
+ run get_disk_count "$cfg"
+ [ "$status" -eq 0 ]
+ [ "$output" = "0" ]
+}
+
+#############################
+# get_disk_args
+#############################
+
+@test "get_disk_args builds one -drive block for a single disk" {
+ run get_disk_args 1 single
+ [ "$status" -eq 0 ]
+ [ "$(grep -o -- '-drive' <<<"$output" | wc -l)" -eq 1 ]
+ [[ "$output" == *"test-single-disk1.qcow2"* ]]
+ [[ "$output" == *"format=qcow2"* ]]
+ [[ "$output" == *"if=virtio"* ]]
+}
+
+@test "get_disk_args builds one -drive block per disk for multiple disks" {
+ run get_disk_args 2 mirror
+ [ "$status" -eq 0 ]
+ [ "$(grep -o -- '-drive' <<<"$output" | wc -l)" -eq 2 ]
+ [[ "$output" == *"test-mirror-disk1.qcow2"* ]]
+ [[ "$output" == *"test-mirror-disk2.qcow2"* ]]
+}
+
+# Boundary: zero disks yields no arguments.
+@test "get_disk_args returns empty for a zero count" {
+ run get_disk_args 0 empty
+ [ "$status" -eq 0 ]
+ [ "$output" = "" ]
+}