5 files changed, 186 insertions, 850 deletions
diff --git a/scripts/testing/lib/logging.sh b/scripts/testing/lib/logging.sh
index ed20707..809d396 100755
--- a/scripts/testing/lib/logging.sh
+++ b/scripts/testing/lib/logging.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+# SPDX-License-Identifier: GPL-3.0-or-later
 # Logging utilities for archsetup testing
 # Author: Craig Jennings <craigmartinjennings@gmail.com>
 # License: GNU GPLv3
diff --git a/scripts/testing/lib/network-diagnostics.sh b/scripts/testing/lib/network-diagnostics.sh
index 674aeba..38788e5 100644
--- a/scripts/testing/lib/network-diagnostics.sh
+++ b/scripts/testing/lib/network-diagnostics.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+# SPDX-License-Identifier: GPL-3.0-or-later
 # Network diagnostics for VM testing
 # Author: Craig Jennings <craigmartinjennings@gmail.com>
 # License: GNU GPLv3
diff --git a/scripts/testing/lib/testinfra.sh b/scripts/testing/lib/testinfra.sh
new file mode 100644
index 0000000..0822a9f
--- /dev/null
+++ b/scripts/testing/lib/testinfra.sh
@@ -0,0 +1,120 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# Testinfra post-install validation sweep (runs on the host, over SSH).
+#
+# This is the primary post-install validator (it replaced the shell
+# run_all_validations sweep). It connects to the freshly-installed VM over SSH
+# and runs the pytest suite under scripts/testing/tests/. Its result drives the
+# run's pass/fail, and per-test failures are bucketed (archsetup / base_install
+# / unknown) into the same issue-attribution report the shell sweep produced.
+#
+# Auth: reuse the root key the harness already authorized (inject_root_key),
+# which survives the sshd prohibit-password hardening; mint our own only if the
+# harness didn't (standalone use). pytest connects key-only via a generated
+# ssh-config. Key + config live in the results dir and are discarded with it.
+#
+# Uses globals from run-test.sh / vm-utils.sh: SCRIPT_DIR, VM_IP, SSH_PORT,
+# ROOT_PASSWORD, ROOT_SSH_KEY, ARCHSETUP_VM_CONF, plus the validation.sh
+# helpers attribute_issue / VALIDATION_*. Toggle with RUN_TESTINFRA=false.
+
+# Record each pytest failure from the attribution file into the issue arrays
+# (validation.sh's attribute_issue), so generate_issue_report covers them.
+_testinfra_record_attribution() {
+    local file="$1" bucket=""
+    [ -f "$file" ] || return 0
+    while IFS= read -r line; do
+        case "$line" in
+            "[archsetup]")    bucket=archsetup ;;
+            "[base_install]") bucket=base ;;
+            "[unknown]")      bucket=unknown ;;
+            "  "*)            attribute_issue "testinfra: ${line#  }" "$bucket" ;;
+        esac
+    done < "$file"
+}
+
+# run_testinfra_validation <results_dir>
+# Returns 0 only when the pytest sweep ran and passed. Returns non-zero when it
+# failed OR could not run (missing tooling / SSH setup) — a sweep that can't run
+# is not a pass. RUN_TESTINFRA=false is the one explicit opt-out (returns 0).
+run_testinfra_validation() {
+    local results_dir="$1"
+    local tests_dir="$SCRIPT_DIR/tests"
+    local key="$results_dir/testinfra_key"
+    local sshcfg="$results_dir/testinfra_ssh_config"
+
+    if [ "${RUN_TESTINFRA:-true}" != "true" ]; then
+        warn "RUN_TESTINFRA=false - skipping the Testinfra validation sweep"
+        return 0
+    fi
+    if ! command -v pytest >/dev/null 2>&1 || ! python3 -c 'import testinfra' >/dev/null 2>&1; then
+        error "Testinfra/pytest not installed on host - cannot validate (run: make deps)"
+        return 1
+    fi
+
+    section "Running Validation Checks (Testinfra)"
+
+    # Prefer the harness's already-authorized root key; mint one if absent.
+    if [ -n "${ROOT_SSH_KEY:-}" ] && [ -f "${ROOT_SSH_KEY}" ]; then
+        key="$ROOT_SSH_KEY"
+    else
+        rm -f "$key" "$key.pub"
+        if ! ssh-keygen -t ed25519 -N "" -q -f "$key"; then
+            error "testinfra: ssh-keygen failed"
+            return 1
+        fi
+        if ! copy_to_vm "$key.pub" "/tmp/testinfra_key.pub" "$ROOT_PASSWORD"; then
+            error "testinfra: pubkey copy failed"
+            return 1
+        fi
+        if ! vm_exec "$ROOT_PASSWORD" \
+            "mkdir -p /root/.ssh && chmod 700 /root/.ssh && cat /tmp/testinfra_key.pub >> /root/.ssh/authorized_keys && chmod 600 /root/.ssh/authorized_keys"; then
+            error "testinfra: authorizing key in VM failed"
+            return 1
+        fi
+    fi
+
+    # ssh-config so testinfra connects key-only, no host-key prompt.
+    cat > "$sshcfg" <<EOF
+Host testinfra-target
+    HostName ${VM_IP:-localhost}
+    Port ${SSH_PORT:-2222}
+    User root
+    IdentityFile $key
+    IdentitiesOnly yes
+    StrictHostKeyChecking no
+    UserKnownHostsFile /dev/null
+EOF
+
+    # The account archsetup created, for the tests that need it.
+    local test_user
+    test_user=$(sed -n 's/^USERNAME=//p' "$ARCHSETUP_VM_CONF" 2>/dev/null | head -n1)
+    : "${test_user:=cjennings}"
+
+    local logf="$results_dir/testinfra.log"
+    ARCHSETUP_TEST_USER="$test_user" pytest "$tests_dir" \
+        --hosts="ssh://testinfra-target" \
+        --ssh-config="$sshcfg" \
+        --attribution-file="$results_dir/testinfra-attribution.txt" \
+        -v >> "$logf" 2>&1
+    local rc=$?
+
+    # Surface pytest's counts through the shared validation counters so the
+    # issue report summary is meaningful (the shell sweep no longer runs).
+    local summary
+    summary=$(grep -oE '[0-9]+ (passed|failed|error|errors|skipped)' "$logf" | tail -10)
+    VALIDATION_PASSED=$(echo "$summary" | awk '/passed/{print $1}' | tail -1); VALIDATION_PASSED=${VALIDATION_PASSED:-0}
+    VALIDATION_WARNINGS=$(echo "$summary" | awk '/skipped/{print $1}' | tail -1); VALIDATION_WARNINGS=${VALIDATION_WARNINGS:-0}
+    local nfail nerr
+    nfail=$(echo "$summary" | awk '/failed/{print $1}' | tail -1); nfail=${nfail:-0}
+    nerr=$(echo "$summary" | awk '/error/{print $1}' | tail -1); nerr=${nerr:-0}
+    VALIDATION_FAILED=$((nfail + nerr))
+
+    if [ "$rc" -eq 0 ]; then
+        success "Testinfra validation passed ($VALIDATION_PASSED passed, $VALIDATION_WARNINGS skipped)"
+    else
+        error "Testinfra validation failed ($VALIDATION_FAILED failed/error; see testinfra.log)"
+        _testinfra_record_attribution "$results_dir/testinfra-attribution.txt"
+    fi
+    return "$rc"
+}
diff --git a/scripts/testing/lib/validation.sh b/scripts/testing/lib/validation.sh
index 91270ef..fa7ddcc 100644
--- a/scripts/testing/lib/validation.sh
+++ b/scripts/testing/lib/validation.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+# SPDX-License-Identifier: GPL-3.0-or-later
 # Validation utilities for archsetup testing
 # Author: Craig Jennings <craigmartinjennings@gmail.com>
 # License: GNU GPLv3
@@ -20,38 +21,7 @@ declare -a UNKNOWN_ISSUES
 # SSH helper (uses globals: VM_IP, ROOT_PASSWORD)
 ssh_cmd() {
     sshpass -p "$ROOT_PASSWORD" ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
-        -o ConnectTimeout=10 -p "${SSH_PORT:-22}" "root@$VM_IP" "$@" 2>/dev/null
-}
-
-# Validation result helpers
-validation_pass() {
-    local test_name="$1"
-    success "$test_name"
-    ((VALIDATION_PASSED++)) || true
-}
-
-validation_fail() {
-    local test_name="$1"
-    local details="${2:-}"
-    error "$test_name"
-    [ -n "$details" ] && info "  Details: $details"
-    ((VALIDATION_FAILED++)) || true
-}
-
-validation_warn() {
-    local test_name="$1"
-    local details="${2:-}"
-    warn "$test_name"
-    [ -n "$details" ] && info "  Details: $details"
-    ((VALIDATION_WARNINGS++)) || true
-}
-
-# A check whose precondition can't hold in this environment (headless VM,
-# slirp networking, pre-reboot state). Logged for the record, counted nowhere
-# — a warning that fires on every run trains readers to ignore warnings.
-validation_skip() {
-    local test_name="$1"
-    info "SKIP: $test_name"
+        -o ConnectTimeout=10 ${SSH_KEY_OPT:-} -p "${SSH_PORT:-22}" "root@$VM_IP" "$@" 2>/dev/null
 }
 
 # Attribute an issue to archsetup or base install
@@ -264,802 +234,6 @@ categorize_errors() {
 }
 
 #=============================================================================
-# VALIDATION CHECKS
-#=============================================================================
-
-run_all_validations() {
-    section "Running Validation Checks"
-
-    # User & Authentication
-    validate_user_created
-    validate_user_shell
-    validate_user_groups
-
-    # Dotfiles
-    validate_dotfiles
-
-    # Package Managers
-    validate_yay_installed
-    validate_pacman_working
-
-    # Window Manager (detects DWM or Hyprland automatically)
-    validate_window_manager
-
-    # Essential Services
-    validate_firewall
-    validate_dns_config
-    validate_avahi
-    validate_fail2ban
-    validate_networkmanager
-
-    # Developer Tools
-    validate_emacs
-    validate_git_config
-    validate_dev_tools
-
-    # System Configuration
-    validate_zfs_config
-    validate_boot_config
-    validate_autologin_config
-    validate_gnome_keyring_setup
-
-    # Boot & Initramfs (critical for ZFS systems)
-    validate_terminus_font
-    validate_mkinitcpio_hooks
-    validate_initramfs_consolefont
-    validate_nvme_module
-
-    # Archsetup Specific
-    validate_archsetup_log
-    validate_state_markers
-}
-
-#-----------------------------------------------------------------------------
-# User & Authentication Validations
-#-----------------------------------------------------------------------------
-
-validate_user_created() {
-    step "Checking if user 'cjennings' exists"
-    if ssh_cmd "id cjennings" &>> "$LOGFILE"; then
-        validation_pass "User cjennings exists"
-    else
-        validation_fail "User cjennings not found"
-        attribute_issue "User cjennings not created" "archsetup"
-    fi
-}
-
-validate_user_shell() {
-    step "Checking if ZSH is default shell"
-    local shell=$(ssh_cmd "getent passwd cjennings | cut -d: -f7")
-    if [ "$shell" = "/bin/zsh" ] || [ "$shell" = "/usr/bin/zsh" ]; then
-        validation_pass "ZSH is default shell"
-    else
-        validation_fail "ZSH not default shell (got: $shell)"
-        attribute_issue "ZSH not set as default shell" "archsetup"
-    fi
-}
-
-validate_user_groups() {
-    step "Checking user group memberships"
-    # Groups added by archsetup:
-    # - wheel (useradd -G wheel)
-    # - sys,adm,network,scanner,power,uucp,audio,lp,rfkill,video,storage,optical,users (usermod -aG)
-    # - docker (gpasswd -a, added later in developer_workstation)
-    local expected_groups="wheel sys adm network scanner power uucp audio lp rfkill video storage optical users docker"
-    local missing_groups=""
-
-    for group in $expected_groups; do
-        if ! ssh_cmd "groups cjennings" | grep -q "\b$group\b"; then
-            missing_groups="$missing_groups $group"
-        fi
-    done
-
-    if [ -z "$missing_groups" ]; then
-        validation_pass "User in all expected groups (15 groups)"
-    else
-        validation_fail "User missing groups:$missing_groups"
-        attribute_issue "User missing groups:$missing_groups" "archsetup"
-    fi
-}
-
-#-----------------------------------------------------------------------------
-# Dotfiles Validations
-#-----------------------------------------------------------------------------
-
-validate_dotfiles() {
-    step "Checking dotfiles setup"
-
-    # 1. Check if .zshrc is a symlink
-    if ! ssh_cmd "test -L /home/cjennings/.zshrc"; then
-        validation_fail "Dotfiles not stowed (.zshrc is not a symlink)"
-        attribute_issue "Dotfiles stow failed" "archsetup"
-        return 1
-    fi
-
-    # 2. Check symlink points to correct location. archsetup now clones the
-    # dotfiles repo to ~/.dotfiles and stows from there (DOTFILES_DIR default).
-    # Which tree owns .zshrc depends on DESKTOP_ENV: none stows the standalone
-    # minimal/ tree; dwm and hyprland stow common/.
-    local target=$(ssh_cmd "readlink /home/cjennings/.zshrc")
-    local desktop_env=$(sed -n 's/^DESKTOP_ENV=//p' "$ARCHSETUP_VM_CONF" 2>/dev/null | head -n1)
-    local expected_pattern=".dotfiles/common/.zshrc"
-    [ "$desktop_env" = "none" ] && expected_pattern=".dotfiles/minimal/.zshrc"
-
-    if ! echo "$target" | grep -q "$expected_pattern"; then
-        validation_fail "Dotfiles symlink points to wrong location: $target"
-        attribute_issue "Dotfiles symlink incorrect: $target" "archsetup"
-        return 1
-    fi
-
-    # 3. Check the target file actually exists (not a broken symlink)
-    if ! ssh_cmd "test -f /home/cjennings/.zshrc"; then
-        validation_fail "Dotfiles symlink is broken (target doesn't exist)"
-        ssh_cmd "ls -la /home/cjennings/.zshrc" >> "$LOGFILE" 2>&1
-        attribute_issue "Dotfiles symlink broken" "archsetup"
-        return 1
-    fi
-
-    # 4. Check user can actually read the file (not just root)
-    local result=$(ssh_cmd "sudo -u cjennings cat /home/cjennings/.zshrc > /dev/null 2>&1 && echo OK || echo FAIL")
-    if [ "$result" != "OK" ]; then
-        validation_fail "Dotfiles not readable by user (permission issue)"
-        ssh_cmd "ls -la /home/cjennings/.zshrc" >> "$LOGFILE" 2>&1
-        attribute_issue "Dotfiles not readable by user" "archsetup"
-        return 1
-    fi
-
-    validation_pass "Dotfiles configured correctly (symlink to $target, readable by user)"
-}
-
-#-----------------------------------------------------------------------------
-# Package Manager Validations
-#-----------------------------------------------------------------------------
-
-validate_yay_installed() {
-    step "Checking if yay (AUR helper) is installed and functional"
-
-    # Check binary exists
-    if ! ssh_cmd "which yay" &>> "$LOGFILE"; then
-        validation_fail "yay not found"
-        attribute_issue "yay not installed" "archsetup"
-        return 1
-    fi
-
-    # Check yay can query packages (functional test)
-    if ssh_cmd "sudo -u cjennings yay -Qi yay" &>> "$LOGFILE"; then
-        validation_pass "yay is installed and functional"
-    else
-        validation_fail "yay binary exists but query failed"
-        attribute_issue "yay not functional" "archsetup"
-    fi
-}
-
-validate_pacman_working() {
-    step "Checking if pacman is functional"
-    if ssh_cmd "pacman -Qi base" &>> "$LOGFILE"; then
-        validation_pass "pacman is functional"
-    else
-        validation_fail "pacman query failed"
-        attribute_issue "pacman not functional" "unknown"
-    fi
-}
-
-#-----------------------------------------------------------------------------
-# Window Manager Validations
-#-----------------------------------------------------------------------------
-
-validate_suckless_tools() {
-    step "Checking suckless tools (dwm, st, dmenu, slock)"
-    local missing=""
-
-    for tool in dwm st dmenu slock; do
-        if ! ssh_cmd "test -f /usr/local/bin/$tool"; then
-            missing="$missing $tool"
-        fi
-    done
-
-    if [ -z "$missing" ]; then
-        validation_pass "All suckless tools installed (dwm, st, dmenu, slock)"
-    else
-        validation_fail "Missing suckless tools:$missing"
-        attribute_issue "Missing suckless tools:$missing" "archsetup"
-    fi
-}
-
-validate_hyprland_tools() {
-    step "Checking Hyprland tools"
-    local missing=""
-
-    # Check core Hyprland packages
-    for pkg in hyprland hypridle hyprlock waybar fuzzel swww grim slurp gammastep foot; do
-        if ! ssh_cmd "pacman -Q $pkg &>/dev/null"; then
-            missing="$missing $pkg"
-        fi
-    done
-
-    if [ -z "$missing" ]; then
-        validation_pass "All Hyprland tools installed"
-    else
-        validation_fail "Missing Hyprland tools:$missing"
-        attribute_issue "Missing Hyprland tools:$missing" "archsetup"
-    fi
-}
-
-validate_hyprland_config() {
-    step "Checking Hyprland configuration files"
-    local missing=""
-
-    for config in ".config/hypr/hyprland.conf" ".config/hypr/hypridle.conf" \
-                  ".config/hypr/hyprlock.conf" ".config/waybar/config" \
-                  ".config/fuzzel/fuzzel.ini" ".config/gammastep/config.ini"; do
-        if ! ssh_cmd "test -f /home/cjennings/$config"; then
-            missing="$missing $config"
-        fi
-    done
-
-    if [ -z "$missing" ]; then
-        validation_pass "All Hyprland config files present"
-    else
-        validation_fail "Missing Hyprland configs:$missing"
-        attribute_issue "Missing Hyprland configs:$missing" "archsetup"
-    fi
-}
-
-validate_hyprland_socket() {
-    step "Checking Hyprland IPC socket"
-    # The socket only exists while the compositor runs. In the headless test
-    # VM nobody logs in graphically, so a missing socket with no Hyprland
-    # process is the expected state, not a finding.
-    if ssh_cmd "test -S /tmp/hypr/*/.socket.sock 2>/dev/null"; then
-        validation_pass "Hyprland socket exists"
-    elif ! ssh_cmd "pgrep -x Hyprland >/dev/null 2>&1"; then
-        validation_skip "Hyprland not running (headless) — socket check not applicable"
-    else
-        validation_warn "Hyprland running but IPC socket not found"
-    fi
-}
-
-validate_portal_dark_mode() {
-    step "Checking Settings portal returns dark mode"
-
-    # Check portals.conf exists and uses gtk for Settings
-    local portals_conf="/home/cjennings/.config/xdg-desktop-portal/portals.conf"
-    if ! ssh_cmd "test -f $portals_conf"; then
-        validation_fail "portals.conf not found"
-        attribute_issue "xdg-desktop-portal portals.conf missing" "archsetup"
-        return 1
-    fi
-
-    local settings_backend=$(ssh_cmd "grep 'org.freedesktop.impl.portal.Settings' $portals_conf 2>/dev/null | cut -d= -f2")
-    if [ "$settings_backend" = "none" ]; then
-        validation_fail "Settings portal disabled (set to 'none')"
-        attribute_issue "Settings portal disabled in portals.conf" "archsetup"
-        return 1
-    fi
-
-    # Query the portal for color-scheme (requires portal services running)
-    # Returns "v v u 1" for dark mode (1 = prefer-dark)
-    local color_scheme=$(ssh_cmd "sudo -u cjennings busctl --user call org.freedesktop.portal.Desktop /org/freedesktop/portal/desktop org.freedesktop.portal.Settings Read 'ss' 'org.freedesktop.appearance' 'color-scheme' 2>/dev/null | grep -o 'u [0-9]' | cut -d' ' -f2")
-
-    if [ "$color_scheme" = "1" ]; then
-        validation_pass "Settings portal returns dark mode (color-scheme=1)"
-    elif [ -z "$color_scheme" ] && ! ssh_cmd "pgrep -x Hyprland >/dev/null 2>&1"; then
-        # No compositor → no graphical session bus to query. A socket-activated
-        # xdg-desktop-portal process can exist even headless, so the compositor
-        # is the real precondition (same condition as the socket check). The
-        # conf-file checks above already validated what install controls.
-        validation_skip "No compositor running (headless) — portal query not applicable"
-    elif [ -z "$color_scheme" ]; then
-        validation_warn "Could not query Settings portal (portal may not be running)"
-    else
-        validation_fail "Settings portal not returning dark mode (color-scheme=$color_scheme, expected 1)"
-        attribute_issue "Settings portal not configured for dark mode" "archsetup"
-    fi
-}
-
-validate_window_manager() {
-    # Detect which desktop environment is installed and validate accordingly
-    if ssh_cmd "pacman -Q hyprland &>/dev/null"; then
-        section "Hyprland Desktop Environment"
-        validate_hyprland_tools
-        validate_hyprland_config
-        validate_hyprland_socket
-        validate_portal_dark_mode
-    elif ssh_cmd "test -f /usr/local/bin/dwm"; then
-        section "DWM Desktop Environment"
-        validate_suckless_tools
-    else
-        validation_warn "No window manager detected (DESKTOP_ENV=none?)"
-    fi
-}
-
-#-----------------------------------------------------------------------------
-# Essential Services Validations
-#-----------------------------------------------------------------------------
-
-validate_firewall() {
-    step "Checking if firewall (ufw) is enabled"
-    local status=$(ssh_cmd "systemctl is-enabled ufw.service 2>/dev/null || echo disabled")
-    if [ "$status" = "enabled" ]; then
-        validation_pass "UFW firewall is enabled"
-    else
-        validation_fail "UFW firewall not enabled"
-        attribute_issue "UFW not enabled" "archsetup"
-    fi
-}
-
-validate_dns_config() {
-    step "Checking DNS-over-TLS configuration"
-    if ssh_cmd "grep -q 'DNS=.*#' /etc/systemd/resolved.conf 2>/dev/null"; then
-        validation_pass "DNS-over-TLS configured"
-    else
-        validation_warn "DNS-over-TLS may not be configured"
-    fi
-}
-
-validate_avahi() {
-    step "Checking avahi-daemon status"
-    local status=$(ssh_cmd "systemctl is-enabled avahi-daemon.service 2>/dev/null || echo disabled")
-    if [ "$status" = "enabled" ]; then
-        validation_pass "avahi-daemon is enabled"
-
-        # Full-stack mDNS test: ping hostname.local. QEMU user-mode (slirp,
-        # 10.0.2.x) doesn't pass multicast, so mDNS genuinely can't resolve
-        # there — only run the ping on real networking.
-        if ssh_cmd "ip -4 addr show" 2>/dev/null | grep -q "10\.0\.2\."; then
-            validation_skip "mDNS ping not possible on slirp networking (no multicast)"
-        else
-            local hostname=$(ssh_cmd "hostname")
-            if ssh_cmd "ping -c 1 -W 2 ${hostname}.local" &>> "$LOGFILE"; then
-                validation_pass "mDNS working (${hostname}.local responds to ping)"
-            else
-                validation_warn "mDNS ping failed (avahi may need time to propagate)"
-            fi
-        fi
-    else
-        # This might be OK if avahi was pre-installed
-        validation_warn "avahi-daemon not enabled (may have been pre-configured)"
-    fi
-}
-
-validate_fail2ban() {
-    step "Checking fail2ban status"
-    local status=$(ssh_cmd "systemctl is-enabled fail2ban.service 2>/dev/null || echo disabled")
-    if [ "$status" = "enabled" ]; then
-        validation_pass "fail2ban is enabled"
-    else
-        validation_fail "fail2ban not enabled"
-        attribute_issue "fail2ban not enabled" "archsetup"
-    fi
-}
-
-validate_networkmanager() {
-    step "Checking NetworkManager status"
-    local status=$(ssh_cmd "systemctl is-enabled NetworkManager.service 2>/dev/null || echo disabled")
-    if [ "$status" = "enabled" ]; then
-        validation_pass "NetworkManager is enabled"
-        # Functional test
-        if ssh_cmd "nmcli general status" &>> "$LOGFILE"; then
-            validation_pass "NetworkManager is functional"
-        else
-            validation_warn "NetworkManager enabled but not responding"
-        fi
-    else
-        validation_fail "NetworkManager not enabled"
-        attribute_issue "NetworkManager not enabled" "archsetup"
-    fi
-}
-
-#-----------------------------------------------------------------------------
-# Service-Specific Validations
-#-----------------------------------------------------------------------------
-
-validate_all_services() {
-    section "Service Validations"
-
-    # Core services (always expected)
-    validate_service "sshd" "enabled" "active"
-    validate_service "systemd-resolved" "enabled" "active"
-    validate_service "ufw" "enabled" ""  # VM lacks iptables modules, can't be active
-    validate_service "fail2ban" "enabled" "active"
-    validate_service "NetworkManager" "enabled" "active"
-    validate_service "rngd" "enabled" "active"
-    validate_service "cronie" "enabled" ""
-    validate_service "atd" "enabled" ""
-
-    # Cron job: log cleanup
-    step "Checking log-cleanup cron job"
-    local crontab_entry=$(ssh_cmd "sudo -u cjennings crontab -l 2>/dev/null | grep log-cleanup")
-    if [ -n "$crontab_entry" ]; then
-        validation_pass "log-cleanup cron job installed"
-    else
-        validation_fail "log-cleanup cron job not in crontab"
-        attribute_issue "log-cleanup cron job missing from user crontab" "archsetup"
-    fi
-
-    # Timer services
-    validate_service "reflector.timer" "enabled" ""
-    validate_service "paccache.timer" "enabled" ""
-
-    # Optional services (warn if missing, don't fail)
-    validate_service_optional "avahi-daemon" "enabled"
-    validate_service_optional "bluetooth" "enabled"
-    validate_service_optional "cups" "enabled"
-    validate_service_optional "docker" "enabled"
-    validate_service_optional "tailscaled" "enabled"
-    # Syncthing uses user service (not system), check lingering is enabled.
-    # test -e, not ls: ls prints the path on success, so the old capture held
-    # "path\nyes" and could never equal "yes" — the check warned on every
-    # run, even with lingering correctly enabled.
-    step "Checking user lingering for syncthing"
-    local linger_enabled=$(ssh_cmd "test -e /var/lib/systemd/linger/cjennings && echo yes || echo no")
-    if [ "$linger_enabled" = "yes" ]; then
-        validation_pass "User lingering enabled for syncthing user service"
-    else
-        validation_warn "User lingering not enabled (syncthing may not autostart)"
-    fi
-
-    # Filesystem-specific
-    validate_zfs_services
-    validate_btrfs_services
-
-    # Functional tests
-    validate_service_functions
-}
-
-validate_service() {
-    local service="$1"
-    local expected_enabled="$2"  # "enabled" or ""
-    local expected_active="$3"   # "active" or ""
-
-    step "Checking $service"
-
-    if [ -n "$expected_enabled" ]; then
-        local enabled=$(ssh_cmd "systemctl is-enabled $service 2>/dev/null || echo disabled")
-        if [ "$enabled" = "enabled" ]; then
-            validation_pass "$service is enabled"
-        else
-            validation_fail "$service not enabled (got: $enabled)"
-            attribute_issue "$service not enabled" "archsetup"
-            return 1
-        fi
-    fi
-
-    if [ -n "$expected_active" ]; then
-        local active=$(ssh_cmd "systemctl is-active $service 2>/dev/null || echo inactive")
-        if [ "$active" = "active" ]; then
-            validation_pass "$service is active"
-        else
-            validation_fail "$service not active (got: $active)"
-            attribute_issue "$service not active" "archsetup"
-            return 1
-        fi
-    fi
-
-    return 0
-}
-
-validate_service_optional() {
-    local service="$1"
-    local expected_enabled="$2"
-
-    step "Checking optional service: $service"
-
-    local enabled=$(ssh_cmd "systemctl is-enabled $service 2>/dev/null || echo disabled")
-    if [ "$enabled" = "enabled" ]; then
-        validation_pass "$service is enabled"
-    else
-        validation_warn "$service not enabled (optional)"
-    fi
-}
-
-validate_zfs_services() {
-    # Only check if ZFS is installed
-    if ! ssh_cmd "which zfs" &>> "$LOGFILE"; then
-        return 0
-    fi
-
-    step "Checking ZFS-specific services"
-
-    validate_service_optional "sanoid.timer" "enabled"
-
-    # Check for zfs-scrub timer (pool name varies)
-    local scrub_enabled
-    scrub_enabled=$(ssh_cmd "systemctl list-unit-files 'zfs-scrub*' 2>/dev/null | grep -c enabled" | tr -d '[:space:]')
-    scrub_enabled=${scrub_enabled:-0}
-    if [ "$scrub_enabled" -gt 0 ]; then
-        validation_pass "ZFS scrub timer enabled"
-    else
-        validation_warn "ZFS scrub timer not found"
-    fi
-}
-
-validate_btrfs_services() {
-    # Only check if btrfs root
-    if ! ssh_cmd "mount | grep 'on / ' | grep -q btrfs"; then
-        return 0
-    fi
-
-    step "Checking btrfs-specific services"
-    validate_service_optional "grub-btrfsd" "enabled"
-}
-
-validate_service_functions() {
-    section "Service Functional Tests"
-
-    # UFW functional test
-    # NOTE: VM environment lacks iptables kernel modules, so UFW cannot activate.
-    # We only verify it's enabled; active status requires real hardware.
-    step "Testing UFW functionality"
-    local ufw_enabled
-    ufw_enabled=$(ssh_cmd "systemctl is-enabled ufw.service 2>/dev/null || echo disabled")
-    if [ "$ufw_enabled" = "enabled" ]; then
-        validation_pass "UFW is enabled (activation requires iptables kernel modules)"
-    else
-        validation_fail "UFW not enabled"
-        attribute_issue "UFW not enabled" "archsetup"
-    fi
-
-    # fail2ban functional test
-    step "Testing fail2ban functionality"
-    if ssh_cmd "fail2ban-client status" &>> "$LOGFILE"; then
-        validation_pass "fail2ban is responding"
-    else
-        validation_fail "fail2ban not responding"
-        attribute_issue "fail2ban not functioning" "archsetup"
-    fi
-
-    # DNS resolution test
-    step "Testing DNS resolution"
-    if ssh_cmd "resolvectl query archlinux.org" &>> "$LOGFILE"; then
-        validation_pass "DNS resolution working"
-    else
-        validation_warn "DNS resolution test failed (may be network issue)"
-    fi
-
-    # Docker functional test (if enabled)
-    if ssh_cmd "systemctl is-enabled docker" &>> "$LOGFILE"; then
-        step "Testing Docker functionality"
-        if ssh_cmd "docker info" &>> "$LOGFILE"; then
-            validation_pass "Docker is responding"
-        elif ! ssh_cmd "systemctl is-active --quiet docker"; then
-            # archsetup enables docker for next boot (enable, not enable --now,
-            # by design — the daemon is heavy). Validation runs pre-reboot, so
-            # enabled-but-not-started is the correct installed state.
-            validation_skip "Docker enabled but not started (starts on boot by design)"
-        else
-            validation_warn "Docker active but not responding"
-        fi
-    fi
-}
-
-#-----------------------------------------------------------------------------
-# Developer Tools Validations
-#-----------------------------------------------------------------------------
-
-validate_emacs() {
-    step "Checking if Emacs is installed"
-    if ssh_cmd "which emacs" &>> "$LOGFILE"; then
-        validation_pass "Emacs is installed"
-
-        # Check if config exists
-        if ssh_cmd "test -d /home/cjennings/.emacs.d"; then
-            validation_pass "Emacs config directory exists"
-
-            # Check user can access the directory
-            local result
-            result=$(ssh_cmd "sudo -u cjennings ls /home/cjennings/.emacs.d > /dev/null 2>&1 && echo OK || echo FAIL")
-            if [ "$result" = "OK" ]; then
-                validation_pass "Emacs config readable by user"
-            else
-                validation_fail "Emacs config not readable by user (permission issue)"
-                attribute_issue "Emacs .emacs.d not readable by user" "archsetup"
-            fi
-        else
-            validation_warn "Emacs config directory not found"
-        fi
-    else
-        validation_fail "Emacs not found"
-        attribute_issue "Emacs not installed" "archsetup"
-    fi
-}
-
-validate_git_config() {
-    step "Checking git installation"
-    if ssh_cmd "which git" &>> "$LOGFILE"; then
-        validation_pass "git is installed"
-    else
-        validation_fail "git not found"
-        attribute_issue "git not installed" "archsetup"
-    fi
-}
-
-validate_dev_tools() {
-    step "Checking developer tools"
-    local tools="python node npm go rustc"
-    local missing=""
-
-    for tool in $tools; do
-        if ! ssh_cmd "which $tool" &>> "$LOGFILE"; then
-            missing="$missing $tool"
-        fi
-    done
-
-    if [ -z "$missing" ]; then
-        validation_pass "Core dev tools installed"
-    else
-        validation_warn "Some dev tools missing:$missing"
-    fi
-}
-
-#-----------------------------------------------------------------------------
-# System Configuration Validations
-#-----------------------------------------------------------------------------
-
-validate_zfs_config() {
-    step "Checking ZFS configuration (if applicable)"
-    if ssh_cmd "which zfs" &>> "$LOGFILE"; then
-        # ZFS is installed, check for sanoid
-        if ssh_cmd "which sanoid" &>> "$LOGFILE"; then
-            validation_pass "ZFS with sanoid detected"
-        else
-            validation_warn "ZFS detected but sanoid not installed"
-        fi
-    else
-        info "ZFS not installed (non-ZFS system)"
-    fi
-}
-
-validate_boot_config() {
-    step "Checking GRUB configuration"
-    if ssh_cmd "test -f /boot/grub/grub.cfg" &>> "$LOGFILE"; then
-        validation_pass "GRUB config exists"
-    else
-        validation_warn "GRUB config not found (may use different bootloader)"
-    fi
-}
-
-validate_terminus_font() {
-    step "Checking terminus-font installation"
-    if ssh_cmd "pacman -Q terminus-font" &>> "$LOGFILE"; then
-        validation_pass "terminus-font package installed"
-    else
-        validation_fail "terminus-font package not installed"
-        attribute_issue "terminus-font not installed via pacman" "archsetup"
-    fi
-}
-
-validate_mkinitcpio_hooks() {
-    step "Checking mkinitcpio HOOKS configuration"
-    local hooks=$(ssh_cmd "grep '^HOOKS=' /etc/mkinitcpio.conf")
-    local is_zfs=$(ssh_cmd "findmnt -n -o FSTYPE / 2>/dev/null")
-
-    if [ "$is_zfs" = "zfs" ]; then
-        # ZFS system: must use udev, not systemd
-        if echo "$hooks" | grep -q '\budev\b'; then
-            validation_pass "ZFS system uses udev hook (correct)"
-        elif echo "$hooks" | grep -q '\bsystemd\b'; then
-            validation_fail "ZFS system uses systemd hook (will break boot)"
-            attribute_issue "mkinitcpio uses systemd hook on ZFS system" "archsetup"
-        else
-            validation_warn "Could not determine init hook type"
-        fi
-    else
-        # Non-ZFS: systemd hook is fine
-        if echo "$hooks" | grep -q '\bsystemd\b'; then
-            validation_pass "Non-ZFS system uses systemd hook"
-        elif echo "$hooks" | grep -q '\budev\b'; then
-            validation_pass "Non-ZFS system uses udev hook"
-        fi
-    fi
-}
-
-validate_initramfs_consolefont() {
-    step "Checking console font in initramfs"
-    local font_in_initramfs=$(ssh_cmd "lsinitcpio /boot/initramfs-linux*.img 2>/dev/null | grep -c 'consolefont.psf\\|ter-'")
-
-    if [ "${font_in_initramfs:-0}" -gt 0 ]; then
-        validation_pass "Console font included in initramfs"
-    else
-        validation_warn "Console font may not be in initramfs"
-    fi
-}
-
-validate_nvme_module() {
-    step "Checking NVMe module configuration"
-    local has_nvme=$(ssh_cmd "ls /dev/nvme* 2>/dev/null | head -1")
-
-    if [ -n "$has_nvme" ]; then
-        # System has NVMe drives
-        local modules=$(ssh_cmd "grep '^MODULES=' /etc/mkinitcpio.conf")
-        if echo "$modules" | grep -q 'nvme'; then
-            validation_pass "NVMe module in mkinitcpio MODULES"
-        else
-            validation_warn "NVMe system but nvme not in MODULES (may cause slow boot)"
-        fi
-    else
-        info "No NVMe drives detected"
-    fi
-}
-
-validate_autologin_config() {
-    step "Checking autologin configuration"
-    if ssh_cmd "test -f /etc/systemd/system/getty@tty1.service.d/autologin.conf" &>> "$LOGFILE"; then
-        validation_pass "Autologin configured"
-    else
-        info "Autologin not configured (may be intentional)"
-    fi
-}
-
-validate_gnome_keyring_setup() {
-    step "Checking gnome-keyring pre-configuration"
-    local keyring_dir="/home/cjennings/.local/share/keyrings"
-
-    # Check directory exists
-    if ! ssh_cmd "test -d $keyring_dir"; then
-        validation_fail "Keyring directory not created"
-        attribute_issue "gnome-keyring directory not pre-created" "archsetup"
-        return 1
-    fi
-
-    # Check directory permissions (should be 700)
-    local perms=$(ssh_cmd "stat -c '%a' $keyring_dir")
-    if [ "$perms" != "700" ]; then
-        validation_fail "Keyring directory has wrong permissions: $perms (expected 700)"
-        attribute_issue "gnome-keyring directory wrong permissions" "archsetup"
-        return 1
-    fi
-
-    # Check ownership
-    local owner=$(ssh_cmd "stat -c '%U' $keyring_dir")
-    if [ "$owner" != "cjennings" ]; then
-        validation_fail "Keyring directory owned by $owner (expected cjennings)"
-        attribute_issue "gnome-keyring directory wrong ownership" "archsetup"
-        return 1
-    fi
-
-    # Check default file exists and contains "login"
-    local default_keyring=$(ssh_cmd "cat $keyring_dir/default 2>/dev/null")
-    if [ "$default_keyring" != "login" ]; then
-        validation_fail "Default keyring not set to 'login' (got: '$default_keyring')"
-        attribute_issue "gnome-keyring default not set to login" "archsetup"
-        return 1
-    fi
-
-    validation_pass "gnome-keyring pre-configured (default=login, dir=700)"
-}
-
-#-----------------------------------------------------------------------------
-# Archsetup-Specific Validations
-#-----------------------------------------------------------------------------
-
-validate_archsetup_log() {
-    step "Checking archsetup log for errors"
-    local error_count
-    # Use grep -h to suppress filenames, then wc -l to count total matches
-    error_count=$(ssh_cmd "grep -h '^Error:' /var/log/archsetup-*.log 2>/dev/null | wc -l" | tr -d '[:space:]')
-    error_count=${error_count:-0}
-
-    if [ "$error_count" = "0" ]; then
-        validation_pass "No errors in archsetup log"
-    else
-        validation_fail "Found $error_count errors in archsetup log"
-        attribute_issue "Errors in archsetup log: $error_count" "archsetup"
-    fi
-}
-
-validate_state_markers() {
-    step "Checking archsetup state markers"
-    local state_count=$(ssh_cmd "ls /var/lib/archsetup/state/ 2>/dev/null | wc -l")
-
-    if [ "$state_count" -ge 12 ]; then
-        validation_pass "All 12 installation steps completed"
-    else
-        validation_warn "Only $state_count/12 steps completed"
-    fi
-}
-
-#=============================================================================
 # ISSUE REPORTING
 #=============================================================================
 
@@ -1138,18 +312,3 @@ EOF
     info "Issue report saved: $report_file"
 }
 
-#=============================================================================
-# MAIN VALIDATION ENTRY POINT
-#=============================================================================
-
-run_full_validation() {
-    local output_dir="$1"
-    local archzfs_inbox="${2:-}"
-
-    run_all_validations
-    analyze_log_diff "$output_dir"
-    generate_issue_report "$output_dir" "$archzfs_inbox"
-
-    # Return success if no failures
-    [ $VALIDATION_FAILED -eq 0 ]
-}
diff --git a/scripts/testing/lib/vm-utils.sh b/scripts/testing/lib/vm-utils.sh
index a8736a3..b85e773 100755
--- a/scripts/testing/lib/vm-utils.sh
+++ b/scripts/testing/lib/vm-utils.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+# SPDX-License-Identifier: GPL-3.0-or-later
 # VM management utilities for archsetup testing (direct QEMU)
 # Author: Craig Jennings <craigmartinjennings@gmail.com>
 # License: GNU GPLv3
@@ -10,13 +11,26 @@
 
 # VM configuration defaults
 VM_CPUS="${VM_CPUS:-4}"
-VM_RAM="${VM_RAM:-4096}"  # MB
+# 8 GiB headroom for AUR builds: makepkg runs -j$VM_CPUS, and parallel cc1plus
+# (~700 MB each on heavy C++ packages) OOM-killed under the old 4 GiB default.
+VM_RAM="${VM_RAM:-8192}"  # MB
 VM_DISK_SIZE="${VM_DISK_SIZE:-50}"  # GB
 
+# Filesystem profile: selects which base image + archangel config the harness
+# targets. "btrfs" is the historical default (its image name stays unsuffixed
+# so existing base images keep working); "zfs" gets its own image, since the
+# two on-disk layouts can't share a disk. Honoured by init_vm_paths below.
+FS_PROFILE="${FS_PROFILE:-btrfs}"
+
 # SSH configuration
 SSH_PORT="${SSH_PORT:-2222}"
 SSH_OPTS="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=10"
 ROOT_PASSWORD="${ROOT_PASSWORD:-archsetup}"
+# Set by inject_root_key once a root key is authorized in the VM. When set, the
+# ssh/scp helpers add "-i <key>" so they keep working after archsetup hardens
+# sshd to PermitRootLogin prohibit-password (which kills root *password* login
+# but still allows key auth). Left unquoted at use sites, like SSH_OPTS.
+SSH_KEY_OPT="${SSH_KEY_OPT:-}"
 
 # OVMF firmware paths
 OVMF_CODE="/usr/share/edk2/x64/OVMF_CODE.4m.fd"
@@ -36,9 +50,22 @@ init_vm_paths() {
     local images_dir="${1:-$VM_IMAGES_DIR}"
     [ -z "$images_dir" ] && fatal "VM_IMAGES_DIR not set"
 
+    case "$FS_PROFILE" in
+        btrfs|zfs) ;;
+        *) fatal "Invalid FS_PROFILE: $FS_PROFILE (must be 'btrfs' or 'zfs')" ;;
+    esac
+
     VM_IMAGES_DIR="$images_dir"
-    DISK_PATH="$VM_IMAGES_DIR/archsetup-base.qcow2"
-    OVMF_VARS="$VM_IMAGES_DIR/OVMF_VARS.fd"
+    # btrfs keeps the legacy unsuffixed name; other profiles get a suffix so
+    # their images sit side by side without clobbering each other.
+    local img_suffix=""
+    [ "$FS_PROFILE" != "btrfs" ] && img_suffix="-$FS_PROFILE"
+    DISK_PATH="$VM_IMAGES_DIR/archsetup-base${img_suffix}.qcow2"
+    # Per-profile NVRAM: UEFI boot entries live here, outside the qcow2, so a
+    # disk-snapshot revert can't restore them. Sharing one file across profiles
+    # let a zfs run's ZFSBootMenu entries clobber the btrfs GRUB entry, leaving
+    # the btrfs base unbootable (no removable ESP fallback to recover from).
+    OVMF_VARS="$VM_IMAGES_DIR/OVMF_VARS${img_suffix}.fd"
     PID_FILE="$VM_IMAGES_DIR/qemu.pid"
     MONITOR_SOCK="$VM_IMAGES_DIR/qemu-monitor.sock"
     SERIAL_LOG="$VM_IMAGES_DIR/qemu-serial.log"
@@ -350,7 +377,7 @@ wait_for_ssh() {
 
     progress "Waiting for SSH on localhost:$SSH_PORT..."
     while [ "$elapsed" -lt "$timeout" ]; do
-        if sshpass -p "$password" ssh $SSH_OPTS -p "$SSH_PORT" root@localhost true 2>/dev/null; then
+        if sshpass -p "$password" ssh $SSH_OPTS $SSH_KEY_OPT -p "$SSH_PORT" root@localhost true 2>/dev/null; then
             success "SSH is available"
             return 0
         fi
@@ -366,7 +393,7 @@ wait_for_ssh() {
 vm_exec() {
     local password="${1:-$ROOT_PASSWORD}"
     shift
-    sshpass -p "$password" ssh $SSH_OPTS \
+    sshpass -p "$password" ssh $SSH_OPTS $SSH_KEY_OPT \
         -o ServerAliveInterval=30 -o ServerAliveCountMax=10 \
         -p "$SSH_PORT" root@localhost "$@" 2>> "$LOGFILE"
 }
@@ -378,7 +405,7 @@ copy_to_vm() {
     local password="${3:-$ROOT_PASSWORD}"
 
     step "Copying $(basename "$local_file") to VM:$remote_path"
-    if sshpass -p "$password" scp $SSH_OPTS -P "$SSH_PORT" \
+    if sshpass -p "$password" scp $SSH_OPTS $SSH_KEY_OPT -P "$SSH_PORT" \
         "$local_file" "root@localhost:$remote_path" >> "$LOGFILE" 2>&1; then
         success "File copied to VM"
         return 0
@@ -395,7 +422,7 @@ copy_from_vm() {
     local password="${3:-$ROOT_PASSWORD}"
 
     step "Copying $remote_file from VM"
-    if sshpass -p "$password" scp $SSH_OPTS -P "$SSH_PORT" \
+    if sshpass -p "$password" scp $SSH_OPTS $SSH_KEY_OPT -P "$SSH_PORT" \
         "root@localhost:$remote_file" "$local_path" >> "$LOGFILE" 2>&1; then
         success "File copied from VM"
         return 0
@@ -404,3 +431,31 @@ copy_from_vm() {
         return 1
     fi
 }
+
+# inject_root_key <key_path>
+# Authorize a throwaway root key over the initial password session and switch
+# all the helpers above to key auth (sets SSH_KEY_OPT + ROOT_SSH_KEY). Call once,
+# right after wait_for_ssh and before running archsetup: archsetup sets
+# PermitRootLogin prohibit-password and reloads sshd partway through, which kills
+# root *password* login. Without a key in place first, every SSH after that step
+# fails and the run aborts before any validation. Key auth survives the hardening.
+# Targets root@$VM_IP on $SSH_PORT so it works for both the local VM runner
+# (VM_IP=localhost, port 2222) and the bare-metal runner (VM_IP=host, port 22).
+inject_root_key() {
+    local key="$1"
+    rm -f "$key" "$key.pub"
+    if ! ssh-keygen -t ed25519 -N "" -q -f "$key"; then
+        warn "Root key generation failed - run may break at sshd hardening"
+        return 1
+    fi
+    if sshpass -p "$ROOT_PASSWORD" ssh $SSH_OPTS -p "$SSH_PORT" "root@${VM_IP:-localhost}" \
+        "mkdir -p /root/.ssh && chmod 700 /root/.ssh && cat >> /root/.ssh/authorized_keys && chmod 600 /root/.ssh/authorized_keys" \
+        < "$key.pub" >> "$LOGFILE" 2>&1; then
+        SSH_KEY_OPT="-i $key"
+        export ROOT_SSH_KEY="$key"
+        success "Root SSH key authorized (survives sshd prohibit-password hardening)"
+        return 0
+    fi
+    warn "Root key authorization failed - run may break at sshd hardening"
+    return 1
+}