fix(testing): authorize a root key so make test survives sshd hardening

The VM test SSHes into the guest as root with a password for the whole run. archsetup hardens sshd to PermitRootLogin prohibit-password and reloads it partway through the install, so every SSH after that step failed with "Permission denied" and the run aborted before any validation — make test had been silently broken since the hardening landed. inject_root_key authorizes a throwaway root key right after the first SSH (before archsetup runs) and the ssh/scp helpers now add -i <key> via SSH_KEY_OPT. prohibit-password still allows root key auth, so the harness survives the very hardening it validates. Password stays as the fallback, so the change is additive.
author: Craig Jennings <c@cjennings.net> 2026-06-25 01:24:33 -0400
committer: Craig Jennings <c@cjennings.net> 2026-06-25 01:24:33 -0400
commit: f50fc1def85c1dbbb0ec781be4071b7ec9285785 (patch)
tree: 256b852c91a0a9289d130fcd8e79f5146b73c6cf
parent: 3cac3b3dfcd432395201a309920c2491ee9caf01 (diff)
download: archsetup-f50fc1def85c1dbbb0ec781be4071b7ec9285785.tar.gz
archsetup-f50fc1def85c1dbbb0ec781be4071b7ec9285785.zip
5 files changed, 65 insertions, 19 deletions
diff --git a/scripts/testing/lib/testinfra.sh b/scripts/testing/lib/testinfra.sh
index 0db0ec9..bfcd43a 100644
--- a/scripts/testing/lib/testinfra.sh
+++ b/scripts/testing/lib/testinfra.sh
@@ -32,20 +32,26 @@ run_testinfra_validation() {
 
     step "Running Testinfra validation sweep (advisory)"
 
-    # Ephemeral keypair; authorize the pubkey in the VM over the existing channel.
-    rm -f "$key" "$key.pub"
-    if ! ssh-keygen -t ed25519 -N "" -q -f "$key"; then
-        warn "testinfra: ssh-keygen failed - skipping"
-        return 0
-    fi
-    if ! copy_to_vm "$key.pub" "/tmp/testinfra_key.pub" "$ROOT_PASSWORD"; then
-        warn "testinfra: pubkey copy failed - skipping"
-        return 0
-    fi
-    if ! vm_exec "$ROOT_PASSWORD" \
-        "mkdir -p /root/.ssh && chmod 700 /root/.ssh && cat /tmp/testinfra_key.pub >> /root/.ssh/authorized_keys && chmod 600 /root/.ssh/authorized_keys"; then
-        warn "testinfra: authorizing key in VM failed - skipping"
-        return 0
+    # Prefer the root key the harness already authorized (inject_root_key). It
+    # survives the sshd prohibit-password hardening, so reuse it rather than
+    # authorizing a second key. Fall back to minting our own for standalone use.
+    if [ -n "${ROOT_SSH_KEY:-}" ] && [ -f "${ROOT_SSH_KEY}" ]; then
+        key="$ROOT_SSH_KEY"
+    else
+        rm -f "$key" "$key.pub"
+        if ! ssh-keygen -t ed25519 -N "" -q -f "$key"; then
+            warn "testinfra: ssh-keygen failed - skipping"
+            return 0
+        fi
+        if ! copy_to_vm "$key.pub" "/tmp/testinfra_key.pub" "$ROOT_PASSWORD"; then
+            warn "testinfra: pubkey copy failed - skipping"
+            return 0
+        fi
+        if ! vm_exec "$ROOT_PASSWORD" \
+            "mkdir -p /root/.ssh && chmod 700 /root/.ssh && cat /tmp/testinfra_key.pub >> /root/.ssh/authorized_keys && chmod 600 /root/.ssh/authorized_keys"; then
+            warn "testinfra: authorizing key in VM failed - skipping"
+            return 0
+        fi
     fi
 
     # ssh-config so testinfra connects key-only, no host-key prompt.
diff --git a/scripts/testing/lib/validation.sh b/scripts/testing/lib/validation.sh
index 6855da7..900d675 100644
--- a/scripts/testing/lib/validation.sh
+++ b/scripts/testing/lib/validation.sh
@@ -21,7 +21,7 @@ declare -a UNKNOWN_ISSUES
 # SSH helper (uses globals: VM_IP, ROOT_PASSWORD)
 ssh_cmd() {
     sshpass -p "$ROOT_PASSWORD" ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
-        -o ConnectTimeout=10 -p "${SSH_PORT:-22}" "root@$VM_IP" "$@" 2>/dev/null
+        -o ConnectTimeout=10 ${SSH_KEY_OPT:-} -p "${SSH_PORT:-22}" "root@$VM_IP" "$@" 2>/dev/null
 }
 
 # Validation result helpers
diff --git a/scripts/testing/lib/vm-utils.sh b/scripts/testing/lib/vm-utils.sh
index d029d58..f86e583 100755
--- a/scripts/testing/lib/vm-utils.sh
+++ b/scripts/testing/lib/vm-utils.sh
@@ -18,6 +18,11 @@ VM_DISK_SIZE="${VM_DISK_SIZE:-50}"  # GB
 SSH_PORT="${SSH_PORT:-2222}"
 SSH_OPTS="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=10"
 ROOT_PASSWORD="${ROOT_PASSWORD:-archsetup}"
+# Set by inject_root_key once a root key is authorized in the VM. When set, the
+# ssh/scp helpers add "-i <key>" so they keep working after archsetup hardens
+# sshd to PermitRootLogin prohibit-password (which kills root *password* login
+# but still allows key auth). Left unquoted at use sites, like SSH_OPTS.
+SSH_KEY_OPT="${SSH_KEY_OPT:-}"
 
 # OVMF firmware paths
 OVMF_CODE="/usr/share/edk2/x64/OVMF_CODE.4m.fd"
@@ -351,7 +356,7 @@ wait_for_ssh() {
 
     progress "Waiting for SSH on localhost:$SSH_PORT..."
     while [ "$elapsed" -lt "$timeout" ]; do
-        if sshpass -p "$password" ssh $SSH_OPTS -p "$SSH_PORT" root@localhost true 2>/dev/null; then
+        if sshpass -p "$password" ssh $SSH_OPTS $SSH_KEY_OPT -p "$SSH_PORT" root@localhost true 2>/dev/null; then
             success "SSH is available"
             return 0
         fi
@@ -367,7 +372,7 @@ wait_for_ssh() {
 vm_exec() {
     local password="${1:-$ROOT_PASSWORD}"
     shift
-    sshpass -p "$password" ssh $SSH_OPTS \
+    sshpass -p "$password" ssh $SSH_OPTS $SSH_KEY_OPT \
         -o ServerAliveInterval=30 -o ServerAliveCountMax=10 \
         -p "$SSH_PORT" root@localhost "$@" 2>> "$LOGFILE"
 }
@@ -379,7 +384,7 @@ copy_to_vm() {
     local password="${3:-$ROOT_PASSWORD}"
 
     step "Copying $(basename "$local_file") to VM:$remote_path"
-    if sshpass -p "$password" scp $SSH_OPTS -P "$SSH_PORT" \
+    if sshpass -p "$password" scp $SSH_OPTS $SSH_KEY_OPT -P "$SSH_PORT" \
         "$local_file" "root@localhost:$remote_path" >> "$LOGFILE" 2>&1; then
         success "File copied to VM"
         return 0
@@ -396,7 +401,7 @@ copy_from_vm() {
     local password="${3:-$ROOT_PASSWORD}"
 
     step "Copying $remote_file from VM"
-    if sshpass -p "$password" scp $SSH_OPTS -P "$SSH_PORT" \
+    if sshpass -p "$password" scp $SSH_OPTS $SSH_KEY_OPT -P "$SSH_PORT" \
         "root@localhost:$remote_file" "$local_path" >> "$LOGFILE" 2>&1; then
         success "File copied from VM"
         return 0
@@ -405,3 +410,29 @@ copy_from_vm() {
         return 1
     fi
 }
+
+# inject_root_key <key_path>
+# Authorize a throwaway root key over the initial password session and switch
+# all the helpers above to key auth (sets SSH_KEY_OPT + ROOT_SSH_KEY). Call once,
+# right after wait_for_ssh and before running archsetup: archsetup sets
+# PermitRootLogin prohibit-password and reloads sshd partway through, which kills
+# root *password* login. Without a key in place first, every SSH after that step
+# fails and the run aborts before any validation. Key auth survives the hardening.
+inject_root_key() {
+    local key="$1"
+    rm -f "$key" "$key.pub"
+    if ! ssh-keygen -t ed25519 -N "" -q -f "$key"; then
+        warn "Root key generation failed - run may break at sshd hardening"
+        return 1
+    fi
+    if sshpass -p "$ROOT_PASSWORD" ssh $SSH_OPTS -p "$SSH_PORT" root@localhost \
+        "mkdir -p /root/.ssh && chmod 700 /root/.ssh && cat >> /root/.ssh/authorized_keys && chmod 600 /root/.ssh/authorized_keys" \
+        < "$key.pub" >> "$LOGFILE" 2>&1; then
+        SSH_KEY_OPT="-i $key"
+        export ROOT_SSH_KEY="$key"
+        success "Root SSH key authorized (survives sshd prohibit-password hardening)"
+        return 0
+    fi
+    warn "Root key authorization failed - run may break at sshd hardening"
+    return 1
+}
diff --git a/scripts/testing/run-test.sh b/scripts/testing/run-test.sh
index 314097a..90022d3 100755
--- a/scripts/testing/run-test.sh
+++ b/scripts/testing/run-test.sh
@@ -142,6 +142,13 @@ start_qemu "$DISK_PATH" "disk" "" "none" || fatal "Failed to start VM"
 wait_for_ssh "$ROOT_PASSWORD" 120 || fatal "VM SSH not available"
 stop_timer "boot"
 
+# Authorize a root key now, before archsetup runs. archsetup hardens sshd to
+# PermitRootLogin prohibit-password partway through, which breaks the harness's
+# root password SSH; key auth survives it. Without this, the run aborts mid-way
+# (before any validation) once the hardening step lands.
+inject_root_key "$TEST_RESULTS_DIR/root_key" || \
+    warn "Continuing without root key - run may break at the sshd hardening step"
+
 # Run network diagnostics
 if ! run_network_diagnostics; then
     fatal "Network diagnostics failed - aborting test"
diff --git a/todo.org b/todo.org
index e0da8e6..8329094 100644
--- a/todo.org
+++ b/todo.org
@@ -542,6 +542,8 @@ Reviewed against the existing harness: =scripts/testing/lib/validation.sh= alrea
 Built the Testinfra harness skeleton: =scripts/testing/tests/= (conftest.py with the attribution marker + report hook + =target_user= fixture; 3 parity checks — user exists/shell, ufw enabled, dotfiles stowed+readable), =scripts/testing/lib/testinfra.sh= (=run_testinfra_validation=: ephemeral-key injection, ssh-config, pytest-over-SSH; advisory + non-fatal, =RUN_TESTINFRA= toggle), wired into run-test.sh after the shell sweep, and added =python-pytest python-pytest-testinfra= to =make deps=. Verified on host: py_compile clean, =pytest --collect-only= green in a throwaway venv (4 tests, fixtures resolve), =bash -n= + shellcheck clean, unit suite still green. Integration (the pytest sweep actually running against a VM) is unverified here — needs a =make test= run. Decisions locked: inject test key; run both through parity; full expansion (P4) in this task after the P3 cutover.
 *** 2026-06-25 Thu @ 01:12:09 -0400 P2 full parity port (88 tests)
 Ported the whole shell sweep to pytest: test_users (exists/shell/15 groups parametrized), test_packages (yay+functional, pacman, terminus-font, emacs+config readable, git, 5 dev tools), test_services (required enabled/active, enabled-only, timers, optional skip-if-absent, DoT drop-in, fail2ban/nmcli responds, log-cleanup cron, syncthing lingering, DNS/mDNS/docker skips), test_desktop (Hyprland tools+configs+portal+socket gated on install/compositor, DWM suckless, autologin), test_boot (grub, mkinitcpio hooks branched on zfs_root, console-font-in-initramfs, nvme gated, zfs/sanoid), test_keyring (dir 700/owner/default=login), test_archsetup (log no Error:, ≥12 state markers). conftest fixtures: target_user/home/zfs_root/has_nvme/hyprland_installed/dwm_installed/compositor_running/on_slirp. 88 tests collected, py_compile clean. Correctness fix vs the shell sweep: check =awww= not the stale =swww=. Installed python-pytest-testinfra on velox so the harness gate passes. Next: VM run to diff pytest vs shell sweep for parity.
+*** 2026-06-25 Thu @ 01:24:11 -0400 Fixed: sshd hardening had silently broken =make test=
+VM run #1 aborted ~6 min in (Error 5), before any validation ran. Root cause (pre-existing, not the Testinfra work): the 2026-06-24 sshd hardening sets =PermitRootLogin prohibit-password= + reloads sshd mid-install, and the harness SSHes as root by *password* throughout — so every op after that step got "Permission denied" and run-test.sh fataled before validations. Fix: =inject_root_key= authorizes a throwaway root key right after first SSH (before archsetup runs) and all helpers (=wait_for_ssh=/=vm_exec=/=copy_to_vm=/=copy_from_vm=/=ssh_cmd=) gained =$SSH_KEY_OPT= so they use key auth, which =prohibit-password= still allows. testinfra.sh reuses that key. Additive (password stays as fallback). bash -n + shellcheck clean. Re-running the VM suite to confirm it now reaches the validation + pytest phases.
 Create comprehensive integration tests using Testinfra (Python + pytest) to validate archsetup installations
 
 Tests should cover:
author	Craig Jennings <c@cjennings.net>	2026-06-25 01:24:33 -0400
committer	Craig Jennings <c@cjennings.net>	2026-06-25 01:24:33 -0400
commit	f50fc1def85c1dbbb0ec781be4071b7ec9285785 (patch)
tree	256b852c91a0a9289d130fcd8e79f5146b73c6cf
parent	3cac3b3dfcd432395201a309920c2491ee9caf01 (diff)
download	archsetup-f50fc1def85c1dbbb0ec781be4071b7ec9285785.tar.gz archsetup-f50fc1def85c1dbbb0ec781be4071b7ec9285785.zip