From f50fc1def85c1dbbb0ec781be4071b7ec9285785 Mon Sep 17 00:00:00 2001 From: Craig Jennings Date: Thu, 25 Jun 2026 01:24:33 -0400 Subject: fix(testing): authorize a root key so make test survives sshd hardening MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The VM test SSHes into the guest as root with a password for the whole run. archsetup hardens sshd to PermitRootLogin prohibit-password and reloads it partway through the install, so every SSH after that step failed with "Permission denied" and the run aborted before any validation — make test had been silently broken since the hardening landed. inject_root_key authorizes a throwaway root key right after the first SSH (before archsetup runs) and the ssh/scp helpers now add -i via SSH_KEY_OPT. prohibit-password still allows root key auth, so the harness survives the very hardening it validates. Password stays as the fallback, so the change is additive. --- scripts/testing/lib/testinfra.sh | 34 ++++++++++++++++++++-------------- scripts/testing/lib/validation.sh | 2 +- scripts/testing/lib/vm-utils.sh | 39 +++++++++++++++++++++++++++++++++++---- scripts/testing/run-test.sh | 7 +++++++ todo.org | 2 ++ 5 files changed, 65 insertions(+), 19 deletions(-) diff --git a/scripts/testing/lib/testinfra.sh b/scripts/testing/lib/testinfra.sh index 0db0ec9..bfcd43a 100644 --- a/scripts/testing/lib/testinfra.sh +++ b/scripts/testing/lib/testinfra.sh @@ -32,20 +32,26 @@ run_testinfra_validation() { step "Running Testinfra validation sweep (advisory)" - # Ephemeral keypair; authorize the pubkey in the VM over the existing channel. - rm -f "$key" "$key.pub" - if ! ssh-keygen -t ed25519 -N "" -q -f "$key"; then - warn "testinfra: ssh-keygen failed - skipping" - return 0 - fi - if ! copy_to_vm "$key.pub" "/tmp/testinfra_key.pub" "$ROOT_PASSWORD"; then - warn "testinfra: pubkey copy failed - skipping" - return 0 - fi - if ! vm_exec "$ROOT_PASSWORD" \ - "mkdir -p /root/.ssh && chmod 700 /root/.ssh && cat /tmp/testinfra_key.pub >> /root/.ssh/authorized_keys && chmod 600 /root/.ssh/authorized_keys"; then - warn "testinfra: authorizing key in VM failed - skipping" - return 0 + # Prefer the root key the harness already authorized (inject_root_key). It + # survives the sshd prohibit-password hardening, so reuse it rather than + # authorizing a second key. Fall back to minting our own for standalone use. + if [ -n "${ROOT_SSH_KEY:-}" ] && [ -f "${ROOT_SSH_KEY}" ]; then + key="$ROOT_SSH_KEY" + else + rm -f "$key" "$key.pub" + if ! ssh-keygen -t ed25519 -N "" -q -f "$key"; then + warn "testinfra: ssh-keygen failed - skipping" + return 0 + fi + if ! copy_to_vm "$key.pub" "/tmp/testinfra_key.pub" "$ROOT_PASSWORD"; then + warn "testinfra: pubkey copy failed - skipping" + return 0 + fi + if ! vm_exec "$ROOT_PASSWORD" \ + "mkdir -p /root/.ssh && chmod 700 /root/.ssh && cat /tmp/testinfra_key.pub >> /root/.ssh/authorized_keys && chmod 600 /root/.ssh/authorized_keys"; then + warn "testinfra: authorizing key in VM failed - skipping" + return 0 + fi fi # ssh-config so testinfra connects key-only, no host-key prompt. diff --git a/scripts/testing/lib/validation.sh b/scripts/testing/lib/validation.sh index 6855da7..900d675 100644 --- a/scripts/testing/lib/validation.sh +++ b/scripts/testing/lib/validation.sh @@ -21,7 +21,7 @@ declare -a UNKNOWN_ISSUES # SSH helper (uses globals: VM_IP, ROOT_PASSWORD) ssh_cmd() { sshpass -p "$ROOT_PASSWORD" ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ - -o ConnectTimeout=10 -p "${SSH_PORT:-22}" "root@$VM_IP" "$@" 2>/dev/null + -o ConnectTimeout=10 ${SSH_KEY_OPT:-} -p "${SSH_PORT:-22}" "root@$VM_IP" "$@" 2>/dev/null } # Validation result helpers diff --git a/scripts/testing/lib/vm-utils.sh b/scripts/testing/lib/vm-utils.sh index d029d58..f86e583 100755 --- a/scripts/testing/lib/vm-utils.sh +++ b/scripts/testing/lib/vm-utils.sh @@ -18,6 +18,11 @@ VM_DISK_SIZE="${VM_DISK_SIZE:-50}" # GB SSH_PORT="${SSH_PORT:-2222}" SSH_OPTS="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=10" ROOT_PASSWORD="${ROOT_PASSWORD:-archsetup}" +# Set by inject_root_key once a root key is authorized in the VM. When set, the +# ssh/scp helpers add "-i " so they keep working after archsetup hardens +# sshd to PermitRootLogin prohibit-password (which kills root *password* login +# but still allows key auth). Left unquoted at use sites, like SSH_OPTS. +SSH_KEY_OPT="${SSH_KEY_OPT:-}" # OVMF firmware paths OVMF_CODE="/usr/share/edk2/x64/OVMF_CODE.4m.fd" @@ -351,7 +356,7 @@ wait_for_ssh() { progress "Waiting for SSH on localhost:$SSH_PORT..." while [ "$elapsed" -lt "$timeout" ]; do - if sshpass -p "$password" ssh $SSH_OPTS -p "$SSH_PORT" root@localhost true 2>/dev/null; then + if sshpass -p "$password" ssh $SSH_OPTS $SSH_KEY_OPT -p "$SSH_PORT" root@localhost true 2>/dev/null; then success "SSH is available" return 0 fi @@ -367,7 +372,7 @@ wait_for_ssh() { vm_exec() { local password="${1:-$ROOT_PASSWORD}" shift - sshpass -p "$password" ssh $SSH_OPTS \ + sshpass -p "$password" ssh $SSH_OPTS $SSH_KEY_OPT \ -o ServerAliveInterval=30 -o ServerAliveCountMax=10 \ -p "$SSH_PORT" root@localhost "$@" 2>> "$LOGFILE" } @@ -379,7 +384,7 @@ copy_to_vm() { local password="${3:-$ROOT_PASSWORD}" step "Copying $(basename "$local_file") to VM:$remote_path" - if sshpass -p "$password" scp $SSH_OPTS -P "$SSH_PORT" \ + if sshpass -p "$password" scp $SSH_OPTS $SSH_KEY_OPT -P "$SSH_PORT" \ "$local_file" "root@localhost:$remote_path" >> "$LOGFILE" 2>&1; then success "File copied to VM" return 0 @@ -396,7 +401,7 @@ copy_from_vm() { local password="${3:-$ROOT_PASSWORD}" step "Copying $remote_file from VM" - if sshpass -p "$password" scp $SSH_OPTS -P "$SSH_PORT" \ + if sshpass -p "$password" scp $SSH_OPTS $SSH_KEY_OPT -P "$SSH_PORT" \ "root@localhost:$remote_file" "$local_path" >> "$LOGFILE" 2>&1; then success "File copied from VM" return 0 @@ -405,3 +410,29 @@ copy_from_vm() { return 1 fi } + +# inject_root_key +# Authorize a throwaway root key over the initial password session and switch +# all the helpers above to key auth (sets SSH_KEY_OPT + ROOT_SSH_KEY). Call once, +# right after wait_for_ssh and before running archsetup: archsetup sets +# PermitRootLogin prohibit-password and reloads sshd partway through, which kills +# root *password* login. Without a key in place first, every SSH after that step +# fails and the run aborts before any validation. Key auth survives the hardening. +inject_root_key() { + local key="$1" + rm -f "$key" "$key.pub" + if ! ssh-keygen -t ed25519 -N "" -q -f "$key"; then + warn "Root key generation failed - run may break at sshd hardening" + return 1 + fi + if sshpass -p "$ROOT_PASSWORD" ssh $SSH_OPTS -p "$SSH_PORT" root@localhost \ + "mkdir -p /root/.ssh && chmod 700 /root/.ssh && cat >> /root/.ssh/authorized_keys && chmod 600 /root/.ssh/authorized_keys" \ + < "$key.pub" >> "$LOGFILE" 2>&1; then + SSH_KEY_OPT="-i $key" + export ROOT_SSH_KEY="$key" + success "Root SSH key authorized (survives sshd prohibit-password hardening)" + return 0 + fi + warn "Root key authorization failed - run may break at sshd hardening" + return 1 +} diff --git a/scripts/testing/run-test.sh b/scripts/testing/run-test.sh index 314097a..90022d3 100755 --- a/scripts/testing/run-test.sh +++ b/scripts/testing/run-test.sh @@ -142,6 +142,13 @@ start_qemu "$DISK_PATH" "disk" "" "none" || fatal "Failed to start VM" wait_for_ssh "$ROOT_PASSWORD" 120 || fatal "VM SSH not available" stop_timer "boot" +# Authorize a root key now, before archsetup runs. archsetup hardens sshd to +# PermitRootLogin prohibit-password partway through, which breaks the harness's +# root password SSH; key auth survives it. Without this, the run aborts mid-way +# (before any validation) once the hardening step lands. +inject_root_key "$TEST_RESULTS_DIR/root_key" || \ + warn "Continuing without root key - run may break at the sshd hardening step" + # Run network diagnostics if ! run_network_diagnostics; then fatal "Network diagnostics failed - aborting test" diff --git a/todo.org b/todo.org index e0da8e6..8329094 100644 --- a/todo.org +++ b/todo.org @@ -542,6 +542,8 @@ Reviewed against the existing harness: =scripts/testing/lib/validation.sh= alrea Built the Testinfra harness skeleton: =scripts/testing/tests/= (conftest.py with the attribution marker + report hook + =target_user= fixture; 3 parity checks — user exists/shell, ufw enabled, dotfiles stowed+readable), =scripts/testing/lib/testinfra.sh= (=run_testinfra_validation=: ephemeral-key injection, ssh-config, pytest-over-SSH; advisory + non-fatal, =RUN_TESTINFRA= toggle), wired into run-test.sh after the shell sweep, and added =python-pytest python-pytest-testinfra= to =make deps=. Verified on host: py_compile clean, =pytest --collect-only= green in a throwaway venv (4 tests, fixtures resolve), =bash -n= + shellcheck clean, unit suite still green. Integration (the pytest sweep actually running against a VM) is unverified here — needs a =make test= run. Decisions locked: inject test key; run both through parity; full expansion (P4) in this task after the P3 cutover. *** 2026-06-25 Thu @ 01:12:09 -0400 P2 full parity port (88 tests) Ported the whole shell sweep to pytest: test_users (exists/shell/15 groups parametrized), test_packages (yay+functional, pacman, terminus-font, emacs+config readable, git, 5 dev tools), test_services (required enabled/active, enabled-only, timers, optional skip-if-absent, DoT drop-in, fail2ban/nmcli responds, log-cleanup cron, syncthing lingering, DNS/mDNS/docker skips), test_desktop (Hyprland tools+configs+portal+socket gated on install/compositor, DWM suckless, autologin), test_boot (grub, mkinitcpio hooks branched on zfs_root, console-font-in-initramfs, nvme gated, zfs/sanoid), test_keyring (dir 700/owner/default=login), test_archsetup (log no Error:, ≥12 state markers). conftest fixtures: target_user/home/zfs_root/has_nvme/hyprland_installed/dwm_installed/compositor_running/on_slirp. 88 tests collected, py_compile clean. Correctness fix vs the shell sweep: check =awww= not the stale =swww=. Installed python-pytest-testinfra on velox so the harness gate passes. Next: VM run to diff pytest vs shell sweep for parity. +*** 2026-06-25 Thu @ 01:24:11 -0400 Fixed: sshd hardening had silently broken =make test= +VM run #1 aborted ~6 min in (Error 5), before any validation ran. Root cause (pre-existing, not the Testinfra work): the 2026-06-24 sshd hardening sets =PermitRootLogin prohibit-password= + reloads sshd mid-install, and the harness SSHes as root by *password* throughout — so every op after that step got "Permission denied" and run-test.sh fataled before validations. Fix: =inject_root_key= authorizes a throwaway root key right after first SSH (before archsetup runs) and all helpers (=wait_for_ssh=/=vm_exec=/=copy_to_vm=/=copy_from_vm=/=ssh_cmd=) gained =$SSH_KEY_OPT= so they use key auth, which =prohibit-password= still allows. testinfra.sh reuses that key. Additive (password stays as fallback). bash -n + shellcheck clean. Re-running the VM suite to confirm it now reaches the validation + pytest phases. Create comprehensive integration tests using Testinfra (Python + pytest) to validate archsetup installations Tests should cover: -- cgit v1.2.3