aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCraig Jennings <c@cjennings.net>2026-07-04 16:23:12 -0500
committerCraig Jennings <c@cjennings.net>2026-07-04 16:23:57 -0500
commitf8cb4cd1562aceef39f45e33951143c1c1a95b16 (patch)
tree5e88bed934d59550ffca702165578b9071002aec
parent518ffd7578dbc74689b5303a35f402bfe081aa91 (diff)
downloadarchsetup-f8cb4cd1562aceef39f45e33951143c1c1a95b16.tar.gz
archsetup-f8cb4cd1562aceef39f45e33951143c1c1a95b16.zip
test(vm): collect network evidence before failing in pre-flight diagnostics
run_network_diagnostics tested HTTP before DNS and returned on the first failure, so a DNS failure surfaced as a generic "no internet" and the IP/route/resolver evidence was never reached. It now collects read-only facts first (ip -brief addr, default route, resolv.conf) and prints them regardless of outcome, runs every check, and reports all failures together in a summary. Generic checks (DNS, egress, TLS) are split from Arch-specific ones (mirror, AUR) so a DNS failure is named as DNS, not a mirror problem. Raw fact output is saved to the results dir when one is set.
-rw-r--r--scripts/testing/lib/network-diagnostics.sh96
-rw-r--r--tests/network-diagnostics/test_network_diagnostics.py215
2 files changed, 289 insertions, 22 deletions
diff --git a/scripts/testing/lib/network-diagnostics.sh b/scripts/testing/lib/network-diagnostics.sh
index 38788e5..dc54334 100644
--- a/scripts/testing/lib/network-diagnostics.sh
+++ b/scripts/testing/lib/network-diagnostics.sh
@@ -6,58 +6,110 @@
# Note: logging.sh and vm-utils.sh should already be sourced by the calling script
# Uses globals: ROOT_PASSWORD, SSH_PORT, SSH_OPTS, VM_IP (from vm-utils.sh or calling script)
+# Optional global: TEST_RESULTS_DIR (raw command outputs are saved there when set)
-# Run quick network diagnostics
+# Gather one read-only fact from the VM, print it, and save the raw output.
+# Facts are collected regardless of pass/fail so a failing install still leaves
+# the IP/route/resolver evidence in the log and the results dir.
+# $1 label human-readable label for the fact
+# $2 slug filename slug for the saved raw output
+# $3 cmd remote command to run over the shared ssh_base
+# Uses the caller's locals ssh_base and results_dir (dynamic scope).
+_netdiag_fact() {
+ local label="$1" slug="$2" cmd="$3" out
+ out="$($ssh_base "$cmd" 2>&1)"
+ info "${label}:"
+ printf '%s\n' "$out" | while IFS= read -r line; do
+ info " $line"
+ done
+ if [ -n "$results_dir" ]; then
+ printf '%s\n' "$out" > "$results_dir/netdiag-${slug}.txt" 2>/dev/null || true
+ fi
+}
+
+# Run quick network diagnostics.
+#
+# Evidence first: collect read-only facts (interfaces, route, resolver)
+# unconditionally, then run every reachability check and report all failures at
+# the end. A DNS failure is named as a DNS failure, not masked as a generic "no
+# internet" or misattributed to the Arch mirror. Returns 0 when all checks pass,
+# non-zero when any check fails, so callers keep their success/failure contract.
run_network_diagnostics() {
local password="${ROOT_PASSWORD:-archsetup}"
local port="${SSH_PORT:-22}"
local host="${VM_IP:-localhost}"
local ssh_base="sshpass -p $password ssh $SSH_OPTS -p $port root@$host"
+ local results_dir="${TEST_RESULTS_DIR:-}"
+ local failures=()
section "Pre-flight Network Diagnostics"
- # Test 1: Basic connectivity (use curl instead of ping - SLIRP may not handle ICMP)
- step "Testing internet connectivity"
- if $ssh_base "curl -s --connect-timeout 5 -o /dev/null http://archlinux.org" 2>/dev/null; then
- success "Internet connectivity OK"
- else
- error "No internet connectivity"
- return 1
- fi
+ # --- Phase 1: collect read-only facts, unconditionally ---
+ # These never gate the outcome; they exist so a failed install still has
+ # the interface/route/resolver evidence to diagnose from.
+ step "Collecting interface addresses"
+ _netdiag_fact "Interface addresses (ip -brief addr)" "ip-addr" "ip -brief addr"
+
+ step "Collecting default route"
+ _netdiag_fact "Default route (ip route show default)" "ip-route" "ip route show default"
- # Test 2: DNS resolution (use getent which is always available, unlike nslookup/dig)
+ step "Reading resolver configuration"
+ _netdiag_fact "Resolver (/etc/resolv.conf)" "resolv-conf" "cat /etc/resolv.conf"
+
+ # --- Phase 2: generic connectivity checks (run all, don't short-circuit) ---
+ # DNS, egress, and TLS are independent failure modes. Keeping them separate
+ # means a resolver problem reads as DNS, not as a downstream mirror failure.
step "Testing DNS resolution"
if $ssh_base "getent hosts archlinux.org >/dev/null 2>&1" 2>/dev/null; then
success "DNS resolution OK"
else
error "DNS resolution failed"
- return 1
+ failures+=("DNS resolution (getent hosts archlinux.org)")
fi
- # Test 3: Arch mirror accessibility
+ step "Testing HTTP egress"
+ if $ssh_base "curl -s --connect-timeout 5 -o /dev/null http://archlinux.org" 2>/dev/null; then
+ success "HTTP egress OK"
+ else
+ error "HTTP egress failed"
+ failures+=("HTTP egress (http://archlinux.org)")
+ fi
+
+ step "Testing TLS/HTTPS egress"
+ if $ssh_base "curl -s --connect-timeout 5 -o /dev/null https://archlinux.org" 2>/dev/null; then
+ success "TLS/HTTPS egress OK"
+ else
+ error "TLS/HTTPS egress failed"
+ failures+=("TLS/HTTPS egress (https://archlinux.org)")
+ fi
+
+ # --- Phase 3: Arch-specific checks (run all, don't short-circuit) ---
step "Testing Arch mirror access"
if $ssh_base "curl -s -I https://geo.mirror.pkgbuild.com/ | head -1 | grep -qE '(200|301|302)'" 2>/dev/null; then
success "Arch mirrors accessible"
else
error "Cannot reach Arch mirrors"
- return 1
+ failures+=("Arch mirror (https://geo.mirror.pkgbuild.com/)")
fi
- # Test 4: AUR accessibility
step "Testing AUR access"
if $ssh_base "curl -s -I https://aur.archlinux.org/ | head -1 | grep -qE '(200|405)'" 2>/dev/null; then
success "AUR accessible"
else
error "Cannot reach AUR"
- return 1
+ failures+=("AUR (https://aur.archlinux.org/)")
fi
- # Show network info
- info "Network configuration:"
- $ssh_base "ip addr show | grep 'inet ' | grep -v '127.0.0.1'" 2>/dev/null | while IFS= read -r line; do
- info " $line"
- done
+ # --- Summary: report every failure, not just the first ---
+ if [ ${#failures[@]} -eq 0 ]; then
+ success "Network diagnostics complete - all checks passed"
+ return 0
+ fi
- success "Network diagnostics complete"
- return 0
+ error "Network diagnostics found ${#failures[@]} failure(s):"
+ local f
+ for f in "${failures[@]}"; do
+ error " - $f"
+ done
+ return 1
}
diff --git a/tests/network-diagnostics/test_network_diagnostics.py b/tests/network-diagnostics/test_network_diagnostics.py
new file mode 100644
index 0000000..1a8073f
--- /dev/null
+++ b/tests/network-diagnostics/test_network_diagnostics.py
@@ -0,0 +1,215 @@
+"""Tests for run_network_diagnostics in the VM testing harness.
+
+run_network_diagnostics is the VM install pre-flight network check. It
+collects read-only facts (interfaces, default route, resolver) first and
+unconditionally, then runs every reachability check -- DNS, HTTP egress,
+TLS egress, Arch mirror, AUR -- accumulating failures and reporting them all
+at the end. Facts are printed regardless of pass/fail, so a failed install
+still leaves the evidence. Generic checks (DNS/egress/TLS) are kept separate
+from Arch-specific checks (mirror/AUR) so a DNS failure is named as DNS, not
+misattributed to the mirror. Returns 0 when all checks pass, non-zero
+otherwise, preserving the caller's success/failure contract.
+
+These tests exercise the REAL function body (sourced out of
+network-diagnostics.sh, not a copy) with:
+ - stub logging functions (section/step/info/success/error/warn) that just
+ echo, so output is assertable;
+ - a fake `sshpass` on PATH that dispatches on the remote command string and
+ returns canned exit codes driven by FAKE_*_FAIL env vars. This is the
+ system boundary -- the real function shells out through
+ `sshpass ... ssh ... "<remote cmd>"`, and the fake stands in for the VM.
+
+Run from repo root:
+ python3 -m unittest tests.network-diagnostics.test_network_diagnostics
+"""
+
+import os
+import shutil
+import subprocess
+import tempfile
+import unittest
+
+
+REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
+NETDIAG = os.path.join(
+ REPO_ROOT, "scripts", "testing", "lib", "network-diagnostics.sh"
+)
+
+# A fake sshpass. The real invocation is:
+# sshpass -p <pw> ssh <opts> -p <port> root@<host> "<remote cmd>"
+# so the remote command is always the last argument. This stub inspects it and
+# returns a canned exit code per check, driven by FAKE_*_FAIL env vars. Fact
+# commands (ip/route/resolv) always succeed and print sample output so the
+# evidence-collection path is exercised.
+FAKE_SSHPASS = r"""#!/bin/bash
+cmd="${@: -1}"
+case "$cmd" in
+ *"ip -brief addr"*)
+ echo "lo UNKNOWN 127.0.0.1/8"
+ echo "eth0 UP 10.0.2.15/24"
+ exit 0 ;;
+ *"ip route show default"*)
+ echo "default via 10.0.2.2 dev eth0"
+ exit 0 ;;
+ *"resolv.conf"*)
+ echo "nameserver 10.0.2.3"
+ exit 0 ;;
+ *"getent hosts"*)
+ [ "${FAKE_DNS_FAIL:-0}" = "1" ] && exit 2
+ exit 0 ;;
+ *"https://archlinux.org"*)
+ [ "${FAKE_TLS_FAIL:-0}" = "1" ] && exit 7
+ exit 0 ;;
+ *"http://archlinux.org"*)
+ [ "${FAKE_HTTP_FAIL:-0}" = "1" ] && exit 7
+ exit 0 ;;
+ *"geo.mirror.pkgbuild.com"*)
+ [ "${FAKE_MIRROR_FAIL:-0}" = "1" ] && exit 1
+ exit 0 ;;
+ *"aur.archlinux.org"*)
+ [ "${FAKE_AUR_FAIL:-0}" = "1" ] && exit 1
+ exit 0 ;;
+ *)
+ exit 0 ;;
+esac
+"""
+
+# Stub logging functions plus the sourced real file, then call the function.
+WRAPPER = r"""#!/bin/bash
+section() { echo "=== $1 ==="; }
+step() { echo " -> $1"; }
+info() { echo "[i] $1"; }
+success() { echo "[OK] $1"; }
+warn() { echo "[!] $1" >&2; }
+error() { echo "[X] $1" >&2; }
+source "$1"
+run_network_diagnostics
+"""
+
+
+class NetworkDiagnosticsHarness(unittest.TestCase):
+ def setUp(self):
+ self.tmp = tempfile.mkdtemp(prefix="netdiag-test-")
+ self.fakebin = os.path.join(self.tmp, "bin")
+ os.makedirs(self.fakebin)
+ sshpass = os.path.join(self.fakebin, "sshpass")
+ with open(sshpass, "w") as f:
+ f.write(FAKE_SSHPASS)
+ os.chmod(sshpass, 0o755)
+ self.wrapper = os.path.join(self.tmp, "run.sh")
+ with open(self.wrapper, "w") as f:
+ f.write(WRAPPER)
+ os.chmod(self.wrapper, 0o755)
+
+ def tearDown(self):
+ shutil.rmtree(self.tmp, ignore_errors=True)
+
+ def run_diag(self, results_dir=None, **fail_flags):
+ env = dict(os.environ)
+ env["PATH"] = self.fakebin + os.pathsep + env["PATH"]
+ # Keep the harness deterministic regardless of the host's SSH config.
+ env["SSH_OPTS"] = "-o StrictHostKeyChecking=no"
+ env["ROOT_PASSWORD"] = "archsetup"
+ env["SSH_PORT"] = "22"
+ env["VM_IP"] = "localhost"
+ if results_dir is not None:
+ env["TEST_RESULTS_DIR"] = results_dir
+ for k, v in fail_flags.items():
+ env[k] = v
+ return subprocess.run(
+ ["bash", self.wrapper, NETDIAG],
+ capture_output=True, text=True, timeout=20, env=env,
+ )
+
+ # --- Normal case: everything reachable -----------------------------
+
+ def test_all_checks_pass_returns_zero(self):
+ r = self.run_diag()
+ self.assertEqual(r.returncode, 0, r.stdout + r.stderr)
+ self.assertIn("all checks passed", r.stdout)
+
+ def test_facts_collected_on_success(self):
+ r = self.run_diag()
+ out = r.stdout + r.stderr
+ self.assertIn("10.0.2.15/24", out) # interface fact
+ self.assertIn("default via 10.0.2.2", out) # route fact
+ self.assertIn("nameserver 10.0.2.3", out) # resolver fact
+
+ # --- DNS-failure case ----------------------------------------------
+
+ def test_dns_failure_returns_nonzero(self):
+ r = self.run_diag(FAKE_DNS_FAIL="1")
+ self.assertNotEqual(r.returncode, 0)
+
+ def test_dns_failure_names_dns_not_mirror(self):
+ r = self.run_diag(FAKE_DNS_FAIL="1")
+ out = r.stdout + r.stderr
+ self.assertIn("DNS resolution failed", out)
+ # A DNS failure must not be misreported as a mirror failure. With only
+ # DNS failing, the mirror check still runs and passes.
+ self.assertNotIn("Cannot reach Arch mirrors", out)
+
+ def test_dns_failure_still_collects_evidence(self):
+ # The whole point of the change: evidence is gathered before any check
+ # can bail, so a DNS failure still leaves the facts in the output.
+ r = self.run_diag(FAKE_DNS_FAIL="1")
+ out = r.stdout + r.stderr
+ self.assertIn("10.0.2.15/24", out)
+ self.assertIn("default via 10.0.2.2", out)
+ self.assertIn("nameserver 10.0.2.3", out)
+
+ def test_dns_failure_summary_lists_the_failure(self):
+ r = self.run_diag(FAKE_DNS_FAIL="1")
+ out = r.stdout + r.stderr
+ self.assertIn("found 1 failure", out)
+ self.assertIn("getent hosts archlinux.org", out)
+
+ # --- Mirror-only-failure case --------------------------------------
+
+ def test_mirror_only_failure_returns_nonzero(self):
+ r = self.run_diag(FAKE_MIRROR_FAIL="1")
+ self.assertNotEqual(r.returncode, 0)
+
+ def test_mirror_only_failure_generic_checks_pass(self):
+ r = self.run_diag(FAKE_MIRROR_FAIL="1")
+ out = r.stdout + r.stderr
+ # Generic checks are healthy; only the Arch-specific mirror check fails.
+ self.assertIn("DNS resolution OK", out)
+ self.assertIn("HTTP egress OK", out)
+ self.assertIn("TLS/HTTPS egress OK", out)
+ self.assertIn("Cannot reach Arch mirrors", out)
+ self.assertNotIn("DNS resolution failed", out)
+
+ def test_mirror_only_failure_summary_names_mirror(self):
+ r = self.run_diag(FAKE_MIRROR_FAIL="1")
+ out = r.stdout + r.stderr
+ self.assertIn("geo.mirror.pkgbuild.com", out)
+
+ # --- All checks run: multiple failures are all reported ------------
+
+ def test_multiple_failures_all_reported(self):
+ r = self.run_diag(FAKE_DNS_FAIL="1", FAKE_AUR_FAIL="1")
+ out = r.stdout + r.stderr
+ self.assertIn("found 2 failure", out)
+ self.assertIn("getent hosts archlinux.org", out)
+ self.assertIn("aur.archlinux.org", out)
+
+ # --- Raw outputs saved to the results dir --------------------------
+
+ def test_raw_facts_saved_to_results_dir(self):
+ results = os.path.join(self.tmp, "results")
+ os.makedirs(results)
+ self.run_diag(results_dir=results)
+ for slug, needle in (
+ ("ip-addr", "10.0.2.15/24"),
+ ("ip-route", "default via 10.0.2.2"),
+ ("resolv-conf", "nameserver 10.0.2.3"),
+ ):
+ path = os.path.join(results, "netdiag-%s.txt" % slug)
+ self.assertTrue(os.path.exists(path), "missing " + path)
+ with open(path) as f:
+ self.assertIn(needle, f.read())
+
+
+if __name__ == "__main__":
+ unittest.main()