diff options
| -rw-r--r-- | scripts/testing/lib/network-diagnostics.sh | 96 | ||||
| -rw-r--r-- | tests/network-diagnostics/test_network_diagnostics.py | 215 |
2 files changed, 289 insertions, 22 deletions
diff --git a/scripts/testing/lib/network-diagnostics.sh b/scripts/testing/lib/network-diagnostics.sh index 38788e5..dc54334 100644 --- a/scripts/testing/lib/network-diagnostics.sh +++ b/scripts/testing/lib/network-diagnostics.sh @@ -6,58 +6,110 @@ # Note: logging.sh and vm-utils.sh should already be sourced by the calling script # Uses globals: ROOT_PASSWORD, SSH_PORT, SSH_OPTS, VM_IP (from vm-utils.sh or calling script) +# Optional global: TEST_RESULTS_DIR (raw command outputs are saved there when set) -# Run quick network diagnostics +# Gather one read-only fact from the VM, print it, and save the raw output. +# Facts are collected regardless of pass/fail so a failing install still leaves +# the IP/route/resolver evidence in the log and the results dir. +# $1 label human-readable label for the fact +# $2 slug filename slug for the saved raw output +# $3 cmd remote command to run over the shared ssh_base +# Uses the caller's locals ssh_base and results_dir (dynamic scope). +_netdiag_fact() { + local label="$1" slug="$2" cmd="$3" out + out="$($ssh_base "$cmd" 2>&1)" + info "${label}:" + printf '%s\n' "$out" | while IFS= read -r line; do + info " $line" + done + if [ -n "$results_dir" ]; then + printf '%s\n' "$out" > "$results_dir/netdiag-${slug}.txt" 2>/dev/null || true + fi +} + +# Run quick network diagnostics. +# +# Evidence first: collect read-only facts (interfaces, route, resolver) +# unconditionally, then run every reachability check and report all failures at +# the end. A DNS failure is named as a DNS failure, not masked as a generic "no +# internet" or misattributed to the Arch mirror. Returns 0 when all checks pass, +# non-zero when any check fails, so callers keep their success/failure contract. run_network_diagnostics() { local password="${ROOT_PASSWORD:-archsetup}" local port="${SSH_PORT:-22}" local host="${VM_IP:-localhost}" local ssh_base="sshpass -p $password ssh $SSH_OPTS -p $port root@$host" + local results_dir="${TEST_RESULTS_DIR:-}" + local failures=() section "Pre-flight Network Diagnostics" - # Test 1: Basic connectivity (use curl instead of ping - SLIRP may not handle ICMP) - step "Testing internet connectivity" - if $ssh_base "curl -s --connect-timeout 5 -o /dev/null http://archlinux.org" 2>/dev/null; then - success "Internet connectivity OK" - else - error "No internet connectivity" - return 1 - fi + # --- Phase 1: collect read-only facts, unconditionally --- + # These never gate the outcome; they exist so a failed install still has + # the interface/route/resolver evidence to diagnose from. + step "Collecting interface addresses" + _netdiag_fact "Interface addresses (ip -brief addr)" "ip-addr" "ip -brief addr" + + step "Collecting default route" + _netdiag_fact "Default route (ip route show default)" "ip-route" "ip route show default" - # Test 2: DNS resolution (use getent which is always available, unlike nslookup/dig) + step "Reading resolver configuration" + _netdiag_fact "Resolver (/etc/resolv.conf)" "resolv-conf" "cat /etc/resolv.conf" + + # --- Phase 2: generic connectivity checks (run all, don't short-circuit) --- + # DNS, egress, and TLS are independent failure modes. Keeping them separate + # means a resolver problem reads as DNS, not as a downstream mirror failure. step "Testing DNS resolution" if $ssh_base "getent hosts archlinux.org >/dev/null 2>&1" 2>/dev/null; then success "DNS resolution OK" else error "DNS resolution failed" - return 1 + failures+=("DNS resolution (getent hosts archlinux.org)") fi - # Test 3: Arch mirror accessibility + step "Testing HTTP egress" + if $ssh_base "curl -s --connect-timeout 5 -o /dev/null http://archlinux.org" 2>/dev/null; then + success "HTTP egress OK" + else + error "HTTP egress failed" + failures+=("HTTP egress (http://archlinux.org)") + fi + + step "Testing TLS/HTTPS egress" + if $ssh_base "curl -s --connect-timeout 5 -o /dev/null https://archlinux.org" 2>/dev/null; then + success "TLS/HTTPS egress OK" + else + error "TLS/HTTPS egress failed" + failures+=("TLS/HTTPS egress (https://archlinux.org)") + fi + + # --- Phase 3: Arch-specific checks (run all, don't short-circuit) --- step "Testing Arch mirror access" if $ssh_base "curl -s -I https://geo.mirror.pkgbuild.com/ | head -1 | grep -qE '(200|301|302)'" 2>/dev/null; then success "Arch mirrors accessible" else error "Cannot reach Arch mirrors" - return 1 + failures+=("Arch mirror (https://geo.mirror.pkgbuild.com/)") fi - # Test 4: AUR accessibility step "Testing AUR access" if $ssh_base "curl -s -I https://aur.archlinux.org/ | head -1 | grep -qE '(200|405)'" 2>/dev/null; then success "AUR accessible" else error "Cannot reach AUR" - return 1 + failures+=("AUR (https://aur.archlinux.org/)") fi - # Show network info - info "Network configuration:" - $ssh_base "ip addr show | grep 'inet ' | grep -v '127.0.0.1'" 2>/dev/null | while IFS= read -r line; do - info " $line" - done + # --- Summary: report every failure, not just the first --- + if [ ${#failures[@]} -eq 0 ]; then + success "Network diagnostics complete - all checks passed" + return 0 + fi - success "Network diagnostics complete" - return 0 + error "Network diagnostics found ${#failures[@]} failure(s):" + local f + for f in "${failures[@]}"; do + error " - $f" + done + return 1 } diff --git a/tests/network-diagnostics/test_network_diagnostics.py b/tests/network-diagnostics/test_network_diagnostics.py new file mode 100644 index 0000000..1a8073f --- /dev/null +++ b/tests/network-diagnostics/test_network_diagnostics.py @@ -0,0 +1,215 @@ +"""Tests for run_network_diagnostics in the VM testing harness. + +run_network_diagnostics is the VM install pre-flight network check. It +collects read-only facts (interfaces, default route, resolver) first and +unconditionally, then runs every reachability check -- DNS, HTTP egress, +TLS egress, Arch mirror, AUR -- accumulating failures and reporting them all +at the end. Facts are printed regardless of pass/fail, so a failed install +still leaves the evidence. Generic checks (DNS/egress/TLS) are kept separate +from Arch-specific checks (mirror/AUR) so a DNS failure is named as DNS, not +misattributed to the mirror. Returns 0 when all checks pass, non-zero +otherwise, preserving the caller's success/failure contract. + +These tests exercise the REAL function body (sourced out of +network-diagnostics.sh, not a copy) with: + - stub logging functions (section/step/info/success/error/warn) that just + echo, so output is assertable; + - a fake `sshpass` on PATH that dispatches on the remote command string and + returns canned exit codes driven by FAKE_*_FAIL env vars. This is the + system boundary -- the real function shells out through + `sshpass ... ssh ... "<remote cmd>"`, and the fake stands in for the VM. + +Run from repo root: + python3 -m unittest tests.network-diagnostics.test_network_diagnostics +""" + +import os +import shutil +import subprocess +import tempfile +import unittest + + +REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) +NETDIAG = os.path.join( + REPO_ROOT, "scripts", "testing", "lib", "network-diagnostics.sh" +) + +# A fake sshpass. The real invocation is: +# sshpass -p <pw> ssh <opts> -p <port> root@<host> "<remote cmd>" +# so the remote command is always the last argument. This stub inspects it and +# returns a canned exit code per check, driven by FAKE_*_FAIL env vars. Fact +# commands (ip/route/resolv) always succeed and print sample output so the +# evidence-collection path is exercised. +FAKE_SSHPASS = r"""#!/bin/bash +cmd="${@: -1}" +case "$cmd" in + *"ip -brief addr"*) + echo "lo UNKNOWN 127.0.0.1/8" + echo "eth0 UP 10.0.2.15/24" + exit 0 ;; + *"ip route show default"*) + echo "default via 10.0.2.2 dev eth0" + exit 0 ;; + *"resolv.conf"*) + echo "nameserver 10.0.2.3" + exit 0 ;; + *"getent hosts"*) + [ "${FAKE_DNS_FAIL:-0}" = "1" ] && exit 2 + exit 0 ;; + *"https://archlinux.org"*) + [ "${FAKE_TLS_FAIL:-0}" = "1" ] && exit 7 + exit 0 ;; + *"http://archlinux.org"*) + [ "${FAKE_HTTP_FAIL:-0}" = "1" ] && exit 7 + exit 0 ;; + *"geo.mirror.pkgbuild.com"*) + [ "${FAKE_MIRROR_FAIL:-0}" = "1" ] && exit 1 + exit 0 ;; + *"aur.archlinux.org"*) + [ "${FAKE_AUR_FAIL:-0}" = "1" ] && exit 1 + exit 0 ;; + *) + exit 0 ;; +esac +""" + +# Stub logging functions plus the sourced real file, then call the function. +WRAPPER = r"""#!/bin/bash +section() { echo "=== $1 ==="; } +step() { echo " -> $1"; } +info() { echo "[i] $1"; } +success() { echo "[OK] $1"; } +warn() { echo "[!] $1" >&2; } +error() { echo "[X] $1" >&2; } +source "$1" +run_network_diagnostics +""" + + +class NetworkDiagnosticsHarness(unittest.TestCase): + def setUp(self): + self.tmp = tempfile.mkdtemp(prefix="netdiag-test-") + self.fakebin = os.path.join(self.tmp, "bin") + os.makedirs(self.fakebin) + sshpass = os.path.join(self.fakebin, "sshpass") + with open(sshpass, "w") as f: + f.write(FAKE_SSHPASS) + os.chmod(sshpass, 0o755) + self.wrapper = os.path.join(self.tmp, "run.sh") + with open(self.wrapper, "w") as f: + f.write(WRAPPER) + os.chmod(self.wrapper, 0o755) + + def tearDown(self): + shutil.rmtree(self.tmp, ignore_errors=True) + + def run_diag(self, results_dir=None, **fail_flags): + env = dict(os.environ) + env["PATH"] = self.fakebin + os.pathsep + env["PATH"] + # Keep the harness deterministic regardless of the host's SSH config. + env["SSH_OPTS"] = "-o StrictHostKeyChecking=no" + env["ROOT_PASSWORD"] = "archsetup" + env["SSH_PORT"] = "22" + env["VM_IP"] = "localhost" + if results_dir is not None: + env["TEST_RESULTS_DIR"] = results_dir + for k, v in fail_flags.items(): + env[k] = v + return subprocess.run( + ["bash", self.wrapper, NETDIAG], + capture_output=True, text=True, timeout=20, env=env, + ) + + # --- Normal case: everything reachable ----------------------------- + + def test_all_checks_pass_returns_zero(self): + r = self.run_diag() + self.assertEqual(r.returncode, 0, r.stdout + r.stderr) + self.assertIn("all checks passed", r.stdout) + + def test_facts_collected_on_success(self): + r = self.run_diag() + out = r.stdout + r.stderr + self.assertIn("10.0.2.15/24", out) # interface fact + self.assertIn("default via 10.0.2.2", out) # route fact + self.assertIn("nameserver 10.0.2.3", out) # resolver fact + + # --- DNS-failure case ---------------------------------------------- + + def test_dns_failure_returns_nonzero(self): + r = self.run_diag(FAKE_DNS_FAIL="1") + self.assertNotEqual(r.returncode, 0) + + def test_dns_failure_names_dns_not_mirror(self): + r = self.run_diag(FAKE_DNS_FAIL="1") + out = r.stdout + r.stderr + self.assertIn("DNS resolution failed", out) + # A DNS failure must not be misreported as a mirror failure. With only + # DNS failing, the mirror check still runs and passes. + self.assertNotIn("Cannot reach Arch mirrors", out) + + def test_dns_failure_still_collects_evidence(self): + # The whole point of the change: evidence is gathered before any check + # can bail, so a DNS failure still leaves the facts in the output. + r = self.run_diag(FAKE_DNS_FAIL="1") + out = r.stdout + r.stderr + self.assertIn("10.0.2.15/24", out) + self.assertIn("default via 10.0.2.2", out) + self.assertIn("nameserver 10.0.2.3", out) + + def test_dns_failure_summary_lists_the_failure(self): + r = self.run_diag(FAKE_DNS_FAIL="1") + out = r.stdout + r.stderr + self.assertIn("found 1 failure", out) + self.assertIn("getent hosts archlinux.org", out) + + # --- Mirror-only-failure case -------------------------------------- + + def test_mirror_only_failure_returns_nonzero(self): + r = self.run_diag(FAKE_MIRROR_FAIL="1") + self.assertNotEqual(r.returncode, 0) + + def test_mirror_only_failure_generic_checks_pass(self): + r = self.run_diag(FAKE_MIRROR_FAIL="1") + out = r.stdout + r.stderr + # Generic checks are healthy; only the Arch-specific mirror check fails. + self.assertIn("DNS resolution OK", out) + self.assertIn("HTTP egress OK", out) + self.assertIn("TLS/HTTPS egress OK", out) + self.assertIn("Cannot reach Arch mirrors", out) + self.assertNotIn("DNS resolution failed", out) + + def test_mirror_only_failure_summary_names_mirror(self): + r = self.run_diag(FAKE_MIRROR_FAIL="1") + out = r.stdout + r.stderr + self.assertIn("geo.mirror.pkgbuild.com", out) + + # --- All checks run: multiple failures are all reported ------------ + + def test_multiple_failures_all_reported(self): + r = self.run_diag(FAKE_DNS_FAIL="1", FAKE_AUR_FAIL="1") + out = r.stdout + r.stderr + self.assertIn("found 2 failure", out) + self.assertIn("getent hosts archlinux.org", out) + self.assertIn("aur.archlinux.org", out) + + # --- Raw outputs saved to the results dir -------------------------- + + def test_raw_facts_saved_to_results_dir(self): + results = os.path.join(self.tmp, "results") + os.makedirs(results) + self.run_diag(results_dir=results) + for slug, needle in ( + ("ip-addr", "10.0.2.15/24"), + ("ip-route", "default via 10.0.2.2"), + ("resolv-conf", "nameserver 10.0.2.3"), + ): + path = os.path.join(results, "netdiag-%s.txt" % slug) + self.assertTrue(os.path.exists(path), "missing " + path) + with open(path) as f: + self.assertIn(needle, f.read()) + + +if __name__ == "__main__": + unittest.main() |
