2 files changed, 289 insertions, 22 deletions
diff --git a/scripts/testing/lib/network-diagnostics.sh b/scripts/testing/lib/network-diagnostics.sh
index 38788e5..dc54334 100644
--- a/scripts/testing/lib/network-diagnostics.sh
+++ b/scripts/testing/lib/network-diagnostics.sh
@@ -6,58 +6,110 @@
 
 # Note: logging.sh and vm-utils.sh should already be sourced by the calling script
 # Uses globals: ROOT_PASSWORD, SSH_PORT, SSH_OPTS, VM_IP (from vm-utils.sh or calling script)
+# Optional global: TEST_RESULTS_DIR (raw command outputs are saved there when set)
 
-# Run quick network diagnostics
+# Gather one read-only fact from the VM, print it, and save the raw output.
+# Facts are collected regardless of pass/fail so a failing install still leaves
+# the IP/route/resolver evidence in the log and the results dir.
+#   $1 label   human-readable label for the fact
+#   $2 slug    filename slug for the saved raw output
+#   $3 cmd     remote command to run over the shared ssh_base
+# Uses the caller's locals ssh_base and results_dir (dynamic scope).
+_netdiag_fact() {
+    local label="$1" slug="$2" cmd="$3" out
+    out="$($ssh_base "$cmd" 2>&1)"
+    info "${label}:"
+    printf '%s\n' "$out" | while IFS= read -r line; do
+        info "  $line"
+    done
+    if [ -n "$results_dir" ]; then
+        printf '%s\n' "$out" > "$results_dir/netdiag-${slug}.txt" 2>/dev/null || true
+    fi
+}
+
+# Run quick network diagnostics.
+#
+# Evidence first: collect read-only facts (interfaces, route, resolver)
+# unconditionally, then run every reachability check and report all failures at
+# the end. A DNS failure is named as a DNS failure, not masked as a generic "no
+# internet" or misattributed to the Arch mirror. Returns 0 when all checks pass,
+# non-zero when any check fails, so callers keep their success/failure contract.
 run_network_diagnostics() {
     local password="${ROOT_PASSWORD:-archsetup}"
     local port="${SSH_PORT:-22}"
     local host="${VM_IP:-localhost}"
     local ssh_base="sshpass -p $password ssh $SSH_OPTS -p $port root@$host"
+    local results_dir="${TEST_RESULTS_DIR:-}"
+    local failures=()
 
     section "Pre-flight Network Diagnostics"
 
-    # Test 1: Basic connectivity (use curl instead of ping - SLIRP may not handle ICMP)
-    step "Testing internet connectivity"
-    if $ssh_base "curl -s --connect-timeout 5 -o /dev/null http://archlinux.org" 2>/dev/null; then
-        success "Internet connectivity OK"
-    else
-        error "No internet connectivity"
-        return 1
-    fi
+    # --- Phase 1: collect read-only facts, unconditionally ---
+    # These never gate the outcome; they exist so a failed install still has
+    # the interface/route/resolver evidence to diagnose from.
+    step "Collecting interface addresses"
+    _netdiag_fact "Interface addresses (ip -brief addr)" "ip-addr" "ip -brief addr"
+
+    step "Collecting default route"
+    _netdiag_fact "Default route (ip route show default)" "ip-route" "ip route show default"
 
-    # Test 2: DNS resolution (use getent which is always available, unlike nslookup/dig)
+    step "Reading resolver configuration"
+    _netdiag_fact "Resolver (/etc/resolv.conf)" "resolv-conf" "cat /etc/resolv.conf"
+
+    # --- Phase 2: generic connectivity checks (run all, don't short-circuit) ---
+    # DNS, egress, and TLS are independent failure modes. Keeping them separate
+    # means a resolver problem reads as DNS, not as a downstream mirror failure.
     step "Testing DNS resolution"
     if $ssh_base "getent hosts archlinux.org >/dev/null 2>&1" 2>/dev/null; then
         success "DNS resolution OK"
     else
         error "DNS resolution failed"
-        return 1
+        failures+=("DNS resolution (getent hosts archlinux.org)")
     fi
 
-    # Test 3: Arch mirror accessibility
+    step "Testing HTTP egress"
+    if $ssh_base "curl -s --connect-timeout 5 -o /dev/null http://archlinux.org" 2>/dev/null; then
+        success "HTTP egress OK"
+    else
+        error "HTTP egress failed"
+        failures+=("HTTP egress (http://archlinux.org)")
+    fi
+
+    step "Testing TLS/HTTPS egress"
+    if $ssh_base "curl -s --connect-timeout 5 -o /dev/null https://archlinux.org" 2>/dev/null; then
+        success "TLS/HTTPS egress OK"
+    else
+        error "TLS/HTTPS egress failed"
+        failures+=("TLS/HTTPS egress (https://archlinux.org)")
+    fi
+
+    # --- Phase 3: Arch-specific checks (run all, don't short-circuit) ---
     step "Testing Arch mirror access"
     if $ssh_base "curl -s -I https://geo.mirror.pkgbuild.com/ | head -1 | grep -qE '(200|301|302)'" 2>/dev/null; then
         success "Arch mirrors accessible"
     else
         error "Cannot reach Arch mirrors"
-        return 1
+        failures+=("Arch mirror (https://geo.mirror.pkgbuild.com/)")
     fi
 
-    # Test 4: AUR accessibility
     step "Testing AUR access"
     if $ssh_base "curl -s -I https://aur.archlinux.org/ | head -1 | grep -qE '(200|405)'" 2>/dev/null; then
         success "AUR accessible"
     else
         error "Cannot reach AUR"
-        return 1
+        failures+=("AUR (https://aur.archlinux.org/)")
     fi
 
-    # Show network info
-    info "Network configuration:"
-    $ssh_base "ip addr show | grep 'inet ' | grep -v '127.0.0.1'" 2>/dev/null | while IFS= read -r line; do
-        info "  $line"
-    done
+    # --- Summary: report every failure, not just the first ---
+    if [ ${#failures[@]} -eq 0 ]; then
+        success "Network diagnostics complete - all checks passed"
+        return 0
+    fi
 
-    success "Network diagnostics complete"
-    return 0
+    error "Network diagnostics found ${#failures[@]} failure(s):"
+    local f
+    for f in "${failures[@]}"; do
+        error "  - $f"
+    done
+    return 1
 }
diff --git a/tests/network-diagnostics/test_network_diagnostics.py b/tests/network-diagnostics/test_network_diagnostics.py
new file mode 100644
index 0000000..1a8073f
--- /dev/null
+++ b/tests/network-diagnostics/test_network_diagnostics.py
@@ -0,0 +1,215 @@
+"""Tests for run_network_diagnostics in the VM testing harness.
+
+run_network_diagnostics is the VM install pre-flight network check. It
+collects read-only facts (interfaces, default route, resolver) first and
+unconditionally, then runs every reachability check -- DNS, HTTP egress,
+TLS egress, Arch mirror, AUR -- accumulating failures and reporting them all
+at the end. Facts are printed regardless of pass/fail, so a failed install
+still leaves the evidence. Generic checks (DNS/egress/TLS) are kept separate
+from Arch-specific checks (mirror/AUR) so a DNS failure is named as DNS, not
+misattributed to the mirror. Returns 0 when all checks pass, non-zero
+otherwise, preserving the caller's success/failure contract.
+
+These tests exercise the REAL function body (sourced out of
+network-diagnostics.sh, not a copy) with:
+  - stub logging functions (section/step/info/success/error/warn) that just
+    echo, so output is assertable;
+  - a fake `sshpass` on PATH that dispatches on the remote command string and
+    returns canned exit codes driven by FAKE_*_FAIL env vars. This is the
+    system boundary -- the real function shells out through
+    `sshpass ... ssh ... "<remote cmd>"`, and the fake stands in for the VM.
+
+Run from repo root:
+    python3 -m unittest tests.network-diagnostics.test_network_diagnostics
+"""
+
+import os
+import shutil
+import subprocess
+import tempfile
+import unittest
+
+
+REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
+NETDIAG = os.path.join(
+    REPO_ROOT, "scripts", "testing", "lib", "network-diagnostics.sh"
+)
+
+# A fake sshpass. The real invocation is:
+#   sshpass -p <pw> ssh <opts> -p <port> root@<host> "<remote cmd>"
+# so the remote command is always the last argument. This stub inspects it and
+# returns a canned exit code per check, driven by FAKE_*_FAIL env vars. Fact
+# commands (ip/route/resolv) always succeed and print sample output so the
+# evidence-collection path is exercised.
+FAKE_SSHPASS = r"""#!/bin/bash
+cmd="${@: -1}"
+case "$cmd" in
+    *"ip -brief addr"*)
+        echo "lo    UNKNOWN 127.0.0.1/8"
+        echo "eth0  UP      10.0.2.15/24"
+        exit 0 ;;
+    *"ip route show default"*)
+        echo "default via 10.0.2.2 dev eth0"
+        exit 0 ;;
+    *"resolv.conf"*)
+        echo "nameserver 10.0.2.3"
+        exit 0 ;;
+    *"getent hosts"*)
+        [ "${FAKE_DNS_FAIL:-0}" = "1" ] && exit 2
+        exit 0 ;;
+    *"https://archlinux.org"*)
+        [ "${FAKE_TLS_FAIL:-0}" = "1" ] && exit 7
+        exit 0 ;;
+    *"http://archlinux.org"*)
+        [ "${FAKE_HTTP_FAIL:-0}" = "1" ] && exit 7
+        exit 0 ;;
+    *"geo.mirror.pkgbuild.com"*)
+        [ "${FAKE_MIRROR_FAIL:-0}" = "1" ] && exit 1
+        exit 0 ;;
+    *"aur.archlinux.org"*)
+        [ "${FAKE_AUR_FAIL:-0}" = "1" ] && exit 1
+        exit 0 ;;
+    *)
+        exit 0 ;;
+esac
+"""
+
+# Stub logging functions plus the sourced real file, then call the function.
+WRAPPER = r"""#!/bin/bash
+section() { echo "=== $1 ==="; }
+step()    { echo "  -> $1"; }
+info()    { echo "[i] $1"; }
+success() { echo "[OK] $1"; }
+warn()    { echo "[!] $1" >&2; }
+error()   { echo "[X] $1" >&2; }
+source "$1"
+run_network_diagnostics
+"""
+
+
+class NetworkDiagnosticsHarness(unittest.TestCase):
+    def setUp(self):
+        self.tmp = tempfile.mkdtemp(prefix="netdiag-test-")
+        self.fakebin = os.path.join(self.tmp, "bin")
+        os.makedirs(self.fakebin)
+        sshpass = os.path.join(self.fakebin, "sshpass")
+        with open(sshpass, "w") as f:
+            f.write(FAKE_SSHPASS)
+        os.chmod(sshpass, 0o755)
+        self.wrapper = os.path.join(self.tmp, "run.sh")
+        with open(self.wrapper, "w") as f:
+            f.write(WRAPPER)
+        os.chmod(self.wrapper, 0o755)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp, ignore_errors=True)
+
+    def run_diag(self, results_dir=None, **fail_flags):
+        env = dict(os.environ)
+        env["PATH"] = self.fakebin + os.pathsep + env["PATH"]
+        # Keep the harness deterministic regardless of the host's SSH config.
+        env["SSH_OPTS"] = "-o StrictHostKeyChecking=no"
+        env["ROOT_PASSWORD"] = "archsetup"
+        env["SSH_PORT"] = "22"
+        env["VM_IP"] = "localhost"
+        if results_dir is not None:
+            env["TEST_RESULTS_DIR"] = results_dir
+        for k, v in fail_flags.items():
+            env[k] = v
+        return subprocess.run(
+            ["bash", self.wrapper, NETDIAG],
+            capture_output=True, text=True, timeout=20, env=env,
+        )
+
+    # --- Normal case: everything reachable -----------------------------
+
+    def test_all_checks_pass_returns_zero(self):
+        r = self.run_diag()
+        self.assertEqual(r.returncode, 0, r.stdout + r.stderr)
+        self.assertIn("all checks passed", r.stdout)
+
+    def test_facts_collected_on_success(self):
+        r = self.run_diag()
+        out = r.stdout + r.stderr
+        self.assertIn("10.0.2.15/24", out)          # interface fact
+        self.assertIn("default via 10.0.2.2", out)  # route fact
+        self.assertIn("nameserver 10.0.2.3", out)   # resolver fact
+
+    # --- DNS-failure case ----------------------------------------------
+
+    def test_dns_failure_returns_nonzero(self):
+        r = self.run_diag(FAKE_DNS_FAIL="1")
+        self.assertNotEqual(r.returncode, 0)
+
+    def test_dns_failure_names_dns_not_mirror(self):
+        r = self.run_diag(FAKE_DNS_FAIL="1")
+        out = r.stdout + r.stderr
+        self.assertIn("DNS resolution failed", out)
+        # A DNS failure must not be misreported as a mirror failure. With only
+        # DNS failing, the mirror check still runs and passes.
+        self.assertNotIn("Cannot reach Arch mirrors", out)
+
+    def test_dns_failure_still_collects_evidence(self):
+        # The whole point of the change: evidence is gathered before any check
+        # can bail, so a DNS failure still leaves the facts in the output.
+        r = self.run_diag(FAKE_DNS_FAIL="1")
+        out = r.stdout + r.stderr
+        self.assertIn("10.0.2.15/24", out)
+        self.assertIn("default via 10.0.2.2", out)
+        self.assertIn("nameserver 10.0.2.3", out)
+
+    def test_dns_failure_summary_lists_the_failure(self):
+        r = self.run_diag(FAKE_DNS_FAIL="1")
+        out = r.stdout + r.stderr
+        self.assertIn("found 1 failure", out)
+        self.assertIn("getent hosts archlinux.org", out)
+
+    # --- Mirror-only-failure case --------------------------------------
+
+    def test_mirror_only_failure_returns_nonzero(self):
+        r = self.run_diag(FAKE_MIRROR_FAIL="1")
+        self.assertNotEqual(r.returncode, 0)
+
+    def test_mirror_only_failure_generic_checks_pass(self):
+        r = self.run_diag(FAKE_MIRROR_FAIL="1")
+        out = r.stdout + r.stderr
+        # Generic checks are healthy; only the Arch-specific mirror check fails.
+        self.assertIn("DNS resolution OK", out)
+        self.assertIn("HTTP egress OK", out)
+        self.assertIn("TLS/HTTPS egress OK", out)
+        self.assertIn("Cannot reach Arch mirrors", out)
+        self.assertNotIn("DNS resolution failed", out)
+
+    def test_mirror_only_failure_summary_names_mirror(self):
+        r = self.run_diag(FAKE_MIRROR_FAIL="1")
+        out = r.stdout + r.stderr
+        self.assertIn("geo.mirror.pkgbuild.com", out)
+
+    # --- All checks run: multiple failures are all reported ------------
+
+    def test_multiple_failures_all_reported(self):
+        r = self.run_diag(FAKE_DNS_FAIL="1", FAKE_AUR_FAIL="1")
+        out = r.stdout + r.stderr
+        self.assertIn("found 2 failure", out)
+        self.assertIn("getent hosts archlinux.org", out)
+        self.assertIn("aur.archlinux.org", out)
+
+    # --- Raw outputs saved to the results dir --------------------------
+
+    def test_raw_facts_saved_to_results_dir(self):
+        results = os.path.join(self.tmp, "results")
+        os.makedirs(results)
+        self.run_diag(results_dir=results)
+        for slug, needle in (
+            ("ip-addr", "10.0.2.15/24"),
+            ("ip-route", "default via 10.0.2.2"),
+            ("resolv-conf", "nameserver 10.0.2.3"),
+        ):
+            path = os.path.join(results, "netdiag-%s.txt" % slug)
+            self.assertTrue(os.path.exists(path), "missing " + path)
+            with open(path) as f:
+                self.assertIn(needle, f.read())
+
+
+if __name__ == "__main__":
+    unittest.main()