#!/usr/bin/env python3 """Failsafe halt for cross-agent comms. See cross-agent-halt.md. Touches ~/.config/cross-agent-comms/HALT and stops the cross-agent-watch systemd user service. With --tailnet, propagates the HALT file to every peer in peers.toml via SSH; reports per-peer status with non-zero exit on partial halt. Does NOT pkill in-flight scripts — they detect HALT on next iteration and stop themselves. """ from __future__ import annotations import argparse import subprocess import sys import tomllib from pathlib import Path CONFIG_DIR = Path.home() / ".config" / "cross-agent-comms" HALT_FILE = CONFIG_DIR / "HALT" PEERS_TOML = CONFIG_DIR / "peers.toml" EXIT_OK = 0 EXIT_PARTIAL = 1 def err(msg: str) -> None: print(msg, file=sys.stderr) def write_halt_file(reason: str) -> None: CONFIG_DIR.mkdir(parents=True, exist_ok=True) HALT_FILE.write_text((reason + "\n") if reason else "") def stop_watcher_service() -> None: """Best-effort stop of the systemd watcher service. Failures are logged but not fatal.""" try: subprocess.run( ["systemctl", "--user", "stop", "cross-agent-watch.path"], capture_output=True, text=True, timeout=5, ) except (FileNotFoundError, subprocess.TimeoutExpired): # Watcher service may not be installed — fine. pass def load_peers() -> dict: if not PEERS_TOML.exists(): return {} try: return tomllib.loads(PEERS_TOML.read_text()) except (tomllib.TOMLDecodeError, OSError) as e: err(f"cannot parse peers.toml: {e}") return {} def ssh_touch_halt(host: str, ssh_user: str | None, reason: str) -> tuple[bool, str]: target = f"{ssh_user}@{host}" if ssh_user else host # Build the remote command. Quote the reason carefully. remote_cmd = ( f"mkdir -p ~/.config/cross-agent-comms && " f"printf %s {_sh_quote(reason)} > ~/.config/cross-agent-comms/HALT" ) try: result = subprocess.run( ["ssh", "-o", "ConnectTimeout=3", "-o", "BatchMode=yes", target, remote_cmd], capture_output=True, text=True, timeout=10, ) except (FileNotFoundError, subprocess.TimeoutExpired): return False, "ssh unavailable or timed out" if result.returncode == 0: return True, "HALT file written" return False, (result.stderr.strip().splitlines() or [f"exit {result.returncode}"])[-1] def _sh_quote(s: str) -> str: return "'" + s.replace("'", "'\"'\"'") + "'" def main() -> int: parser = argparse.ArgumentParser(description="Halt all cross-agent comms on this machine (and optionally tailnet).") parser.add_argument("reason", nargs="?", default="", help="Optional human-readable reason") parser.add_argument("--tailnet", action="store_true", help="Propagate HALT to every peer in peers.toml") args = parser.parse_args() # Local halt. write_halt_file(args.reason) stop_watcher_service() print("Halting locally ✓ (HALT file written)") if not args.tailnet: print() print(f"Halt active. Remove {HALT_FILE} or run cross-agent-resume to clear.") print("Agent polling will stop within ~5 min (one cadence cycle).") return EXIT_OK peers = load_peers().get("peers", {}) if not peers: print() print("No peers configured in peers.toml — local-only halt complete.") return EXIT_OK print() successes = 1 # local already counted failures = [] for name, cfg in sorted(peers.items()): host = cfg.get("host", name) ssh_user = cfg.get("ssh_user") ok, detail = ssh_touch_halt(host, ssh_user, args.reason) marker = "✓" if ok else "✗" print(f"Halting {host:<28} {marker} ({detail})") if ok: successes += 1 else: failures.append(f"{name} ({host}): {detail}") print() total = len(peers) + 1 if failures: print(f"PARTIAL HALT: {successes}/{total} machines halted.") for f in failures: print(f" - {f}") print("Resolve the failures or manually halt each machine.") return EXIT_PARTIAL print(f"Halt active across {total} machine(s).") return EXIT_OK if __name__ == "__main__": sys.exit(main())