#!/usr/bin/env python3 """Resume cross-agent comms after a halt. See cross-agent-resume.md. Removes ~/.config/cross-agent-comms/HALT and restarts the cross-agent-watch systemd user service. With --tailnet, propagates the removal to every peer in peers.toml via SSH; reports per-peer status with non-zero exit on partial resume. Per the asymmetry rule: clearing HALT does NOT auto-resume agent polling. Each session must explicitly re-engage. """ from __future__ import annotations import argparse import subprocess import sys import tomllib from pathlib import Path CONFIG_DIR = Path.home() / ".config" / "cross-agent-comms" HALT_FILE = CONFIG_DIR / "HALT" PEERS_TOML = CONFIG_DIR / "peers.toml" EXIT_OK = 0 EXIT_PARTIAL = 1 def err(msg: str) -> None: print(msg, file=sys.stderr) def remove_halt_file() -> bool: """Returns True if HALT was removed, False if it didn't exist.""" if HALT_FILE.exists(): try: HALT_FILE.unlink() return True except OSError as e: err(f"could not remove HALT: {e}") return False return False def start_watcher_service() -> None: """Best-effort start of the systemd watcher path unit.""" try: subprocess.run( ["systemctl", "--user", "start", "cross-agent-watch.path"], capture_output=True, text=True, timeout=5, ) except (FileNotFoundError, subprocess.TimeoutExpired): pass def load_peers() -> dict: if not PEERS_TOML.exists(): return {} try: return tomllib.loads(PEERS_TOML.read_text()) except (tomllib.TOMLDecodeError, OSError) as e: err(f"cannot parse peers.toml: {e}") return {} def ssh_remove_halt(host: str, ssh_user: str | None) -> tuple[bool, str]: target = f"{ssh_user}@{host}" if ssh_user else host remote_cmd = "rm -f ~/.config/cross-agent-comms/HALT" try: result = subprocess.run( ["ssh", "-o", "ConnectTimeout=3", "-o", "BatchMode=yes", target, remote_cmd], capture_output=True, text=True, timeout=10, ) except (FileNotFoundError, subprocess.TimeoutExpired): return False, "ssh unavailable or timed out" if result.returncode == 0: return True, "HALT cleared" return False, (result.stderr.strip().splitlines() or [f"exit {result.returncode}"])[-1] def print_re_engage_instructions() -> None: print() print("Halt cleared. Watcher restarted.") print() print("Agent polling does NOT auto-resume — per the failsafe asymmetry rule,") print("agents stay paused until you explicitly re-engage each session.") print("Open the relevant Claude session and tell the agent to resume polling") print("for its conversation.") def main() -> int: parser = argparse.ArgumentParser(description="Resume cross-agent comms after a halt.") parser.add_argument("--tailnet", action="store_true", help="Propagate HALT removal to every peer in peers.toml") args = parser.parse_args() removed = remove_halt_file() start_watcher_service() if removed: print("Resuming locally ✓ (HALT cleared)") else: print("Resuming locally ✓ (no HALT was active)") if not args.tailnet: print_re_engage_instructions() return EXIT_OK peers = load_peers().get("peers", {}) if not peers: print() print("No peers configured in peers.toml — local-only resume complete.") print_re_engage_instructions() return EXIT_OK print() successes = 1 failures = [] for name, cfg in sorted(peers.items()): host = cfg.get("host", name) ssh_user = cfg.get("ssh_user") ok, detail = ssh_remove_halt(host, ssh_user) marker = "✓" if ok else "✗" print(f"Resuming {host:<27} {marker} ({detail})") if ok: successes += 1 else: failures.append(f"{name} ({host}): {detail}") print() total = len(peers) + 1 if failures: print(f"PARTIAL RESUME: {successes}/{total} machines cleared.") for f in failures: print(f" - {f}") print("Resolve the failures or manually clear HALT on each machine.") print_re_engage_instructions() return EXIT_PARTIAL print(f"Resume complete across {total} machine(s).") print_re_engage_instructions() return EXIT_OK if __name__ == "__main__": sys.exit(main())