1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
|
#!/usr/bin/env python3
"""Failsafe halt for cross-agent comms.
See cross-agent-halt.md. Touches ~/.config/cross-agent-comms/HALT and stops
the cross-agent-watch systemd user service. With --tailnet, propagates the
HALT file to every peer in peers.toml via SSH; reports per-peer status with
non-zero exit on partial halt.
Does NOT pkill in-flight scripts — they detect HALT on next iteration and
stop themselves.
"""
from __future__ import annotations
import argparse
import subprocess
import sys
import tomllib
from pathlib import Path
CONFIG_DIR = Path.home() / ".config" / "cross-agent-comms"
HALT_FILE = CONFIG_DIR / "HALT"
PEERS_TOML = CONFIG_DIR / "peers.toml"
EXIT_OK = 0
EXIT_PARTIAL = 1
def err(msg: str) -> None:
print(msg, file=sys.stderr)
def write_halt_file(reason: str) -> None:
CONFIG_DIR.mkdir(parents=True, exist_ok=True)
HALT_FILE.write_text((reason + "\n") if reason else "")
def stop_watcher_service() -> None:
"""Best-effort stop of the systemd watcher service. Failures are logged but not fatal."""
try:
subprocess.run(
["systemctl", "--user", "stop", "cross-agent-watch.path"],
capture_output=True, text=True, timeout=5,
)
except (FileNotFoundError, subprocess.TimeoutExpired):
# Watcher service may not be installed — fine.
pass
def load_peers() -> dict:
if not PEERS_TOML.exists():
return {}
try:
return tomllib.loads(PEERS_TOML.read_text())
except (tomllib.TOMLDecodeError, OSError) as e:
err(f"cannot parse peers.toml: {e}")
return {}
def ssh_touch_halt(host: str, ssh_user: str | None, reason: str) -> tuple[bool, str]:
target = f"{ssh_user}@{host}" if ssh_user else host
# Build the remote command. Quote the reason carefully.
remote_cmd = (
f"mkdir -p ~/.config/cross-agent-comms && "
f"printf %s {_sh_quote(reason)} > ~/.config/cross-agent-comms/HALT"
)
try:
result = subprocess.run(
["ssh", "-o", "ConnectTimeout=3", "-o", "BatchMode=yes", target, remote_cmd],
capture_output=True, text=True, timeout=10,
)
except (FileNotFoundError, subprocess.TimeoutExpired):
return False, "ssh unavailable or timed out"
if result.returncode == 0:
return True, "HALT file written"
return False, (result.stderr.strip().splitlines() or [f"exit {result.returncode}"])[-1]
def _sh_quote(s: str) -> str:
return "'" + s.replace("'", "'\"'\"'") + "'"
def main() -> int:
parser = argparse.ArgumentParser(description="Halt all cross-agent comms on this machine (and optionally tailnet).")
parser.add_argument("reason", nargs="?", default="", help="Optional human-readable reason")
parser.add_argument("--tailnet", action="store_true",
help="Propagate HALT to every peer in peers.toml")
args = parser.parse_args()
# Local halt.
write_halt_file(args.reason)
stop_watcher_service()
print("Halting locally ✓ (HALT file written)")
if not args.tailnet:
print()
print(f"Halt active. Remove {HALT_FILE} or run cross-agent-resume to clear.")
print("Agent polling will stop within ~5 min (one cadence cycle).")
return EXIT_OK
peers = load_peers().get("peers", {})
if not peers:
print()
print("No peers configured in peers.toml — local-only halt complete.")
return EXIT_OK
print()
successes = 1 # local already counted
failures = []
for name, cfg in sorted(peers.items()):
host = cfg.get("host", name)
ssh_user = cfg.get("ssh_user")
ok, detail = ssh_touch_halt(host, ssh_user, args.reason)
marker = "✓" if ok else "✗"
print(f"Halting {host:<28} {marker} ({detail})")
if ok:
successes += 1
else:
failures.append(f"{name} ({host}): {detail}")
print()
total = len(peers) + 1
if failures:
print(f"PARTIAL HALT: {successes}/{total} machines halted.")
for f in failures:
print(f" - {f}")
print("Resolve the failures or manually halt each machine.")
return EXIT_PARTIAL
print(f"Halt active across {total} machine(s).")
return EXIT_OK
if __name__ == "__main__":
sys.exit(main())
|