1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
|
#!/usr/bin/env python3
"""Resume cross-agent comms after a halt.
See cross-agent-resume.md. Removes ~/.config/cross-agent-comms/HALT and
restarts the cross-agent-watch systemd user service. With --tailnet,
propagates the removal to every peer in peers.toml via SSH; reports
per-peer status with non-zero exit on partial resume.
Per the asymmetry rule: clearing HALT does NOT auto-resume agent polling.
Each session must explicitly re-engage.
"""
from __future__ import annotations
import argparse
import subprocess
import sys
import tomllib
from pathlib import Path
CONFIG_DIR = Path.home() / ".config" / "cross-agent-comms"
HALT_FILE = CONFIG_DIR / "HALT"
PEERS_TOML = CONFIG_DIR / "peers.toml"
EXIT_OK = 0
EXIT_PARTIAL = 1
def err(msg: str) -> None:
print(msg, file=sys.stderr)
def remove_halt_file() -> bool:
"""Returns True if HALT was removed, False if it didn't exist."""
if HALT_FILE.exists():
try:
HALT_FILE.unlink()
return True
except OSError as e:
err(f"could not remove HALT: {e}")
return False
return False
def start_watcher_service() -> None:
"""Best-effort start of the systemd watcher path unit."""
try:
subprocess.run(
["systemctl", "--user", "start", "cross-agent-watch.path"],
capture_output=True, text=True, timeout=5,
)
except (FileNotFoundError, subprocess.TimeoutExpired):
pass
def load_peers() -> dict:
if not PEERS_TOML.exists():
return {}
try:
return tomllib.loads(PEERS_TOML.read_text())
except (tomllib.TOMLDecodeError, OSError) as e:
err(f"cannot parse peers.toml: {e}")
return {}
def ssh_remove_halt(host: str, ssh_user: str | None) -> tuple[bool, str]:
target = f"{ssh_user}@{host}" if ssh_user else host
remote_cmd = "rm -f ~/.config/cross-agent-comms/HALT"
try:
result = subprocess.run(
["ssh", "-o", "ConnectTimeout=3", "-o", "BatchMode=yes", target, remote_cmd],
capture_output=True, text=True, timeout=10,
)
except (FileNotFoundError, subprocess.TimeoutExpired):
return False, "ssh unavailable or timed out"
if result.returncode == 0:
return True, "HALT cleared"
return False, (result.stderr.strip().splitlines() or [f"exit {result.returncode}"])[-1]
def print_re_engage_instructions() -> None:
print()
print("Halt cleared. Watcher restarted.")
print()
print("Agent polling does NOT auto-resume — per the failsafe asymmetry rule,")
print("agents stay paused until you explicitly re-engage each session.")
print("Open the relevant Claude session and tell the agent to resume polling")
print("for its conversation.")
def main() -> int:
parser = argparse.ArgumentParser(description="Resume cross-agent comms after a halt.")
parser.add_argument("--tailnet", action="store_true",
help="Propagate HALT removal to every peer in peers.toml")
args = parser.parse_args()
removed = remove_halt_file()
start_watcher_service()
if removed:
print("Resuming locally ✓ (HALT cleared)")
else:
print("Resuming locally ✓ (no HALT was active)")
if not args.tailnet:
print_re_engage_instructions()
return EXIT_OK
peers = load_peers().get("peers", {})
if not peers:
print()
print("No peers configured in peers.toml — local-only resume complete.")
print_re_engage_instructions()
return EXIT_OK
print()
successes = 1
failures = []
for name, cfg in sorted(peers.items()):
host = cfg.get("host", name)
ssh_user = cfg.get("ssh_user")
ok, detail = ssh_remove_halt(host, ssh_user)
marker = "✓" if ok else "✗"
print(f"Resuming {host:<27} {marker} ({detail})")
if ok:
successes += 1
else:
failures.append(f"{name} ({host}): {detail}")
print()
total = len(peers) + 1
if failures:
print(f"PARTIAL RESUME: {successes}/{total} machines cleared.")
for f in failures:
print(f" - {f}")
print("Resolve the failures or manually clear HALT on each machine.")
print_re_engage_instructions()
return EXIT_PARTIAL
print(f"Resume complete across {total} machine(s).")
print_re_engage_instructions()
return EXIT_OK
if __name__ == "__main__":
sys.exit(main())
|