diff options
Diffstat (limited to '.ai/scripts/cross-agent-comms')
| -rwxr-xr-x | .ai/scripts/cross-agent-comms/cross-agent-discover | 230 | ||||
| -rw-r--r-- | .ai/scripts/cross-agent-comms/cross-agent-discover.md | 155 | ||||
| -rwxr-xr-x | .ai/scripts/cross-agent-comms/cross-agent-halt | 134 | ||||
| -rw-r--r-- | .ai/scripts/cross-agent-comms/cross-agent-halt.md | 134 | ||||
| -rwxr-xr-x | .ai/scripts/cross-agent-comms/cross-agent-recv | 250 | ||||
| -rw-r--r-- | .ai/scripts/cross-agent-comms/cross-agent-recv.md | 218 | ||||
| -rwxr-xr-x | .ai/scripts/cross-agent-comms/cross-agent-resume | 145 | ||||
| -rw-r--r-- | .ai/scripts/cross-agent-comms/cross-agent-resume.md | 117 | ||||
| -rwxr-xr-x | .ai/scripts/cross-agent-comms/cross-agent-send | 356 | ||||
| -rw-r--r-- | .ai/scripts/cross-agent-comms/cross-agent-send.md | 199 | ||||
| -rwxr-xr-x | .ai/scripts/cross-agent-comms/cross-agent-status | 185 | ||||
| -rw-r--r-- | .ai/scripts/cross-agent-comms/cross-agent-status.md | 139 | ||||
| -rwxr-xr-x | .ai/scripts/cross-agent-comms/cross-agent-watch | 106 | ||||
| -rw-r--r-- | .ai/scripts/cross-agent-comms/cross-agent-watch.md | 130 |
14 files changed, 0 insertions, 2498 deletions
diff --git a/.ai/scripts/cross-agent-comms/cross-agent-discover b/.ai/scripts/cross-agent-comms/cross-agent-discover deleted file mode 100755 index 152cf27..0000000 --- a/.ai/scripts/cross-agent-comms/cross-agent-discover +++ /dev/null @@ -1,230 +0,0 @@ -#!/usr/bin/env python3 -"""Enumerate cross-agent destinations: local projects + tailnet peers. - -See cross-agent-discover.md. Local: scan ~/projects/*/.ai/. Peers: read -peers.toml, SSH-probe each for reachability. --enumerate-remote optionally -runs `ls -d ~/projects/*/.ai/` over SSH to list remote projects. - -Cache results for 5 min at ~/.cache/cross-agent-comms/discovery.json so -repeated invocations don't re-probe. - -HALT: prints a banner; otherwise continues. -""" - -from __future__ import annotations - -import argparse -import datetime as _dt -import json -import os -import subprocess -import sys -import time -import tomllib -from pathlib import Path - -CONFIG_DIR = Path.home() / ".config" / "cross-agent-comms" -PEERS_TOML = CONFIG_DIR / "peers.toml" -HALT_FILE = CONFIG_DIR / "HALT" -CACHE_DIR = Path.home() / ".cache" / "cross-agent-comms" -CACHE_FILE = CACHE_DIR / "discovery.json" -CACHE_TTL_SECONDS = 300 - -EXIT_OK = 0 -EXIT_GENERAL = 1 -EXIT_PEERS_TOML = 1 - - -def err(msg: str) -> None: - print(msg, file=sys.stderr) - - -def render_banner_if_halt() -> None: - if not HALT_FILE.exists(): - return - try: - reason = HALT_FILE.read_text().strip() - except OSError: - reason = "(HALT file unreadable; treated as halted)" - print("⚠ HALT ACTIVE — cross-agent comms paused") - if reason: - print(f" reason: {reason}") - print() - - -def enumerate_local_projects() -> list[str]: - projects_dir = Path.home() / "projects" - if not projects_dir.is_dir(): - return [] - found = [] - for child in sorted(projects_dir.iterdir()): - if child.is_dir() and (child / ".ai").is_dir(): - found.append(child.name) - return found - - -def load_peers() -> dict: - if not PEERS_TOML.exists(): - return {"peers": {}} - try: - return tomllib.loads(PEERS_TOML.read_text()) - except (tomllib.TOMLDecodeError, OSError) as e: - err(f"cannot parse peers.toml: {e}") - sys.exit(EXIT_PEERS_TOML) - - -def probe_peer_reachability(host: str, ssh_user: str | None) -> tuple[bool, str | None]: - """Run a short SSH probe with BatchMode=yes (no interactive prompt).""" - target = f"{ssh_user}@{host}" if ssh_user else host - try: - result = subprocess.run( - ["ssh", "-o", "ConnectTimeout=2", "-o", "BatchMode=yes", target, "true"], - capture_output=True, - text=True, - timeout=5, - ) - except (FileNotFoundError, subprocess.TimeoutExpired): - return False, "ssh probe failed" - if result.returncode == 0: - return True, None - return False, (result.stderr.strip().splitlines() or [f"exit {result.returncode}"])[-1] - - -def enumerate_remote_projects(host: str, ssh_user: str | None) -> list[str] | None: - target = f"{ssh_user}@{host}" if ssh_user else host - try: - result = subprocess.run( - [ - "ssh", "-o", "ConnectTimeout=3", "-o", "BatchMode=yes", target, - "ls -d ~/projects/*/.ai/ 2>/dev/null", - ], - capture_output=True, - text=True, - timeout=10, - ) - except (FileNotFoundError, subprocess.TimeoutExpired): - return None - if result.returncode != 0: - return None - projects = [] - for line in result.stdout.splitlines(): - # Each line looks like /home/<user>/projects/<name>/.ai/ - parts = line.rstrip("/").split("/") - if len(parts) >= 2 and parts[-1] == ".ai": - projects.append(parts[-2]) - return projects - - -def read_cache() -> dict | None: - if not CACHE_FILE.exists(): - return None - try: - age = time.time() - CACHE_FILE.stat().st_mtime - if age > CACHE_TTL_SECONDS: - return None - return json.loads(CACHE_FILE.read_text()) - except (OSError, json.JSONDecodeError): - return None - - -def write_cache(payload: dict) -> None: - CACHE_DIR.mkdir(parents=True, exist_ok=True) - CACHE_FILE.write_text(json.dumps(payload, indent=2)) - - -def discover(peer_filter: str | None, enumerate_remote: bool) -> dict: - local = enumerate_local_projects() - peers_cfg = load_peers().get("peers", {}) - - peers_out = [] - for name, cfg in sorted(peers_cfg.items()): - if peer_filter and name != peer_filter: - continue - host = cfg.get("host", name) - ssh_user = cfg.get("ssh_user") - reachable, error = probe_peer_reachability(host, ssh_user) - entry = { - "name": name, - "host": host, - "reachable": reachable, - } - if not reachable: - entry["error"] = error - if enumerate_remote and reachable: - entry["projects"] = enumerate_remote_projects(host, ssh_user) or [] - peers_out.append(entry) - - return { - "scanned_at": _dt.datetime.now(_dt.timezone.utc).isoformat(), - "halt_active": HALT_FILE.exists(), - "local": local, - "peers": peers_out, - } - - -def render_table(payload: dict, enumerate_remote: bool) -> None: - local = payload.get("local", []) - print(f"Local ({_local_hostname()}):") - if local: - wrapped = ", ".join(local) - print(f" {wrapped} [{len(local)} project{'s' if len(local) != 1 else ''}]") - else: - print(" (no projects with .ai/ found)") - print() - - peers = payload.get("peers", []) - if not peers: - print("Peers (from peers.toml):") - print(" (no peers configured)") - return - - print("Peers (from ~/.config/cross-agent-comms/peers.toml):") - for p in peers: - marker = "✓ reachable" if p.get("reachable") else f"✗ UNREACHABLE ({p.get('error', 'unknown')})" - print(f" {p['name']:<16} {p['host']:<24} {marker}") - if enumerate_remote and p.get("projects"): - wrapped = ", ".join(p["projects"]) - print(f" projects: {wrapped}") - - -def _local_hostname() -> str: - import socket - return socket.gethostname().split(".")[0] - - -def main() -> int: - parser = argparse.ArgumentParser(description="Discover cross-agent destinations.") - parser.add_argument("--enumerate-remote", action="store_true", - help="SSH into each peer and list ~/projects/*/.ai/") - parser.add_argument("--no-cache", action="store_true", help="Skip cache; force fresh probe") - parser.add_argument("--peer", help="Limit to a single peer name from peers.toml") - parser.add_argument("--json", action="store_true", help="Machine-readable output") - args = parser.parse_args() - - render_banner_if_halt() - - payload = None - if not args.no_cache: - cached = read_cache() - if cached is not None: - # Honor --peer filter on cached payload. - if args.peer: - cached["peers"] = [p for p in cached.get("peers", []) if p["name"] == args.peer] - payload = cached - - if payload is None: - payload = discover(args.peer, args.enumerate_remote) - if not args.no_cache and not args.peer: - # Only cache full (unfiltered) discoveries. - write_cache(payload) - - if args.json: - print(json.dumps(payload, indent=2)) - return EXIT_OK - - render_table(payload, args.enumerate_remote) - return EXIT_OK - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/.ai/scripts/cross-agent-comms/cross-agent-discover.md b/.ai/scripts/cross-agent-comms/cross-agent-discover.md deleted file mode 100644 index 95134bb..0000000 --- a/.ai/scripts/cross-agent-comms/cross-agent-discover.md +++ /dev/null @@ -1,155 +0,0 @@ -# cross-agent-discover - -**Purpose.** Enumerate available cross-agent destinations — local projects on -this machine and remote projects on tailnet peers. Validates SSH reachability -for cross-machine destinations before reporting them as usable. - -## Usage - -``` -cross-agent-discover [--enumerate-remote] [--no-cache] [--peer <name>] -``` - -No args required for the common case (local enumeration + peer reachability). - -### Flags - -| Flag | Default | Purpose | -|---|---|---| -| `--enumerate-remote` | off | SSH into each peer and list projects under `~/projects/*/.ai/`. Off by default because SSH adds latency; turn on when you want to see what's available on a remote machine you haven't fully configured. | -| `--no-cache` | off | Skip the 5-minute cache; force fresh discovery. | -| `--peer <name>` | (all) | Limit to a single peer from `peers.toml`. | -| `--json` | off | Machine-readable output. | - -## Output - -### Default - -``` -$ cross-agent-discover -Local (ratio): - career, claude-templates, clipper, danneel, documents, elibrary, - finances, health, homelab, jr-estate, kit, little-elisper, - philosophy, website [14 projects] - -Peers (from ~/.config/cross-agent-comms/peers.toml): - velox.local reachable (last seen 2 sec ago) - bastion.local UNREACHABLE (ssh exit 255: connection refused) -``` - -### With `--enumerate-remote` - -``` -$ cross-agent-discover --enumerate-remote -Local (ratio): - ... (as above) - -velox.local (reachable): - career, homelab [2 projects] -``` - -## Configuration - -Reads `~/.config/cross-agent-comms/peers.toml`: - -```toml -# Each peer is a remote machine reachable via SSH (typically over Tailscale). - -[peers.velox] -host = "velox.local" -ssh_user = "cjennings" - -[peers.bastion] -host = "bastion.local" -ssh_user = "cjennings" -``` - -Peers entries describe machines, NOT projects. Projects are enumerated -on-demand under `~/projects/*/.ai/` either locally or via SSH. - -## Cache - -Successful discovery results are cached at -`~/.cache/cross-agent-comms/discovery.json` for 5 minutes. Repeated invocations -within the window read from cache. - -`--no-cache` forces a fresh probe. Useful when adding a new peer or after a -network change. - -## SSH reachability check - -For each peer, runs: - -``` -ssh -o ConnectTimeout=2 -o BatchMode=yes <user>@<host> true -``` - -`BatchMode=yes` prevents interactive password prompts — peers that don't have -key-based auth set up are reported as UNREACHABLE. - -If `--enumerate-remote` is set, on success runs: - -``` -ssh <user>@<host> 'ls -d ~/projects/*/.ai/ 2>/dev/null' -``` - -## Failure modes - -| Symptom | Likely cause | Fix | -|---|---|---| -| Peer reported UNREACHABLE | Tailscale not connected, SSH key not authorized, host firewalled | `tailscale status`; `ssh -v <peer>` to debug. | -| Local list is empty | Glob misresolved, or `~/projects/` doesn't exist | Check `ls -d ~/projects/*/.ai/`. | -| `--enumerate-remote` slow | Cold cache, slow tailnet, many peers | First run is slow, subsequent runs hit cache. Use `--peer <name>` to scope. | -| Peer unexpectedly missing from output | Not in `peers.toml`, or `peers.toml` malformed | `cat ~/.config/cross-agent-comms/peers.toml` and validate. | - -## HALT awareness - -Checks `~/.config/cross-agent-comms/HALT` at start. If HALT exists, prints a -prominent banner before normal output: - -``` -$ cross-agent-discover -⚠ HALT ACTIVE — cross-agent comms paused - Reason: <reason from HALT file body, if any> - Resume with: cross-agent-resume - -(enumeration continues normally — HALT does not suppress visibility) - -Local (ratio): - career, claude-templates, ... - -Peers: - velox.local reachable -``` - -Discover is read-only. Like `cross-agent-status`, it always runs so the user -keeps visibility into what destinations exist regardless of halt state. The -banner makes the halt state impossible to miss. - -If the HALT file exists but is unreadable, print a warning banner and -continue. - -See `cross-agent-halt.md` for the full halt mechanism. - -## Examples - -```bash -# Common: see what's available -cross-agent-discover - -# Force fresh probe after network change -cross-agent-discover --no-cache - -# What's on velox specifically -cross-agent-discover --peer velox --enumerate-remote - -# Pipe to grep -cross-agent-discover --json | jq '.peers[] | select(.reachable)' -``` - -## See also - -- `cross-agent-send` — uses `peers.toml` for routing destinations. -- `cross-agent-status` — local pending messages. -- `cross-agent-comms.org` — protocol spec, `* Limitations` section - explains the cross-machine model. diff --git a/.ai/scripts/cross-agent-comms/cross-agent-halt b/.ai/scripts/cross-agent-comms/cross-agent-halt deleted file mode 100755 index df25115..0000000 --- a/.ai/scripts/cross-agent-comms/cross-agent-halt +++ /dev/null @@ -1,134 +0,0 @@ -#!/usr/bin/env python3 -"""Failsafe halt for cross-agent comms. - -See cross-agent-halt.md. Touches ~/.config/cross-agent-comms/HALT and stops -the cross-agent-watch systemd user service. With --tailnet, propagates the -HALT file to every peer in peers.toml via SSH; reports per-peer status with -non-zero exit on partial halt. - -Does NOT pkill in-flight scripts — they detect HALT on next iteration and -stop themselves. -""" - -from __future__ import annotations - -import argparse -import subprocess -import sys -import tomllib -from pathlib import Path - -CONFIG_DIR = Path.home() / ".config" / "cross-agent-comms" -HALT_FILE = CONFIG_DIR / "HALT" -PEERS_TOML = CONFIG_DIR / "peers.toml" - -EXIT_OK = 0 -EXIT_PARTIAL = 1 - - -def err(msg: str) -> None: - print(msg, file=sys.stderr) - - -def write_halt_file(reason: str) -> None: - CONFIG_DIR.mkdir(parents=True, exist_ok=True) - HALT_FILE.write_text((reason + "\n") if reason else "") - - -def stop_watcher_service() -> None: - """Best-effort stop of the systemd watcher service. Failures are logged but not fatal.""" - try: - subprocess.run( - ["systemctl", "--user", "stop", "cross-agent-watch.path"], - capture_output=True, text=True, timeout=5, - ) - except (FileNotFoundError, subprocess.TimeoutExpired): - # Watcher service may not be installed — fine. - pass - - -def load_peers() -> dict: - if not PEERS_TOML.exists(): - return {} - try: - return tomllib.loads(PEERS_TOML.read_text()) - except (tomllib.TOMLDecodeError, OSError) as e: - err(f"cannot parse peers.toml: {e}") - return {} - - -def ssh_touch_halt(host: str, ssh_user: str | None, reason: str) -> tuple[bool, str]: - target = f"{ssh_user}@{host}" if ssh_user else host - # Build the remote command. Quote the reason carefully. - remote_cmd = ( - f"mkdir -p ~/.config/cross-agent-comms && " - f"printf %s {_sh_quote(reason)} > ~/.config/cross-agent-comms/HALT" - ) - try: - result = subprocess.run( - ["ssh", "-o", "ConnectTimeout=3", "-o", "BatchMode=yes", target, remote_cmd], - capture_output=True, text=True, timeout=10, - ) - except (FileNotFoundError, subprocess.TimeoutExpired): - return False, "ssh unavailable or timed out" - if result.returncode == 0: - return True, "HALT file written" - return False, (result.stderr.strip().splitlines() or [f"exit {result.returncode}"])[-1] - - -def _sh_quote(s: str) -> str: - return "'" + s.replace("'", "'\"'\"'") + "'" - - -def main() -> int: - parser = argparse.ArgumentParser(description="Halt all cross-agent comms on this machine (and optionally tailnet).") - parser.add_argument("reason", nargs="?", default="", help="Optional human-readable reason") - parser.add_argument("--tailnet", action="store_true", - help="Propagate HALT to every peer in peers.toml") - args = parser.parse_args() - - # Local halt. - write_halt_file(args.reason) - stop_watcher_service() - print("Halting locally ✓ (HALT file written)") - - if not args.tailnet: - print() - print(f"Halt active. Remove {HALT_FILE} or run cross-agent-resume to clear.") - print("Agent polling will stop within ~5 min (one cadence cycle).") - return EXIT_OK - - peers = load_peers().get("peers", {}) - if not peers: - print() - print("No peers configured in peers.toml — local-only halt complete.") - return EXIT_OK - - print() - successes = 1 # local already counted - failures = [] - for name, cfg in sorted(peers.items()): - host = cfg.get("host", name) - ssh_user = cfg.get("ssh_user") - ok, detail = ssh_touch_halt(host, ssh_user, args.reason) - marker = "✓" if ok else "✗" - print(f"Halting {host:<28} {marker} ({detail})") - if ok: - successes += 1 - else: - failures.append(f"{name} ({host}): {detail}") - - print() - total = len(peers) + 1 - if failures: - print(f"PARTIAL HALT: {successes}/{total} machines halted.") - for f in failures: - print(f" - {f}") - print("Resolve the failures or manually halt each machine.") - return EXIT_PARTIAL - print(f"Halt active across {total} machine(s).") - return EXIT_OK - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/.ai/scripts/cross-agent-comms/cross-agent-halt.md b/.ai/scripts/cross-agent-comms/cross-agent-halt.md deleted file mode 100644 index b817fbc..0000000 --- a/.ai/scripts/cross-agent-comms/cross-agent-halt.md +++ /dev/null @@ -1,134 +0,0 @@ -# cross-agent-halt - -**Purpose.** Failsafe stop for all cross-agent activity on the local machine -(or, with `--tailnet`, across all configured peers). Creates the HALT file -that every component in the protocol checks; within one polling cadence -(~5 min) all polling, sending, watching, and receiving stops. - -This is the user's emergency brake. Use when something is misbehaving and -visiting individual sessions is too slow. - -## Usage - -``` -cross-agent-halt [reason] [--tailnet] [--no-stop-watcher] -``` - -### Positional argument - -| Position | Meaning | Example | -|---|---|---| -| 1 | Optional human-readable reason for the halt. Written into the HALT file's body. Helps future-you remember why you stopped things. | `"investigating runaway poll loop, 2026-04-27"` | - -### Flags - -| Flag | Default | Purpose | -|---|---|---| -| `--tailnet` | local only | Propagate halt to every peer in `peers.toml` via SSH over Tailscale. | -| `--no-stop-watcher` | (stops watcher) | Skip stopping the `cross-agent-watch.path` systemd unit. Useful if the watcher is intentionally separate from comms (rare). | - -## Behavior - -### Local halt (default) - -1. Write the HALT file: `~/.config/cross-agent-comms/HALT`. If a `[reason]` was - passed, write it as the file's body. Otherwise the file is empty (existence - alone triggers halt). -2. Stop the watcher service: `systemctl --user stop cross-agent-watch.path` - (and the corresponding `.service` if running). -3. Print a summary: - ``` - ✓ HALT file written: ~/.config/cross-agent-comms/HALT - ✓ Watcher service stopped (cross-agent-watch.path) - - In-flight sends will complete their current rsync step (~seconds), then - stop. New sends are blocked. - - Active agent polling sessions stop within one cadence (~5 min). - - Use `cross-agent-resume` to clear HALT. - Per-session polling does NOT auto-resume — you re-engage each session by - telling its agent to resume polling. - ``` -4. Exit 0. - -### Cross-tailnet halt (`--tailnet`) - -1. Apply local halt steps 1-2 first. -2. Read `peers.toml` for the list of remote machines. -3. For each peer, SSH and write the HALT file: - ``` - ssh <user>@<host> "echo '<reason>' > ~/.config/cross-agent-comms/HALT && \ - systemctl --user stop cross-agent-watch.path" - ``` -4. Track per-peer success/failure. Print results: - ``` - Halting velox.local ✓ (HALT file written) - Halting bastion.local ✗ (ssh exit 255: no route to host) - Halting locally ✓ (HALT file written) - - PARTIAL HALT: 2/3 machines halted. bastion.local needs manual halt. - ``` -5. Exit 0 if all peers halted; exit 1 if any peer failed (so scripts can - detect partial halt). The local halt always succeeds — even on `--tailnet`, - if remote peers fail, local is still halted. - -## What "halt active" means for each component - -| Component | Behavior under HALT | -|---|---| -| `cross-agent-send` | Refuses to send. Exits 5 with "halt active; remove ~/.config/cross-agent-comms/HALT to resume." Checks HALT at start AND between each retry/rsync step, so an in-flight send completes its current step then stops. | -| `cross-agent-recv` | Refuses to verify or dedup. Exits 5 with same message. Inbound files are **left in place** — not moved, not rejected — so resume picks them up cleanly via cold-start. | -| `cross-agent-watch` | Continues running but suppresses notifications. Logs each event with `(suppressed by HALT)` so the operator can see what would have fired. | -| `cross-agent-status` | Prints prominent `⚠ HALT ACTIVE` banner before normal output. Continues to enumerate (read-only). | -| `cross-agent-discover` | Same banner. Continues (read-only). | -| Agent polling loops | Check HALT on every wake. If set: write a final `progress` note to any active conversation ("HALT fired locally; pausing"), surface "(HALT active; cross-agent comms paused)" in every user response, and stop rescheduling. Polling decays naturally within one cadence. | -| Conversation initiator | Refuses to write sequence 1 of any new conversation. Surfaces refusal to user. | -| Startup workflow (Phase A) | Checks HALT at session boot. If set, surfaces immediately and skips cross-agent inbox checks. | - -## Failure modes - -| Symptom | Cause | Fix | -|---|---|---| -| `~/.config/cross-agent-comms/HALT` already exists | Halt was already active | OK — running halt again refreshes the reason text. Safe. | -| `systemctl --user stop` fails | Watcher service not installed, or systemd not available | The HALT file is still written — components that check HALT will still stop. The systemctl failure surfaces as a non-fatal warning. | -| `--tailnet` halts some peers but not others | One or more peers unreachable | Exit 1 with per-peer status. Manually halt the unreachable peers (visit each machine, `touch ~/.config/cross-agent-comms/HALT`), or fix the network and re-run. | -| Permission denied writing the HALT file | `~/.config/cross-agent-comms/` doesn't exist or is owned by another user | `mkdir -p ~/.config/cross-agent-comms/`; check ownership. | - -## What halt does NOT do - -- Does not kill running Claude sessions. Polling stops within ~5 min, but the - session itself stays alive and can be re-engaged after resume. -- Does not delete pending messages. Inbound files in `inbox/from-agents/` - remain; they get processed when polling resumes. -- Does not abort in-flight rsync push mid-byte. Atomic-write semantics - guarantee in-flight messages either complete cleanly or leave only `.tmp.*` - files (which receivers ignore). - -## Examples - -```bash -# Quick halt with no reason -cross-agent-halt - -# Halt with a memo -cross-agent-halt "runaway poll loop in homelab session, debugging" - -# Halt all tailnet peers + local -cross-agent-halt --tailnet "shutting down for system update" - -# Halt protocol comms but leave the watcher service running -cross-agent-halt --no-stop-watcher -``` - -## Recovery - -Always pair with `cross-agent-resume` when the situation is resolved: - -```bash -cross-agent-resume # local -cross-agent-resume --tailnet # all peers -``` - -## See also - -- `cross-agent-resume` — counterpart that clears HALT. -- `cross-agent-status` — see HALT state at a glance. -- `cross-agent-comms.org` — protocol spec, `* Halt mechanism` section. diff --git a/.ai/scripts/cross-agent-comms/cross-agent-recv b/.ai/scripts/cross-agent-comms/cross-agent-recv deleted file mode 100755 index b67533a..0000000 --- a/.ai/scripts/cross-agent-comms/cross-agent-recv +++ /dev/null @@ -1,250 +0,0 @@ -#!/usr/bin/env python3 -"""Cross-agent message receiver. - -See cross-agent-recv.md for the full contract. Reads one message file and -emits a structured decision the agent acts on: - - process | dedup | query | reject - -Decision exit codes: - 0 = process 1 = dedup 2 = query 3 = reject - -When HALT is set, the script refuses to verify or dedup and leaves the -inbound file in place — resume picks it up via cold-start. -""" - -from __future__ import annotations - -import argparse -import hashlib -import json -import re -import shutil -import subprocess -import sys -from pathlib import Path - -CONFIG_DIR = Path.home() / ".config" / "cross-agent-comms" -HALT_FILE = CONFIG_DIR / "HALT" -EXPECTED_PROTOCOL_VERSION = "5" - -REQUIRED_FRONTMATTER = ["TITLE", "CONVERSATION_ID", "MESSAGE_TYPE", "SEQUENCE", "TIMESTAMP", "PROTOCOL_VERSION"] -VALID_MESSAGE_TYPES = {"request", "progress", "query", "pushback", "complete", "release", "escalate"} - -DEC_PROCESS = "process" -DEC_DEDUP = "dedup" -DEC_QUERY = "query" -DEC_REJECT = "reject" - -EXIT_FOR_DECISION = { - DEC_PROCESS: 0, - DEC_DEDUP: 1, - DEC_QUERY: 2, - DEC_REJECT: 3, -} - -EXIT_HALT = 5 - - -def err(msg: str) -> None: - print(msg, file=sys.stderr) - - -def check_halt() -> None: - if HALT_FILE.exists(): - try: - reason = HALT_FILE.read_text().strip() - except OSError: - err("halt active (HALT file present but unreadable; treated as halted)") - sys.exit(EXIT_HALT) - msg = "halt active; leaving inbound message in place (resume will pick up)" - if reason: - msg = f"{msg}: {reason}" - err(msg) - sys.exit(EXIT_HALT) - - -def parse_frontmatter(path: Path) -> dict[str, str]: - try: - text = path.read_text() - except OSError as e: - return {"_parse_error": f"cannot read: {e}"} - fm: dict[str, str] = {} - for line in text.splitlines(): - line = line.rstrip() - if not line: - if fm: - break - continue - m = re.match(r"#\+([A-Z_]+):\s*(.*)", line) - if m: - fm[m.group(1)] = m.group(2).strip() - elif fm: - break - return fm - - -def emit_decision( - decision: str, - reason: str | None, - fm: dict[str, str], - sha256: str | None, - args: argparse.Namespace, -) -> int: - payload = { - "decision": decision, - "reason": reason, - "message_type": fm.get("MESSAGE_TYPE"), - "conversation_id": fm.get("CONVERSATION_ID"), - "sequence": fm.get("SEQUENCE"), - "timestamp": fm.get("TIMESTAMP"), - "sha256": sha256, - } - if args.json: - print(json.dumps(payload, indent=None if args.compact_json else 2)) - else: - print(f"decision: {decision}") - if reason: - print(f"reason: {reason}") - for k in ("message_type", "conversation_id", "sequence", "timestamp"): - v = payload[k] - if v is not None: - print(f"{k}: {v}") - if sha256: - print(f"sha256: {sha256}") - return EXIT_FOR_DECISION[decision] - - -def gpg_verify(message_path: Path, sig_path: Path) -> tuple[bool, str]: - try: - result = subprocess.run( - ["gpg", "--verify", str(sig_path), str(message_path)], - capture_output=True, - text=True, - ) - except FileNotFoundError: - return False, "gpg not installed" - if result.returncode == 0: - return True, "" - return False, result.stderr.strip().splitlines()[-1] if result.stderr.strip() else f"exit {result.returncode}" - - -def sha256_of(path: Path) -> str: - h = hashlib.sha256() - with path.open("rb") as f: - for chunk in iter(lambda: f.read(65536), b""): - h.update(chunk) - return h.hexdigest() - - -def find_dedup_match(message_path: Path, fm: dict[str, str], my_hash: str) -> tuple[str, str | None]: - """Scan the message's directory for same-CONVERSATION_ID/SEQUENCE files. - - Returns (decision, reason) — decision is DEC_DEDUP for an exact-hash match, - or DEC_PROCESS when no match or hash differs (sequence collision is OK). - """ - parent = message_path.parent - conv_id = fm["CONVERSATION_ID"] - sequence = fm["SEQUENCE"] - for sibling in parent.iterdir(): - if sibling == message_path or not sibling.is_file() or sibling.suffix != ".org": - continue - sib_fm = parse_frontmatter(sibling) - if sib_fm.get("CONVERSATION_ID") != conv_id or sib_fm.get("SEQUENCE") != sequence: - continue - # Same conv-id + same sequence — check hash. - if sha256_of(sibling) == my_hash: - return DEC_DEDUP, f"identical retry of {sibling.name}" - return DEC_PROCESS, None - - -def check_requires_tools(fm: dict[str, str]) -> tuple[bool, list[str]]: - """REQUIRES_TOOLS is a comma-separated list of tool names. - - For v5, "tool available" is a heuristic: an executable on PATH whose name - matches the tool slug. MCP availability is currently out of scope (no - portable way to query it from a CLI). - """ - tools_field = fm.get("REQUIRES_TOOLS") - if not tools_field: - return True, [] - tools = [t.strip() for t in tools_field.split(",") if t.strip()] - missing = [t for t in tools if shutil.which(t) is None] - return len(missing) == 0, missing - - -def main() -> int: - parser = argparse.ArgumentParser(description="Receive and decide on a cross-agent message.") - parser.add_argument("message_file", type=Path) - parser.add_argument("--no-verify", action="store_true", help="Skip GPG verification (testing only)") - parser.add_argument("--no-dedup", action="store_true", help="Skip SHA-256 dedup against existing files") - parser.add_argument("--protocol-version", default=EXPECTED_PROTOCOL_VERSION, - help="Override expected protocol version (default: 5)") - parser.add_argument("--json", action="store_true", help="Emit JSON output") - parser.add_argument("--compact-json", action="store_true", help="Compact JSON (no indent)") - args = parser.parse_args() - - check_halt() - - if not args.message_file.is_file(): - err(f"message file not found: {args.message_file}") - return EXIT_FOR_DECISION[DEC_REJECT] - - fm = parse_frontmatter(args.message_file) - if "_parse_error" in fm: - return emit_decision(DEC_REJECT, fm["_parse_error"], {}, None, args) - - # Step 1: frontmatter sanity-check. - missing = [k for k in REQUIRED_FRONTMATTER if k not in fm] - if missing: - return emit_decision( - DEC_REJECT, f"frontmatter missing required fields: {', '.join(missing)}", fm, None, args - ) - if fm["MESSAGE_TYPE"] not in VALID_MESSAGE_TYPES: - return emit_decision( - DEC_REJECT, f"invalid MESSAGE_TYPE: {fm['MESSAGE_TYPE']!r}", fm, None, args - ) - - # Step 2: PROTOCOL_VERSION check. - if fm["PROTOCOL_VERSION"] != args.protocol_version: - return emit_decision( - DEC_QUERY, - f"PROTOCOL_VERSION mismatch: expected {args.protocol_version}, got {fm['PROTOCOL_VERSION']}", - fm, - None, - args, - ) - - # Step 3: GPG verify. - if not args.no_verify: - sig_path = args.message_file.with_suffix(args.message_file.suffix + ".asc") - if not sig_path.is_file(): - return emit_decision(DEC_REJECT, f"signature file missing: {sig_path.name}", fm, None, args) - ok, gpg_err = gpg_verify(args.message_file, sig_path) - if not ok: - return emit_decision(DEC_REJECT, f"gpg verify failed: {gpg_err}", fm, None, args) - - # Step 4: SHA-256 dedup. - my_hash = sha256_of(args.message_file) - if not args.no_dedup: - decision, reason = find_dedup_match(args.message_file, fm, my_hash) - if decision == DEC_DEDUP: - return emit_decision(DEC_DEDUP, reason, fm, my_hash, args) - - # Step 5: REQUIRES_TOOLS check. - ok, missing_tools = check_requires_tools(fm) - if not ok: - return emit_decision( - DEC_QUERY, - f"required tools unavailable: {', '.join(missing_tools)}", - fm, - my_hash, - args, - ) - - # Step 6: process. - return emit_decision(DEC_PROCESS, None, fm, my_hash, args) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/.ai/scripts/cross-agent-comms/cross-agent-recv.md b/.ai/scripts/cross-agent-comms/cross-agent-recv.md deleted file mode 100644 index 247a27a..0000000 --- a/.ai/scripts/cross-agent-comms/cross-agent-recv.md +++ /dev/null @@ -1,218 +0,0 @@ -# cross-agent-recv - -**Purpose.** The canonical receiver-side processor. Reads a single incoming -message file and reports a structured decision the agent acts on: -process / dedup / query / reject. - -The script handles only mechanical checks (frontmatter, signature, dedup, -version, tools). Substance-level decisions like `pushback` ("I disagree with -this request") happen one layer up — after the agent reads the message body -the script returns as `process`-able. - -This is the read-side counterpart to `cross-agent-send`. Together they are the -two halves of the per-message contract. The agent's polling loop calls -`cross-agent-recv` on every new file in `inbox/from-agents/` and dispatches on -the decision. - -Without this script, every receiver implementation re-invents GPG verify + -frontmatter sanity-check + SHA-256 dedup. With it, behavior is consistent -across projects. - -## Usage - -``` -cross-agent-recv <message-file> -``` - -Single positional argument: a `.org` file in `inbox/from-agents/`. The matching -`.asc` signature file must be present alongside it. - -### Flags - -| Flag | Default | Purpose | -|---|---|---| -| `--no-verify` | (verify on) | Skip GPG verification. Testing only. | -| `--no-dedup` | (dedup on) | Skip SHA-256 dedup against existing files. Testing only. | -| `--protocol-version <N>` | 5 | Override the expected protocol version. Useful for testing forward-compatibility checks. | -| `--json` | off | Output decision as JSON for easier parsing by the agent. | - -## Behavior - -Runs the receiver checks in order. First failure determines the decision. - -### Step 1 — Frontmatter sanity-check - -Parse the message's org-mode frontmatter. Required fields: - -- `#+TITLE` -- `#+CONVERSATION_ID` -- `#+MESSAGE_TYPE` (must be one of: `request`, `progress`, `query`, `pushback`, - `complete`, `release`, `escalate`) -- `#+SEQUENCE` (integer) -- `#+TIMESTAMP` (ISO 8601 with explicit offset) -- `#+PROTOCOL_VERSION` (must match the expected version; default 5) - -Any required field missing, malformed, or the protocol version mismatched → -decision = `reject` (frontmatter) or `query` (version mismatch — see below). - -### Step 2 — Protocol-version check - -If `PROTOCOL_VERSION` doesn't match the expected: - -- Decision = `query`. Action: receiver should write a `query` reply asking the - sender to upgrade to the expected protocol version. - -### Step 3 — Signature verification - -Look for `<message-file>.asc` alongside the `.org`. If missing or `gpg ---verify` fails: - -- Decision = `reject` (signature). Surface to user; do not act. - -The `.asc` file MUST be present when the `.org` is — `cross-agent-send` -guarantees this with its strict ordering (`.asc` lands first). If the `.asc` -is missing despite the `.org` being present, the sender violated atomic-write -ordering or the file was tampered with in transit. - -### Step 4 — SHA-256 dedup - -Compute SHA-256 of the message file. Scan the same directory for existing -files matching `CONVERSATION_ID + SEQUENCE`: - -- No match → decision = `process` (new message, dispatch by type). -- Match with **identical** SHA-256 → decision = `dedup` (silent retry; do not - reprocess). -- Match with **different** SHA-256 → decision = `process` (sequence collision - with non-identical content; both are legitimate, ordered by `#+TIMESTAMP`). - -### Step 5 — REQUIRES_TOOLS optional check - -If the message has a `#+REQUIRES_TOOLS` field, verify each named tool/MCP is -available in the receiver's environment. - -- All available → `process`. -- One or more missing → decision = `query`. The agent should write a `query` - reply naming the missing tools, asking the sender to reframe the request to - avoid them. - -### Step 6 — Dispatch decision - -If all checks pass, decision = `process` with the parsed `MESSAGE_TYPE` so the -agent's main loop knows which handler to invoke. - -## Output - -### Default (human-readable) - -``` -$ cross-agent-recv inbox/from-agents/20260427T091015Z-from-homelab-prep-fixup.org -decision: process -message_type: request -conversation_id: prep-fixup -sequence: 6 -sha256: a1b2c3d4... -``` - -### `--json` - -```json -{ - "decision": "process", - "reason": null, - "message_type": "request", - "conversation_id": "prep-fixup", - "sequence": 6, - "timestamp": "2026-04-27T04:11:42-05:00", - "sha256": "a1b2c3d4..." -} -``` - -For decisions other than `process`, `reason` carries a human-readable -explanation: - -```json -{ - "decision": "query", - "reason": "PROTOCOL_VERSION mismatch: expected 5, got 4", - "conversation_id": "prep-fixup", - "sequence": 6 -} -``` - -## Decision exit codes - -| Decision | Exit code | Agent action | -|---|---|---| -| `process` | 0 | Dispatch to the message-type handler | -| `dedup` | 1 | Silent — do nothing further | -| `query` | 2 | Write a `query` reply (see `reason` for what to ask) | -| `reject` | 3 | Surface to user; do not auto-reply | - -The agent reads stdout/JSON to learn the decision; it can also key off exit -code for simpler bash-style dispatching. - -## Failure modes - -| Symptom | Cause | Fix | -|---|---|---| -| `decision: reject (frontmatter)` | Required field missing or malformed | Open the message; fix or surface to user. The sender should not have produced this file. | -| `decision: reject (signature)` | `.asc` missing, GPG verify failed, or signer unknown | Check that `.asc` exists alongside `.org`. If yes, run `gpg --verify <msg>.asc <msg>` manually for diagnostic output. | -| `decision: query (PROTOCOL_VERSION)` | Sender on older/newer protocol | Reply with a `query` asking sender to upgrade. Both sides should align before continuing. | -| `decision: query (REQUIRES_TOOLS)` | Receiver lacks one of the named tools | Reply with a `query` naming the missing tools; sender should reframe to avoid. | -| `decision: dedup` | Already-processed identical retry | No action. The script handled it correctly. | - -## HALT awareness - -Checks `~/.config/cross-agent-comms/HALT` at the start of every invocation. If -HALT exists, exits with code 5 ("halt active; remove -~/.config/cross-agent-comms/HALT to resume") without verifying, deduping, or -returning a decision. - -**The inbound file is left in place** — not moved, not rejected, not -deduped. When HALT clears and polling resumes, the file gets picked up via -the normal cold-start handling (whichever surfaces first: watcher -notification, startup workflow check, or the next agent poll). Reversibility -is preserved. - -If the HALT file exists but is unreadable, fail-closed — treat as if HALT is -set. - -See `cross-agent-halt.md` for the full halt mechanism. - -## Examples - -```bash -# Basic invocation in an agent's polling loop -for msg in inbox/from-agents/*.org; do - decision=$(cross-agent-recv --json "$msg") - case "$(echo "$decision" | jq -r '.decision')" in - process) handle_message "$msg" ;; - dedup) ;; # silent - query) write_query_reply "$msg" "$decision" ;; - reject) surface_to_user "$msg" "$decision" ;; - esac -done - -# Test signature verification only -cross-agent-recv --no-dedup inbox/from-agents/test-msg.org - -# Test against a future protocol version -cross-agent-recv --protocol-version 6 inbox/from-agents/future-msg.org -``` - -## Performance - -The script is fast (single SHA-256 compute, single GPG verify, frontmatter -parse). For typical messages (single-digit KB), runs in well under 100ms. -Dedup-scan is O(N) over files in the directory; if a project's -`inbox/from-agents/` accumulates hundreds of files, archive released -conversations to keep the scan fast. - -## See also - -- `cross-agent-send` — counterpart writer. -- `cross-agent-watch` — fires when a new message arrives; agent then calls - `cross-agent-recv` to process it. -- `cross-agent-status` — pending-message snapshot (uses similar - released-vs-unreleased logic, but doesn't process individual messages). -- `cross-agent-comms.org` — protocol spec, the "what" the script implements. diff --git a/.ai/scripts/cross-agent-comms/cross-agent-resume b/.ai/scripts/cross-agent-comms/cross-agent-resume deleted file mode 100755 index 1fb83bc..0000000 --- a/.ai/scripts/cross-agent-comms/cross-agent-resume +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/env python3 -"""Resume cross-agent comms after a halt. - -See cross-agent-resume.md. Removes ~/.config/cross-agent-comms/HALT and -restarts the cross-agent-watch systemd user service. With --tailnet, -propagates the removal to every peer in peers.toml via SSH; reports -per-peer status with non-zero exit on partial resume. - -Per the asymmetry rule: clearing HALT does NOT auto-resume agent polling. -Each session must explicitly re-engage. -""" - -from __future__ import annotations - -import argparse -import subprocess -import sys -import tomllib -from pathlib import Path - -CONFIG_DIR = Path.home() / ".config" / "cross-agent-comms" -HALT_FILE = CONFIG_DIR / "HALT" -PEERS_TOML = CONFIG_DIR / "peers.toml" - -EXIT_OK = 0 -EXIT_PARTIAL = 1 - - -def err(msg: str) -> None: - print(msg, file=sys.stderr) - - -def remove_halt_file() -> bool: - """Returns True if HALT was removed, False if it didn't exist.""" - if HALT_FILE.exists(): - try: - HALT_FILE.unlink() - return True - except OSError as e: - err(f"could not remove HALT: {e}") - return False - return False - - -def start_watcher_service() -> None: - """Best-effort start of the systemd watcher path unit.""" - try: - subprocess.run( - ["systemctl", "--user", "start", "cross-agent-watch.path"], - capture_output=True, text=True, timeout=5, - ) - except (FileNotFoundError, subprocess.TimeoutExpired): - pass - - -def load_peers() -> dict: - if not PEERS_TOML.exists(): - return {} - try: - return tomllib.loads(PEERS_TOML.read_text()) - except (tomllib.TOMLDecodeError, OSError) as e: - err(f"cannot parse peers.toml: {e}") - return {} - - -def ssh_remove_halt(host: str, ssh_user: str | None) -> tuple[bool, str]: - target = f"{ssh_user}@{host}" if ssh_user else host - remote_cmd = "rm -f ~/.config/cross-agent-comms/HALT" - try: - result = subprocess.run( - ["ssh", "-o", "ConnectTimeout=3", "-o", "BatchMode=yes", target, remote_cmd], - capture_output=True, text=True, timeout=10, - ) - except (FileNotFoundError, subprocess.TimeoutExpired): - return False, "ssh unavailable or timed out" - if result.returncode == 0: - return True, "HALT cleared" - return False, (result.stderr.strip().splitlines() or [f"exit {result.returncode}"])[-1] - - -def print_re_engage_instructions() -> None: - print() - print("Halt cleared. Watcher restarted.") - print() - print("Agent polling does NOT auto-resume — per the failsafe asymmetry rule,") - print("agents stay paused until you explicitly re-engage each session.") - print("Open the relevant Claude session and tell the agent to resume polling") - print("for its conversation.") - - -def main() -> int: - parser = argparse.ArgumentParser(description="Resume cross-agent comms after a halt.") - parser.add_argument("--tailnet", action="store_true", - help="Propagate HALT removal to every peer in peers.toml") - args = parser.parse_args() - - removed = remove_halt_file() - start_watcher_service() - if removed: - print("Resuming locally ✓ (HALT cleared)") - else: - print("Resuming locally ✓ (no HALT was active)") - - if not args.tailnet: - print_re_engage_instructions() - return EXIT_OK - - peers = load_peers().get("peers", {}) - if not peers: - print() - print("No peers configured in peers.toml — local-only resume complete.") - print_re_engage_instructions() - return EXIT_OK - - print() - successes = 1 - failures = [] - for name, cfg in sorted(peers.items()): - host = cfg.get("host", name) - ssh_user = cfg.get("ssh_user") - ok, detail = ssh_remove_halt(host, ssh_user) - marker = "✓" if ok else "✗" - print(f"Resuming {host:<27} {marker} ({detail})") - if ok: - successes += 1 - else: - failures.append(f"{name} ({host}): {detail}") - - print() - total = len(peers) + 1 - if failures: - print(f"PARTIAL RESUME: {successes}/{total} machines cleared.") - for f in failures: - print(f" - {f}") - print("Resolve the failures or manually clear HALT on each machine.") - print_re_engage_instructions() - return EXIT_PARTIAL - - print(f"Resume complete across {total} machine(s).") - print_re_engage_instructions() - return EXIT_OK - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/.ai/scripts/cross-agent-comms/cross-agent-resume.md b/.ai/scripts/cross-agent-comms/cross-agent-resume.md deleted file mode 100644 index 8aa8357..0000000 --- a/.ai/scripts/cross-agent-comms/cross-agent-resume.md +++ /dev/null @@ -1,117 +0,0 @@ -# cross-agent-resume - -**Purpose.** Clear the HALT file and restart the watcher service. Counterpart -to `cross-agent-halt`. Resuming agent polling is **explicit per-session** — -this script doesn't auto-revive halted polling loops; you tell each session -to re-engage. - -## Usage - -``` -cross-agent-resume [--tailnet] -``` - -### Flags - -| Flag | Default | Purpose | -|---|---|---| -| `--tailnet` | local only | Clear HALT on every peer in `peers.toml` via SSH over Tailscale. | - -## Behavior - -### Local resume (default) - -1. Remove the HALT file: `rm -f ~/.config/cross-agent-comms/HALT`. (Use `-f` - so a missing file isn't an error — running resume when not halted is safe.) -2. Restart the watcher service: `systemctl --user start cross-agent-watch.path`. -3. Print a summary: - ``` - ✓ HALT file removed - ✓ Watcher service started (cross-agent-watch.path) - - cross-agent-send and cross-agent-recv will accept new operations. - - Inbound messages held during halt will be picked up by the watcher. - - Agent polling does NOT auto-resume. To re-engage polling in a paused - session, open that Claude session and tell the agent to resume. - ``` -4. Exit 0. - -### Cross-tailnet resume (`--tailnet`) - -1. Apply local resume steps 1-2 first. -2. Read `peers.toml` for the list of remote machines. -3. For each peer, SSH: - ``` - ssh <user>@<host> "rm -f ~/.config/cross-agent-comms/HALT && \ - systemctl --user start cross-agent-watch.path" - ``` -4. Track per-peer success/failure: - ``` - Resuming velox.local ✓ (HALT cleared, watcher started) - Resuming bastion.local ✗ (ssh exit 255: no route to host) - Resuming locally ✓ - - PARTIAL RESUME: 2/3 machines resumed. bastion.local still halted. - ``` -5. Exit 0 if all peers resumed; exit 1 on any failure. - -## Why agent polling doesn't auto-resume - -Two reasons the asymmetry is deliberate: - -1. *Auto-resume could silently invert intentional kills.* If you halted - because a session was misbehaving, removing HALT shouldn't quietly revive - that session's polling. You re-engage explicitly so you're aware of which - sessions came back online. - -2. *You may want to inspect before resuming.* After a halt, you might want to - read pending messages, fix configuration, or kill a particular Claude - session entirely. Per-session resume forces that pause. - -## Re-engaging polling in a Claude session - -After `cross-agent-resume`, open the relevant Claude session and say something -like: - -``` -HALT is cleared; resume polling. -``` - -The agent will check the HALT file (now absent), re-create its polling -schedule, and continue the in-flight conversation from wherever it left off. -The conversation file is intact; the receiver will pick up any new messages -that arrived during the halt window. - -## Failure modes - -| Symptom | Cause | Fix | -|---|---|---| -| HALT file doesn't exist | Already resumed (or never halted) | OK — `-f` makes this a no-op. | -| `systemctl --user start` fails | Watcher service not installed | Install per `cross-agent-watch.md`'s systemd recipe. | -| `--tailnet` resumes some peers but not others | Same as halt: peer unreachable | Per-peer status reported; resolve manually for unreachable peers. | -| Permission denied removing HALT file | File owned by another user | Check ownership; HALT files should be owned by the running user. | - -## Examples - -```bash -# Local resume after a halt -cross-agent-resume - -# Resume all tailnet peers + local -cross-agent-resume --tailnet -``` - -## Recovery flow - -After a halt: - -1. Investigate whatever caused the halt (runaway loop, bad config, etc.). -2. Fix the underlying issue. -3. Run `cross-agent-resume`. -4. Open each Claude session that was polling and tell its agent to re-engage. -5. Confirm operation with `cross-agent-status`. - -## See also - -- `cross-agent-halt` — counterpart that creates the HALT file. -- `cross-agent-status` — verify HALT cleared and see pending messages. -- `cross-agent-comms.org` — protocol spec, `* Halt mechanism` section. diff --git a/.ai/scripts/cross-agent-comms/cross-agent-send b/.ai/scripts/cross-agent-comms/cross-agent-send deleted file mode 100755 index 68c010a..0000000 --- a/.ai/scripts/cross-agent-comms/cross-agent-send +++ /dev/null @@ -1,356 +0,0 @@ -#!/usr/bin/env python3 -"""Cross-agent message sender. - -See cross-agent-send.md for the full contract. Briefly: - -- Destination as <machine>.<project>; resolved via peers.toml. -- Same-machine: cp to receiver's inbox/from-agents/ with atomic rename. -- Cross-machine: rsync over SSH (typically Tailscale) with retry+backoff. -- GPG-signs by default; .asc renames before .org so receivers never see - a .org without its sibling signature. -- Generates the canonical filename; user's input filename is ignored. -- Honors the HALT file: refuses to send and exits with code 5 when set. -""" - -from __future__ import annotations - -import argparse -import datetime as _dt -import json -import os -import re -import shutil -import socket -import subprocess -import sys -import tempfile -import time -import tomllib -from pathlib import Path - -CONFIG_DIR = Path.home() / ".config" / "cross-agent-comms" -PEERS_TOML = CONFIG_DIR / "peers.toml" -HALT_FILE = CONFIG_DIR / "HALT" -STATE_DIR = Path.home() / ".local" / "state" / "cross-agent-comms" -FAILED_SENDS_DIR = STATE_DIR / "failed-sends" - -EXIT_OK = 0 -EXIT_GENERAL = 1 -EXIT_DEST_NOT_FOUND = 2 -EXIT_CROSS_MACHINE_FAILED = 3 -EXIT_FRONTMATTER = 4 -EXIT_HALT = 5 - -REQUIRED_FRONTMATTER = ["CONVERSATION_ID", "MESSAGE_TYPE", "SEQUENCE", "TIMESTAMP", "PROTOCOL_VERSION"] -VALID_MESSAGE_TYPES = {"request", "progress", "query", "pushback", "complete", "release", "escalate"} - - -def err(msg: str) -> None: - print(msg, file=sys.stderr) - - -def check_halt() -> None: - """Exit with code 5 if HALT file exists.""" - if HALT_FILE.exists(): - try: - reason = HALT_FILE.read_text().strip() - except OSError: - # Fail-closed on unreadable HALT. - err("halt active (HALT file present but unreadable; treated as halted)") - err(f"remove {HALT_FILE} to resume") - sys.exit(EXIT_HALT) - msg = "halt active" - if reason: - msg += f": {reason}" - err(msg) - err(f"remove {HALT_FILE} to resume") - sys.exit(EXIT_HALT) - - -def parse_frontmatter(path: Path) -> dict[str, str]: - """Extract org-mode #+KEY: value frontmatter from the top of the file.""" - try: - text = path.read_text() - except OSError as e: - err(f"cannot read message file: {e}") - sys.exit(EXIT_GENERAL) - - frontmatter: dict[str, str] = {} - for line in text.splitlines(): - line = line.rstrip() - if not line: - # Blank line ends the frontmatter block. - if frontmatter: - break - continue - m = re.match(r"#\+([A-Z_]+):\s*(.*)", line) - if m: - frontmatter[m.group(1)] = m.group(2).strip() - else: - # First non-frontmatter line ends parsing. - if frontmatter: - break - return frontmatter - - -def validate_frontmatter(fm: dict[str, str]) -> None: - missing = [k for k in REQUIRED_FRONTMATTER if k not in fm] - if missing: - err(f"frontmatter missing required fields: {', '.join(missing)}") - sys.exit(EXIT_FRONTMATTER) - if fm["MESSAGE_TYPE"] not in VALID_MESSAGE_TYPES: - err(f"invalid MESSAGE_TYPE: {fm['MESSAGE_TYPE']!r}; expected one of {sorted(VALID_MESSAGE_TYPES)}") - sys.exit(EXIT_FRONTMATTER) - try: - int(fm["SEQUENCE"]) - except ValueError: - err(f"SEQUENCE must be an integer; got {fm['SEQUENCE']!r}") - sys.exit(EXIT_FRONTMATTER) - - -def load_peers() -> dict: - if not PEERS_TOML.exists(): - return {} - try: - return tomllib.loads(PEERS_TOML.read_text()) - except (tomllib.TOMLDecodeError, OSError) as e: - err(f"cannot read {PEERS_TOML}: {e}") - sys.exit(EXIT_GENERAL) - - -def resolve_destination(dest: str, peers: dict) -> tuple[str, str, str | None, str | None]: - """Resolve <machine>.<project> to (machine, project, host, ssh_user). - - host is None for same-machine destinations. - """ - if "." not in dest: - err(f"destination must be <machine>.<project>; got {dest!r}") - sys.exit(EXIT_DEST_NOT_FOUND) - machine, project = dest.split(".", 1) - - local_hostname = socket.gethostname().split(".")[0] - is_local = machine == local_hostname or machine == "local" - - host = None - ssh_user = None - if not is_local: - peer_cfg = peers.get("peers", {}).get(machine) - if peer_cfg is None: - available = list(peers.get("peers", {}).keys()) - err(f"destination not found in peers.toml; available peers: {available or '(none)'}") - sys.exit(EXIT_DEST_NOT_FOUND) - host = peer_cfg.get("host", machine) - ssh_user = peer_cfg.get("ssh_user", os.environ.get("USER")) - - return machine, project, host, ssh_user - - -def resolve_inbox_path(project: str, peers: dict) -> str: - """Inbox path on the receiver. Defaults to ~/projects/<project>/inbox/from-agents.""" - proj_cfg = peers.get("projects", {}).get(project) - if proj_cfg and "inbox_path" in proj_cfg: - return os.path.expanduser(proj_cfg["inbox_path"]) - return f"~/projects/{project}/inbox/from-agents" - - -def derive_sender_project() -> str: - """Walk up from CWD looking for ~/projects/<name>/. - - Returns the project name if found; falls back to the basename of CWD. - """ - cwd = Path.cwd().resolve() - projects_root = (Path.home() / "projects").resolve() - try: - rel = cwd.relative_to(projects_root) - return rel.parts[0] - except ValueError: - return cwd.name - - -def generate_canonical_filename(sender: str, conv_id: str) -> str: - """YYYYMMDDTHHMMSSZ-from-<sender>-<conv-id>.org""" - now = _dt.datetime.now(_dt.timezone.utc) - timestamp = now.strftime("%Y%m%dT%H%M%SZ") - return f"{timestamp}-from-{sender}-{conv_id}.org" - - -def sign(message_path: Path, sig_path: Path, key: str | None) -> None: - """gpg --detach-sign --armor --output <sig> [--local-user <key>] <message>""" - cmd = ["gpg", "--detach-sign", "--armor", "--yes", "--output", str(sig_path)] - if key: - cmd.extend(["--local-user", key]) - cmd.append(str(message_path)) - try: - result = subprocess.run(cmd, capture_output=True, text=True) - except FileNotFoundError: - err("gpg not found; install gnupg or use --no-sign for testing") - sys.exit(EXIT_GENERAL) - if result.returncode != 0: - err(f"signing failed: {result.stderr.strip()}") - sys.exit(EXIT_GENERAL) - - -def same_machine_deliver(message_path: Path, sig_path: Path | None, target_dir: Path, canonical_name: str) -> None: - """Atomic-write delivery: stage .asc, mv to final, then stage .org, mv to final.""" - target_dir.mkdir(parents=True, exist_ok=True) - final_msg = target_dir / canonical_name - final_sig = target_dir / f"{canonical_name}.asc" - - if sig_path is not None: - # Stage .asc first, mv to final, THEN stage .org and mv to final. - with tempfile.NamedTemporaryFile( - mode="wb", dir=target_dir, prefix=f".tmp.{canonical_name}.asc.", delete=False - ) as tmp: - tmp.write(sig_path.read_bytes()) - tmp_sig_path = Path(tmp.name) - os.replace(tmp_sig_path, final_sig) - - # Re-check HALT between .asc and .org per the layered-checks rule. - check_halt() - - with tempfile.NamedTemporaryFile( - mode="wb", dir=target_dir, prefix=f".tmp.{canonical_name}.", delete=False - ) as tmp: - tmp.write(message_path.read_bytes()) - tmp_msg_path = Path(tmp.name) - os.replace(tmp_msg_path, final_msg) - - -def cross_machine_deliver( - message_path: Path, - sig_path: Path | None, - canonical_name: str, - host: str, - ssh_user: str, - inbox_path: str, - retries: int, -) -> bool: - """rsync push the .asc first (if signed), re-check HALT, then push the .org. - - Returns True on success, False on persistent failure (after retries). - """ - # Stage local copies with the canonical name so rsync sets the right - # destination filename. - with tempfile.TemporaryDirectory(prefix="cross-agent-send-") as staging: - staging_dir = Path(staging) - local_msg = staging_dir / canonical_name - local_msg.write_bytes(message_path.read_bytes()) - local_sig = None - if sig_path is not None: - local_sig = staging_dir / f"{canonical_name}.asc" - local_sig.write_bytes(sig_path.read_bytes()) - - backoffs = [5, 30, 120] - # Step 1: push .asc first if signed. - if local_sig is not None: - if not _rsync_with_retries(local_sig, host, ssh_user, inbox_path, retries, backoffs): - return False - - # Re-check HALT between .asc and .org per the layered-checks rule. - check_halt() - - # Step 2: push .org. - if not _rsync_with_retries(local_msg, host, ssh_user, inbox_path, retries, backoffs): - return False - - return True - - -def _rsync_with_retries( - src: Path, host: str, ssh_user: str, inbox_path: str, retries: int, backoffs: list[int] -) -> bool: - target = f"{ssh_user}@{host}:{inbox_path}/" - last_err = "" - for attempt in range(retries + 1): - if attempt > 0: - check_halt() - wait = backoffs[min(attempt - 1, len(backoffs) - 1)] - err(f"rsync attempt {attempt} failed: {last_err}; retrying in {wait}s") - time.sleep(wait) - try: - result = subprocess.run( - ["rsync", "-a", str(src), target], - capture_output=True, - text=True, - ) - except FileNotFoundError: - err("rsync not found; install rsync") - return False - if result.returncode == 0: - return True - last_err = result.stderr.strip() or f"exit {result.returncode}" - err(f"rsync failed after {retries + 1} attempts: {last_err}") - return False - - -def write_failed_send_marker(dest: str, message_path: Path, error: str, retry_log: list[str]) -> None: - FAILED_SENDS_DIR.mkdir(parents=True, exist_ok=True) - timestamp = _dt.datetime.now(_dt.timezone.utc).strftime("%Y%m%dT%H%M%SZ") - safe_basename = re.sub(r"[^A-Za-z0-9._-]", "_", message_path.name) - marker = FAILED_SENDS_DIR / f"{timestamp}-{dest.replace('.', '-')}-{safe_basename}.json" - marker.write_text(json.dumps( - { - "timestamp": timestamp, - "destination": dest, - "message_path": str(message_path), - "error": error, - "retry_log": retry_log, - }, - indent=2, - )) - err(f"marker written: {marker}") - - -def main() -> int: - parser = argparse.ArgumentParser(description="Send a cross-agent message.") - parser.add_argument("destination", help="Destination as <machine>.<project>") - parser.add_argument("message_file", type=Path, help="Path to the message body file") - parser.add_argument("--no-sign", action="store_true", help="Skip GPG signing (testing only)") - parser.add_argument("--retries", type=int, default=3, help="Retry count for cross-machine sends") - parser.add_argument("--key", help="GPG key id to sign with (default: user's primary)") - args = parser.parse_args() - - check_halt() - - if not args.message_file.is_file(): - err(f"message file not found: {args.message_file}") - return EXIT_GENERAL - - fm = parse_frontmatter(args.message_file) - validate_frontmatter(fm) - - peers = load_peers() - machine, project, host, ssh_user = resolve_destination(args.destination, peers) - inbox_path = resolve_inbox_path(project, peers) - - sender = derive_sender_project() - canonical_name = generate_canonical_filename(sender, fm["CONVERSATION_ID"]) - - sig_tmp = None - if not args.no_sign: - sig_tmp = args.message_file.with_suffix(args.message_file.suffix + ".asc.tmp") - sign(args.message_file, sig_tmp, args.key) - - try: - if host is None: - # Same-machine delivery. - target_dir = Path(os.path.expanduser(inbox_path)) - same_machine_deliver(args.message_file, sig_tmp, target_dir, canonical_name) - print(f"sent: {target_dir}/{canonical_name}") - return EXIT_OK - else: - ok = cross_machine_deliver( - args.message_file, sig_tmp, canonical_name, host, ssh_user, inbox_path, args.retries - ) - if ok: - print(f"sent: {ssh_user}@{host}:{inbox_path}/{canonical_name}") - return EXIT_OK - write_failed_send_marker(args.destination, args.message_file, "rsync failed after retries", []) - return EXIT_CROSS_MACHINE_FAILED - finally: - if sig_tmp is not None and sig_tmp.exists(): - sig_tmp.unlink() - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/.ai/scripts/cross-agent-comms/cross-agent-send.md b/.ai/scripts/cross-agent-comms/cross-agent-send.md deleted file mode 100644 index 29bfb24..0000000 --- a/.ai/scripts/cross-agent-comms/cross-agent-send.md +++ /dev/null @@ -1,199 +0,0 @@ -# cross-agent-send - -**Purpose.** Send a cross-agent message file to a specific destination. Handles -peer-config lookup, GPG signing, atomic write (same-machine) or rsync push -(cross-machine), retry-with-backoff, and failure surfacing. - -This is the canonical writer. The protocol spec defers all writer mechanics to -this script. - -## Usage - -``` -cross-agent-send <destination> <message-file> [--no-sign] [--retries N] -``` - -### Positional arguments - -| Position | Meaning | Example | -|---|---|---| -| 1 | Destination as `<machine>.<project>` | `homelab.career`, `velox.career` | -| 2 | Message file (already-formatted `.org`) | `/tmp/my-message.org` | - -### Flags - -| Flag | Default | Purpose | -|---|---|---| -| `--no-sign` | (signing on) | Skip GPG signing. Use only for testing; receivers reject unsigned messages by default. | -| `--retries N` | 3 | Override retry count for cross-machine sends. | -| `--key <key-id>` | (user's primary key) | GPG key to sign with. Resolution order: `--key` flag, `GPG_USER` env, `git config user.signingkey`, then the first secret key in the keyring. | - -## Behavior - -### Filename generation (script-controlled) - -The script generates the canonical destination filename from the message's -frontmatter and sender context. The user's input filename is ignored — pass any -path, the script names the destination correctly: - -``` -<UTC-now>T<HHMMSS>Z-from-<sender-slug>-<short-conv-id>.org -``` - -`<sender-slug>` comes from the sender machine's project name (config or -hostname-based). `<short-conv-id>` is read from the message's -`#+CONVERSATION_ID` frontmatter field. UTC timestamp is generated at send time. - -The script also performs the **sender-side max-seen scan** before writing: it -reads the receiver's `from-agents/` directory, finds the highest existing -sequence in this conversation across both sender prefixes, and (best-effort) -suggests `max(seen) + 1` for the next sequence. The user/agent is responsible -for setting `#+SEQUENCE` in the message body; the script only advises. - -### Same-machine destinations - -Resolved when the destination's machine matches the current hostname (or is -not in `peers.toml` as a remote). Steps: - -1. Parse frontmatter; extract `CONVERSATION_ID` and `TIMESTAMP`. Validate per - the *Validation before send* section below. -2. Generate canonical filename per *Filename generation* above. -3. Sign: `gpg --detach-sign --armor --output <canonical>.asc --local-user <key> <input>`. -4. Compute target: read `peers.toml` for the project's `inbox_path`. If - missing, fall back to `~/projects/<project>/inbox/from-agents/`. -5. **Atomic write with strict ordering** (signature must precede message): - - Stage `.asc`: write to `<target>/.tmp.XXXXXX-<canonical>.asc`, - then `mv` to `<target>/<canonical>.asc`. - - **Then** stage `.org`: write to `<target>/.tmp.XXXXXX-<canonical>`, - then `mv` to `<target>/<canonical>`. - - Receivers only act on `.org` files; staging the `.asc` first guarantees - the signature is present when the receiver opens the message. Out-of-order - would race: receiver could read the `.org` before the `.asc` lands and - fail GPG verify even though the sender did everything right. -6. Exit 0 on success. Exit non-zero if any step fails. - -### Cross-machine destinations - -Steps: - -1. Parse + generate canonical filename, as same-machine steps 1-2. -2. Sign locally to `<input>.asc` (or a tmp staging file). -3. rsync push **with the same .asc-first ordering**: - - `rsync -a <input>.asc <ssh-user>@<host>:<inbox_path>/<canonical>.asc` - - **Then** `rsync -a <input> <ssh-user>@<host>:<inbox_path>/<canonical>` - rsync writes to a hidden temp file then renames atomically by default - (`--inplace` would defeat this; do not pass it). -4. Retry on failure: 5s, 30s, 120s backoff, then surface error. -5. On persistent failure: write a marker file to - `~/.local/state/cross-agent-comms/failed-sends/<timestamp>-<dest>-<canonical>.json` - containing the destination, message path, error, and retry log. Exit non-zero. - -### Validation before send - -- Destination resolves via `peers.toml` (or local fallback). If neither, exit - immediately with `destination not found in peers.toml; available: <list>`. -- Message file must be readable, non-empty, and have valid org-mode frontmatter - with **all** of the following required fields: - - `#+TITLE` - - `#+CONVERSATION_ID` - - `#+MESSAGE_TYPE` - - `#+SEQUENCE` - - `#+TIMESTAMP` - - `#+PROTOCOL_VERSION` (must equal `5` for v5) - - If any required field is missing or malformed, exit immediately with a parse - error naming the offending field. - -- Optional fields the script recognizes and passes through (no special - handling beyond preservation): - - `#+REQUIRES_TOOLS` — comma-separated tool/MCP slugs the receiver needs. - - `#+RELEASE_STATUS` — valid only on `MESSAGE_TYPE: release`. Values per - spec: `complete`, `cancelled`, `withdrawn-after-pushback`, - `abandoned-after-escalation`. - - `#+WORKFLOW_VERSION` — sender's version of the cross-agent-comms workflow - file. Currently advisory; receiver may warn on mismatch but does not block. - -## Configuration - -Reads `~/.config/cross-agent-comms/peers.toml` for peer routing: - -```toml -[peers.velox] -host = "velox.local" -ssh_user = "cjennings" - -# Optional: per-project inbox-path overrides for non-default layouts. -[projects.work] -inbox_path = "~/projects/work/inbox/from-agents" - -[projects.homelab] -inbox_path = "~/projects/homelab/inbox/from-agents" -``` - -If a project entry is omitted, defaults to `~/projects/<project>/inbox/from-agents`. - -## Failure modes - -| Symptom | Cause | Fix | -|---|---|---| -| `destination not found in peers.toml` | Misspelled destination, or peer not configured | Run `cross-agent-discover` to see available destinations. | -| `signing failed: no secret key` | GPG key missing or not in keyring | `gpg --list-secret-keys` to confirm. Override with `--key <id>`. | -| `signing failed: pinentry timed out` | Headless session, GUI pinentry unavailable | Confirm `pinentry-program` in `gpg-agent.conf` matches available pinentry. Per protocols.org, GUI pinentry works from Claude Code. | -| `rsync exit 255` | SSH unreachable | `cross-agent-discover --peer <name>` to confirm reachability. | -| `rsync exit 23` | Permission denied at destination | Check destination directory perms (`chmod 700`) and ownership. | -| Marker file written to `failed-sends/` | Persistent cross-machine failure | Inspect the marker's `error` field. After fixing, retry: `cross-agent-send <dest> <msg>` (the marker is for visibility; it does not auto-retry). | -| Receiver complains "unsigned message" | `--no-sign` was used in production | Don't use `--no-sign` outside testing. | - -## HALT awareness - -Checks `~/.config/cross-agent-comms/HALT` at the start of every send AND -between the `.asc` and `.org` rsync calls AND between each retry iteration. -On HALT exists, exits with code 5 ("halt active; remove -~/.config/cross-agent-comms/HALT to resume") without writing or pushing -further. - -Worst case: one in-flight send completes its current rsync step within a few -seconds before halt kicks in for the next step. New sends are blocked -immediately. No `pkill` needed — the per-iteration check stops things -naturally. - -If the HALT file exists but is unreadable (permissions wrong), fail-closed — -treat as if HALT is set. Safer than fail-open. - -See `cross-agent-halt.md` for the full halt mechanism. - -## Examples - -```bash -# Same-machine send -cross-agent-send homelab.career /tmp/my-message.org - -# Cross-machine send via Tailscale -cross-agent-send velox.career /tmp/my-message.org - -# Test send without signing (receiver will reject) -cross-agent-send homelab.career /tmp/test.org --no-sign - -# Override retry count for a flaky link -cross-agent-send velox.career /tmp/my-message.org --retries 10 - -# After a delivery failure, inspect the marker -cat ~/.local/state/cross-agent-comms/failed-sends/*.json | jq . -``` - -## Exit codes - -| Code | Meaning | -|---|---| -| 0 | Sent successfully. | -| 1 | General error (parse failure, signing failure, etc.). | -| 2 | Destination not found in peers.toml. | -| 3 | Cross-machine delivery failed after retries. Marker file written. | -| 4 | Frontmatter validation failed. | - -## See also - -- `cross-agent-discover` — validate destinations before sending. -- `cross-agent-watch` — receiver-side notification. -- `cross-agent-status` — see what's queued. -- `cross-agent-comms.org` — protocol spec, the "what" the script implements. diff --git a/.ai/scripts/cross-agent-comms/cross-agent-status b/.ai/scripts/cross-agent-comms/cross-agent-status deleted file mode 100755 index 4eee75b..0000000 --- a/.ai/scripts/cross-agent-comms/cross-agent-status +++ /dev/null @@ -1,185 +0,0 @@ -#!/usr/bin/env python3 -"""Point-in-time snapshot of pending cross-agent messages across local projects. - -See cross-agent-status.md. Pending = messages in inbox/from-agents/ whose -CONVERSATION_ID has no MESSAGE_TYPE: release at a later #+TIMESTAMP. - -HALT: prints a prominent banner before normal output, but continues to enumerate. -""" - -from __future__ import annotations - -import argparse -import glob -import json -import os -import re -import sys -from pathlib import Path - -CONFIG_DIR = Path.home() / ".config" / "cross-agent-comms" -HALT_FILE = CONFIG_DIR / "HALT" -DEFAULT_GLOB = str(Path.home() / "projects" / "*" / "inbox" / "from-agents") + "/" - - -def parse_frontmatter(path: Path) -> dict[str, str]: - try: - text = path.read_text() - except OSError: - return {} - fm: dict[str, str] = {} - for line in text.splitlines(): - line = line.rstrip() - if not line: - if fm: - break - continue - m = re.match(r"#\+([A-Z_]+):\s*(.*)", line) - if m: - fm[m.group(1)] = m.group(2).strip() - elif fm: - break - return fm - - -def project_name_from_path(path: str) -> str: - """Walk up from path to find ~/projects/<name>/...""" - home = str(Path.home()) - parts = Path(path).parts - for i, part in enumerate(parts): - if part == "projects" and i + 1 < len(parts) and str(Path(*parts[: i + 1])) == os.path.join(home, "projects"): - return parts[i + 1] - # Fallback: dir three levels up from the .org file (project/inbox/from-agents/file.org) - return Path(path).parent.parent.parent.name - - -def scan_project(inbox_dir: Path) -> tuple[int, str | None, int | None]: - """Return (pending_count, most_recent_filename_or_None, most_recent_age_seconds_or_None).""" - if not inbox_dir.is_dir(): - return 0, None, None - - # Group .org files by CONVERSATION_ID, also collect release timestamps per conv. - org_files = sorted(inbox_dir.glob("*.org")) - if not org_files: - return 0, None, None - - by_conv: dict[str, list[tuple[str, str, Path]]] = {} # conv_id -> [(timestamp, msg_type, path)] - for f in org_files: - fm = parse_frontmatter(f) - conv = fm.get("CONVERSATION_ID") - ts = fm.get("TIMESTAMP") - mt = fm.get("MESSAGE_TYPE") - if not conv or not ts or not mt: - # Malformed file: count as pending under conv "_unparseable". - by_conv.setdefault("_unparseable", []).append(("", "request", f)) - continue - by_conv.setdefault(conv, []).append((ts, mt, f)) - - pending_files: list[Path] = [] - for conv, entries in by_conv.items(): - entries.sort(key=lambda e: e[0]) - # Find the latest release timestamp. - release_ts = None - for ts, mt, _f in entries: - if mt == "release" and (release_ts is None or ts > release_ts): - release_ts = ts - for ts, mt, f in entries: - if mt == "release": - continue - if release_ts is not None and ts <= release_ts: - continue - pending_files.append(f) - - if not pending_files: - return 0, None, None - - # Most-recent by mtime (proxy for arrival order). - most_recent = max(pending_files, key=lambda p: p.stat().st_mtime) - import time - age = int(time.time() - most_recent.stat().st_mtime) - return len(pending_files), most_recent.name, age - - -def fmt_age(seconds: int | None) -> str: - if seconds is None: - return "—" - if seconds < 60: - return f"{seconds}s ago" - if seconds < 3600: - return f"{seconds // 60} min ago" - if seconds < 86400: - return f"{seconds // 3600} hr ago" - return f"{seconds // 86400} day(s) ago" - - -def render_banner_if_halt() -> None: - if not HALT_FILE.exists(): - return - try: - reason = HALT_FILE.read_text().strip() - except OSError: - reason = "(HALT file unreadable; treated as halted)" - print("⚠ HALT ACTIVE — cross-agent comms paused") - if reason: - print(f" reason: {reason}") - print(f" clear: rm {HALT_FILE} (or: cross-agent-resume)") - print() - - -def main() -> int: - parser = argparse.ArgumentParser(description="Snapshot of pending cross-agent messages across local projects.") - parser.add_argument("--json", action="store_true", help="Emit JSON output") - parser.add_argument("--projects-glob", default=DEFAULT_GLOB, - help=f"Glob for project from-agents dirs (default: {DEFAULT_GLOB})") - args = parser.parse_args() - - render_banner_if_halt() - - matched = sorted(glob.glob(args.projects_glob)) - rows = [] - for path in matched: - inbox = Path(path) - if not inbox.is_dir(): - continue - proj = project_name_from_path(path) - count, most_recent, age = scan_project(inbox) - rows.append({ - "name": proj, - "pending_count": count, - "most_recent": ( - {"filename": most_recent, "age_seconds": age} - if most_recent else None - ), - }) - - # Sort: pending-first, then alphabetical by name. - rows.sort(key=lambda r: (-r["pending_count"], r["name"])) - - if args.json: - import datetime as _dt - payload = { - "scanned_at": _dt.datetime.now(_dt.timezone.utc).isoformat(), - "halt_active": HALT_FILE.exists(), - "projects": rows, - } - print(json.dumps(payload, indent=2)) - return 0 - - if not rows: - print("No projects with inbox/from-agents/ found — 0 pending.") - return 0 - - # Human-readable table. - name_w = max(len("project"), max(len(r["name"]) for r in rows)) - print(f"{'project':<{name_w}} pending most-recent") - for r in rows: - most_recent_str = "—" - if r["most_recent"]: - most_recent_str = f"{r['most_recent']['filename']} ({fmt_age(r['most_recent']['age_seconds'])})" - print(f"{r['name']:<{name_w}} {r['pending_count']:<7} {most_recent_str}") - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/.ai/scripts/cross-agent-comms/cross-agent-status.md b/.ai/scripts/cross-agent-comms/cross-agent-status.md deleted file mode 100644 index 070330c..0000000 --- a/.ai/scripts/cross-agent-comms/cross-agent-status.md +++ /dev/null @@ -1,139 +0,0 @@ -# cross-agent-status - -**Purpose.** Point-in-time snapshot of pending cross-agent messages across -every project on this machine. Run from any terminal. No daemon required. - -This is the user-pull layer of the cold-start story — `cross-agent-watch` -pushes notifications, `cross-agent-status` lets the user query. - -## Usage - -``` -cross-agent-status [--json] [--projects-glob <glob>] -``` - -No args required. - -### Flags - -| Flag | Default | Purpose | -|---|---|---| -| `--json` | off (table) | Output as JSON for scripting. | -| `--projects-glob <glob>` | `~/projects/*/inbox/from-agents/` | Override which directories to scan. | - -## Output - -### Default (table) - -``` -$ cross-agent-status -project pending most-recent -career 0 — -claude-templates 0 — -clipper 0 — -homelab 1 20260427T085611Z-from-career-question.org (3 min ago) -finances 0 — -... (other 9 projects) -``` - -Sort: pending-first, then alphabetical. - -### `--json` - -```json -{ - "scanned_at": "2026-04-27T04:13:00-05:00", - "projects": [ - { - "name": "homelab", - "pending_count": 1, - "most_recent": { - "filename": "20260427T085611Z-from-career-question.org", - "age_seconds": 180 - } - }, - ... - ] -} -``` - -## Pending semantics - -A message is "pending" if it sits in `inbox/from-agents/` AND no -`MESSAGE_TYPE: release` exists for the same `CONVERSATION_ID` after it. - -Concretely: - -1. Scan each project's `inbox/from-agents/` for `.org` files. -2. Group by `CONVERSATION_ID` from frontmatter. -3. For each conversation, find the highest-`#+TIMESTAMP` message with - `MESSAGE_TYPE: release`. -4. Messages with `#+TIMESTAMP` after that release (or in conversations with no - release) count as pending. - -Files without parseable frontmatter are counted as pending and noted in the -output (single warning row per project). - -## Failure modes - -| Symptom | Likely cause | Fix | -|---|---|---| -| Project missing from output | Project's `.ai/` directory exists but `inbox/from-agents/` does not | Created lazily on first cross-agent message; `mkdir -p` to surface in output. | -| All projects show "0 pending" but you know one has messages | Glob misresolved, OR all messages are post-release | `cross-agent-status --projects-glob` with explicit path to confirm. | -| Warning row "N files unparseable in <project>" | Message file has invalid frontmatter | Open the file, fix or move out. | - -## Performance - -Scans every `.org` file in every watched directory. For Craig's setup (14 -projects, single-digit messages each), runs in <100ms. If a project -accumulates hundreds of post-release messages, archive them per the persistence -guidance in the protocol spec. - -## HALT awareness - -Checks `~/.config/cross-agent-comms/HALT` at start. If HALT exists, prints a -prominent banner before normal output: - -``` -$ cross-agent-status -⚠ HALT ACTIVE — cross-agent comms paused - Reason: investigating runaway poll loop, 2026-04-27 - HALT file: ~/.config/cross-agent-comms/HALT - Resume with: cross-agent-resume - -(snapshot continues normally — HALT does not suppress visibility) - -project pending most-recent -career 0 — -homelab 1 20260427T085611Z-from-career-question.org (3 min ago) -... -``` - -Status is read-only, so it always runs. The banner ensures the user can't -miss that halt is active when checking inbox state. Reason text comes from -the HALT file's body; if empty, omit the reason line. - -If the HALT file exists but is unreadable, print a warning banner ("HALT -file present but unreadable; treat as halted") and continue with normal -output. - -See `cross-agent-halt.md` for the full halt mechanism. - -## Examples - -```bash -# Snapshot -cross-agent-status - -# JSON for piping -cross-agent-status --json | jq '.projects[] | select(.pending_count > 0)' - -# Single-project query -cross-agent-status --projects-glob ~/projects/work/inbox/from-agents/ -``` - -## See also - -- `cross-agent-watch` — push notifications on new arrivals. -- `cross-agent-discover` — enumerate available agents (cross-machine). -- `cross-agent-comms.org` — protocol spec. diff --git a/.ai/scripts/cross-agent-comms/cross-agent-watch b/.ai/scripts/cross-agent-comms/cross-agent-watch deleted file mode 100755 index f50ba26..0000000 --- a/.ai/scripts/cross-agent-comms/cross-agent-watch +++ /dev/null @@ -1,106 +0,0 @@ -#!/usr/bin/env bash -# cross-agent-watch — desktop-notify on new cross-agent messages. -# -# See cross-agent-watch.md. Watches every ~/projects/*/inbox/from-agents/ by -# default. inotifywait fires create + moved_to events; .tmp.* files are -# filtered out. HALT suppresses notifications but the watcher keeps running -# and logs each event with "(suppressed by HALT)". - -set -uo pipefail - -# Defaults. -PROJECTS_GLOB="${HOME}/projects/*/inbox/from-agents/" -LOG_FILE="${HOME}/.local/state/cross-agent-comms/watch.log" -HALT_FILE="${HOME}/.config/cross-agent-comms/HALT" -QUIET=0 -NO_NOTIFY=0 - -# Arg parsing. -while [[ $# -gt 0 ]]; do - case "$1" in - --projects-glob) - PROJECTS_GLOB="$2"; shift 2 ;; - --log) - LOG_FILE="$2"; shift 2 ;; - --quiet) - QUIET=1; shift ;; - --no-notify) - NO_NOTIFY=1; shift ;; - -h|--help) - cat <<EOF -Usage: cross-agent-watch [--projects-glob GLOB] [--log PATH] [--quiet] [--no-notify] - -Watches inbox/from-agents/ directories for new cross-agent messages and fires -desktop notifications. See cross-agent-watch.md for details. -EOF - exit 0 ;; - *) - echo "unknown flag: $1" >&2; exit 1 ;; - esac -done - -# Resolve glob to a concrete list of directories. -# shellcheck disable=SC2086 -DIRS=( $PROJECTS_GLOB ) -# Filter out non-existent paths (glob may include literal pattern when no match). -EXISTING=() -for d in "${DIRS[@]}"; do - if [[ -d "$d" ]]; then - EXISTING+=( "$d" ) - fi -done - -if [[ ${#EXISTING[@]} -eq 0 ]]; then - echo "cross-agent-watch: glob resolved 0 directories: $PROJECTS_GLOB" >&2 - exit 1 -fi - -# Ensure log dir exists. -mkdir -p "$(dirname "$LOG_FILE")" - -[[ $QUIET -eq 0 ]] && echo "cross-agent-watch: watching ${#EXISTING[@]} dir(s); log: $LOG_FILE" - -# Helper: project name from path like /home/.../projects/<name>/inbox/from-agents/... -project_name() { - local path="$1" - # Match ~/projects/<name>/... - if [[ "$path" =~ ${HOME}/projects/([^/]+)/ ]]; then - echo "${BASH_REMATCH[1]}" - else - basename "$(dirname "$(dirname "$path")")" - fi -} - -# Main loop. inotifywait emits one line per event in the format -# "<full-path>" because we passed --format '%w%f'. -inotifywait -m -e create,moved_to --format '%w%f' "${EXISTING[@]}" 2>/dev/null \ - | while IFS= read -r path; do - filename="$(basename "$path")" - - # Filter .tmp.* staging files. - case "$filename" in - .tmp.*) continue ;; - esac - - # Filter .asc sidecars — they land first per the atomic-write ordering; - # the .org event will fire after. - case "$filename" in - *.asc) continue ;; - esac - - proj="$(project_name "$path")" - iso="$(date -u "+%Y-%m-%dT%H:%M:%SZ")" - - if [[ -e "$HALT_FILE" ]]; then - printf '%s\t%s\t%s\t(suppressed by HALT)\n' "$iso" "$proj" "$filename" >> "$LOG_FILE" - [[ $QUIET -eq 0 ]] && echo "[$iso] $proj: $filename (suppressed by HALT)" - continue - fi - - printf '%s\t%s\t%s\n' "$iso" "$proj" "$filename" >> "$LOG_FILE" - [[ $QUIET -eq 0 ]] && echo "[$iso] $proj: $filename" - - if [[ $NO_NOTIFY -eq 0 ]]; then - notify info "Cross-agent message" "${proj}: ${filename}" --persist 2>/dev/null || true - fi - done diff --git a/.ai/scripts/cross-agent-comms/cross-agent-watch.md b/.ai/scripts/cross-agent-comms/cross-agent-watch.md deleted file mode 100644 index 04e8005..0000000 --- a/.ai/scripts/cross-agent-comms/cross-agent-watch.md +++ /dev/null @@ -1,130 +0,0 @@ -# cross-agent-watch - -**Purpose.** Long-running watcher that fires desktop notifications when new -cross-agent messages land in any project's `inbox/from-agents/` directory. -This is the primary cold-start mechanism: messages get noticed even when no -Claude session is active. - -## Usage - -``` -cross-agent-watch [--projects-glob <glob>] [--log <path>] -``` - -No args required. Defaults: - -- Watches `~/projects/*/inbox/from-agents/` (matches every project with the - cross-agent-comms convention). -- Logs each event to `~/.local/state/cross-agent-comms/watch.log`. - -### Flags - -| Flag | Default | Purpose | -|---|---|---| -| `--projects-glob <glob>` | `~/projects/*/inbox/from-agents/` | Override which directories to watch. Useful for testing on a single project. | -| `--log <path>` | `~/.local/state/cross-agent-comms/watch.log` | Override log location. Set to `/dev/null` to disable logging. | -| `--quiet` | off | Suppress stdout output. Notifications still fire. | -| `--no-notify` | off | Skip `notify` calls. Useful for testing the watcher loop without spamming notifications. | - -## Behavior - -1. Resolves the projects-glob to a concrete list of directories at startup. - New projects added to `~/projects/` after startup are NOT picked up — restart - the watcher to re-resolve. -2. Runs `inotifywait -m -e create,moved_to --format '%w%f'` against each - watched directory. -3. For each event, calls - `notify info "Cross-agent message" "<project>: <filename>" --persist`. The - `--persist` flag keeps the page on screen until dismissed, so an inbound - message that arrives while Craig is away from the desk isn't missed. -4. Appends an event line to the log: - `<ISO-8601-timestamp>\t<project>\t<filename>`. - -## Event filtering - -- Watches `create` AND `moved_to` events. The `moved_to` part is critical for - the atomic-write convention (`mktemp` + `mv` produces a `moved_to`, not a - `create`). -- Files starting with `.tmp.` are ignored — they're staging files from - in-progress writes that should never produce a notification. - -## Installation - -### Option A — tmux pane (personal, easy) - -Run in a tmux pane that survives session disconnects: - -``` -tmux new -d -s cross-agent-watch 'cross-agent-watch' -``` - -### Option B — systemd user service (production) - -Provided files: - -- `~/.config/systemd/user/cross-agent-watch.service` -- `~/.config/systemd/user/cross-agent-watch.path` - -Enable with: - -``` -systemctl --user enable --now cross-agent-watch.path -``` - -The path unit triggers the service unit on filesystem changes; the service -unit re-execs `cross-agent-watch` if it dies. Survives reboot. - -## Failure modes - -| Symptom | Likely cause | Fix | -|---|---|---| -| No notifications fire on new files | inotifywait not running, or glob resolved to zero dirs | Check `cross-agent-watch --projects-glob ... --quiet` exits non-zero immediately. Log shows `"resolved 0 directories"`. | -| Notifications fire on `.tmp.` files | Filter regression | Verify `inotifywait` events show the `.tmp.` files; if so check this script's filter logic. | -| Some files missed under rapid bursts | inotify queue overflow | Increase `fs.inotify.max_queued_events` sysctl. Default 16384 is usually fine. | -| Permission denied on a watched dir | Directory perms wrong | `chmod 700 <dir>` and confirm owner. | - -## HALT awareness - -Checks `~/.config/cross-agent-comms/HALT` on each iteration (each inotifywait -event fired). If HALT exists, the watcher continues running but **suppresses -the `notify` call**. The event is still logged, with `(suppressed by HALT)` -appended: - -``` -2026-04-27T04:42:00-05:00 career 20260427T094200Z-from-homelab-test.org (suppressed by HALT) -``` - -Logged-but-suppressed events are useful for the operator to see what would -have fired during the halt window — helpful for diagnosing whatever caused -the halt. - -When HALT clears, suppression stops; subsequent events fire normally. Backlog -events that arrived during halt are NOT replayed — they get picked up via -cold-start handling (status CLI, agent startup check, or the next agent -poll once polling resumes). - -If the HALT file exists but is unreadable, fail-closed (suppress) — safer -than fail-open. - -See `cross-agent-halt.md` for the full halt mechanism. - -## Examples - -```bash -# Watch all projects, log everything, fire notifications -cross-agent-watch - -# Test against a single project, no notifications, verbose -cross-agent-watch \ - --projects-glob "$HOME/projects/work/inbox/from-agents/" \ - --no-notify - -# Production-style: quiet stdout, log only -cross-agent-watch --quiet -``` - -## See also - -- `cross-agent-status` — point-in-time snapshot of pending messages. -- `cross-agent-send` — counterpart writer. -- `cross-agent-comms.org` — protocol spec. |
