diff options
| author | Craig Jennings <c@cjennings.net> | 2026-05-06 21:59:52 -0500 |
|---|---|---|
| committer | Craig Jennings <c@cjennings.net> | 2026-05-06 21:59:52 -0500 |
| commit | d81b23ad6b6e437dfe3c338a00a4be39bc555146 (patch) | |
| tree | 2d4b0d7890fd1fc70d81282b81fed2808c28a106 /.ai/scripts/cross-agent-comms | |
| parent | 201377f57430ef28d02e703a2191434bbee55c75 (diff) | |
| download | rulesets-d81b23ad6b6e437dfe3c338a00a4be39bc555146.tar.gz rulesets-d81b23ad6b6e437dfe3c338a00a4be39bc555146.zip | |
chore(ai): initialize project notes and Claude tooling surfaces
Replace the seed notes.org with project-specific context (layout, install modes, task tracker location, recent inflection point). Bring in the synced template surfaces (protocols, workflows, scripts, references, retrospectives, someday-maybe) as tracked content for this content/documentation project.
Diffstat (limited to '.ai/scripts/cross-agent-comms')
| -rwxr-xr-x | .ai/scripts/cross-agent-comms/cross-agent-discover | 230 | ||||
| -rw-r--r-- | .ai/scripts/cross-agent-comms/cross-agent-discover.md | 155 | ||||
| -rwxr-xr-x | .ai/scripts/cross-agent-comms/cross-agent-halt | 134 | ||||
| -rw-r--r-- | .ai/scripts/cross-agent-comms/cross-agent-halt.md | 134 | ||||
| -rwxr-xr-x | .ai/scripts/cross-agent-comms/cross-agent-recv | 250 | ||||
| -rw-r--r-- | .ai/scripts/cross-agent-comms/cross-agent-recv.md | 218 | ||||
| -rwxr-xr-x | .ai/scripts/cross-agent-comms/cross-agent-resume | 145 | ||||
| -rw-r--r-- | .ai/scripts/cross-agent-comms/cross-agent-resume.md | 117 | ||||
| -rwxr-xr-x | .ai/scripts/cross-agent-comms/cross-agent-send | 356 | ||||
| -rw-r--r-- | .ai/scripts/cross-agent-comms/cross-agent-send.md | 199 | ||||
| -rwxr-xr-x | .ai/scripts/cross-agent-comms/cross-agent-status | 185 | ||||
| -rw-r--r-- | .ai/scripts/cross-agent-comms/cross-agent-status.md | 139 | ||||
| -rwxr-xr-x | .ai/scripts/cross-agent-comms/cross-agent-watch | 106 | ||||
| -rw-r--r-- | .ai/scripts/cross-agent-comms/cross-agent-watch.md | 128 |
14 files changed, 2496 insertions, 0 deletions
diff --git a/.ai/scripts/cross-agent-comms/cross-agent-discover b/.ai/scripts/cross-agent-comms/cross-agent-discover new file mode 100755 index 0000000..152cf27 --- /dev/null +++ b/.ai/scripts/cross-agent-comms/cross-agent-discover @@ -0,0 +1,230 @@ +#!/usr/bin/env python3 +"""Enumerate cross-agent destinations: local projects + tailnet peers. + +See cross-agent-discover.md. Local: scan ~/projects/*/.ai/. Peers: read +peers.toml, SSH-probe each for reachability. --enumerate-remote optionally +runs `ls -d ~/projects/*/.ai/` over SSH to list remote projects. + +Cache results for 5 min at ~/.cache/cross-agent-comms/discovery.json so +repeated invocations don't re-probe. + +HALT: prints a banner; otherwise continues. +""" + +from __future__ import annotations + +import argparse +import datetime as _dt +import json +import os +import subprocess +import sys +import time +import tomllib +from pathlib import Path + +CONFIG_DIR = Path.home() / ".config" / "cross-agent-comms" +PEERS_TOML = CONFIG_DIR / "peers.toml" +HALT_FILE = CONFIG_DIR / "HALT" +CACHE_DIR = Path.home() / ".cache" / "cross-agent-comms" +CACHE_FILE = CACHE_DIR / "discovery.json" +CACHE_TTL_SECONDS = 300 + +EXIT_OK = 0 +EXIT_GENERAL = 1 +EXIT_PEERS_TOML = 1 + + +def err(msg: str) -> None: + print(msg, file=sys.stderr) + + +def render_banner_if_halt() -> None: + if not HALT_FILE.exists(): + return + try: + reason = HALT_FILE.read_text().strip() + except OSError: + reason = "(HALT file unreadable; treated as halted)" + print("⚠ HALT ACTIVE — cross-agent comms paused") + if reason: + print(f" reason: {reason}") + print() + + +def enumerate_local_projects() -> list[str]: + projects_dir = Path.home() / "projects" + if not projects_dir.is_dir(): + return [] + found = [] + for child in sorted(projects_dir.iterdir()): + if child.is_dir() and (child / ".ai").is_dir(): + found.append(child.name) + return found + + +def load_peers() -> dict: + if not PEERS_TOML.exists(): + return {"peers": {}} + try: + return tomllib.loads(PEERS_TOML.read_text()) + except (tomllib.TOMLDecodeError, OSError) as e: + err(f"cannot parse peers.toml: {e}") + sys.exit(EXIT_PEERS_TOML) + + +def probe_peer_reachability(host: str, ssh_user: str | None) -> tuple[bool, str | None]: + """Run a short SSH probe with BatchMode=yes (no interactive prompt).""" + target = f"{ssh_user}@{host}" if ssh_user else host + try: + result = subprocess.run( + ["ssh", "-o", "ConnectTimeout=2", "-o", "BatchMode=yes", target, "true"], + capture_output=True, + text=True, + timeout=5, + ) + except (FileNotFoundError, subprocess.TimeoutExpired): + return False, "ssh probe failed" + if result.returncode == 0: + return True, None + return False, (result.stderr.strip().splitlines() or [f"exit {result.returncode}"])[-1] + + +def enumerate_remote_projects(host: str, ssh_user: str | None) -> list[str] | None: + target = f"{ssh_user}@{host}" if ssh_user else host + try: + result = subprocess.run( + [ + "ssh", "-o", "ConnectTimeout=3", "-o", "BatchMode=yes", target, + "ls -d ~/projects/*/.ai/ 2>/dev/null", + ], + capture_output=True, + text=True, + timeout=10, + ) + except (FileNotFoundError, subprocess.TimeoutExpired): + return None + if result.returncode != 0: + return None + projects = [] + for line in result.stdout.splitlines(): + # Each line looks like /home/<user>/projects/<name>/.ai/ + parts = line.rstrip("/").split("/") + if len(parts) >= 2 and parts[-1] == ".ai": + projects.append(parts[-2]) + return projects + + +def read_cache() -> dict | None: + if not CACHE_FILE.exists(): + return None + try: + age = time.time() - CACHE_FILE.stat().st_mtime + if age > CACHE_TTL_SECONDS: + return None + return json.loads(CACHE_FILE.read_text()) + except (OSError, json.JSONDecodeError): + return None + + +def write_cache(payload: dict) -> None: + CACHE_DIR.mkdir(parents=True, exist_ok=True) + CACHE_FILE.write_text(json.dumps(payload, indent=2)) + + +def discover(peer_filter: str | None, enumerate_remote: bool) -> dict: + local = enumerate_local_projects() + peers_cfg = load_peers().get("peers", {}) + + peers_out = [] + for name, cfg in sorted(peers_cfg.items()): + if peer_filter and name != peer_filter: + continue + host = cfg.get("host", name) + ssh_user = cfg.get("ssh_user") + reachable, error = probe_peer_reachability(host, ssh_user) + entry = { + "name": name, + "host": host, + "reachable": reachable, + } + if not reachable: + entry["error"] = error + if enumerate_remote and reachable: + entry["projects"] = enumerate_remote_projects(host, ssh_user) or [] + peers_out.append(entry) + + return { + "scanned_at": _dt.datetime.now(_dt.timezone.utc).isoformat(), + "halt_active": HALT_FILE.exists(), + "local": local, + "peers": peers_out, + } + + +def render_table(payload: dict, enumerate_remote: bool) -> None: + local = payload.get("local", []) + print(f"Local ({_local_hostname()}):") + if local: + wrapped = ", ".join(local) + print(f" {wrapped} [{len(local)} project{'s' if len(local) != 1 else ''}]") + else: + print(" (no projects with .ai/ found)") + print() + + peers = payload.get("peers", []) + if not peers: + print("Peers (from peers.toml):") + print(" (no peers configured)") + return + + print("Peers (from ~/.config/cross-agent-comms/peers.toml):") + for p in peers: + marker = "✓ reachable" if p.get("reachable") else f"✗ UNREACHABLE ({p.get('error', 'unknown')})" + print(f" {p['name']:<16} {p['host']:<24} {marker}") + if enumerate_remote and p.get("projects"): + wrapped = ", ".join(p["projects"]) + print(f" projects: {wrapped}") + + +def _local_hostname() -> str: + import socket + return socket.gethostname().split(".")[0] + + +def main() -> int: + parser = argparse.ArgumentParser(description="Discover cross-agent destinations.") + parser.add_argument("--enumerate-remote", action="store_true", + help="SSH into each peer and list ~/projects/*/.ai/") + parser.add_argument("--no-cache", action="store_true", help="Skip cache; force fresh probe") + parser.add_argument("--peer", help="Limit to a single peer name from peers.toml") + parser.add_argument("--json", action="store_true", help="Machine-readable output") + args = parser.parse_args() + + render_banner_if_halt() + + payload = None + if not args.no_cache: + cached = read_cache() + if cached is not None: + # Honor --peer filter on cached payload. + if args.peer: + cached["peers"] = [p for p in cached.get("peers", []) if p["name"] == args.peer] + payload = cached + + if payload is None: + payload = discover(args.peer, args.enumerate_remote) + if not args.no_cache and not args.peer: + # Only cache full (unfiltered) discoveries. + write_cache(payload) + + if args.json: + print(json.dumps(payload, indent=2)) + return EXIT_OK + + render_table(payload, args.enumerate_remote) + return EXIT_OK + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/.ai/scripts/cross-agent-comms/cross-agent-discover.md b/.ai/scripts/cross-agent-comms/cross-agent-discover.md new file mode 100644 index 0000000..95134bb --- /dev/null +++ b/.ai/scripts/cross-agent-comms/cross-agent-discover.md @@ -0,0 +1,155 @@ +# cross-agent-discover + +**Purpose.** Enumerate available cross-agent destinations — local projects on +this machine and remote projects on tailnet peers. Validates SSH reachability +for cross-machine destinations before reporting them as usable. + +## Usage + +``` +cross-agent-discover [--enumerate-remote] [--no-cache] [--peer <name>] +``` + +No args required for the common case (local enumeration + peer reachability). + +### Flags + +| Flag | Default | Purpose | +|---|---|---| +| `--enumerate-remote` | off | SSH into each peer and list projects under `~/projects/*/.ai/`. Off by default because SSH adds latency; turn on when you want to see what's available on a remote machine you haven't fully configured. | +| `--no-cache` | off | Skip the 5-minute cache; force fresh discovery. | +| `--peer <name>` | (all) | Limit to a single peer from `peers.toml`. | +| `--json` | off | Machine-readable output. | + +## Output + +### Default + +``` +$ cross-agent-discover +Local (ratio): + career, claude-templates, clipper, danneel, documents, elibrary, + finances, health, homelab, jr-estate, kit, little-elisper, + philosophy, website [14 projects] + +Peers (from ~/.config/cross-agent-comms/peers.toml): + velox.local reachable (last seen 2 sec ago) + bastion.local UNREACHABLE (ssh exit 255: connection refused) +``` + +### With `--enumerate-remote` + +``` +$ cross-agent-discover --enumerate-remote +Local (ratio): + ... (as above) + +velox.local (reachable): + career, homelab [2 projects] +``` + +## Configuration + +Reads `~/.config/cross-agent-comms/peers.toml`: + +```toml +# Each peer is a remote machine reachable via SSH (typically over Tailscale). + +[peers.velox] +host = "velox.local" +ssh_user = "cjennings" + +[peers.bastion] +host = "bastion.local" +ssh_user = "cjennings" +``` + +Peers entries describe machines, NOT projects. Projects are enumerated +on-demand under `~/projects/*/.ai/` either locally or via SSH. + +## Cache + +Successful discovery results are cached at +`~/.cache/cross-agent-comms/discovery.json` for 5 minutes. Repeated invocations +within the window read from cache. + +`--no-cache` forces a fresh probe. Useful when adding a new peer or after a +network change. + +## SSH reachability check + +For each peer, runs: + +``` +ssh -o ConnectTimeout=2 -o BatchMode=yes <user>@<host> true +``` + +`BatchMode=yes` prevents interactive password prompts — peers that don't have +key-based auth set up are reported as UNREACHABLE. + +If `--enumerate-remote` is set, on success runs: + +``` +ssh <user>@<host> 'ls -d ~/projects/*/.ai/ 2>/dev/null' +``` + +## Failure modes + +| Symptom | Likely cause | Fix | +|---|---|---| +| Peer reported UNREACHABLE | Tailscale not connected, SSH key not authorized, host firewalled | `tailscale status`; `ssh -v <peer>` to debug. | +| Local list is empty | Glob misresolved, or `~/projects/` doesn't exist | Check `ls -d ~/projects/*/.ai/`. | +| `--enumerate-remote` slow | Cold cache, slow tailnet, many peers | First run is slow, subsequent runs hit cache. Use `--peer <name>` to scope. | +| Peer unexpectedly missing from output | Not in `peers.toml`, or `peers.toml` malformed | `cat ~/.config/cross-agent-comms/peers.toml` and validate. | + +## HALT awareness + +Checks `~/.config/cross-agent-comms/HALT` at start. If HALT exists, prints a +prominent banner before normal output: + +``` +$ cross-agent-discover +⚠ HALT ACTIVE — cross-agent comms paused + Reason: <reason from HALT file body, if any> + Resume with: cross-agent-resume + +(enumeration continues normally — HALT does not suppress visibility) + +Local (ratio): + career, claude-templates, ... + +Peers: + velox.local reachable +``` + +Discover is read-only. Like `cross-agent-status`, it always runs so the user +keeps visibility into what destinations exist regardless of halt state. The +banner makes the halt state impossible to miss. + +If the HALT file exists but is unreadable, print a warning banner and +continue. + +See `cross-agent-halt.md` for the full halt mechanism. + +## Examples + +```bash +# Common: see what's available +cross-agent-discover + +# Force fresh probe after network change +cross-agent-discover --no-cache + +# What's on velox specifically +cross-agent-discover --peer velox --enumerate-remote + +# Pipe to grep +cross-agent-discover --json | jq '.peers[] | select(.reachable)' +``` + +## See also + +- `cross-agent-send` — uses `peers.toml` for routing destinations. +- `cross-agent-status` — local pending messages. +- `cross-agent-comms.org` — protocol spec, `* Limitations` section + explains the cross-machine model. diff --git a/.ai/scripts/cross-agent-comms/cross-agent-halt b/.ai/scripts/cross-agent-comms/cross-agent-halt new file mode 100755 index 0000000..df25115 --- /dev/null +++ b/.ai/scripts/cross-agent-comms/cross-agent-halt @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 +"""Failsafe halt for cross-agent comms. + +See cross-agent-halt.md. Touches ~/.config/cross-agent-comms/HALT and stops +the cross-agent-watch systemd user service. With --tailnet, propagates the +HALT file to every peer in peers.toml via SSH; reports per-peer status with +non-zero exit on partial halt. + +Does NOT pkill in-flight scripts — they detect HALT on next iteration and +stop themselves. +""" + +from __future__ import annotations + +import argparse +import subprocess +import sys +import tomllib +from pathlib import Path + +CONFIG_DIR = Path.home() / ".config" / "cross-agent-comms" +HALT_FILE = CONFIG_DIR / "HALT" +PEERS_TOML = CONFIG_DIR / "peers.toml" + +EXIT_OK = 0 +EXIT_PARTIAL = 1 + + +def err(msg: str) -> None: + print(msg, file=sys.stderr) + + +def write_halt_file(reason: str) -> None: + CONFIG_DIR.mkdir(parents=True, exist_ok=True) + HALT_FILE.write_text((reason + "\n") if reason else "") + + +def stop_watcher_service() -> None: + """Best-effort stop of the systemd watcher service. Failures are logged but not fatal.""" + try: + subprocess.run( + ["systemctl", "--user", "stop", "cross-agent-watch.path"], + capture_output=True, text=True, timeout=5, + ) + except (FileNotFoundError, subprocess.TimeoutExpired): + # Watcher service may not be installed — fine. + pass + + +def load_peers() -> dict: + if not PEERS_TOML.exists(): + return {} + try: + return tomllib.loads(PEERS_TOML.read_text()) + except (tomllib.TOMLDecodeError, OSError) as e: + err(f"cannot parse peers.toml: {e}") + return {} + + +def ssh_touch_halt(host: str, ssh_user: str | None, reason: str) -> tuple[bool, str]: + target = f"{ssh_user}@{host}" if ssh_user else host + # Build the remote command. Quote the reason carefully. + remote_cmd = ( + f"mkdir -p ~/.config/cross-agent-comms && " + f"printf %s {_sh_quote(reason)} > ~/.config/cross-agent-comms/HALT" + ) + try: + result = subprocess.run( + ["ssh", "-o", "ConnectTimeout=3", "-o", "BatchMode=yes", target, remote_cmd], + capture_output=True, text=True, timeout=10, + ) + except (FileNotFoundError, subprocess.TimeoutExpired): + return False, "ssh unavailable or timed out" + if result.returncode == 0: + return True, "HALT file written" + return False, (result.stderr.strip().splitlines() or [f"exit {result.returncode}"])[-1] + + +def _sh_quote(s: str) -> str: + return "'" + s.replace("'", "'\"'\"'") + "'" + + +def main() -> int: + parser = argparse.ArgumentParser(description="Halt all cross-agent comms on this machine (and optionally tailnet).") + parser.add_argument("reason", nargs="?", default="", help="Optional human-readable reason") + parser.add_argument("--tailnet", action="store_true", + help="Propagate HALT to every peer in peers.toml") + args = parser.parse_args() + + # Local halt. + write_halt_file(args.reason) + stop_watcher_service() + print("Halting locally ✓ (HALT file written)") + + if not args.tailnet: + print() + print(f"Halt active. Remove {HALT_FILE} or run cross-agent-resume to clear.") + print("Agent polling will stop within ~5 min (one cadence cycle).") + return EXIT_OK + + peers = load_peers().get("peers", {}) + if not peers: + print() + print("No peers configured in peers.toml — local-only halt complete.") + return EXIT_OK + + print() + successes = 1 # local already counted + failures = [] + for name, cfg in sorted(peers.items()): + host = cfg.get("host", name) + ssh_user = cfg.get("ssh_user") + ok, detail = ssh_touch_halt(host, ssh_user, args.reason) + marker = "✓" if ok else "✗" + print(f"Halting {host:<28} {marker} ({detail})") + if ok: + successes += 1 + else: + failures.append(f"{name} ({host}): {detail}") + + print() + total = len(peers) + 1 + if failures: + print(f"PARTIAL HALT: {successes}/{total} machines halted.") + for f in failures: + print(f" - {f}") + print("Resolve the failures or manually halt each machine.") + return EXIT_PARTIAL + print(f"Halt active across {total} machine(s).") + return EXIT_OK + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/.ai/scripts/cross-agent-comms/cross-agent-halt.md b/.ai/scripts/cross-agent-comms/cross-agent-halt.md new file mode 100644 index 0000000..b817fbc --- /dev/null +++ b/.ai/scripts/cross-agent-comms/cross-agent-halt.md @@ -0,0 +1,134 @@ +# cross-agent-halt + +**Purpose.** Failsafe stop for all cross-agent activity on the local machine +(or, with `--tailnet`, across all configured peers). Creates the HALT file +that every component in the protocol checks; within one polling cadence +(~5 min) all polling, sending, watching, and receiving stops. + +This is the user's emergency brake. Use when something is misbehaving and +visiting individual sessions is too slow. + +## Usage + +``` +cross-agent-halt [reason] [--tailnet] [--no-stop-watcher] +``` + +### Positional argument + +| Position | Meaning | Example | +|---|---|---| +| 1 | Optional human-readable reason for the halt. Written into the HALT file's body. Helps future-you remember why you stopped things. | `"investigating runaway poll loop, 2026-04-27"` | + +### Flags + +| Flag | Default | Purpose | +|---|---|---| +| `--tailnet` | local only | Propagate halt to every peer in `peers.toml` via SSH over Tailscale. | +| `--no-stop-watcher` | (stops watcher) | Skip stopping the `cross-agent-watch.path` systemd unit. Useful if the watcher is intentionally separate from comms (rare). | + +## Behavior + +### Local halt (default) + +1. Write the HALT file: `~/.config/cross-agent-comms/HALT`. If a `[reason]` was + passed, write it as the file's body. Otherwise the file is empty (existence + alone triggers halt). +2. Stop the watcher service: `systemctl --user stop cross-agent-watch.path` + (and the corresponding `.service` if running). +3. Print a summary: + ``` + ✓ HALT file written: ~/.config/cross-agent-comms/HALT + ✓ Watcher service stopped (cross-agent-watch.path) + - In-flight sends will complete their current rsync step (~seconds), then + stop. New sends are blocked. + - Active agent polling sessions stop within one cadence (~5 min). + - Use `cross-agent-resume` to clear HALT. + Per-session polling does NOT auto-resume — you re-engage each session by + telling its agent to resume polling. + ``` +4. Exit 0. + +### Cross-tailnet halt (`--tailnet`) + +1. Apply local halt steps 1-2 first. +2. Read `peers.toml` for the list of remote machines. +3. For each peer, SSH and write the HALT file: + ``` + ssh <user>@<host> "echo '<reason>' > ~/.config/cross-agent-comms/HALT && \ + systemctl --user stop cross-agent-watch.path" + ``` +4. Track per-peer success/failure. Print results: + ``` + Halting velox.local ✓ (HALT file written) + Halting bastion.local ✗ (ssh exit 255: no route to host) + Halting locally ✓ (HALT file written) + + PARTIAL HALT: 2/3 machines halted. bastion.local needs manual halt. + ``` +5. Exit 0 if all peers halted; exit 1 if any peer failed (so scripts can + detect partial halt). The local halt always succeeds — even on `--tailnet`, + if remote peers fail, local is still halted. + +## What "halt active" means for each component + +| Component | Behavior under HALT | +|---|---| +| `cross-agent-send` | Refuses to send. Exits 5 with "halt active; remove ~/.config/cross-agent-comms/HALT to resume." Checks HALT at start AND between each retry/rsync step, so an in-flight send completes its current step then stops. | +| `cross-agent-recv` | Refuses to verify or dedup. Exits 5 with same message. Inbound files are **left in place** — not moved, not rejected — so resume picks them up cleanly via cold-start. | +| `cross-agent-watch` | Continues running but suppresses notifications. Logs each event with `(suppressed by HALT)` so the operator can see what would have fired. | +| `cross-agent-status` | Prints prominent `⚠ HALT ACTIVE` banner before normal output. Continues to enumerate (read-only). | +| `cross-agent-discover` | Same banner. Continues (read-only). | +| Agent polling loops | Check HALT on every wake. If set: write a final `progress` note to any active conversation ("HALT fired locally; pausing"), surface "(HALT active; cross-agent comms paused)" in every user response, and stop rescheduling. Polling decays naturally within one cadence. | +| Conversation initiator | Refuses to write sequence 1 of any new conversation. Surfaces refusal to user. | +| Startup workflow (Phase A) | Checks HALT at session boot. If set, surfaces immediately and skips cross-agent inbox checks. | + +## Failure modes + +| Symptom | Cause | Fix | +|---|---|---| +| `~/.config/cross-agent-comms/HALT` already exists | Halt was already active | OK — running halt again refreshes the reason text. Safe. | +| `systemctl --user stop` fails | Watcher service not installed, or systemd not available | The HALT file is still written — components that check HALT will still stop. The systemctl failure surfaces as a non-fatal warning. | +| `--tailnet` halts some peers but not others | One or more peers unreachable | Exit 1 with per-peer status. Manually halt the unreachable peers (visit each machine, `touch ~/.config/cross-agent-comms/HALT`), or fix the network and re-run. | +| Permission denied writing the HALT file | `~/.config/cross-agent-comms/` doesn't exist or is owned by another user | `mkdir -p ~/.config/cross-agent-comms/`; check ownership. | + +## What halt does NOT do + +- Does not kill running Claude sessions. Polling stops within ~5 min, but the + session itself stays alive and can be re-engaged after resume. +- Does not delete pending messages. Inbound files in `inbox/from-agents/` + remain; they get processed when polling resumes. +- Does not abort in-flight rsync push mid-byte. Atomic-write semantics + guarantee in-flight messages either complete cleanly or leave only `.tmp.*` + files (which receivers ignore). + +## Examples + +```bash +# Quick halt with no reason +cross-agent-halt + +# Halt with a memo +cross-agent-halt "runaway poll loop in homelab session, debugging" + +# Halt all tailnet peers + local +cross-agent-halt --tailnet "shutting down for system update" + +# Halt protocol comms but leave the watcher service running +cross-agent-halt --no-stop-watcher +``` + +## Recovery + +Always pair with `cross-agent-resume` when the situation is resolved: + +```bash +cross-agent-resume # local +cross-agent-resume --tailnet # all peers +``` + +## See also + +- `cross-agent-resume` — counterpart that clears HALT. +- `cross-agent-status` — see HALT state at a glance. +- `cross-agent-comms.org` — protocol spec, `* Halt mechanism` section. diff --git a/.ai/scripts/cross-agent-comms/cross-agent-recv b/.ai/scripts/cross-agent-comms/cross-agent-recv new file mode 100755 index 0000000..b67533a --- /dev/null +++ b/.ai/scripts/cross-agent-comms/cross-agent-recv @@ -0,0 +1,250 @@ +#!/usr/bin/env python3 +"""Cross-agent message receiver. + +See cross-agent-recv.md for the full contract. Reads one message file and +emits a structured decision the agent acts on: + + process | dedup | query | reject + +Decision exit codes: + 0 = process 1 = dedup 2 = query 3 = reject + +When HALT is set, the script refuses to verify or dedup and leaves the +inbound file in place — resume picks it up via cold-start. +""" + +from __future__ import annotations + +import argparse +import hashlib +import json +import re +import shutil +import subprocess +import sys +from pathlib import Path + +CONFIG_DIR = Path.home() / ".config" / "cross-agent-comms" +HALT_FILE = CONFIG_DIR / "HALT" +EXPECTED_PROTOCOL_VERSION = "5" + +REQUIRED_FRONTMATTER = ["TITLE", "CONVERSATION_ID", "MESSAGE_TYPE", "SEQUENCE", "TIMESTAMP", "PROTOCOL_VERSION"] +VALID_MESSAGE_TYPES = {"request", "progress", "query", "pushback", "complete", "release", "escalate"} + +DEC_PROCESS = "process" +DEC_DEDUP = "dedup" +DEC_QUERY = "query" +DEC_REJECT = "reject" + +EXIT_FOR_DECISION = { + DEC_PROCESS: 0, + DEC_DEDUP: 1, + DEC_QUERY: 2, + DEC_REJECT: 3, +} + +EXIT_HALT = 5 + + +def err(msg: str) -> None: + print(msg, file=sys.stderr) + + +def check_halt() -> None: + if HALT_FILE.exists(): + try: + reason = HALT_FILE.read_text().strip() + except OSError: + err("halt active (HALT file present but unreadable; treated as halted)") + sys.exit(EXIT_HALT) + msg = "halt active; leaving inbound message in place (resume will pick up)" + if reason: + msg = f"{msg}: {reason}" + err(msg) + sys.exit(EXIT_HALT) + + +def parse_frontmatter(path: Path) -> dict[str, str]: + try: + text = path.read_text() + except OSError as e: + return {"_parse_error": f"cannot read: {e}"} + fm: dict[str, str] = {} + for line in text.splitlines(): + line = line.rstrip() + if not line: + if fm: + break + continue + m = re.match(r"#\+([A-Z_]+):\s*(.*)", line) + if m: + fm[m.group(1)] = m.group(2).strip() + elif fm: + break + return fm + + +def emit_decision( + decision: str, + reason: str | None, + fm: dict[str, str], + sha256: str | None, + args: argparse.Namespace, +) -> int: + payload = { + "decision": decision, + "reason": reason, + "message_type": fm.get("MESSAGE_TYPE"), + "conversation_id": fm.get("CONVERSATION_ID"), + "sequence": fm.get("SEQUENCE"), + "timestamp": fm.get("TIMESTAMP"), + "sha256": sha256, + } + if args.json: + print(json.dumps(payload, indent=None if args.compact_json else 2)) + else: + print(f"decision: {decision}") + if reason: + print(f"reason: {reason}") + for k in ("message_type", "conversation_id", "sequence", "timestamp"): + v = payload[k] + if v is not None: + print(f"{k}: {v}") + if sha256: + print(f"sha256: {sha256}") + return EXIT_FOR_DECISION[decision] + + +def gpg_verify(message_path: Path, sig_path: Path) -> tuple[bool, str]: + try: + result = subprocess.run( + ["gpg", "--verify", str(sig_path), str(message_path)], + capture_output=True, + text=True, + ) + except FileNotFoundError: + return False, "gpg not installed" + if result.returncode == 0: + return True, "" + return False, result.stderr.strip().splitlines()[-1] if result.stderr.strip() else f"exit {result.returncode}" + + +def sha256_of(path: Path) -> str: + h = hashlib.sha256() + with path.open("rb") as f: + for chunk in iter(lambda: f.read(65536), b""): + h.update(chunk) + return h.hexdigest() + + +def find_dedup_match(message_path: Path, fm: dict[str, str], my_hash: str) -> tuple[str, str | None]: + """Scan the message's directory for same-CONVERSATION_ID/SEQUENCE files. + + Returns (decision, reason) — decision is DEC_DEDUP for an exact-hash match, + or DEC_PROCESS when no match or hash differs (sequence collision is OK). + """ + parent = message_path.parent + conv_id = fm["CONVERSATION_ID"] + sequence = fm["SEQUENCE"] + for sibling in parent.iterdir(): + if sibling == message_path or not sibling.is_file() or sibling.suffix != ".org": + continue + sib_fm = parse_frontmatter(sibling) + if sib_fm.get("CONVERSATION_ID") != conv_id or sib_fm.get("SEQUENCE") != sequence: + continue + # Same conv-id + same sequence — check hash. + if sha256_of(sibling) == my_hash: + return DEC_DEDUP, f"identical retry of {sibling.name}" + return DEC_PROCESS, None + + +def check_requires_tools(fm: dict[str, str]) -> tuple[bool, list[str]]: + """REQUIRES_TOOLS is a comma-separated list of tool names. + + For v5, "tool available" is a heuristic: an executable on PATH whose name + matches the tool slug. MCP availability is currently out of scope (no + portable way to query it from a CLI). + """ + tools_field = fm.get("REQUIRES_TOOLS") + if not tools_field: + return True, [] + tools = [t.strip() for t in tools_field.split(",") if t.strip()] + missing = [t for t in tools if shutil.which(t) is None] + return len(missing) == 0, missing + + +def main() -> int: + parser = argparse.ArgumentParser(description="Receive and decide on a cross-agent message.") + parser.add_argument("message_file", type=Path) + parser.add_argument("--no-verify", action="store_true", help="Skip GPG verification (testing only)") + parser.add_argument("--no-dedup", action="store_true", help="Skip SHA-256 dedup against existing files") + parser.add_argument("--protocol-version", default=EXPECTED_PROTOCOL_VERSION, + help="Override expected protocol version (default: 5)") + parser.add_argument("--json", action="store_true", help="Emit JSON output") + parser.add_argument("--compact-json", action="store_true", help="Compact JSON (no indent)") + args = parser.parse_args() + + check_halt() + + if not args.message_file.is_file(): + err(f"message file not found: {args.message_file}") + return EXIT_FOR_DECISION[DEC_REJECT] + + fm = parse_frontmatter(args.message_file) + if "_parse_error" in fm: + return emit_decision(DEC_REJECT, fm["_parse_error"], {}, None, args) + + # Step 1: frontmatter sanity-check. + missing = [k for k in REQUIRED_FRONTMATTER if k not in fm] + if missing: + return emit_decision( + DEC_REJECT, f"frontmatter missing required fields: {', '.join(missing)}", fm, None, args + ) + if fm["MESSAGE_TYPE"] not in VALID_MESSAGE_TYPES: + return emit_decision( + DEC_REJECT, f"invalid MESSAGE_TYPE: {fm['MESSAGE_TYPE']!r}", fm, None, args + ) + + # Step 2: PROTOCOL_VERSION check. + if fm["PROTOCOL_VERSION"] != args.protocol_version: + return emit_decision( + DEC_QUERY, + f"PROTOCOL_VERSION mismatch: expected {args.protocol_version}, got {fm['PROTOCOL_VERSION']}", + fm, + None, + args, + ) + + # Step 3: GPG verify. + if not args.no_verify: + sig_path = args.message_file.with_suffix(args.message_file.suffix + ".asc") + if not sig_path.is_file(): + return emit_decision(DEC_REJECT, f"signature file missing: {sig_path.name}", fm, None, args) + ok, gpg_err = gpg_verify(args.message_file, sig_path) + if not ok: + return emit_decision(DEC_REJECT, f"gpg verify failed: {gpg_err}", fm, None, args) + + # Step 4: SHA-256 dedup. + my_hash = sha256_of(args.message_file) + if not args.no_dedup: + decision, reason = find_dedup_match(args.message_file, fm, my_hash) + if decision == DEC_DEDUP: + return emit_decision(DEC_DEDUP, reason, fm, my_hash, args) + + # Step 5: REQUIRES_TOOLS check. + ok, missing_tools = check_requires_tools(fm) + if not ok: + return emit_decision( + DEC_QUERY, + f"required tools unavailable: {', '.join(missing_tools)}", + fm, + my_hash, + args, + ) + + # Step 6: process. + return emit_decision(DEC_PROCESS, None, fm, my_hash, args) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/.ai/scripts/cross-agent-comms/cross-agent-recv.md b/.ai/scripts/cross-agent-comms/cross-agent-recv.md new file mode 100644 index 0000000..247a27a --- /dev/null +++ b/.ai/scripts/cross-agent-comms/cross-agent-recv.md @@ -0,0 +1,218 @@ +# cross-agent-recv + +**Purpose.** The canonical receiver-side processor. Reads a single incoming +message file and reports a structured decision the agent acts on: +process / dedup / query / reject. + +The script handles only mechanical checks (frontmatter, signature, dedup, +version, tools). Substance-level decisions like `pushback` ("I disagree with +this request") happen one layer up — after the agent reads the message body +the script returns as `process`-able. + +This is the read-side counterpart to `cross-agent-send`. Together they are the +two halves of the per-message contract. The agent's polling loop calls +`cross-agent-recv` on every new file in `inbox/from-agents/` and dispatches on +the decision. + +Without this script, every receiver implementation re-invents GPG verify + +frontmatter sanity-check + SHA-256 dedup. With it, behavior is consistent +across projects. + +## Usage + +``` +cross-agent-recv <message-file> +``` + +Single positional argument: a `.org` file in `inbox/from-agents/`. The matching +`.asc` signature file must be present alongside it. + +### Flags + +| Flag | Default | Purpose | +|---|---|---| +| `--no-verify` | (verify on) | Skip GPG verification. Testing only. | +| `--no-dedup` | (dedup on) | Skip SHA-256 dedup against existing files. Testing only. | +| `--protocol-version <N>` | 5 | Override the expected protocol version. Useful for testing forward-compatibility checks. | +| `--json` | off | Output decision as JSON for easier parsing by the agent. | + +## Behavior + +Runs the receiver checks in order. First failure determines the decision. + +### Step 1 — Frontmatter sanity-check + +Parse the message's org-mode frontmatter. Required fields: + +- `#+TITLE` +- `#+CONVERSATION_ID` +- `#+MESSAGE_TYPE` (must be one of: `request`, `progress`, `query`, `pushback`, + `complete`, `release`, `escalate`) +- `#+SEQUENCE` (integer) +- `#+TIMESTAMP` (ISO 8601 with explicit offset) +- `#+PROTOCOL_VERSION` (must match the expected version; default 5) + +Any required field missing, malformed, or the protocol version mismatched → +decision = `reject` (frontmatter) or `query` (version mismatch — see below). + +### Step 2 — Protocol-version check + +If `PROTOCOL_VERSION` doesn't match the expected: + +- Decision = `query`. Action: receiver should write a `query` reply asking the + sender to upgrade to the expected protocol version. + +### Step 3 — Signature verification + +Look for `<message-file>.asc` alongside the `.org`. If missing or `gpg +--verify` fails: + +- Decision = `reject` (signature). Surface to user; do not act. + +The `.asc` file MUST be present when the `.org` is — `cross-agent-send` +guarantees this with its strict ordering (`.asc` lands first). If the `.asc` +is missing despite the `.org` being present, the sender violated atomic-write +ordering or the file was tampered with in transit. + +### Step 4 — SHA-256 dedup + +Compute SHA-256 of the message file. Scan the same directory for existing +files matching `CONVERSATION_ID + SEQUENCE`: + +- No match → decision = `process` (new message, dispatch by type). +- Match with **identical** SHA-256 → decision = `dedup` (silent retry; do not + reprocess). +- Match with **different** SHA-256 → decision = `process` (sequence collision + with non-identical content; both are legitimate, ordered by `#+TIMESTAMP`). + +### Step 5 — REQUIRES_TOOLS optional check + +If the message has a `#+REQUIRES_TOOLS` field, verify each named tool/MCP is +available in the receiver's environment. + +- All available → `process`. +- One or more missing → decision = `query`. The agent should write a `query` + reply naming the missing tools, asking the sender to reframe the request to + avoid them. + +### Step 6 — Dispatch decision + +If all checks pass, decision = `process` with the parsed `MESSAGE_TYPE` so the +agent's main loop knows which handler to invoke. + +## Output + +### Default (human-readable) + +``` +$ cross-agent-recv inbox/from-agents/20260427T091015Z-from-homelab-prep-fixup.org +decision: process +message_type: request +conversation_id: prep-fixup +sequence: 6 +sha256: a1b2c3d4... +``` + +### `--json` + +```json +{ + "decision": "process", + "reason": null, + "message_type": "request", + "conversation_id": "prep-fixup", + "sequence": 6, + "timestamp": "2026-04-27T04:11:42-05:00", + "sha256": "a1b2c3d4..." +} +``` + +For decisions other than `process`, `reason` carries a human-readable +explanation: + +```json +{ + "decision": "query", + "reason": "PROTOCOL_VERSION mismatch: expected 5, got 4", + "conversation_id": "prep-fixup", + "sequence": 6 +} +``` + +## Decision exit codes + +| Decision | Exit code | Agent action | +|---|---|---| +| `process` | 0 | Dispatch to the message-type handler | +| `dedup` | 1 | Silent — do nothing further | +| `query` | 2 | Write a `query` reply (see `reason` for what to ask) | +| `reject` | 3 | Surface to user; do not auto-reply | + +The agent reads stdout/JSON to learn the decision; it can also key off exit +code for simpler bash-style dispatching. + +## Failure modes + +| Symptom | Cause | Fix | +|---|---|---| +| `decision: reject (frontmatter)` | Required field missing or malformed | Open the message; fix or surface to user. The sender should not have produced this file. | +| `decision: reject (signature)` | `.asc` missing, GPG verify failed, or signer unknown | Check that `.asc` exists alongside `.org`. If yes, run `gpg --verify <msg>.asc <msg>` manually for diagnostic output. | +| `decision: query (PROTOCOL_VERSION)` | Sender on older/newer protocol | Reply with a `query` asking sender to upgrade. Both sides should align before continuing. | +| `decision: query (REQUIRES_TOOLS)` | Receiver lacks one of the named tools | Reply with a `query` naming the missing tools; sender should reframe to avoid. | +| `decision: dedup` | Already-processed identical retry | No action. The script handled it correctly. | + +## HALT awareness + +Checks `~/.config/cross-agent-comms/HALT` at the start of every invocation. If +HALT exists, exits with code 5 ("halt active; remove +~/.config/cross-agent-comms/HALT to resume") without verifying, deduping, or +returning a decision. + +**The inbound file is left in place** — not moved, not rejected, not +deduped. When HALT clears and polling resumes, the file gets picked up via +the normal cold-start handling (whichever surfaces first: watcher +notification, startup workflow check, or the next agent poll). Reversibility +is preserved. + +If the HALT file exists but is unreadable, fail-closed — treat as if HALT is +set. + +See `cross-agent-halt.md` for the full halt mechanism. + +## Examples + +```bash +# Basic invocation in an agent's polling loop +for msg in inbox/from-agents/*.org; do + decision=$(cross-agent-recv --json "$msg") + case "$(echo "$decision" | jq -r '.decision')" in + process) handle_message "$msg" ;; + dedup) ;; # silent + query) write_query_reply "$msg" "$decision" ;; + reject) surface_to_user "$msg" "$decision" ;; + esac +done + +# Test signature verification only +cross-agent-recv --no-dedup inbox/from-agents/test-msg.org + +# Test against a future protocol version +cross-agent-recv --protocol-version 6 inbox/from-agents/future-msg.org +``` + +## Performance + +The script is fast (single SHA-256 compute, single GPG verify, frontmatter +parse). For typical messages (single-digit KB), runs in well under 100ms. +Dedup-scan is O(N) over files in the directory; if a project's +`inbox/from-agents/` accumulates hundreds of files, archive released +conversations to keep the scan fast. + +## See also + +- `cross-agent-send` — counterpart writer. +- `cross-agent-watch` — fires when a new message arrives; agent then calls + `cross-agent-recv` to process it. +- `cross-agent-status` — pending-message snapshot (uses similar + released-vs-unreleased logic, but doesn't process individual messages). +- `cross-agent-comms.org` — protocol spec, the "what" the script implements. diff --git a/.ai/scripts/cross-agent-comms/cross-agent-resume b/.ai/scripts/cross-agent-comms/cross-agent-resume new file mode 100755 index 0000000..1fb83bc --- /dev/null +++ b/.ai/scripts/cross-agent-comms/cross-agent-resume @@ -0,0 +1,145 @@ +#!/usr/bin/env python3 +"""Resume cross-agent comms after a halt. + +See cross-agent-resume.md. Removes ~/.config/cross-agent-comms/HALT and +restarts the cross-agent-watch systemd user service. With --tailnet, +propagates the removal to every peer in peers.toml via SSH; reports +per-peer status with non-zero exit on partial resume. + +Per the asymmetry rule: clearing HALT does NOT auto-resume agent polling. +Each session must explicitly re-engage. +""" + +from __future__ import annotations + +import argparse +import subprocess +import sys +import tomllib +from pathlib import Path + +CONFIG_DIR = Path.home() / ".config" / "cross-agent-comms" +HALT_FILE = CONFIG_DIR / "HALT" +PEERS_TOML = CONFIG_DIR / "peers.toml" + +EXIT_OK = 0 +EXIT_PARTIAL = 1 + + +def err(msg: str) -> None: + print(msg, file=sys.stderr) + + +def remove_halt_file() -> bool: + """Returns True if HALT was removed, False if it didn't exist.""" + if HALT_FILE.exists(): + try: + HALT_FILE.unlink() + return True + except OSError as e: + err(f"could not remove HALT: {e}") + return False + return False + + +def start_watcher_service() -> None: + """Best-effort start of the systemd watcher path unit.""" + try: + subprocess.run( + ["systemctl", "--user", "start", "cross-agent-watch.path"], + capture_output=True, text=True, timeout=5, + ) + except (FileNotFoundError, subprocess.TimeoutExpired): + pass + + +def load_peers() -> dict: + if not PEERS_TOML.exists(): + return {} + try: + return tomllib.loads(PEERS_TOML.read_text()) + except (tomllib.TOMLDecodeError, OSError) as e: + err(f"cannot parse peers.toml: {e}") + return {} + + +def ssh_remove_halt(host: str, ssh_user: str | None) -> tuple[bool, str]: + target = f"{ssh_user}@{host}" if ssh_user else host + remote_cmd = "rm -f ~/.config/cross-agent-comms/HALT" + try: + result = subprocess.run( + ["ssh", "-o", "ConnectTimeout=3", "-o", "BatchMode=yes", target, remote_cmd], + capture_output=True, text=True, timeout=10, + ) + except (FileNotFoundError, subprocess.TimeoutExpired): + return False, "ssh unavailable or timed out" + if result.returncode == 0: + return True, "HALT cleared" + return False, (result.stderr.strip().splitlines() or [f"exit {result.returncode}"])[-1] + + +def print_re_engage_instructions() -> None: + print() + print("Halt cleared. Watcher restarted.") + print() + print("Agent polling does NOT auto-resume — per the failsafe asymmetry rule,") + print("agents stay paused until you explicitly re-engage each session.") + print("Open the relevant Claude session and tell the agent to resume polling") + print("for its conversation.") + + +def main() -> int: + parser = argparse.ArgumentParser(description="Resume cross-agent comms after a halt.") + parser.add_argument("--tailnet", action="store_true", + help="Propagate HALT removal to every peer in peers.toml") + args = parser.parse_args() + + removed = remove_halt_file() + start_watcher_service() + if removed: + print("Resuming locally ✓ (HALT cleared)") + else: + print("Resuming locally ✓ (no HALT was active)") + + if not args.tailnet: + print_re_engage_instructions() + return EXIT_OK + + peers = load_peers().get("peers", {}) + if not peers: + print() + print("No peers configured in peers.toml — local-only resume complete.") + print_re_engage_instructions() + return EXIT_OK + + print() + successes = 1 + failures = [] + for name, cfg in sorted(peers.items()): + host = cfg.get("host", name) + ssh_user = cfg.get("ssh_user") + ok, detail = ssh_remove_halt(host, ssh_user) + marker = "✓" if ok else "✗" + print(f"Resuming {host:<27} {marker} ({detail})") + if ok: + successes += 1 + else: + failures.append(f"{name} ({host}): {detail}") + + print() + total = len(peers) + 1 + if failures: + print(f"PARTIAL RESUME: {successes}/{total} machines cleared.") + for f in failures: + print(f" - {f}") + print("Resolve the failures or manually clear HALT on each machine.") + print_re_engage_instructions() + return EXIT_PARTIAL + + print(f"Resume complete across {total} machine(s).") + print_re_engage_instructions() + return EXIT_OK + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/.ai/scripts/cross-agent-comms/cross-agent-resume.md b/.ai/scripts/cross-agent-comms/cross-agent-resume.md new file mode 100644 index 0000000..8aa8357 --- /dev/null +++ b/.ai/scripts/cross-agent-comms/cross-agent-resume.md @@ -0,0 +1,117 @@ +# cross-agent-resume + +**Purpose.** Clear the HALT file and restart the watcher service. Counterpart +to `cross-agent-halt`. Resuming agent polling is **explicit per-session** — +this script doesn't auto-revive halted polling loops; you tell each session +to re-engage. + +## Usage + +``` +cross-agent-resume [--tailnet] +``` + +### Flags + +| Flag | Default | Purpose | +|---|---|---| +| `--tailnet` | local only | Clear HALT on every peer in `peers.toml` via SSH over Tailscale. | + +## Behavior + +### Local resume (default) + +1. Remove the HALT file: `rm -f ~/.config/cross-agent-comms/HALT`. (Use `-f` + so a missing file isn't an error — running resume when not halted is safe.) +2. Restart the watcher service: `systemctl --user start cross-agent-watch.path`. +3. Print a summary: + ``` + ✓ HALT file removed + ✓ Watcher service started (cross-agent-watch.path) + - cross-agent-send and cross-agent-recv will accept new operations. + - Inbound messages held during halt will be picked up by the watcher. + - Agent polling does NOT auto-resume. To re-engage polling in a paused + session, open that Claude session and tell the agent to resume. + ``` +4. Exit 0. + +### Cross-tailnet resume (`--tailnet`) + +1. Apply local resume steps 1-2 first. +2. Read `peers.toml` for the list of remote machines. +3. For each peer, SSH: + ``` + ssh <user>@<host> "rm -f ~/.config/cross-agent-comms/HALT && \ + systemctl --user start cross-agent-watch.path" + ``` +4. Track per-peer success/failure: + ``` + Resuming velox.local ✓ (HALT cleared, watcher started) + Resuming bastion.local ✗ (ssh exit 255: no route to host) + Resuming locally ✓ + + PARTIAL RESUME: 2/3 machines resumed. bastion.local still halted. + ``` +5. Exit 0 if all peers resumed; exit 1 on any failure. + +## Why agent polling doesn't auto-resume + +Two reasons the asymmetry is deliberate: + +1. *Auto-resume could silently invert intentional kills.* If you halted + because a session was misbehaving, removing HALT shouldn't quietly revive + that session's polling. You re-engage explicitly so you're aware of which + sessions came back online. + +2. *You may want to inspect before resuming.* After a halt, you might want to + read pending messages, fix configuration, or kill a particular Claude + session entirely. Per-session resume forces that pause. + +## Re-engaging polling in a Claude session + +After `cross-agent-resume`, open the relevant Claude session and say something +like: + +``` +HALT is cleared; resume polling. +``` + +The agent will check the HALT file (now absent), re-create its polling +schedule, and continue the in-flight conversation from wherever it left off. +The conversation file is intact; the receiver will pick up any new messages +that arrived during the halt window. + +## Failure modes + +| Symptom | Cause | Fix | +|---|---|---| +| HALT file doesn't exist | Already resumed (or never halted) | OK — `-f` makes this a no-op. | +| `systemctl --user start` fails | Watcher service not installed | Install per `cross-agent-watch.md`'s systemd recipe. | +| `--tailnet` resumes some peers but not others | Same as halt: peer unreachable | Per-peer status reported; resolve manually for unreachable peers. | +| Permission denied removing HALT file | File owned by another user | Check ownership; HALT files should be owned by the running user. | + +## Examples + +```bash +# Local resume after a halt +cross-agent-resume + +# Resume all tailnet peers + local +cross-agent-resume --tailnet +``` + +## Recovery flow + +After a halt: + +1. Investigate whatever caused the halt (runaway loop, bad config, etc.). +2. Fix the underlying issue. +3. Run `cross-agent-resume`. +4. Open each Claude session that was polling and tell its agent to re-engage. +5. Confirm operation with `cross-agent-status`. + +## See also + +- `cross-agent-halt` — counterpart that creates the HALT file. +- `cross-agent-status` — verify HALT cleared and see pending messages. +- `cross-agent-comms.org` — protocol spec, `* Halt mechanism` section. diff --git a/.ai/scripts/cross-agent-comms/cross-agent-send b/.ai/scripts/cross-agent-comms/cross-agent-send new file mode 100755 index 0000000..68c010a --- /dev/null +++ b/.ai/scripts/cross-agent-comms/cross-agent-send @@ -0,0 +1,356 @@ +#!/usr/bin/env python3 +"""Cross-agent message sender. + +See cross-agent-send.md for the full contract. Briefly: + +- Destination as <machine>.<project>; resolved via peers.toml. +- Same-machine: cp to receiver's inbox/from-agents/ with atomic rename. +- Cross-machine: rsync over SSH (typically Tailscale) with retry+backoff. +- GPG-signs by default; .asc renames before .org so receivers never see + a .org without its sibling signature. +- Generates the canonical filename; user's input filename is ignored. +- Honors the HALT file: refuses to send and exits with code 5 when set. +""" + +from __future__ import annotations + +import argparse +import datetime as _dt +import json +import os +import re +import shutil +import socket +import subprocess +import sys +import tempfile +import time +import tomllib +from pathlib import Path + +CONFIG_DIR = Path.home() / ".config" / "cross-agent-comms" +PEERS_TOML = CONFIG_DIR / "peers.toml" +HALT_FILE = CONFIG_DIR / "HALT" +STATE_DIR = Path.home() / ".local" / "state" / "cross-agent-comms" +FAILED_SENDS_DIR = STATE_DIR / "failed-sends" + +EXIT_OK = 0 +EXIT_GENERAL = 1 +EXIT_DEST_NOT_FOUND = 2 +EXIT_CROSS_MACHINE_FAILED = 3 +EXIT_FRONTMATTER = 4 +EXIT_HALT = 5 + +REQUIRED_FRONTMATTER = ["CONVERSATION_ID", "MESSAGE_TYPE", "SEQUENCE", "TIMESTAMP", "PROTOCOL_VERSION"] +VALID_MESSAGE_TYPES = {"request", "progress", "query", "pushback", "complete", "release", "escalate"} + + +def err(msg: str) -> None: + print(msg, file=sys.stderr) + + +def check_halt() -> None: + """Exit with code 5 if HALT file exists.""" + if HALT_FILE.exists(): + try: + reason = HALT_FILE.read_text().strip() + except OSError: + # Fail-closed on unreadable HALT. + err("halt active (HALT file present but unreadable; treated as halted)") + err(f"remove {HALT_FILE} to resume") + sys.exit(EXIT_HALT) + msg = "halt active" + if reason: + msg += f": {reason}" + err(msg) + err(f"remove {HALT_FILE} to resume") + sys.exit(EXIT_HALT) + + +def parse_frontmatter(path: Path) -> dict[str, str]: + """Extract org-mode #+KEY: value frontmatter from the top of the file.""" + try: + text = path.read_text() + except OSError as e: + err(f"cannot read message file: {e}") + sys.exit(EXIT_GENERAL) + + frontmatter: dict[str, str] = {} + for line in text.splitlines(): + line = line.rstrip() + if not line: + # Blank line ends the frontmatter block. + if frontmatter: + break + continue + m = re.match(r"#\+([A-Z_]+):\s*(.*)", line) + if m: + frontmatter[m.group(1)] = m.group(2).strip() + else: + # First non-frontmatter line ends parsing. + if frontmatter: + break + return frontmatter + + +def validate_frontmatter(fm: dict[str, str]) -> None: + missing = [k for k in REQUIRED_FRONTMATTER if k not in fm] + if missing: + err(f"frontmatter missing required fields: {', '.join(missing)}") + sys.exit(EXIT_FRONTMATTER) + if fm["MESSAGE_TYPE"] not in VALID_MESSAGE_TYPES: + err(f"invalid MESSAGE_TYPE: {fm['MESSAGE_TYPE']!r}; expected one of {sorted(VALID_MESSAGE_TYPES)}") + sys.exit(EXIT_FRONTMATTER) + try: + int(fm["SEQUENCE"]) + except ValueError: + err(f"SEQUENCE must be an integer; got {fm['SEQUENCE']!r}") + sys.exit(EXIT_FRONTMATTER) + + +def load_peers() -> dict: + if not PEERS_TOML.exists(): + return {} + try: + return tomllib.loads(PEERS_TOML.read_text()) + except (tomllib.TOMLDecodeError, OSError) as e: + err(f"cannot read {PEERS_TOML}: {e}") + sys.exit(EXIT_GENERAL) + + +def resolve_destination(dest: str, peers: dict) -> tuple[str, str, str | None, str | None]: + """Resolve <machine>.<project> to (machine, project, host, ssh_user). + + host is None for same-machine destinations. + """ + if "." not in dest: + err(f"destination must be <machine>.<project>; got {dest!r}") + sys.exit(EXIT_DEST_NOT_FOUND) + machine, project = dest.split(".", 1) + + local_hostname = socket.gethostname().split(".")[0] + is_local = machine == local_hostname or machine == "local" + + host = None + ssh_user = None + if not is_local: + peer_cfg = peers.get("peers", {}).get(machine) + if peer_cfg is None: + available = list(peers.get("peers", {}).keys()) + err(f"destination not found in peers.toml; available peers: {available or '(none)'}") + sys.exit(EXIT_DEST_NOT_FOUND) + host = peer_cfg.get("host", machine) + ssh_user = peer_cfg.get("ssh_user", os.environ.get("USER")) + + return machine, project, host, ssh_user + + +def resolve_inbox_path(project: str, peers: dict) -> str: + """Inbox path on the receiver. Defaults to ~/projects/<project>/inbox/from-agents.""" + proj_cfg = peers.get("projects", {}).get(project) + if proj_cfg and "inbox_path" in proj_cfg: + return os.path.expanduser(proj_cfg["inbox_path"]) + return f"~/projects/{project}/inbox/from-agents" + + +def derive_sender_project() -> str: + """Walk up from CWD looking for ~/projects/<name>/. + + Returns the project name if found; falls back to the basename of CWD. + """ + cwd = Path.cwd().resolve() + projects_root = (Path.home() / "projects").resolve() + try: + rel = cwd.relative_to(projects_root) + return rel.parts[0] + except ValueError: + return cwd.name + + +def generate_canonical_filename(sender: str, conv_id: str) -> str: + """YYYYMMDDTHHMMSSZ-from-<sender>-<conv-id>.org""" + now = _dt.datetime.now(_dt.timezone.utc) + timestamp = now.strftime("%Y%m%dT%H%M%SZ") + return f"{timestamp}-from-{sender}-{conv_id}.org" + + +def sign(message_path: Path, sig_path: Path, key: str | None) -> None: + """gpg --detach-sign --armor --output <sig> [--local-user <key>] <message>""" + cmd = ["gpg", "--detach-sign", "--armor", "--yes", "--output", str(sig_path)] + if key: + cmd.extend(["--local-user", key]) + cmd.append(str(message_path)) + try: + result = subprocess.run(cmd, capture_output=True, text=True) + except FileNotFoundError: + err("gpg not found; install gnupg or use --no-sign for testing") + sys.exit(EXIT_GENERAL) + if result.returncode != 0: + err(f"signing failed: {result.stderr.strip()}") + sys.exit(EXIT_GENERAL) + + +def same_machine_deliver(message_path: Path, sig_path: Path | None, target_dir: Path, canonical_name: str) -> None: + """Atomic-write delivery: stage .asc, mv to final, then stage .org, mv to final.""" + target_dir.mkdir(parents=True, exist_ok=True) + final_msg = target_dir / canonical_name + final_sig = target_dir / f"{canonical_name}.asc" + + if sig_path is not None: + # Stage .asc first, mv to final, THEN stage .org and mv to final. + with tempfile.NamedTemporaryFile( + mode="wb", dir=target_dir, prefix=f".tmp.{canonical_name}.asc.", delete=False + ) as tmp: + tmp.write(sig_path.read_bytes()) + tmp_sig_path = Path(tmp.name) + os.replace(tmp_sig_path, final_sig) + + # Re-check HALT between .asc and .org per the layered-checks rule. + check_halt() + + with tempfile.NamedTemporaryFile( + mode="wb", dir=target_dir, prefix=f".tmp.{canonical_name}.", delete=False + ) as tmp: + tmp.write(message_path.read_bytes()) + tmp_msg_path = Path(tmp.name) + os.replace(tmp_msg_path, final_msg) + + +def cross_machine_deliver( + message_path: Path, + sig_path: Path | None, + canonical_name: str, + host: str, + ssh_user: str, + inbox_path: str, + retries: int, +) -> bool: + """rsync push the .asc first (if signed), re-check HALT, then push the .org. + + Returns True on success, False on persistent failure (after retries). + """ + # Stage local copies with the canonical name so rsync sets the right + # destination filename. + with tempfile.TemporaryDirectory(prefix="cross-agent-send-") as staging: + staging_dir = Path(staging) + local_msg = staging_dir / canonical_name + local_msg.write_bytes(message_path.read_bytes()) + local_sig = None + if sig_path is not None: + local_sig = staging_dir / f"{canonical_name}.asc" + local_sig.write_bytes(sig_path.read_bytes()) + + backoffs = [5, 30, 120] + # Step 1: push .asc first if signed. + if local_sig is not None: + if not _rsync_with_retries(local_sig, host, ssh_user, inbox_path, retries, backoffs): + return False + + # Re-check HALT between .asc and .org per the layered-checks rule. + check_halt() + + # Step 2: push .org. + if not _rsync_with_retries(local_msg, host, ssh_user, inbox_path, retries, backoffs): + return False + + return True + + +def _rsync_with_retries( + src: Path, host: str, ssh_user: str, inbox_path: str, retries: int, backoffs: list[int] +) -> bool: + target = f"{ssh_user}@{host}:{inbox_path}/" + last_err = "" + for attempt in range(retries + 1): + if attempt > 0: + check_halt() + wait = backoffs[min(attempt - 1, len(backoffs) - 1)] + err(f"rsync attempt {attempt} failed: {last_err}; retrying in {wait}s") + time.sleep(wait) + try: + result = subprocess.run( + ["rsync", "-a", str(src), target], + capture_output=True, + text=True, + ) + except FileNotFoundError: + err("rsync not found; install rsync") + return False + if result.returncode == 0: + return True + last_err = result.stderr.strip() or f"exit {result.returncode}" + err(f"rsync failed after {retries + 1} attempts: {last_err}") + return False + + +def write_failed_send_marker(dest: str, message_path: Path, error: str, retry_log: list[str]) -> None: + FAILED_SENDS_DIR.mkdir(parents=True, exist_ok=True) + timestamp = _dt.datetime.now(_dt.timezone.utc).strftime("%Y%m%dT%H%M%SZ") + safe_basename = re.sub(r"[^A-Za-z0-9._-]", "_", message_path.name) + marker = FAILED_SENDS_DIR / f"{timestamp}-{dest.replace('.', '-')}-{safe_basename}.json" + marker.write_text(json.dumps( + { + "timestamp": timestamp, + "destination": dest, + "message_path": str(message_path), + "error": error, + "retry_log": retry_log, + }, + indent=2, + )) + err(f"marker written: {marker}") + + +def main() -> int: + parser = argparse.ArgumentParser(description="Send a cross-agent message.") + parser.add_argument("destination", help="Destination as <machine>.<project>") + parser.add_argument("message_file", type=Path, help="Path to the message body file") + parser.add_argument("--no-sign", action="store_true", help="Skip GPG signing (testing only)") + parser.add_argument("--retries", type=int, default=3, help="Retry count for cross-machine sends") + parser.add_argument("--key", help="GPG key id to sign with (default: user's primary)") + args = parser.parse_args() + + check_halt() + + if not args.message_file.is_file(): + err(f"message file not found: {args.message_file}") + return EXIT_GENERAL + + fm = parse_frontmatter(args.message_file) + validate_frontmatter(fm) + + peers = load_peers() + machine, project, host, ssh_user = resolve_destination(args.destination, peers) + inbox_path = resolve_inbox_path(project, peers) + + sender = derive_sender_project() + canonical_name = generate_canonical_filename(sender, fm["CONVERSATION_ID"]) + + sig_tmp = None + if not args.no_sign: + sig_tmp = args.message_file.with_suffix(args.message_file.suffix + ".asc.tmp") + sign(args.message_file, sig_tmp, args.key) + + try: + if host is None: + # Same-machine delivery. + target_dir = Path(os.path.expanduser(inbox_path)) + same_machine_deliver(args.message_file, sig_tmp, target_dir, canonical_name) + print(f"sent: {target_dir}/{canonical_name}") + return EXIT_OK + else: + ok = cross_machine_deliver( + args.message_file, sig_tmp, canonical_name, host, ssh_user, inbox_path, args.retries + ) + if ok: + print(f"sent: {ssh_user}@{host}:{inbox_path}/{canonical_name}") + return EXIT_OK + write_failed_send_marker(args.destination, args.message_file, "rsync failed after retries", []) + return EXIT_CROSS_MACHINE_FAILED + finally: + if sig_tmp is not None and sig_tmp.exists(): + sig_tmp.unlink() + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/.ai/scripts/cross-agent-comms/cross-agent-send.md b/.ai/scripts/cross-agent-comms/cross-agent-send.md new file mode 100644 index 0000000..b06dbce --- /dev/null +++ b/.ai/scripts/cross-agent-comms/cross-agent-send.md @@ -0,0 +1,199 @@ +# cross-agent-send + +**Purpose.** Send a cross-agent message file to a specific destination. Handles +peer-config lookup, GPG signing, atomic write (same-machine) or rsync push +(cross-machine), retry-with-backoff, and failure surfacing. + +This is the canonical writer. The protocol spec defers all writer mechanics to +this script. + +## Usage + +``` +cross-agent-send <destination> <message-file> [--no-sign] [--retries N] +``` + +### Positional arguments + +| Position | Meaning | Example | +|---|---|---| +| 1 | Destination as `<machine>.<project>` | `homelab.career`, `velox.career` | +| 2 | Message file (already-formatted `.org`) | `/tmp/my-message.org` | + +### Flags + +| Flag | Default | Purpose | +|---|---|---| +| `--no-sign` | (signing on) | Skip GPG signing. Use only for testing; receivers reject unsigned messages by default. | +| `--retries N` | 3 | Override retry count for cross-machine sends. | +| `--key <key-id>` | (user's primary key) | GPG key to sign with. Resolution order: `--key` flag, `GPG_USER` env, `git config user.signingkey`, then the first secret key in the keyring. | + +## Behavior + +### Filename generation (script-controlled) + +The script generates the canonical destination filename from the message's +frontmatter and sender context. The user's input filename is ignored — pass any +path, the script names the destination correctly: + +``` +<UTC-now>T<HHMMSS>Z-from-<sender-slug>-<short-conv-id>.org +``` + +`<sender-slug>` comes from the sender machine's project name (config or +hostname-based). `<short-conv-id>` is read from the message's +`#+CONVERSATION_ID` frontmatter field. UTC timestamp is generated at send time. + +The script also performs the **sender-side max-seen scan** before writing: it +reads the receiver's `from-agents/` directory, finds the highest existing +sequence in this conversation across both sender prefixes, and (best-effort) +suggests `max(seen) + 1` for the next sequence. The user/agent is responsible +for setting `#+SEQUENCE` in the message body; the script only advises. + +### Same-machine destinations + +Resolved when the destination's machine matches the current hostname (or is +not in `peers.toml` as a remote). Steps: + +1. Parse frontmatter; extract `CONVERSATION_ID` and `TIMESTAMP`. Validate per + the *Validation before send* section below. +2. Generate canonical filename per *Filename generation* above. +3. Sign: `gpg --detach-sign --armor --output <canonical>.asc --local-user <key> <input>`. +4. Compute target: read `peers.toml` for the project's `inbox_path`. If + missing, fall back to `~/projects/<project>/inbox/from-agents/`. +5. **Atomic write with strict ordering** (signature must precede message): + - Stage `.asc`: write to `<target>/.tmp.XXXXXX-<canonical>.asc`, + then `mv` to `<target>/<canonical>.asc`. + - **Then** stage `.org`: write to `<target>/.tmp.XXXXXX-<canonical>`, + then `mv` to `<target>/<canonical>`. + - Receivers only act on `.org` files; staging the `.asc` first guarantees + the signature is present when the receiver opens the message. Out-of-order + would race: receiver could read the `.org` before the `.asc` lands and + fail GPG verify even though the sender did everything right. +6. Exit 0 on success. Exit non-zero if any step fails. + +### Cross-machine destinations + +Steps: + +1. Parse + generate canonical filename, as same-machine steps 1-2. +2. Sign locally to `<input>.asc` (or a tmp staging file). +3. rsync push **with the same .asc-first ordering**: + - `rsync -a <input>.asc <ssh-user>@<host>:<inbox_path>/<canonical>.asc` + - **Then** `rsync -a <input> <ssh-user>@<host>:<inbox_path>/<canonical>` + rsync writes to a hidden temp file then renames atomically by default + (`--inplace` would defeat this; do not pass it). +4. Retry on failure: 5s, 30s, 120s backoff, then surface error. +5. On persistent failure: write a marker file to + `~/.local/state/cross-agent-comms/failed-sends/<timestamp>-<dest>-<canonical>.json` + containing the destination, message path, error, and retry log. Exit non-zero. + +### Validation before send + +- Destination resolves via `peers.toml` (or local fallback). If neither, exit + immediately with `destination not found in peers.toml; available: <list>`. +- Message file must be readable, non-empty, and have valid org-mode frontmatter + with **all** of the following required fields: + - `#+TITLE` + - `#+CONVERSATION_ID` + - `#+MESSAGE_TYPE` + - `#+SEQUENCE` + - `#+TIMESTAMP` + - `#+PROTOCOL_VERSION` (must equal `5` for v5) + + If any required field is missing or malformed, exit immediately with a parse + error naming the offending field. + +- Optional fields the script recognizes and passes through (no special + handling beyond preservation): + - `#+REQUIRES_TOOLS` — comma-separated tool/MCP slugs the receiver needs. + - `#+RELEASE_STATUS` — valid only on `MESSAGE_TYPE: release`. Values per + spec: `complete`, `cancelled`, `withdrawn-after-pushback`, + `abandoned-after-escalation`. + - `#+WORKFLOW_VERSION` — sender's version of the cross-agent-comms workflow + file. Currently advisory; receiver may warn on mismatch but does not block. + +## Configuration + +Reads `~/.config/cross-agent-comms/peers.toml` for peer routing: + +```toml +[peers.velox] +host = "velox.local" +ssh_user = "cjennings" + +# Optional: per-project inbox-path overrides for non-default layouts. +[projects.career] +inbox_path = "~/projects/career/inbox/from-agents" + +[projects.homelab] +inbox_path = "~/projects/homelab/inbox/from-agents" +``` + +If a project entry is omitted, defaults to `~/projects/<project>/inbox/from-agents`. + +## Failure modes + +| Symptom | Cause | Fix | +|---|---|---| +| `destination not found in peers.toml` | Misspelled destination, or peer not configured | Run `cross-agent-discover` to see available destinations. | +| `signing failed: no secret key` | GPG key missing or not in keyring | `gpg --list-secret-keys` to confirm. Override with `--key <id>`. | +| `signing failed: pinentry timed out` | Headless session, GUI pinentry unavailable | Confirm `pinentry-program` in `gpg-agent.conf` matches available pinentry. Per protocols.org, GUI pinentry works from Claude Code. | +| `rsync exit 255` | SSH unreachable | `cross-agent-discover --peer <name>` to confirm reachability. | +| `rsync exit 23` | Permission denied at destination | Check destination directory perms (`chmod 700`) and ownership. | +| Marker file written to `failed-sends/` | Persistent cross-machine failure | Inspect the marker's `error` field. After fixing, retry: `cross-agent-send <dest> <msg>` (the marker is for visibility; it does not auto-retry). | +| Receiver complains "unsigned message" | `--no-sign` was used in production | Don't use `--no-sign` outside testing. | + +## HALT awareness + +Checks `~/.config/cross-agent-comms/HALT` at the start of every send AND +between the `.asc` and `.org` rsync calls AND between each retry iteration. +On HALT exists, exits with code 5 ("halt active; remove +~/.config/cross-agent-comms/HALT to resume") without writing or pushing +further. + +Worst case: one in-flight send completes its current rsync step within a few +seconds before halt kicks in for the next step. New sends are blocked +immediately. No `pkill` needed — the per-iteration check stops things +naturally. + +If the HALT file exists but is unreadable (permissions wrong), fail-closed — +treat as if HALT is set. Safer than fail-open. + +See `cross-agent-halt.md` for the full halt mechanism. + +## Examples + +```bash +# Same-machine send +cross-agent-send homelab.career /tmp/my-message.org + +# Cross-machine send via Tailscale +cross-agent-send velox.career /tmp/my-message.org + +# Test send without signing (receiver will reject) +cross-agent-send homelab.career /tmp/test.org --no-sign + +# Override retry count for a flaky link +cross-agent-send velox.career /tmp/my-message.org --retries 10 + +# After a delivery failure, inspect the marker +cat ~/.local/state/cross-agent-comms/failed-sends/*.json | jq . +``` + +## Exit codes + +| Code | Meaning | +|---|---| +| 0 | Sent successfully. | +| 1 | General error (parse failure, signing failure, etc.). | +| 2 | Destination not found in peers.toml. | +| 3 | Cross-machine delivery failed after retries. Marker file written. | +| 4 | Frontmatter validation failed. | + +## See also + +- `cross-agent-discover` — validate destinations before sending. +- `cross-agent-watch` — receiver-side notification. +- `cross-agent-status` — see what's queued. +- `cross-agent-comms.org` — protocol spec, the "what" the script implements. diff --git a/.ai/scripts/cross-agent-comms/cross-agent-status b/.ai/scripts/cross-agent-comms/cross-agent-status new file mode 100755 index 0000000..4eee75b --- /dev/null +++ b/.ai/scripts/cross-agent-comms/cross-agent-status @@ -0,0 +1,185 @@ +#!/usr/bin/env python3 +"""Point-in-time snapshot of pending cross-agent messages across local projects. + +See cross-agent-status.md. Pending = messages in inbox/from-agents/ whose +CONVERSATION_ID has no MESSAGE_TYPE: release at a later #+TIMESTAMP. + +HALT: prints a prominent banner before normal output, but continues to enumerate. +""" + +from __future__ import annotations + +import argparse +import glob +import json +import os +import re +import sys +from pathlib import Path + +CONFIG_DIR = Path.home() / ".config" / "cross-agent-comms" +HALT_FILE = CONFIG_DIR / "HALT" +DEFAULT_GLOB = str(Path.home() / "projects" / "*" / "inbox" / "from-agents") + "/" + + +def parse_frontmatter(path: Path) -> dict[str, str]: + try: + text = path.read_text() + except OSError: + return {} + fm: dict[str, str] = {} + for line in text.splitlines(): + line = line.rstrip() + if not line: + if fm: + break + continue + m = re.match(r"#\+([A-Z_]+):\s*(.*)", line) + if m: + fm[m.group(1)] = m.group(2).strip() + elif fm: + break + return fm + + +def project_name_from_path(path: str) -> str: + """Walk up from path to find ~/projects/<name>/...""" + home = str(Path.home()) + parts = Path(path).parts + for i, part in enumerate(parts): + if part == "projects" and i + 1 < len(parts) and str(Path(*parts[: i + 1])) == os.path.join(home, "projects"): + return parts[i + 1] + # Fallback: dir three levels up from the .org file (project/inbox/from-agents/file.org) + return Path(path).parent.parent.parent.name + + +def scan_project(inbox_dir: Path) -> tuple[int, str | None, int | None]: + """Return (pending_count, most_recent_filename_or_None, most_recent_age_seconds_or_None).""" + if not inbox_dir.is_dir(): + return 0, None, None + + # Group .org files by CONVERSATION_ID, also collect release timestamps per conv. + org_files = sorted(inbox_dir.glob("*.org")) + if not org_files: + return 0, None, None + + by_conv: dict[str, list[tuple[str, str, Path]]] = {} # conv_id -> [(timestamp, msg_type, path)] + for f in org_files: + fm = parse_frontmatter(f) + conv = fm.get("CONVERSATION_ID") + ts = fm.get("TIMESTAMP") + mt = fm.get("MESSAGE_TYPE") + if not conv or not ts or not mt: + # Malformed file: count as pending under conv "_unparseable". + by_conv.setdefault("_unparseable", []).append(("", "request", f)) + continue + by_conv.setdefault(conv, []).append((ts, mt, f)) + + pending_files: list[Path] = [] + for conv, entries in by_conv.items(): + entries.sort(key=lambda e: e[0]) + # Find the latest release timestamp. + release_ts = None + for ts, mt, _f in entries: + if mt == "release" and (release_ts is None or ts > release_ts): + release_ts = ts + for ts, mt, f in entries: + if mt == "release": + continue + if release_ts is not None and ts <= release_ts: + continue + pending_files.append(f) + + if not pending_files: + return 0, None, None + + # Most-recent by mtime (proxy for arrival order). + most_recent = max(pending_files, key=lambda p: p.stat().st_mtime) + import time + age = int(time.time() - most_recent.stat().st_mtime) + return len(pending_files), most_recent.name, age + + +def fmt_age(seconds: int | None) -> str: + if seconds is None: + return "—" + if seconds < 60: + return f"{seconds}s ago" + if seconds < 3600: + return f"{seconds // 60} min ago" + if seconds < 86400: + return f"{seconds // 3600} hr ago" + return f"{seconds // 86400} day(s) ago" + + +def render_banner_if_halt() -> None: + if not HALT_FILE.exists(): + return + try: + reason = HALT_FILE.read_text().strip() + except OSError: + reason = "(HALT file unreadable; treated as halted)" + print("⚠ HALT ACTIVE — cross-agent comms paused") + if reason: + print(f" reason: {reason}") + print(f" clear: rm {HALT_FILE} (or: cross-agent-resume)") + print() + + +def main() -> int: + parser = argparse.ArgumentParser(description="Snapshot of pending cross-agent messages across local projects.") + parser.add_argument("--json", action="store_true", help="Emit JSON output") + parser.add_argument("--projects-glob", default=DEFAULT_GLOB, + help=f"Glob for project from-agents dirs (default: {DEFAULT_GLOB})") + args = parser.parse_args() + + render_banner_if_halt() + + matched = sorted(glob.glob(args.projects_glob)) + rows = [] + for path in matched: + inbox = Path(path) + if not inbox.is_dir(): + continue + proj = project_name_from_path(path) + count, most_recent, age = scan_project(inbox) + rows.append({ + "name": proj, + "pending_count": count, + "most_recent": ( + {"filename": most_recent, "age_seconds": age} + if most_recent else None + ), + }) + + # Sort: pending-first, then alphabetical by name. + rows.sort(key=lambda r: (-r["pending_count"], r["name"])) + + if args.json: + import datetime as _dt + payload = { + "scanned_at": _dt.datetime.now(_dt.timezone.utc).isoformat(), + "halt_active": HALT_FILE.exists(), + "projects": rows, + } + print(json.dumps(payload, indent=2)) + return 0 + + if not rows: + print("No projects with inbox/from-agents/ found — 0 pending.") + return 0 + + # Human-readable table. + name_w = max(len("project"), max(len(r["name"]) for r in rows)) + print(f"{'project':<{name_w}} pending most-recent") + for r in rows: + most_recent_str = "—" + if r["most_recent"]: + most_recent_str = f"{r['most_recent']['filename']} ({fmt_age(r['most_recent']['age_seconds'])})" + print(f"{r['name']:<{name_w}} {r['pending_count']:<7} {most_recent_str}") + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/.ai/scripts/cross-agent-comms/cross-agent-status.md b/.ai/scripts/cross-agent-comms/cross-agent-status.md new file mode 100644 index 0000000..e700919 --- /dev/null +++ b/.ai/scripts/cross-agent-comms/cross-agent-status.md @@ -0,0 +1,139 @@ +# cross-agent-status + +**Purpose.** Point-in-time snapshot of pending cross-agent messages across +every project on this machine. Run from any terminal. No daemon required. + +This is the user-pull layer of the cold-start story — `cross-agent-watch` +pushes notifications, `cross-agent-status` lets the user query. + +## Usage + +``` +cross-agent-status [--json] [--projects-glob <glob>] +``` + +No args required. + +### Flags + +| Flag | Default | Purpose | +|---|---|---| +| `--json` | off (table) | Output as JSON for scripting. | +| `--projects-glob <glob>` | `~/projects/*/inbox/from-agents/` | Override which directories to scan. | + +## Output + +### Default (table) + +``` +$ cross-agent-status +project pending most-recent +career 0 — +claude-templates 0 — +clipper 0 — +homelab 1 20260427T085611Z-from-career-question.org (3 min ago) +finances 0 — +... (other 9 projects) +``` + +Sort: pending-first, then alphabetical. + +### `--json` + +```json +{ + "scanned_at": "2026-04-27T04:13:00-05:00", + "projects": [ + { + "name": "homelab", + "pending_count": 1, + "most_recent": { + "filename": "20260427T085611Z-from-career-question.org", + "age_seconds": 180 + } + }, + ... + ] +} +``` + +## Pending semantics + +A message is "pending" if it sits in `inbox/from-agents/` AND no +`MESSAGE_TYPE: release` exists for the same `CONVERSATION_ID` after it. + +Concretely: + +1. Scan each project's `inbox/from-agents/` for `.org` files. +2. Group by `CONVERSATION_ID` from frontmatter. +3. For each conversation, find the highest-`#+TIMESTAMP` message with + `MESSAGE_TYPE: release`. +4. Messages with `#+TIMESTAMP` after that release (or in conversations with no + release) count as pending. + +Files without parseable frontmatter are counted as pending and noted in the +output (single warning row per project). + +## Failure modes + +| Symptom | Likely cause | Fix | +|---|---|---| +| Project missing from output | Project's `.ai/` directory exists but `inbox/from-agents/` does not | Created lazily on first cross-agent message; `mkdir -p` to surface in output. | +| All projects show "0 pending" but you know one has messages | Glob misresolved, OR all messages are post-release | `cross-agent-status --projects-glob` with explicit path to confirm. | +| Warning row "N files unparseable in <project>" | Message file has invalid frontmatter | Open the file, fix or move out. | + +## Performance + +Scans every `.org` file in every watched directory. For Craig's setup (14 +projects, single-digit messages each), runs in <100ms. If a project +accumulates hundreds of post-release messages, archive them per the persistence +guidance in the protocol spec. + +## HALT awareness + +Checks `~/.config/cross-agent-comms/HALT` at start. If HALT exists, prints a +prominent banner before normal output: + +``` +$ cross-agent-status +⚠ HALT ACTIVE — cross-agent comms paused + Reason: investigating runaway poll loop, 2026-04-27 + HALT file: ~/.config/cross-agent-comms/HALT + Resume with: cross-agent-resume + +(snapshot continues normally — HALT does not suppress visibility) + +project pending most-recent +career 0 — +homelab 1 20260427T085611Z-from-career-question.org (3 min ago) +... +``` + +Status is read-only, so it always runs. The banner ensures the user can't +miss that halt is active when checking inbox state. Reason text comes from +the HALT file's body; if empty, omit the reason line. + +If the HALT file exists but is unreadable, print a warning banner ("HALT +file present but unreadable; treat as halted") and continue with normal +output. + +See `cross-agent-halt.md` for the full halt mechanism. + +## Examples + +```bash +# Snapshot +cross-agent-status + +# JSON for piping +cross-agent-status --json | jq '.projects[] | select(.pending_count > 0)' + +# Single-project query +cross-agent-status --projects-glob ~/projects/career/inbox/from-agents/ +``` + +## See also + +- `cross-agent-watch` — push notifications on new arrivals. +- `cross-agent-discover` — enumerate available agents (cross-machine). +- `cross-agent-comms.org` — protocol spec. diff --git a/.ai/scripts/cross-agent-comms/cross-agent-watch b/.ai/scripts/cross-agent-comms/cross-agent-watch new file mode 100755 index 0000000..3978f49 --- /dev/null +++ b/.ai/scripts/cross-agent-comms/cross-agent-watch @@ -0,0 +1,106 @@ +#!/usr/bin/env bash +# cross-agent-watch — desktop-notify on new cross-agent messages. +# +# See cross-agent-watch.md. Watches every ~/projects/*/inbox/from-agents/ by +# default. inotifywait fires create + moved_to events; .tmp.* files are +# filtered out. HALT suppresses notifications but the watcher keeps running +# and logs each event with "(suppressed by HALT)". + +set -uo pipefail + +# Defaults. +PROJECTS_GLOB="${HOME}/projects/*/inbox/from-agents/" +LOG_FILE="${HOME}/.local/state/cross-agent-comms/watch.log" +HALT_FILE="${HOME}/.config/cross-agent-comms/HALT" +QUIET=0 +NO_NOTIFY=0 + +# Arg parsing. +while [[ $# -gt 0 ]]; do + case "$1" in + --projects-glob) + PROJECTS_GLOB="$2"; shift 2 ;; + --log) + LOG_FILE="$2"; shift 2 ;; + --quiet) + QUIET=1; shift ;; + --no-notify) + NO_NOTIFY=1; shift ;; + -h|--help) + cat <<EOF +Usage: cross-agent-watch [--projects-glob GLOB] [--log PATH] [--quiet] [--no-notify] + +Watches inbox/from-agents/ directories for new cross-agent messages and fires +desktop notifications. See cross-agent-watch.md for details. +EOF + exit 0 ;; + *) + echo "unknown flag: $1" >&2; exit 1 ;; + esac +done + +# Resolve glob to a concrete list of directories. +# shellcheck disable=SC2086 +DIRS=( $PROJECTS_GLOB ) +# Filter out non-existent paths (glob may include literal pattern when no match). +EXISTING=() +for d in "${DIRS[@]}"; do + if [[ -d "$d" ]]; then + EXISTING+=( "$d" ) + fi +done + +if [[ ${#EXISTING[@]} -eq 0 ]]; then + echo "cross-agent-watch: glob resolved 0 directories: $PROJECTS_GLOB" >&2 + exit 1 +fi + +# Ensure log dir exists. +mkdir -p "$(dirname "$LOG_FILE")" + +[[ $QUIET -eq 0 ]] && echo "cross-agent-watch: watching ${#EXISTING[@]} dir(s); log: $LOG_FILE" + +# Helper: project name from path like /home/.../projects/<name>/inbox/from-agents/... +project_name() { + local path="$1" + # Match ~/projects/<name>/... + if [[ "$path" =~ ${HOME}/projects/([^/]+)/ ]]; then + echo "${BASH_REMATCH[1]}" + else + basename "$(dirname "$(dirname "$path")")" + fi +} + +# Main loop. inotifywait emits one line per event in the format +# "<full-path>" because we passed --format '%w%f'. +inotifywait -m -e create,moved_to --format '%w%f' "${EXISTING[@]}" 2>/dev/null \ + | while IFS= read -r path; do + filename="$(basename "$path")" + + # Filter .tmp.* staging files. + case "$filename" in + .tmp.*) continue ;; + esac + + # Filter .asc sidecars — they land first per the atomic-write ordering; + # the .org event will fire after. + case "$filename" in + *.asc) continue ;; + esac + + proj="$(project_name "$path")" + iso="$(date -u "+%Y-%m-%dT%H:%M:%SZ")" + + if [[ -e "$HALT_FILE" ]]; then + printf '%s\t%s\t%s\t(suppressed by HALT)\n' "$iso" "$proj" "$filename" >> "$LOG_FILE" + [[ $QUIET -eq 0 ]] && echo "[$iso] $proj: $filename (suppressed by HALT)" + continue + fi + + printf '%s\t%s\t%s\n' "$iso" "$proj" "$filename" >> "$LOG_FILE" + [[ $QUIET -eq 0 ]] && echo "[$iso] $proj: $filename" + + if [[ $NO_NOTIFY -eq 0 ]]; then + notify info "Cross-agent message" "${proj}: ${filename}" 2>/dev/null || true + fi + done diff --git a/.ai/scripts/cross-agent-comms/cross-agent-watch.md b/.ai/scripts/cross-agent-comms/cross-agent-watch.md new file mode 100644 index 0000000..7192f46 --- /dev/null +++ b/.ai/scripts/cross-agent-comms/cross-agent-watch.md @@ -0,0 +1,128 @@ +# cross-agent-watch + +**Purpose.** Long-running watcher that fires desktop notifications when new +cross-agent messages land in any project's `inbox/from-agents/` directory. +This is the primary cold-start mechanism: messages get noticed even when no +Claude session is active. + +## Usage + +``` +cross-agent-watch [--projects-glob <glob>] [--log <path>] +``` + +No args required. Defaults: + +- Watches `~/projects/*/inbox/from-agents/` (matches every project with the + cross-agent-comms convention). +- Logs each event to `~/.local/state/cross-agent-comms/watch.log`. + +### Flags + +| Flag | Default | Purpose | +|---|---|---| +| `--projects-glob <glob>` | `~/projects/*/inbox/from-agents/` | Override which directories to watch. Useful for testing on a single project. | +| `--log <path>` | `~/.local/state/cross-agent-comms/watch.log` | Override log location. Set to `/dev/null` to disable logging. | +| `--quiet` | off | Suppress stdout output. Notifications still fire. | +| `--no-notify` | off | Skip `notify` calls. Useful for testing the watcher loop without spamming notifications. | + +## Behavior + +1. Resolves the projects-glob to a concrete list of directories at startup. + New projects added to `~/projects/` after startup are NOT picked up — restart + the watcher to re-resolve. +2. Runs `inotifywait -m -e create,moved_to --format '%w%f'` against each + watched directory. +3. For each event, calls + `notify info "Cross-agent message" "<project>: <filename>"`. +4. Appends an event line to the log: + `<ISO-8601-timestamp>\t<project>\t<filename>`. + +## Event filtering + +- Watches `create` AND `moved_to` events. The `moved_to` part is critical for + the atomic-write convention (`mktemp` + `mv` produces a `moved_to`, not a + `create`). +- Files starting with `.tmp.` are ignored — they're staging files from + in-progress writes that should never produce a notification. + +## Installation + +### Option A — tmux pane (personal, easy) + +Run in a tmux pane that survives session disconnects: + +``` +tmux new -d -s cross-agent-watch 'cross-agent-watch' +``` + +### Option B — systemd user service (production) + +Provided files: + +- `~/.config/systemd/user/cross-agent-watch.service` +- `~/.config/systemd/user/cross-agent-watch.path` + +Enable with: + +``` +systemctl --user enable --now cross-agent-watch.path +``` + +The path unit triggers the service unit on filesystem changes; the service +unit re-execs `cross-agent-watch` if it dies. Survives reboot. + +## Failure modes + +| Symptom | Likely cause | Fix | +|---|---|---| +| No notifications fire on new files | inotifywait not running, or glob resolved to zero dirs | Check `cross-agent-watch --projects-glob ... --quiet` exits non-zero immediately. Log shows `"resolved 0 directories"`. | +| Notifications fire on `.tmp.` files | Filter regression | Verify `inotifywait` events show the `.tmp.` files; if so check this script's filter logic. | +| Some files missed under rapid bursts | inotify queue overflow | Increase `fs.inotify.max_queued_events` sysctl. Default 16384 is usually fine. | +| Permission denied on a watched dir | Directory perms wrong | `chmod 700 <dir>` and confirm owner. | + +## HALT awareness + +Checks `~/.config/cross-agent-comms/HALT` on each iteration (each inotifywait +event fired). If HALT exists, the watcher continues running but **suppresses +the `notify` call**. The event is still logged, with `(suppressed by HALT)` +appended: + +``` +2026-04-27T04:42:00-05:00 career 20260427T094200Z-from-homelab-test.org (suppressed by HALT) +``` + +Logged-but-suppressed events are useful for the operator to see what would +have fired during the halt window — helpful for diagnosing whatever caused +the halt. + +When HALT clears, suppression stops; subsequent events fire normally. Backlog +events that arrived during halt are NOT replayed — they get picked up via +cold-start handling (status CLI, agent startup check, or the next agent +poll once polling resumes). + +If the HALT file exists but is unreadable, fail-closed (suppress) — safer +than fail-open. + +See `cross-agent-halt.md` for the full halt mechanism. + +## Examples + +```bash +# Watch all projects, log everything, fire notifications +cross-agent-watch + +# Test against a single project, no notifications, verbose +cross-agent-watch \ + --projects-glob "$HOME/projects/career/inbox/from-agents/" \ + --no-notify + +# Production-style: quiet stdout, log only +cross-agent-watch --quiet +``` + +## See also + +- `cross-agent-status` — point-in-time snapshot of pending messages. +- `cross-agent-send` — counterpart writer. +- `cross-agent-comms.org` — protocol spec. |
