feat(hooks): scan file-backed messages and harden rm parsing

Two audit gaps in the confirmation hooks, plus the test harness they were missing. The git-commit and gh-pr-create hooks scanned for AI attribution but only saw inline messages. A commit made with -F/--file or a PR made with --body-file slipped through, since the hook stored a placeholder instead of the file's text, and the publish flow uses -F constantly. A new read_referenced_file helper in _common.py reads the referenced local file (missing, oversized, or non-UTF-8 returns None, which means "couldn't inspect" and never "clean"), so attribution scanning now sees the real committed and posted text. An unreadable file falls through to the existing ask-anyway path. destructive-bash-confirm.py parsed rm flags by splitting on whitespace, which mangled quoted paths and missed flag variants. detect_rm_rf now tokenizes with shlex, so quoted or spaced paths and combined, separate, or reordered flags all parse. It fails toward asking (a sentinel that still fires the modal) on unbalanced quotes, or when a forced recursive rm sits alongside a pipeline, compound command, substitution, or redirect, since target attribution isn't trustworthy there. The supported and unsupported shell constructs are documented in the docstrings. These hooks had no tests and weren't in make test. Added a pytest harness under hooks/tests (an importlib-by-path loader, since the hook filenames are hyphenated) with 54 tests across the three hooks and the shared helper, and wired hooks/tests into make test. Full suite green.
author: Craig Jennings <c@cjennings.net> 2026-05-22 15:38:58 -0500
committer: Craig Jennings <c@cjennings.net> 2026-05-22 15:38:58 -0500
commit: 53d33d9cdd5c6d7fd7c4dc7315b04d225add94d6 (patch)
tree: a2cfe5331eed9e22cc9880c3b75e246d74e2239e /hooks
parent: efcc8e5ffdd09f538fd2d83824f4c632264ad96c (diff)
download: rulesets-53d33d9cdd5c6d7fd7c4dc7315b04d225add94d6.tar.gz
rulesets-53d33d9cdd5c6d7fd7c4dc7315b04d225add94d6.zip
9 files changed, 549 insertions, 13 deletions
diff --git a/hooks/_common.py b/hooks/_common.py
index d4bf520..e82f7ed 100644
--- a/hooks/_common.py
+++ b/hooks/_common.py
@@ -16,6 +16,7 @@ the tools themselves.
 """
 
 import json
+import os
 import re
 import sys
 from typing import Optional
@@ -64,6 +65,25 @@ def respond_ask(reason: str, system_message: Optional[str] = None) -> None:
     print(json.dumps(output))
 
 
+def read_referenced_file(path: str, max_bytes: int = 1_000_000) -> Optional[str]:
+    """Read a local file referenced by -F/--file/--body-file so its text can be
+    attribution-scanned. Return the text, or None if it can't be safely read
+    (missing, not a regular file, larger than max_bytes, or not valid UTF-8).
+    None means 'could not inspect', never 'clean'."""
+    if not path:
+        return None
+    try:
+        expanded = os.path.expanduser(path)
+        if not os.path.isfile(expanded):
+            return None
+        if os.path.getsize(expanded) > max_bytes:
+            return None
+        with open(expanded, "r", encoding="utf-8", errors="strict") as fh:
+            return fh.read()
+    except (OSError, UnicodeDecodeError):
+        return None
+
+
 def scan_attribution(text: str) -> list[str]:
     """Return human-readable descriptions of any AI-attribution hits."""
     if not text:
diff --git a/hooks/destructive-bash-confirm.py b/hooks/destructive-bash-confirm.py
index c1cf5f9..be8b491 100755
--- a/hooks/destructive-bash-confirm.py
+++ b/hooks/destructive-bash-confirm.py
@@ -14,9 +14,20 @@ banner via systemMessage. First match wins — a command with multiple
 destructive patterns fires on the first detected.
 
 Non-destructive Bash calls exit 0 silent.
+
+Shell-parsing scope for `rm -rf` detection: `detect_rm_rf` tokenizes the
+command with `shlex.split`, so it handles a single simple command and its
+quoting/escaping (combined flags `-rf`, separate `-r -f`, reordered `-fr`,
+quoted/spaced paths). It does NOT model pipelines, compound commands
+(`;`, `&&`, `||`, `|`), command substitution (`$(...)`, backticks),
+redirects, aliases, or variable/glob expansion. When any of those appear
+alongside a forced recursive `rm` — or when the quoting is unbalanced —
+target attribution is unreliable, so the function returns a sentinel and
+the modal fires anyway (fail toward asking) rather than silently passing.
 """
 
 import re
+import shlex
 import subprocess
 import sys
 from typing import Optional
@@ -26,6 +37,13 @@ from _common import read_payload, respond_ask
 
 PROTECTED_BRANCHES = {"main", "master", "develop", "release", "prod", "production"}
 
+# Returned as the sole target when the command is a forced recursive rm but
+# the shell is too complex to attribute targets safely. Forces the modal.
+UNPARSEABLE_RM_TARGET = "(unparsed — shell too complex to inspect safely)"
+
+# Constructs that make single-command target attribution unreliable.
+_COMPLEX_SHELL = (";", "&&", "||", "|", "$(", "`", ">", "<")
+
 
 def main() -> int:
     payload = read_payload()
@@ -90,6 +108,8 @@ def detect_destructive(cmd: str) -> Optional[tuple[str, dict]]:
             if t in ("/", "~", "$HOME", ".", "..", "*")
             or t.startswith("/")
             or t.startswith("~")
+            or t.startswith("$HOME")
+            or "*" in t
         ]
         if dangerous:
             ctx["_banner"] = (
@@ -114,25 +134,73 @@ def is_force_push(cmd: str) -> bool:
 
 
 def detect_rm_rf(cmd: str) -> Optional[list[str]]:
-    """If cmd invokes `rm` with both -r/-R and -f flags, return its targets."""
-    m = re.search(r"(?:^|[\s;&|()])rm\s+(.+)$", cmd)
-    if not m:
+    """If cmd invokes `rm` with both -r/-R and -f flags, return its targets.
+
+    Tokenizes with `shlex.split`, so quoted/spaced paths and combined
+    (`-rf`), separate (`-r -f`), or reordered (`-fr`) flags all parse. The
+    long forms `--recursive` and `--force` count too. Returns:
+
+      - the target list (without flags) for a forced recursive rm,
+      - None when the command is not a forced recursive rm (no -r, no -f,
+        or not an `rm` at all) — no modal,
+      - [UNPARSEABLE_RM_TARGET] when a forced recursive rm is present but
+        the shell is too complex to attribute targets safely (compound
+        command, pipeline, command substitution, redirect) or the quoting
+        is unbalanced — fail toward asking.
+
+    shlex models a single simple command and its quoting only. It does not
+    model pipelines, compound commands, aliases, or variable/glob
+    expansion — those fall to the [UNPARSEABLE_RM_TARGET] ask path.
+    """
+    # Quick reject: no `rm` word at all.
+    if not re.search(r"(?:^|[\s;&|()])rm\b", cmd):
+        return None
+
+    complex_shell = any(tok in cmd for tok in _COMPLEX_SHELL)
+
+    try:
+        tokens = shlex.split(cmd)
+    except ValueError:
+        # Unbalanced quotes — can't tokenize. If an rm appears at all, ask.
+        return [UNPARSEABLE_RM_TARGET]
+
+    # Walk tokens; find the first `rm` and parse the flags/targets that
+    # immediately follow it within the same simple command.
+    try:
+        rm_idx = tokens.index("rm")
+    except ValueError:
         return None
 
-    rest = m.group(1).split()
-    flag_chars = ""
+    rest = tokens[rm_idx + 1:]
+    has_r = False
+    has_f = False
     i = 0
     while i < len(rest) and rest[i].startswith("-") and rest[i] != "--":
-        flag_chars += rest[i][1:]
+        tok = rest[i]
+        if tok in ("--recursive",):
+            has_r = True
+        elif tok in ("--force",):
+            has_f = True
+        elif tok.startswith("--"):
+            pass  # some other long flag — ignore
+        else:
+            short = tok[1:]
+            if re.search(r"[rR]", short):
+                has_r = True
+            if "f" in short:
+                has_f = True
         i += 1
-    if rest[i:i+1] == ["--"]:
+    if rest[i:i + 1] == ["--"]:
         i += 1
 
-    has_r = bool(re.search(r"[rR]", flag_chars))
-    has_f = "f" in flag_chars
     if not (has_r and has_f):
         return None
 
+    # Forced recursive rm confirmed. If the surrounding shell is too complex
+    # to trust target attribution, ask anyway rather than guess.
+    if complex_shell:
+        return [UNPARSEABLE_RM_TARGET]
+
     return rest[i:]
 
 
diff --git a/hooks/gh-pr-create-confirm.py b/hooks/gh-pr-create-confirm.py
index e3c2f13..f539551 100755
--- a/hooks/gh-pr-create-confirm.py
+++ b/hooks/gh-pr-create-confirm.py
@@ -27,7 +27,12 @@ Wire in ~/.claude/settings.json alongside git-commit-confirm.py:
 import re
 import sys
 
-from _common import read_payload, respond_ask, scan_attribution
+from _common import (
+    read_payload,
+    read_referenced_file,
+    respond_ask,
+    scan_attribution,
+)
 
 
 MAX_BODY_LINES = 20
@@ -91,9 +96,16 @@ def parse_pr_create(cmd: str) -> dict:
         if b:
             fields["body"] = b.group(2).strip()
         else:
-            bf = re.search(r"--body-file\s+(\S+)", cmd)
+            bf = re.search(r"--body-file\s+(\"[^\"]+\"|'[^']+'|\S+)", cmd)
             if bf:
-                fields["body"] = f"(body read from file: {bf.group(1)})"
+                path = bf.group(1).strip("\"'")
+                text = read_referenced_file(path)
+                if text is not None:
+                    fields["body"] = text.strip()
+                else:
+                    fields["body"] = (
+                        f"(body read from file: {path} — could not inspect)"
+                    )
 
     # Base / head
     base = re.search(r"--base\s+(\S+)", cmd)
diff --git a/hooks/git-commit-confirm.py b/hooks/git-commit-confirm.py
index 2441d23..618ac20 100755
--- a/hooks/git-commit-confirm.py
+++ b/hooks/git-commit-confirm.py
@@ -42,7 +42,12 @@ import re
 import subprocess
 import sys
 
-from _common import read_payload, respond_ask, scan_attribution
+from _common import (
+    read_payload,
+    read_referenced_file,
+    respond_ask,
+    scan_attribution,
+)
 
 
 MAX_FILES_SHOWN = 25
@@ -140,6 +145,18 @@ def extract_commit_message(cmd: str) -> str:
     if long_form:
         return "\n\n".join(msg for _, msg in long_form).strip()
 
+    # File-backed message: -F <file> / --file <file> / --file=<file>.
+    # Read the file so its text is attribution-scanned. If it can't be read
+    # safely, fall through to UNPARSEABLE_MESSAGE so the hook asks (fail-safe).
+    file_flag = re.search(
+        r"(?:^|\s)(?:-F|--file)[=\s]+(\"[^\"]+\"|'[^']+'|\S+)", cmd
+    )
+    if file_flag:
+        path = file_flag.group(1).strip("\"'")
+        text = read_referenced_file(path)
+        if text is not None:
+            return text.strip()
+
     return UNPARSEABLE_MESSAGE
 
 
diff --git a/hooks/tests/conftest.py b/hooks/tests/conftest.py
new file mode 100644
index 0000000..72c7920
--- /dev/null
+++ b/hooks/tests/conftest.py
@@ -0,0 +1,41 @@
+"""Pytest harness for the hook scripts under hooks/.
+
+Hook filenames are hyphenated (git-commit-confirm.py, etc.), so they
+cannot be imported by module name. `load_hook(filename)` loads them by
+path via importlib, and inserts hooks/ onto sys.path first so each hook's
+own `from _common import ...` resolves.
+"""
+
+import importlib.util
+import sys
+from pathlib import Path
+
+import pytest
+
+HOOKS_DIR = Path(__file__).resolve().parent.parent
+
+
+def load_hook(filename: str):
+    """Load a hook script by filename and return its module object.
+
+    Inserts hooks/ onto sys.path (idempotently) so the hook's
+    `from _common import ...` works, then loads the file by path under a
+    sanitized module name.
+    """
+    hooks_dir = str(HOOKS_DIR)
+    if hooks_dir not in sys.path:
+        sys.path.insert(0, hooks_dir)
+
+    path = HOOKS_DIR / filename
+    mod_name = "hook_" + filename.replace("-", "_").replace(".py", "")
+    spec = importlib.util.spec_from_file_location(mod_name, path)
+    if spec is None or spec.loader is None:
+        raise ImportError(f"could not load hook from {path}")
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+@pytest.fixture
+def load_hook_fixture():
+    return load_hook
diff --git a/hooks/tests/test_common.py b/hooks/tests/test_common.py
new file mode 100644
index 0000000..10e6097
--- /dev/null
+++ b/hooks/tests/test_common.py
@@ -0,0 +1,74 @@
+"""Tests for hooks/_common.py — read_referenced_file and scan_attribution."""
+
+import sys
+from pathlib import Path
+
+HOOKS_DIR = Path(__file__).resolve().parent.parent
+if str(HOOKS_DIR) not in sys.path:
+    sys.path.insert(0, str(HOOKS_DIR))
+
+import _common  # noqa: E402
+
+
+# --- read_referenced_file: normal cases ------------------------------------
+
+def test_read_referenced_file_returns_content(tmp_path):
+    f = tmp_path / "msg.txt"
+    f.write_text("hello world\n")
+    assert _common.read_referenced_file(str(f)) == "hello world\n"
+
+
+def test_read_referenced_file_expands_user(tmp_path, monkeypatch):
+    # Point HOME at tmp_path so ~/msg.txt resolves under it.
+    monkeypatch.setenv("HOME", str(tmp_path))
+    monkeypatch.setattr(Path, "home", lambda: tmp_path, raising=False)
+    f = tmp_path / "msg.txt"
+    f.write_text("tilde body")
+    assert _common.read_referenced_file("~/msg.txt") == "tilde body"
+
+
+# --- read_referenced_file: error / boundary cases --------------------------
+
+def test_read_referenced_file_missing_returns_none(tmp_path):
+    assert _common.read_referenced_file(str(tmp_path / "nope.txt")) is None
+
+
+def test_read_referenced_file_directory_returns_none(tmp_path):
+    # A directory is not a regular file.
+    assert _common.read_referenced_file(str(tmp_path)) is None
+
+
+def test_read_referenced_file_oversized_returns_none(tmp_path):
+    f = tmp_path / "big.txt"
+    f.write_text("x" * 50)
+    assert _common.read_referenced_file(str(f), max_bytes=10) is None
+
+
+def test_read_referenced_file_at_limit_is_read(tmp_path):
+    f = tmp_path / "edge.txt"
+    f.write_text("12345")  # exactly 5 bytes
+    assert _common.read_referenced_file(str(f), max_bytes=5) == "12345"
+
+
+def test_read_referenced_file_invalid_utf8_returns_none(tmp_path):
+    f = tmp_path / "bin.dat"
+    f.write_bytes(b"\xff\xfe\x00bad")
+    assert _common.read_referenced_file(str(f)) is None
+
+
+def test_read_referenced_file_empty_string_path_returns_none():
+    assert _common.read_referenced_file("") is None
+
+
+# --- scan_attribution sanity (relied on by the file-backed tests) ----------
+
+def test_scan_attribution_catches_coauthor():
+    assert _common.scan_attribution("Co-Authored-By: Claude")
+
+
+def test_scan_attribution_catches_robot_emoji():
+    assert _common.scan_attribution("nice work \U0001F916")
+
+
+def test_scan_attribution_clean_text_empty():
+    assert _common.scan_attribution("fix: tidy up the parser") == []
diff --git a/hooks/tests/test_destructive_bash_confirm.py b/hooks/tests/test_destructive_bash_confirm.py
new file mode 100644
index 0000000..50302e6
--- /dev/null
+++ b/hooks/tests/test_destructive_bash_confirm.py
@@ -0,0 +1,137 @@
+"""Tests for hooks/destructive-bash-confirm.py — shlex-based rm -rf parsing."""
+
+from conftest import load_hook
+
+hook = load_hook("destructive-bash-confirm.py")
+
+SENTINEL = "(unparsed — shell too complex to inspect safely)"
+
+
+# --- detection of flag forms (combined / separate / reordered) -------------
+
+def test_rf_combined_detected():
+    assert hook.detect_rm_rf("rm -rf build") == ["build"]
+
+
+def test_r_f_separate_detected():
+    assert hook.detect_rm_rf("rm -r -f build") == ["build"]
+
+
+def test_fr_reordered_detected():
+    assert hook.detect_rm_rf("rm -fr build") == ["build"]
+
+
+def test_capital_R_detected():
+    assert hook.detect_rm_rf("rm -Rf build") == ["build"]
+
+
+def test_long_flags_detected():
+    targets = hook.detect_rm_rf("rm --recursive --force build")
+    assert targets == ["build"]
+
+
+# --- quoted / spaced paths now parse correctly -----------------------------
+
+def test_quoted_path_with_space_parsed():
+    assert hook.detect_rm_rf('rm -rf "my dir"') == ["my dir"]
+
+
+def test_multiple_targets():
+    assert hook.detect_rm_rf("rm -rf a b c") == ["a", "b", "c"]
+
+
+def test_double_dash_separates_flags_from_paths():
+    assert hook.detect_rm_rf("rm -rf -- -weird-name") == ["-weird-name"]
+
+
+# --- not-a-match cases: no modal -------------------------------------------
+
+def test_no_r_returns_none():
+    assert hook.detect_rm_rf("rm -f file") is None
+
+
+def test_no_f_returns_none():
+    assert hook.detect_rm_rf("rm -r dir") is None
+
+
+def test_not_rm_returns_none():
+    assert hook.detect_rm_rf("rmdir foo") is None
+
+
+def test_plain_rm_returns_none():
+    assert hook.detect_rm_rf("rm file.txt") is None
+
+
+# --- dangerous path banner still fires on parsed targets -------------------
+
+def test_home_var_target_flags_dangerous():
+    detection = hook.detect_destructive('rm -rf "$HOME/x"')
+    assert detection is not None
+    kind, ctx = detection
+    assert kind == "rm -rf"
+    assert "_banner" in ctx
+    assert "$HOME/x" in ctx["_banner"]
+
+
+def test_root_path_flags_dangerous():
+    detection = hook.detect_destructive("rm -rf /etc/foo")
+    assert detection is not None
+    _, ctx = detection
+    assert "_banner" in ctx
+
+
+def test_safe_relative_target_no_banner():
+    detection = hook.detect_destructive("rm -rf build/cache")
+    assert detection is not None
+    _, ctx = detection
+    assert "_banner" not in ctx
+
+
+# --- fail-toward-asking on ambiguity ---------------------------------------
+
+def test_compound_command_returns_sentinel():
+    # `ls && rm -rf foo` — the naive parser missed this; now we ask.
+    assert hook.detect_rm_rf("ls && rm -rf foo") == [SENTINEL]
+
+
+def test_pipeline_returns_sentinel():
+    assert hook.detect_rm_rf("find . -type d | xargs rm -rf") == [SENTINEL]
+
+
+def test_semicolon_returns_sentinel():
+    assert hook.detect_rm_rf("cd /tmp; rm -rf junk") == [SENTINEL]
+
+
+def test_command_substitution_returns_sentinel():
+    assert hook.detect_rm_rf("rm -rf $(echo target)") == [SENTINEL]
+
+
+def test_backtick_substitution_returns_sentinel():
+    assert hook.detect_rm_rf("rm -rf `echo target`") == [SENTINEL]
+
+
+def test_redirect_returns_sentinel():
+    assert hook.detect_rm_rf("rm -rf foo > /dev/null") == [SENTINEL]
+
+
+def test_unbalanced_quotes_returns_sentinel():
+    # shlex.split raises ValueError → ask anyway rather than silently pass.
+    assert hook.detect_rm_rf('rm -rf "unterminated') == [SENTINEL]
+
+
+def test_compound_without_rm_rf_returns_none():
+    # Compound construct but no dangerous rm — should not fire.
+    assert hook.detect_rm_rf("ls && echo done") is None
+
+
+def test_compound_with_rm_but_no_force_returns_none():
+    # `&&` present but the rm has no -f, so nothing to flag.
+    assert hook.detect_rm_rf("ls && rm -r dir") is None
+
+
+def test_sentinel_fires_modal_via_detect_destructive():
+    detection = hook.detect_destructive("ls && rm -rf foo")
+    assert detection is not None
+    kind, ctx = detection
+    assert kind == "rm -rf"
+    assert ctx["targets"] == [SENTINEL]
diff --git a/hooks/tests/test_gh_pr_create_confirm.py b/hooks/tests/test_gh_pr_create_confirm.py
new file mode 100644
index 0000000..19dde2e
--- /dev/null
+++ b/hooks/tests/test_gh_pr_create_confirm.py
@@ -0,0 +1,70 @@
+"""Tests for hooks/gh-pr-create-confirm.py — --body-file reads real content."""
+
+from conftest import load_hook
+
+hook = load_hook("gh-pr-create-confirm.py")
+
+
+# --- existing parsing still works (regression guard) -----------------------
+
+def test_parse_title_and_inline_body():
+    cmd = 'gh pr create --title "feat: thing" --body "does the thing"'
+    fields = hook.parse_pr_create(cmd)
+    assert fields["title"] == "feat: thing"
+    assert fields["body"] == "does the thing"
+
+
+def test_parse_reviewers():
+    cmd = 'gh pr create --title "x" --reviewer alice,bob'
+    fields = hook.parse_pr_create(cmd)
+    assert fields["reviewers"] == ["alice", "bob"]
+
+
+# --- new: --body-file reads the real content -------------------------------
+
+def test_body_file_reads_real_content(tmp_path):
+    f = tmp_path / "body.md"
+    f.write_text("## Problem\nthings broke\n\n## Fix\nfixed them\n")
+    fields = hook.parse_pr_create(f'gh pr create --title "x" --body-file {f}')
+    assert "things broke" in fields["body"]
+    assert "fixed them" in fields["body"]
+    # No longer the old placeholder.
+    assert not fields["body"].startswith("(body read from file")
+
+
+def test_body_file_attribution_is_caught(tmp_path):
+    f = tmp_path / "body.md"
+    f.write_text("## Summary\nshipped a feature \U0001F916 generated with Claude\n")
+    fields = hook.parse_pr_create(f'gh pr create --title "feat: x" --body-file {f}')
+    scan_text = "\n".join(
+        filter(None, [fields.get("title"), fields.get("body")])
+    )
+    hits = hook.scan_attribution(scan_text)
+    assert hits  # robot emoji + 'Generated with Claude' both leak
+
+
+def test_body_file_clean_content_no_hits(tmp_path):
+    f = tmp_path / "body.md"
+    f.write_text("## Summary\nfixed the off-by-one in the pager\n")
+    fields = hook.parse_pr_create(f'gh pr create --title "fix: pager" --body-file {f}')
+    scan_text = "\n".join(
+        filter(None, [fields.get("title"), fields.get("body")])
+    )
+    assert hook.scan_attribution(scan_text) == []
+
+
+# --- unreadable file keeps an informative could-not-inspect placeholder ----
+
+def test_body_file_missing_keeps_could_not_inspect_placeholder(tmp_path):
+    missing = tmp_path / "nope.md"
+    fields = hook.parse_pr_create(f'gh pr create --title "x" --body-file {missing}')
+    assert "could not inspect" in fields["body"]
+    assert str(missing) in fields["body"]
+
+
+def test_body_file_oversized_keeps_placeholder(tmp_path, monkeypatch):
+    f = tmp_path / "big.md"
+    f.write_text("x" * 5000)
+    monkeypatch.setattr(hook, "read_referenced_file", lambda p: None)
+    fields = hook.parse_pr_create(f'gh pr create --title "x" --body-file {f}')
+    assert "could not inspect" in fields["body"]
diff --git a/hooks/tests/test_git_commit_confirm.py b/hooks/tests/test_git_commit_confirm.py
new file mode 100644
index 0000000..83519ad
--- /dev/null
+++ b/hooks/tests/test_git_commit_confirm.py
@@ -0,0 +1,97 @@
+"""Tests for hooks/git-commit-confirm.py — file-backed commit messages."""
+
+from conftest import load_hook
+
+hook = load_hook("git-commit-confirm.py")
+
+
+# --- existing forms still parse (regression guard) -------------------------
+
+def test_extract_dash_m_simple():
+    msg = hook.extract_commit_message('git commit -m "fix: tidy parser"')
+    assert msg == "fix: tidy parser"
+
+
+def test_extract_heredoc():
+    cmd = (
+        "git commit -m \"$(cat <<'EOF'\n"
+        "feat: add thing\n"
+        "\n"
+        "body line\n"
+        "EOF\n"
+        ")\""
+    )
+    msg = hook.extract_commit_message(cmd)
+    assert msg.startswith("feat: add thing")
+    assert "body line" in msg
+
+
+def test_extract_unparseable_falls_through():
+    # Bare `git commit` would drop into $EDITOR.
+    assert hook.extract_commit_message("git commit") == hook.UNPARSEABLE_MESSAGE
+
+
+# --- new: -F / --file / --file= forms read the file ------------------------
+
+def test_extract_dash_F_reads_file(tmp_path):
+    f = tmp_path / "msg.txt"
+    f.write_text("fix: from a file\n\nsome body\n")
+    assert hook.extract_commit_message(f"git commit -F {f}") == "fix: from a file\n\nsome body"
+
+
+def test_extract_long_file_flag_reads_file(tmp_path):
+    f = tmp_path / "msg.txt"
+    f.write_text("docs: long form file flag\n")
+    assert hook.extract_commit_message(f"git commit --file {f}") == "docs: long form file flag"
+
+
+def test_extract_file_equals_form_reads_file(tmp_path):
+    f = tmp_path / "msg.txt"
+    f.write_text("chore: equals form\n")
+    assert hook.extract_commit_message(f"git commit --file={f}") == "chore: equals form"
+
+
+def test_extract_F_strips_quotes_around_path(tmp_path):
+    f = tmp_path / "my msg.txt"
+    f.write_text("feat: quoted path\n")
+    assert hook.extract_commit_message(f'git commit -F "{f}"') == "feat: quoted path"
+
+
+# --- the audit-item bug: attribution in a file-backed message is now caught -
+
+def test_file_backed_attribution_is_caught(tmp_path):
+    f = tmp_path / "msg.txt"
+    f.write_text("feat: add widget\n\nCo-Authored-By: Claude <noreply@anthropic.com>\n")
+    msg = hook.extract_commit_message(f"git commit -F {f}")
+    issues = hook.collect_issues(msg, staged=["a.py"], author="Real Dev <dev@example.com>")
+    assert any(i.startswith("AI-attribution") for i in issues)
+
+
+def test_inline_message_without_attribution_is_clean(tmp_path):
+    # Sanity: a clean file-backed message produces no attribution issue.
+    f = tmp_path / "msg.txt"
+    f.write_text("fix: handle empty input\n")
+    msg = hook.extract_commit_message(f"git commit -F {f}")
+    issues = hook.collect_issues(msg, staged=["a.py"], author="Real Dev <dev@example.com>")
+    assert not any(i.startswith("AI-attribution") for i in issues)
+
+
+# --- unreadable file falls through to UNPARSEABLE (fail-safe: ask) ---------
+
+def test_missing_file_falls_through_to_unparseable(tmp_path):
+    missing = tmp_path / "nope.txt"
+    assert hook.extract_commit_message(f"git commit -F {missing}") == hook.UNPARSEABLE_MESSAGE
+
+
+def test_oversized_file_falls_through_and_hook_asks(tmp_path, monkeypatch):
+    f = tmp_path / "big.txt"
+    f.write_text("x" * 5000)
+    # Force the read to refuse via a tiny limit (simulates oversize).
+    monkeypatch.setattr(
+        hook, "read_referenced_file", lambda p, max_bytes=10: None
+    )
+    msg = hook.extract_commit_message(f"git commit -F {f}")
+    assert msg == hook.UNPARSEABLE_MESSAGE
+    # And the hook would ask, because UNPARSEABLE_MESSAGE is a flagged issue.
+    issues = hook.collect_issues(msg, staged=["a.py"], author="Dev <d@e.com>")
+    assert any("not parseable" in i for i in issues)
author	Craig Jennings <c@cjennings.net>	2026-05-22 15:38:58 -0500
committer	Craig Jennings <c@cjennings.net>	2026-05-22 15:38:58 -0500
commit	53d33d9cdd5c6d7fd7c4dc7315b04d225add94d6 (patch)
tree	a2cfe5331eed9e22cc9880c3b75e246d74e2239e /hooks
parent	efcc8e5ffdd09f538fd2d83824f4c632264ad96c (diff)
download	rulesets-53d33d9cdd5c6d7fd7c4dc7315b04d225add94d6.tar.gz rulesets-53d33d9cdd5c6d7fd7c4dc7315b04d225add94d6.zip