feat(hooks): scan file-backed messages and harden rm parsing

Two audit gaps in the confirmation hooks, plus the test harness they were missing. The git-commit and gh-pr-create hooks scanned for AI attribution but only saw inline messages. A commit made with -F/--file or a PR made with --body-file slipped through, since the hook stored a placeholder instead of the file's text, and the publish flow uses -F constantly. A new read_referenced_file helper in _common.py reads the referenced local file (missing, oversized, or non-UTF-8 returns None, which means "couldn't inspect" and never "clean"), so attribution scanning now sees the real committed and posted text. An unreadable file falls through to the existing ask-anyway path. destructive-bash-confirm.py parsed rm flags by splitting on whitespace, which mangled quoted paths and missed flag variants. detect_rm_rf now tokenizes with shlex, so quoted or spaced paths and combined, separate, or reordered flags all parse. It fails toward asking (a sentinel that still fires the modal) on unbalanced quotes, or when a forced recursive rm sits alongside a pipeline, compound command, substitution, or redirect, since target attribution isn't trustworthy there. The supported and unsupported shell constructs are documented in the docstrings. These hooks had no tests and weren't in make test. Added a pytest harness under hooks/tests (an importlib-by-path loader, since the hook filenames are hyphenated) with 54 tests across the three hooks and the shared helper, and wired hooks/tests into make test. Full suite green.
author: Craig Jennings <c@cjennings.net> 2026-05-22 15:38:58 -0500
committer: Craig Jennings <c@cjennings.net> 2026-05-22 15:38:58 -0500
commit: 53d33d9cdd5c6d7fd7c4dc7315b04d225add94d6 (patch)
tree: a2cfe5331eed9e22cc9880c3b75e246d74e2239e /hooks/tests
parent: efcc8e5ffdd09f538fd2d83824f4c632264ad96c (diff)
download: rulesets-53d33d9cdd5c6d7fd7c4dc7315b04d225add94d6.tar.gz
rulesets-53d33d9cdd5c6d7fd7c4dc7315b04d225add94d6.zip
5 files changed, 419 insertions, 0 deletions
diff --git a/hooks/tests/conftest.py b/hooks/tests/conftest.py
new file mode 100644
index 0000000..72c7920
--- /dev/null
+++ b/hooks/tests/conftest.py
@@ -0,0 +1,41 @@
+"""Pytest harness for the hook scripts under hooks/.
+
+Hook filenames are hyphenated (git-commit-confirm.py, etc.), so they
+cannot be imported by module name. `load_hook(filename)` loads them by
+path via importlib, and inserts hooks/ onto sys.path first so each hook's
+own `from _common import ...` resolves.
+"""
+
+import importlib.util
+import sys
+from pathlib import Path
+
+import pytest
+
+HOOKS_DIR = Path(__file__).resolve().parent.parent
+
+
+def load_hook(filename: str):
+    """Load a hook script by filename and return its module object.
+
+    Inserts hooks/ onto sys.path (idempotently) so the hook's
+    `from _common import ...` works, then loads the file by path under a
+    sanitized module name.
+    """
+    hooks_dir = str(HOOKS_DIR)
+    if hooks_dir not in sys.path:
+        sys.path.insert(0, hooks_dir)
+
+    path = HOOKS_DIR / filename
+    mod_name = "hook_" + filename.replace("-", "_").replace(".py", "")
+    spec = importlib.util.spec_from_file_location(mod_name, path)
+    if spec is None or spec.loader is None:
+        raise ImportError(f"could not load hook from {path}")
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+@pytest.fixture
+def load_hook_fixture():
+    return load_hook
diff --git a/hooks/tests/test_common.py b/hooks/tests/test_common.py
new file mode 100644
index 0000000..10e6097
--- /dev/null
+++ b/hooks/tests/test_common.py
@@ -0,0 +1,74 @@
+"""Tests for hooks/_common.py — read_referenced_file and scan_attribution."""
+
+import sys
+from pathlib import Path
+
+HOOKS_DIR = Path(__file__).resolve().parent.parent
+if str(HOOKS_DIR) not in sys.path:
+    sys.path.insert(0, str(HOOKS_DIR))
+
+import _common  # noqa: E402
+
+
+# --- read_referenced_file: normal cases ------------------------------------
+
+def test_read_referenced_file_returns_content(tmp_path):
+    f = tmp_path / "msg.txt"
+    f.write_text("hello world\n")
+    assert _common.read_referenced_file(str(f)) == "hello world\n"
+
+
+def test_read_referenced_file_expands_user(tmp_path, monkeypatch):
+    # Point HOME at tmp_path so ~/msg.txt resolves under it.
+    monkeypatch.setenv("HOME", str(tmp_path))
+    monkeypatch.setattr(Path, "home", lambda: tmp_path, raising=False)
+    f = tmp_path / "msg.txt"
+    f.write_text("tilde body")
+    assert _common.read_referenced_file("~/msg.txt") == "tilde body"
+
+
+# --- read_referenced_file: error / boundary cases --------------------------
+
+def test_read_referenced_file_missing_returns_none(tmp_path):
+    assert _common.read_referenced_file(str(tmp_path / "nope.txt")) is None
+
+
+def test_read_referenced_file_directory_returns_none(tmp_path):
+    # A directory is not a regular file.
+    assert _common.read_referenced_file(str(tmp_path)) is None
+
+
+def test_read_referenced_file_oversized_returns_none(tmp_path):
+    f = tmp_path / "big.txt"
+    f.write_text("x" * 50)
+    assert _common.read_referenced_file(str(f), max_bytes=10) is None
+
+
+def test_read_referenced_file_at_limit_is_read(tmp_path):
+    f = tmp_path / "edge.txt"
+    f.write_text("12345")  # exactly 5 bytes
+    assert _common.read_referenced_file(str(f), max_bytes=5) == "12345"
+
+
+def test_read_referenced_file_invalid_utf8_returns_none(tmp_path):
+    f = tmp_path / "bin.dat"
+    f.write_bytes(b"\xff\xfe\x00bad")
+    assert _common.read_referenced_file(str(f)) is None
+
+
+def test_read_referenced_file_empty_string_path_returns_none():
+    assert _common.read_referenced_file("") is None
+
+
+# --- scan_attribution sanity (relied on by the file-backed tests) ----------
+
+def test_scan_attribution_catches_coauthor():
+    assert _common.scan_attribution("Co-Authored-By: Claude")
+
+
+def test_scan_attribution_catches_robot_emoji():
+    assert _common.scan_attribution("nice work \U0001F916")
+
+
+def test_scan_attribution_clean_text_empty():
+    assert _common.scan_attribution("fix: tidy up the parser") == []
diff --git a/hooks/tests/test_destructive_bash_confirm.py b/hooks/tests/test_destructive_bash_confirm.py
new file mode 100644
index 0000000..50302e6
--- /dev/null
+++ b/hooks/tests/test_destructive_bash_confirm.py
@@ -0,0 +1,137 @@
+"""Tests for hooks/destructive-bash-confirm.py — shlex-based rm -rf parsing."""
+
+from conftest import load_hook
+
+hook = load_hook("destructive-bash-confirm.py")
+
+SENTINEL = "(unparsed — shell too complex to inspect safely)"
+
+
+# --- detection of flag forms (combined / separate / reordered) -------------
+
+def test_rf_combined_detected():
+    assert hook.detect_rm_rf("rm -rf build") == ["build"]
+
+
+def test_r_f_separate_detected():
+    assert hook.detect_rm_rf("rm -r -f build") == ["build"]
+
+
+def test_fr_reordered_detected():
+    assert hook.detect_rm_rf("rm -fr build") == ["build"]
+
+
+def test_capital_R_detected():
+    assert hook.detect_rm_rf("rm -Rf build") == ["build"]
+
+
+def test_long_flags_detected():
+    targets = hook.detect_rm_rf("rm --recursive --force build")
+    assert targets == ["build"]
+
+
+# --- quoted / spaced paths now parse correctly -----------------------------
+
+def test_quoted_path_with_space_parsed():
+    assert hook.detect_rm_rf('rm -rf "my dir"') == ["my dir"]
+
+
+def test_multiple_targets():
+    assert hook.detect_rm_rf("rm -rf a b c") == ["a", "b", "c"]
+
+
+def test_double_dash_separates_flags_from_paths():
+    assert hook.detect_rm_rf("rm -rf -- -weird-name") == ["-weird-name"]
+
+
+# --- not-a-match cases: no modal -------------------------------------------
+
+def test_no_r_returns_none():
+    assert hook.detect_rm_rf("rm -f file") is None
+
+
+def test_no_f_returns_none():
+    assert hook.detect_rm_rf("rm -r dir") is None
+
+
+def test_not_rm_returns_none():
+    assert hook.detect_rm_rf("rmdir foo") is None
+
+
+def test_plain_rm_returns_none():
+    assert hook.detect_rm_rf("rm file.txt") is None
+
+
+# --- dangerous path banner still fires on parsed targets -------------------
+
+def test_home_var_target_flags_dangerous():
+    detection = hook.detect_destructive('rm -rf "$HOME/x"')
+    assert detection is not None
+    kind, ctx = detection
+    assert kind == "rm -rf"
+    assert "_banner" in ctx
+    assert "$HOME/x" in ctx["_banner"]
+
+
+def test_root_path_flags_dangerous():
+    detection = hook.detect_destructive("rm -rf /etc/foo")
+    assert detection is not None
+    _, ctx = detection
+    assert "_banner" in ctx
+
+
+def test_safe_relative_target_no_banner():
+    detection = hook.detect_destructive("rm -rf build/cache")
+    assert detection is not None
+    _, ctx = detection
+    assert "_banner" not in ctx
+
+
+# --- fail-toward-asking on ambiguity ---------------------------------------
+
+def test_compound_command_returns_sentinel():
+    # `ls && rm -rf foo` — the naive parser missed this; now we ask.
+    assert hook.detect_rm_rf("ls && rm -rf foo") == [SENTINEL]
+
+
+def test_pipeline_returns_sentinel():
+    assert hook.detect_rm_rf("find . -type d | xargs rm -rf") == [SENTINEL]
+
+
+def test_semicolon_returns_sentinel():
+    assert hook.detect_rm_rf("cd /tmp; rm -rf junk") == [SENTINEL]
+
+
+def test_command_substitution_returns_sentinel():
+    assert hook.detect_rm_rf("rm -rf $(echo target)") == [SENTINEL]
+
+
+def test_backtick_substitution_returns_sentinel():
+    assert hook.detect_rm_rf("rm -rf `echo target`") == [SENTINEL]
+
+
+def test_redirect_returns_sentinel():
+    assert hook.detect_rm_rf("rm -rf foo > /dev/null") == [SENTINEL]
+
+
+def test_unbalanced_quotes_returns_sentinel():
+    # shlex.split raises ValueError → ask anyway rather than silently pass.
+    assert hook.detect_rm_rf('rm -rf "unterminated') == [SENTINEL]
+
+
+def test_compound_without_rm_rf_returns_none():
+    # Compound construct but no dangerous rm — should not fire.
+    assert hook.detect_rm_rf("ls && echo done") is None
+
+
+def test_compound_with_rm_but_no_force_returns_none():
+    # `&&` present but the rm has no -f, so nothing to flag.
+    assert hook.detect_rm_rf("ls && rm -r dir") is None
+
+
+def test_sentinel_fires_modal_via_detect_destructive():
+    detection = hook.detect_destructive("ls && rm -rf foo")
+    assert detection is not None
+    kind, ctx = detection
+    assert kind == "rm -rf"
+    assert ctx["targets"] == [SENTINEL]
diff --git a/hooks/tests/test_gh_pr_create_confirm.py b/hooks/tests/test_gh_pr_create_confirm.py
new file mode 100644
index 0000000..19dde2e
--- /dev/null
+++ b/hooks/tests/test_gh_pr_create_confirm.py
@@ -0,0 +1,70 @@
+"""Tests for hooks/gh-pr-create-confirm.py — --body-file reads real content."""
+
+from conftest import load_hook
+
+hook = load_hook("gh-pr-create-confirm.py")
+
+
+# --- existing parsing still works (regression guard) -----------------------
+
+def test_parse_title_and_inline_body():
+    cmd = 'gh pr create --title "feat: thing" --body "does the thing"'
+    fields = hook.parse_pr_create(cmd)
+    assert fields["title"] == "feat: thing"
+    assert fields["body"] == "does the thing"
+
+
+def test_parse_reviewers():
+    cmd = 'gh pr create --title "x" --reviewer alice,bob'
+    fields = hook.parse_pr_create(cmd)
+    assert fields["reviewers"] == ["alice", "bob"]
+
+
+# --- new: --body-file reads the real content -------------------------------
+
+def test_body_file_reads_real_content(tmp_path):
+    f = tmp_path / "body.md"
+    f.write_text("## Problem\nthings broke\n\n## Fix\nfixed them\n")
+    fields = hook.parse_pr_create(f'gh pr create --title "x" --body-file {f}')
+    assert "things broke" in fields["body"]
+    assert "fixed them" in fields["body"]
+    # No longer the old placeholder.
+    assert not fields["body"].startswith("(body read from file")
+
+
+def test_body_file_attribution_is_caught(tmp_path):
+    f = tmp_path / "body.md"
+    f.write_text("## Summary\nshipped a feature \U0001F916 generated with Claude\n")
+    fields = hook.parse_pr_create(f'gh pr create --title "feat: x" --body-file {f}')
+    scan_text = "\n".join(
+        filter(None, [fields.get("title"), fields.get("body")])
+    )
+    hits = hook.scan_attribution(scan_text)
+    assert hits  # robot emoji + 'Generated with Claude' both leak
+
+
+def test_body_file_clean_content_no_hits(tmp_path):
+    f = tmp_path / "body.md"
+    f.write_text("## Summary\nfixed the off-by-one in the pager\n")
+    fields = hook.parse_pr_create(f'gh pr create --title "fix: pager" --body-file {f}')
+    scan_text = "\n".join(
+        filter(None, [fields.get("title"), fields.get("body")])
+    )
+    assert hook.scan_attribution(scan_text) == []
+
+
+# --- unreadable file keeps an informative could-not-inspect placeholder ----
+
+def test_body_file_missing_keeps_could_not_inspect_placeholder(tmp_path):
+    missing = tmp_path / "nope.md"
+    fields = hook.parse_pr_create(f'gh pr create --title "x" --body-file {missing}')
+    assert "could not inspect" in fields["body"]
+    assert str(missing) in fields["body"]
+
+
+def test_body_file_oversized_keeps_placeholder(tmp_path, monkeypatch):
+    f = tmp_path / "big.md"
+    f.write_text("x" * 5000)
+    monkeypatch.setattr(hook, "read_referenced_file", lambda p: None)
+    fields = hook.parse_pr_create(f'gh pr create --title "x" --body-file {f}')
+    assert "could not inspect" in fields["body"]
diff --git a/hooks/tests/test_git_commit_confirm.py b/hooks/tests/test_git_commit_confirm.py
new file mode 100644
index 0000000..83519ad
--- /dev/null
+++ b/hooks/tests/test_git_commit_confirm.py
@@ -0,0 +1,97 @@
+"""Tests for hooks/git-commit-confirm.py — file-backed commit messages."""
+
+from conftest import load_hook
+
+hook = load_hook("git-commit-confirm.py")
+
+
+# --- existing forms still parse (regression guard) -------------------------
+
+def test_extract_dash_m_simple():
+    msg = hook.extract_commit_message('git commit -m "fix: tidy parser"')
+    assert msg == "fix: tidy parser"
+
+
+def test_extract_heredoc():
+    cmd = (
+        "git commit -m \"$(cat <<'EOF'\n"
+        "feat: add thing\n"
+        "\n"
+        "body line\n"
+        "EOF\n"
+        ")\""
+    )
+    msg = hook.extract_commit_message(cmd)
+    assert msg.startswith("feat: add thing")
+    assert "body line" in msg
+
+
+def test_extract_unparseable_falls_through():
+    # Bare `git commit` would drop into $EDITOR.
+    assert hook.extract_commit_message("git commit") == hook.UNPARSEABLE_MESSAGE
+
+
+# --- new: -F / --file / --file= forms read the file ------------------------
+
+def test_extract_dash_F_reads_file(tmp_path):
+    f = tmp_path / "msg.txt"
+    f.write_text("fix: from a file\n\nsome body\n")
+    assert hook.extract_commit_message(f"git commit -F {f}") == "fix: from a file\n\nsome body"
+
+
+def test_extract_long_file_flag_reads_file(tmp_path):
+    f = tmp_path / "msg.txt"
+    f.write_text("docs: long form file flag\n")
+    assert hook.extract_commit_message(f"git commit --file {f}") == "docs: long form file flag"
+
+
+def test_extract_file_equals_form_reads_file(tmp_path):
+    f = tmp_path / "msg.txt"
+    f.write_text("chore: equals form\n")
+    assert hook.extract_commit_message(f"git commit --file={f}") == "chore: equals form"
+
+
+def test_extract_F_strips_quotes_around_path(tmp_path):
+    f = tmp_path / "my msg.txt"
+    f.write_text("feat: quoted path\n")
+    assert hook.extract_commit_message(f'git commit -F "{f}"') == "feat: quoted path"
+
+
+# --- the audit-item bug: attribution in a file-backed message is now caught -
+
+def test_file_backed_attribution_is_caught(tmp_path):
+    f = tmp_path / "msg.txt"
+    f.write_text("feat: add widget\n\nCo-Authored-By: Claude <noreply@anthropic.com>\n")
+    msg = hook.extract_commit_message(f"git commit -F {f}")
+    issues = hook.collect_issues(msg, staged=["a.py"], author="Real Dev <dev@example.com>")
+    assert any(i.startswith("AI-attribution") for i in issues)
+
+
+def test_inline_message_without_attribution_is_clean(tmp_path):
+    # Sanity: a clean file-backed message produces no attribution issue.
+    f = tmp_path / "msg.txt"
+    f.write_text("fix: handle empty input\n")
+    msg = hook.extract_commit_message(f"git commit -F {f}")
+    issues = hook.collect_issues(msg, staged=["a.py"], author="Real Dev <dev@example.com>")
+    assert not any(i.startswith("AI-attribution") for i in issues)
+
+
+# --- unreadable file falls through to UNPARSEABLE (fail-safe: ask) ---------
+
+def test_missing_file_falls_through_to_unparseable(tmp_path):
+    missing = tmp_path / "nope.txt"
+    assert hook.extract_commit_message(f"git commit -F {missing}") == hook.UNPARSEABLE_MESSAGE
+
+
+def test_oversized_file_falls_through_and_hook_asks(tmp_path, monkeypatch):
+    f = tmp_path / "big.txt"
+    f.write_text("x" * 5000)
+    # Force the read to refuse via a tiny limit (simulates oversize).
+    monkeypatch.setattr(
+        hook, "read_referenced_file", lambda p, max_bytes=10: None
+    )
+    msg = hook.extract_commit_message(f"git commit -F {f}")
+    assert msg == hook.UNPARSEABLE_MESSAGE
+    # And the hook would ask, because UNPARSEABLE_MESSAGE is a flagged issue.
+    issues = hook.collect_issues(msg, staged=["a.py"], author="Dev <d@e.com>")
+    assert any("not parseable" in i for i in issues)
author	Craig Jennings <c@cjennings.net>	2026-05-22 15:38:58 -0500
committer	Craig Jennings <c@cjennings.net>	2026-05-22 15:38:58 -0500
commit	53d33d9cdd5c6d7fd7c4dc7315b04d225add94d6 (patch)
tree	a2cfe5331eed9e22cc9880c3b75e246d74e2239e /hooks/tests
parent	efcc8e5ffdd09f538fd2d83824f4c632264ad96c (diff)
download	rulesets-53d33d9cdd5c6d7fd7c4dc7315b04d225add94d6.tar.gz rulesets-53d33d9cdd5c6d7fd7c4dc7315b04d225add94d6.zip