fix(drill-deck): cut leakage false positives and codify source/date conventions

Health ran the new leakage check on a 43-card deck and hit two false-positive classes. The check read the whole card body, so a =Source: <label> — <url>= citation line inflated the front/back overlap whenever the URL slug repeated the question's words. Range/category cards ("What are the HbA1c ranges across normal, prediabetes, and diabetes?") tripped it too, because the question's categories echo in the answer even though the recalled content is the numbers. drill-deck-stats.py now routes leakage through an is_leaky helper. It strips =Source:= and created-date lines before computing overlap, and exempts a card when the answer carries a numeric range or threshold the question lacks. leakage_ratio itself is unchanged, so the genuine-restatement case still flags. Two body conventions now hold: a =Source:= citation goes at the end of a card after two blank lines, and no created/added date goes on a card. drill-to-anki.py now strips =Created:= / =:CREATED:= lines from the back as a backstop, and the workflow's Phase C removes them from the source during the rewrite. I added tests for the source-strip, the numeric carve-out, and the created-line strip, and documented all of it in drill-deck-review.org.
author: Craig Jennings <c@cjennings.net> 2026-05-30 15:46:00 -0500
committer: Craig Jennings <c@cjennings.net> 2026-05-30 15:46:00 -0500
commit: b80a9ceb3fc9cdca9798b48fbc4f9ab9c1592b57 (patch)
tree: be0f328c6be0ddb8d999730cc6fa07277ebc2ec7 /.ai/scripts/drill-deck-stats.py
parent: 968a39bb3978e6ad499447ce173e2265dee772a2 (diff)
download: rulesets-b80a9ceb3fc9cdca9798b48fbc4f9ab9c1592b57.tar.gz
rulesets-b80a9ceb3fc9cdca9798b48fbc4f9ab9c1592b57.zip
1 files changed, 42 insertions, 2 deletions
diff --git a/.ai/scripts/drill-deck-stats.py b/.ai/scripts/drill-deck-stats.py
index d0707e2..04c3468 100755
--- a/.ai/scripts/drill-deck-stats.py
+++ b/.ai/scripts/drill-deck-stats.py
@@ -10,6 +10,8 @@ Blocking WARNs (exit 1):
 - Non-prompt headings (topic-as-heading not yet rewritten)
 - #+TITLE missing, or carrying source-tool jargon ("org-drill")
 - Answer leakage: a card whose question echoes most of its own answer
+  (Source: citation lines and created-date lines are excluded from the
+  overlap, and range/category cards that recall numbers are exempted)
 - Duplicate / near-duplicate fronts (interference between confusable cards)
 
 Non-blocking NOTEs (exit unaffected):
@@ -41,6 +43,10 @@ ID_RE = re.compile(r"^\s*:ID:\s+(\S+)\s*$")
 TITLE_RE = re.compile(r"^#\+TITLE:\s*(.+?)\s*$", re.IGNORECASE)
 SOURCE_TOOL_RE = re.compile(r"\borg[-\s]?drill\b", re.IGNORECASE)
 PLANNING_RE = re.compile(r"^\s*(SCHEDULED|DEADLINE|CLOSED):\s")
+SOURCE_LINE_RE = re.compile(r"^\s*source:\s", re.IGNORECASE)
+CREATED_LINE_RE = re.compile(r"^\s*:?created:?\s", re.IGNORECASE)
+RANGE_RE = re.compile(r"\d[^\n]*[-–—]\s*\d")
+THRESHOLD_RE = re.compile(r"[<>≤≥]\s*\d")
 BULLET_RE = re.compile(r"^\s*([-+*]|\d+[.)])\s+")
 BINARY_LEAD_RE = re.compile(
     r"^\s*(is|are|was|were|does|do|did|can|could|should|would|will|has|have|had)\b",
@@ -98,6 +104,41 @@ def leakage_ratio(heading: str, body: str) -> float:
     return len(hw & content_words(body)) / len(hw)
 
 
+def prose_body(body: str) -> str:
+    """Body with Source: citation and created-date lines removed.
+
+    Those lines are metadata, not the answer. A Source line's URL slug often
+    repeats the question's words, and a created date is bookkeeping — neither
+    should count toward answer-leakage overlap.
+    """
+    return "\n".join(
+        ln for ln in body.splitlines()
+        if not SOURCE_LINE_RE.match(ln) and not CREATED_LINE_RE.match(ln)
+    )
+
+
+def has_distinct_numeric_recall(heading: str, body: str) -> bool:
+    """True if the answer carries numeric ranges/thresholds the question lacks.
+
+    A range/category card ("What are the HbA1c ranges across normal,
+    prediabetes, and diabetes?") echoes its categories in the answer, but the
+    recalled content is the numbers, which the question doesn't give away — so
+    high word overlap isn't leakage.
+    """
+    body_nums = bool(RANGE_RE.search(body) or THRESHOLD_RE.search(body))
+    head_nums = bool(RANGE_RE.search(heading) or THRESHOLD_RE.search(heading))
+    return body_nums and not head_nums
+
+
+def is_leaky(heading: str, body: str) -> bool:
+    """True if a card leaks its answer, after excluding citation lines and
+    numeric-recall (range/category) cards."""
+    prose = prose_body(body)
+    if leakage_ratio(heading, prose) < LEAKAGE_RATIO:
+        return False
+    return not has_distinct_numeric_recall(heading, prose)
+
+
 def normalize_heading(heading: str) -> str:
     """Collapse a heading to a comparison key (lowercase, alnum + single spaces)."""
     return re.sub(r"\s+", " ", re.sub(r"[^a-z0-9 ]", " ", heading.lower())).strip()
@@ -212,8 +253,7 @@ def main() -> int:
     no_id = [c["heading"] for c in cards if not c["has_id"]]
     not_prompt = [c["heading"] for c in cards if not is_prompt_form(c["heading"])]
     answer_count = sum(1 for c in cards if c["has_answer"])
-    leaky = [c["heading"] for c in cards
-             if leakage_ratio(c["heading"], c["body"]) >= LEAKAGE_RATIO]
+    leaky = [c["heading"] for c in cards if is_leaky(c["heading"], c["body"])]
     dups = find_duplicate_fronts(cards)
     overloaded = [c["heading"] for c in cards if back_word_count(c["body"]) > BACK_WORD_LIMIT]
     listy = [c["heading"] for c in cards if is_list_back(c["body"])]
author	Craig Jennings <c@cjennings.net>	2026-05-30 15:46:00 -0500
committer	Craig Jennings <c@cjennings.net>	2026-05-30 15:46:00 -0500
commit	b80a9ceb3fc9cdca9798b48fbc4f9ab9c1592b57 (patch)
tree	be0f328c6be0ddb8d999730cc6fa07277ebc2ec7 /.ai/scripts/drill-deck-stats.py
parent	968a39bb3978e6ad499447ce173e2265dee772a2 (diff)
download	rulesets-b80a9ceb3fc9cdca9798b48fbc4f9ab9c1592b57.tar.gz rulesets-b80a9ceb3fc9cdca9798b48fbc4f9ab9c1592b57.zip