1 files changed, 42 insertions, 2 deletions
diff --git a/.ai/scripts/drill-deck-stats.py b/.ai/scripts/drill-deck-stats.py
index d0707e2..04c3468 100755
--- a/.ai/scripts/drill-deck-stats.py
+++ b/.ai/scripts/drill-deck-stats.py
@@ -10,6 +10,8 @@ Blocking WARNs (exit 1):
 - Non-prompt headings (topic-as-heading not yet rewritten)
 - #+TITLE missing, or carrying source-tool jargon ("org-drill")
 - Answer leakage: a card whose question echoes most of its own answer
+  (Source: citation lines and created-date lines are excluded from the
+  overlap, and range/category cards that recall numbers are exempted)
 - Duplicate / near-duplicate fronts (interference between confusable cards)
 
 Non-blocking NOTEs (exit unaffected):
@@ -41,6 +43,10 @@ ID_RE = re.compile(r"^\s*:ID:\s+(\S+)\s*$")
 TITLE_RE = re.compile(r"^#\+TITLE:\s*(.+?)\s*$", re.IGNORECASE)
 SOURCE_TOOL_RE = re.compile(r"\borg[-\s]?drill\b", re.IGNORECASE)
 PLANNING_RE = re.compile(r"^\s*(SCHEDULED|DEADLINE|CLOSED):\s")
+SOURCE_LINE_RE = re.compile(r"^\s*source:\s", re.IGNORECASE)
+CREATED_LINE_RE = re.compile(r"^\s*:?created:?\s", re.IGNORECASE)
+RANGE_RE = re.compile(r"\d[^\n]*[-–—]\s*\d")
+THRESHOLD_RE = re.compile(r"[<>≤≥]\s*\d")
 BULLET_RE = re.compile(r"^\s*([-+*]|\d+[.)])\s+")
 BINARY_LEAD_RE = re.compile(
     r"^\s*(is|are|was|were|does|do|did|can|could|should|would|will|has|have|had)\b",
@@ -98,6 +104,41 @@ def leakage_ratio(heading: str, body: str) -> float:
     return len(hw & content_words(body)) / len(hw)
 
 
+def prose_body(body: str) -> str:
+    """Body with Source: citation and created-date lines removed.
+
+    Those lines are metadata, not the answer. A Source line's URL slug often
+    repeats the question's words, and a created date is bookkeeping — neither
+    should count toward answer-leakage overlap.
+    """
+    return "\n".join(
+        ln for ln in body.splitlines()
+        if not SOURCE_LINE_RE.match(ln) and not CREATED_LINE_RE.match(ln)
+    )
+
+
+def has_distinct_numeric_recall(heading: str, body: str) -> bool:
+    """True if the answer carries numeric ranges/thresholds the question lacks.
+
+    A range/category card ("What are the HbA1c ranges across normal,
+    prediabetes, and diabetes?") echoes its categories in the answer, but the
+    recalled content is the numbers, which the question doesn't give away — so
+    high word overlap isn't leakage.
+    """
+    body_nums = bool(RANGE_RE.search(body) or THRESHOLD_RE.search(body))
+    head_nums = bool(RANGE_RE.search(heading) or THRESHOLD_RE.search(heading))
+    return body_nums and not head_nums
+
+
+def is_leaky(heading: str, body: str) -> bool:
+    """True if a card leaks its answer, after excluding citation lines and
+    numeric-recall (range/category) cards."""
+    prose = prose_body(body)
+    if leakage_ratio(heading, prose) < LEAKAGE_RATIO:
+        return False
+    return not has_distinct_numeric_recall(heading, prose)
+
+
 def normalize_heading(heading: str) -> str:
     """Collapse a heading to a comparison key (lowercase, alnum + single spaces)."""
     return re.sub(r"\s+", " ", re.sub(r"[^a-z0-9 ]", " ", heading.lower())).strip()
@@ -212,8 +253,7 @@ def main() -> int:
     no_id = [c["heading"] for c in cards if not c["has_id"]]
     not_prompt = [c["heading"] for c in cards if not is_prompt_form(c["heading"])]
     answer_count = sum(1 for c in cards if c["has_answer"])
-    leaky = [c["heading"] for c in cards
-             if leakage_ratio(c["heading"], c["body"]) >= LEAKAGE_RATIO]
+    leaky = [c["heading"] for c in cards if is_leaky(c["heading"], c["body"])]
     dups = find_duplicate_fronts(cards)
     overloaded = [c["heading"] for c in cards if back_word_count(c["body"]) > BACK_WORD_LIMIT]
     listy = [c["heading"] for c in cards if is_list_back(c["body"])]