aboutsummaryrefslogtreecommitdiff
path: root/.ai/scripts/drill-deck-stats.py
diff options
context:
space:
mode:
Diffstat (limited to '.ai/scripts/drill-deck-stats.py')
-rwxr-xr-x.ai/scripts/drill-deck-stats.py44
1 files changed, 42 insertions, 2 deletions
diff --git a/.ai/scripts/drill-deck-stats.py b/.ai/scripts/drill-deck-stats.py
index d0707e2..04c3468 100755
--- a/.ai/scripts/drill-deck-stats.py
+++ b/.ai/scripts/drill-deck-stats.py
@@ -10,6 +10,8 @@ Blocking WARNs (exit 1):
- Non-prompt headings (topic-as-heading not yet rewritten)
- #+TITLE missing, or carrying source-tool jargon ("org-drill")
- Answer leakage: a card whose question echoes most of its own answer
+ (Source: citation lines and created-date lines are excluded from the
+ overlap, and range/category cards that recall numbers are exempted)
- Duplicate / near-duplicate fronts (interference between confusable cards)
Non-blocking NOTEs (exit unaffected):
@@ -41,6 +43,10 @@ ID_RE = re.compile(r"^\s*:ID:\s+(\S+)\s*$")
TITLE_RE = re.compile(r"^#\+TITLE:\s*(.+?)\s*$", re.IGNORECASE)
SOURCE_TOOL_RE = re.compile(r"\borg[-\s]?drill\b", re.IGNORECASE)
PLANNING_RE = re.compile(r"^\s*(SCHEDULED|DEADLINE|CLOSED):\s")
+SOURCE_LINE_RE = re.compile(r"^\s*source:\s", re.IGNORECASE)
+CREATED_LINE_RE = re.compile(r"^\s*:?created:?\s", re.IGNORECASE)
+RANGE_RE = re.compile(r"\d[^\n]*[-–—]\s*\d")
+THRESHOLD_RE = re.compile(r"[<>≤≥]\s*\d")
BULLET_RE = re.compile(r"^\s*([-+*]|\d+[.)])\s+")
BINARY_LEAD_RE = re.compile(
r"^\s*(is|are|was|were|does|do|did|can|could|should|would|will|has|have|had)\b",
@@ -98,6 +104,41 @@ def leakage_ratio(heading: str, body: str) -> float:
return len(hw & content_words(body)) / len(hw)
+def prose_body(body: str) -> str:
+ """Body with Source: citation and created-date lines removed.
+
+ Those lines are metadata, not the answer. A Source line's URL slug often
+ repeats the question's words, and a created date is bookkeeping — neither
+ should count toward answer-leakage overlap.
+ """
+ return "\n".join(
+ ln for ln in body.splitlines()
+ if not SOURCE_LINE_RE.match(ln) and not CREATED_LINE_RE.match(ln)
+ )
+
+
+def has_distinct_numeric_recall(heading: str, body: str) -> bool:
+ """True if the answer carries numeric ranges/thresholds the question lacks.
+
+ A range/category card ("What are the HbA1c ranges across normal,
+ prediabetes, and diabetes?") echoes its categories in the answer, but the
+ recalled content is the numbers, which the question doesn't give away — so
+ high word overlap isn't leakage.
+ """
+ body_nums = bool(RANGE_RE.search(body) or THRESHOLD_RE.search(body))
+ head_nums = bool(RANGE_RE.search(heading) or THRESHOLD_RE.search(heading))
+ return body_nums and not head_nums
+
+
+def is_leaky(heading: str, body: str) -> bool:
+ """True if a card leaks its answer, after excluding citation lines and
+ numeric-recall (range/category) cards."""
+ prose = prose_body(body)
+ if leakage_ratio(heading, prose) < LEAKAGE_RATIO:
+ return False
+ return not has_distinct_numeric_recall(heading, prose)
+
+
def normalize_heading(heading: str) -> str:
"""Collapse a heading to a comparison key (lowercase, alnum + single spaces)."""
return re.sub(r"\s+", " ", re.sub(r"[^a-z0-9 ]", " ", heading.lower())).strip()
@@ -212,8 +253,7 @@ def main() -> int:
no_id = [c["heading"] for c in cards if not c["has_id"]]
not_prompt = [c["heading"] for c in cards if not is_prompt_form(c["heading"])]
answer_count = sum(1 for c in cards if c["has_answer"])
- leaky = [c["heading"] for c in cards
- if leakage_ratio(c["heading"], c["body"]) >= LEAKAGE_RATIO]
+ leaky = [c["heading"] for c in cards if is_leaky(c["heading"], c["body"])]
dups = find_duplicate_fronts(cards)
overloaded = [c["heading"] for c in cards if back_word_count(c["body"]) > BACK_WORD_LIMIT]
listy = [c["heading"] for c in cards if is_list_back(c["body"])]