diff options
Diffstat (limited to '.ai/scripts/drill-deck-stats.py')
| -rwxr-xr-x | .ai/scripts/drill-deck-stats.py | 327 |
1 files changed, 0 insertions, 327 deletions
diff --git a/.ai/scripts/drill-deck-stats.py b/.ai/scripts/drill-deck-stats.py deleted file mode 100755 index 04c3468..0000000 --- a/.ai/scripts/drill-deck-stats.py +++ /dev/null @@ -1,327 +0,0 @@ -#!/usr/bin/env python3 -"""Inventory + authoring-quality checks for an org-drill deck source file. - -Reports counts and flags two tiers of issue. - -Blocking WARNs (exit 1): -- PROPERTIES drawer count not matching card count -- Cards missing :ID: (risks SRS-state loss across rewrites) -- `*** Answer` sub-headers (should be 0 per drill-deck-review.org) -- Non-prompt headings (topic-as-heading not yet rewritten) -- #+TITLE missing, or carrying source-tool jargon ("org-drill") -- Answer leakage: a card whose question echoes most of its own answer - (Source: citation lines and created-date lines are excluded from the - overlap, and range/category cards that recall numbers are exempted) -- Duplicate / near-duplicate fronts (interference between confusable cards) - -Non-blocking NOTEs (exit unaffected): -- Overloaded backs (long answer — candidate to split into atomic cards) -- List-shaped backs (enumeration — candidate to split or use overlapping cloze) -- Binary yes/no prompts (low retrieval effort — candidate to reformulate) - -Exits 0 when no blocking warnings are present, 1 otherwise, 2 on bad usage. -Use as a gate before regenerating the Anki deck or running drill-deck-sync. - -The fuzzy checks (leakage, duplicate, overloaded) are tuned by the LEAKAGE_* -and BACK_WORD_LIMIT constants below; loosen them if a real deck trips false -positives. - -Usage: - drill-deck-stats.py <file.org> -""" -from __future__ import annotations - -import re -import sys -from pathlib import Path - -CARD_RE = re.compile(r"^\*\*\s+(.+?)\s+:drill:\s*$") -ANSWER_RE = re.compile(r"^\*\*\*\s+Answer\b") -PROP_START_RE = re.compile(r"^\s*:PROPERTIES:\s*$") -PROP_END_RE = re.compile(r"^\s*:END:\s*$") -ID_RE = re.compile(r"^\s*:ID:\s+(\S+)\s*$") -TITLE_RE = re.compile(r"^#\+TITLE:\s*(.+?)\s*$", re.IGNORECASE) -SOURCE_TOOL_RE = re.compile(r"\borg[-\s]?drill\b", re.IGNORECASE) -PLANNING_RE = re.compile(r"^\s*(SCHEDULED|DEADLINE|CLOSED):\s") -SOURCE_LINE_RE = re.compile(r"^\s*source:\s", re.IGNORECASE) -CREATED_LINE_RE = re.compile(r"^\s*:?created:?\s", re.IGNORECASE) -RANGE_RE = re.compile(r"\d[^\n]*[-–—]\s*\d") -THRESHOLD_RE = re.compile(r"[<>≤≥]\s*\d") -BULLET_RE = re.compile(r"^\s*([-+*]|\d+[.)])\s+") -BINARY_LEAD_RE = re.compile( - r"^\s*(is|are|was|were|does|do|did|can|could|should|would|will|has|have|had)\b", - re.IGNORECASE, -) - -# A heading qualifies as "prompt form" if it contains `?` or starts with one of -# these imperative verbs (directive prompts like "Spell these out" and -# "Introduce yourself" are valid even without `?`). -IMPERATIVE_VERBS = frozenset({ - "spell", "describe", "explain", "name", "list", "give", - "show", "tell", "define", "compare", "identify", "outline", - "introduce", "walk", "state", "recite", "recall", "summarize", -}) - -# Function words ignored when comparing a question against its answer. -STOPWORDS = frozenset({ - "the", "a", "an", "is", "are", "was", "were", "of", "to", "in", "on", - "for", "and", "or", "with", "what", "who", "whom", "when", "where", "why", - "how", "which", "does", "do", "did", "tell", "me", "about", "their", "this", - "that", "it", "as", "at", "by", "be", "your", "you", "they", "them", -}) - -# Tuning knobs for the fuzzy checks. -LEAKAGE_RATIO = 0.8 # share of a question's content words echoed in its answer -LEAKAGE_MIN_WORDS = 3 # ignore very short questions, where overlap is noise -BACK_WORD_LIMIT = 60 # words on a card back before it's flagged as overloaded - - -def is_prompt_form(heading: str) -> bool: - """True if the heading reads as a question or imperative prompt.""" - if "?" in heading: - return True - first_word = heading.split(None, 1)[0].lower().rstrip(":,;") - return first_word in IMPERATIVE_VERBS - - -def content_words(text: str) -> set[str]: - """Lowercased alphanumeric tokens of length >= 3, minus stopwords.""" - return {w for w in re.findall(r"[a-z0-9]+", text.lower()) - if len(w) >= 3 and w not in STOPWORDS} - - -def leakage_ratio(heading: str, body: str) -> float: - """Fraction of the question's content words that reappear in the answer. - - A high ratio means the answer is largely restated in the question, so the - card can be answered by recognition rather than recall. Returns 0.0 for a - question with fewer than LEAKAGE_MIN_WORDS content words, where overlap is - just noise. - """ - hw = content_words(heading) - if len(hw) < LEAKAGE_MIN_WORDS: - return 0.0 - return len(hw & content_words(body)) / len(hw) - - -def prose_body(body: str) -> str: - """Body with Source: citation and created-date lines removed. - - Those lines are metadata, not the answer. A Source line's URL slug often - repeats the question's words, and a created date is bookkeeping — neither - should count toward answer-leakage overlap. - """ - return "\n".join( - ln for ln in body.splitlines() - if not SOURCE_LINE_RE.match(ln) and not CREATED_LINE_RE.match(ln) - ) - - -def has_distinct_numeric_recall(heading: str, body: str) -> bool: - """True if the answer carries numeric ranges/thresholds the question lacks. - - A range/category card ("What are the HbA1c ranges across normal, - prediabetes, and diabetes?") echoes its categories in the answer, but the - recalled content is the numbers, which the question doesn't give away — so - high word overlap isn't leakage. - """ - body_nums = bool(RANGE_RE.search(body) or THRESHOLD_RE.search(body)) - head_nums = bool(RANGE_RE.search(heading) or THRESHOLD_RE.search(heading)) - return body_nums and not head_nums - - -def is_leaky(heading: str, body: str) -> bool: - """True if a card leaks its answer, after excluding citation lines and - numeric-recall (range/category) cards.""" - prose = prose_body(body) - if leakage_ratio(heading, prose) < LEAKAGE_RATIO: - return False - return not has_distinct_numeric_recall(heading, prose) - - -def normalize_heading(heading: str) -> str: - """Collapse a heading to a comparison key (lowercase, alnum + single spaces).""" - return re.sub(r"\s+", " ", re.sub(r"[^a-z0-9 ]", " ", heading.lower())).strip() - - -def is_binary_prompt(heading: str) -> bool: - """True for yes/no or 'A or B' prompts, which need little retrieval effort.""" - if BINARY_LEAD_RE.match(heading): - return True - return bool(re.search(r"\bor\b", heading, re.IGNORECASE)) and heading.rstrip().endswith("?") - - -def back_word_count(body: str) -> int: - return len(body.split()) - - -def is_list_back(body: str) -> bool: - """True if the answer body is mostly an org list (an enumeration card).""" - lines = [ln for ln in body.splitlines() if ln.strip()] - if len(lines) < 2: - return False - bullets = sum(1 for ln in lines if BULLET_RE.match(ln)) - return bullets >= 2 and bullets * 2 >= len(lines) - - -def parse_cards(lines: list[str]) -> tuple[list[dict], int]: - """Parse :drill: cards from org lines. - - Returns (cards, prop_count). Each card is a dict with heading, has_id, - has_answer, and body (the answer text with PROPERTIES drawers, planning - lines, and `*** Answer` headers removed, approximating the rendered back). - """ - cards: list[dict] = [] - prop_count = 0 - i = 0 - n = len(lines) - while i < n: - m = CARD_RE.match(lines[i]) - if not m: - i += 1 - continue - heading = m.group(1).strip() - i += 1 - has_id = False - has_answer = False - in_drawer = False - body_lines: list[str] = [] - while i < n: - line = lines[i] - if line.startswith("* ") or CARD_RE.match(line): - break - if PROP_START_RE.match(line): - prop_count += 1 - in_drawer = True - elif in_drawer and PROP_END_RE.match(line): - in_drawer = False - elif in_drawer: - if ID_RE.match(line): - has_id = True - elif ANSWER_RE.match(line): - has_answer = True - elif PLANNING_RE.match(line): - pass - else: - body_lines.append(line) - i += 1 - cards.append({ - "heading": heading, - "has_id": has_id, - "has_answer": has_answer, - "body": "\n".join(body_lines).strip(), - }) - return cards, prop_count - - -def find_duplicate_fronts(cards: list[dict]) -> list[tuple[str, str]]: - """Return (first, dup) heading pairs that normalize to the same key.""" - seen: dict[str, str] = {} - dups: list[tuple[str, str]] = [] - for c in cards: - key = normalize_heading(c["heading"]) - if not key: - continue - if key in seen: - dups.append((seen[key], c["heading"])) - else: - seen[key] = c["heading"] - return dups - - -def main() -> int: - if len(sys.argv) != 2: - print(f"usage: {sys.argv[0]} <file.org>", file=sys.stderr) - return 2 - - path = Path(sys.argv[1]).expanduser().resolve() - if not path.is_file(): - print(f"error: {path} not found", file=sys.stderr) - return 2 - - lines = path.read_text(encoding="utf-8").splitlines() - - title: str | None = None - for line in lines[:20]: - m = TITLE_RE.match(line) - if m: - title = m.group(1).strip() - break - - cards, prop_count = parse_cards(lines) - - no_id = [c["heading"] for c in cards if not c["has_id"]] - not_prompt = [c["heading"] for c in cards if not is_prompt_form(c["heading"])] - answer_count = sum(1 for c in cards if c["has_answer"]) - leaky = [c["heading"] for c in cards if is_leaky(c["heading"], c["body"])] - dups = find_duplicate_fronts(cards) - overloaded = [c["heading"] for c in cards if back_word_count(c["body"]) > BACK_WORD_LIMIT] - listy = [c["heading"] for c in cards if is_list_back(c["body"])] - binary = [c["heading"] for c in cards if is_binary_prompt(c["heading"])] - - print(f"{path.name} — drill deck stats") - print() - print(f"Deck title: {title if title else '(no #+TITLE)'}") - print(f"Cards: {len(cards)}") - drawer_status = "match" if prop_count == len(cards) else f"mismatch (expected {len(cards)})" - print(f"PROPERTIES drawers: {prop_count} ({drawer_status})") - print(f"*** Answer sub-headers: {answer_count} ({'clean' if answer_count == 0 else 'workflow violation'})") - print(f"Cards missing :ID:: {len(no_id)}") - print(f"Cards with non-prompt heading: {len(not_prompt)}") - print(f"Cards with possible answer leakage: {len(leaky)}") - print(f"Duplicate / near-duplicate fronts: {len(dups)}") - print() - - warnings = 0 - - def emit_list(items: list[str]) -> None: - for h in items[:5]: - print(f" - {h}") - if len(items) > 5: - print(f" - ... and {len(items) - 5} more") - - def warn(msg: str, items: list[str] | None = None) -> None: - nonlocal warnings - warnings += 1 - print(f"WARN: {msg}") - if items: - emit_list(items) - - def note(msg: str, items: list[str] | None = None) -> None: - print(f"NOTE: {msg}") - if items: - emit_list(items) - - if title is None: - warn("no #+TITLE: line found; deck name will fall back to the file basename") - elif SOURCE_TOOL_RE.search(title): - warn(f"#+TITLE contains source-tool jargon ('{title}'); the deck name shows in Anki — drop 'Org-Drill' for a name that reads well on the consumption side") - if answer_count: - warn(f"{answer_count} cards have *** Answer sub-headers (drop per drill-deck-review.org)") - if prop_count != len(cards): - warn(f"PROPERTIES count {prop_count} does not match card count {len(cards)}") - if no_id: - warn(f"{len(no_id)} cards missing :ID:; losing identity risks SRS-state loss across rewrites", no_id) - if not_prompt: - warn(f"{len(not_prompt)} cards have non-prompt headings (no '?' and no imperative-verb start); likely topic-as-heading not yet rewritten", not_prompt) - if leaky: - warn(f"{len(leaky)} cards may leak their answer (question echoes >= {int(LEAKAGE_RATIO * 100)}% of its own answer's key words); reformulate so the answer is recalled, not recognized", leaky) - if dups: - warn(f"{len(dups)} duplicate / near-duplicate fronts (interference between confusable cards); disambiguate or merge", - [f"{a} == {b}" for a, b in dups]) - - if overloaded: - note(f"{len(overloaded)} cards have a long answer (> {BACK_WORD_LIMIT} words); candidates to split into atomic cards", overloaded) - if listy: - note(f"{len(listy)} cards have a list-shaped answer; enumeration cards recall poorly — candidates to split or use overlapping cloze", listy) - if binary: - note(f"{len(binary)} cards are binary (yes/no or 'A or B'); low retrieval effort — candidates to reformulate open-ended", binary) - - if warnings == 0: - print("clean (with non-blocking notes above)" if (overloaded or listy or binary) else "clean") - return 0 - return 1 - - -if __name__ == "__main__": - raise SystemExit(main()) |
