#!/usr/bin/env python3 """Inventory + authoring-quality checks for an org-drill deck source file. Reports counts and flags two tiers of issue. Blocking WARNs (exit 1): - PROPERTIES drawer count not matching card count - Cards missing :ID: (risks SRS-state loss across rewrites) - `*** Answer` sub-headers (should be 0 per flashcard-review.org) - Non-prompt headings (topic-as-heading not yet rewritten) - #+TITLE missing, or carrying source-tool jargon ("org-drill") - Answer leakage: a card whose question echoes most of its own answer (Source: citation lines and created-date lines are excluded from the overlap, and range/category cards that recall numbers are exempted) - Duplicate / near-duplicate fronts (interference between confusable cards) Non-blocking NOTEs (exit unaffected): - Overloaded backs (long answer — candidate to split into atomic cards) - List-shaped backs (enumeration — candidate to split or use overlapping cloze) - Binary yes/no prompts (low retrieval effort — candidate to reformulate) Exits 0 when no blocking warnings are present, 1 otherwise, 2 on bad usage. Use as a gate before regenerating the Anki deck or running flashcard-sync. The fuzzy checks (leakage, duplicate, overloaded) are tuned by the LEAKAGE_* and BACK_WORD_LIMIT constants below; loosen them if a real deck trips false positives. Usage: flashcard-stats.py """ from __future__ import annotations import re import sys from pathlib import Path CARD_RE = re.compile(r"^\*\*\s+(.+?)\s+:drill:\s*$") ANSWER_RE = re.compile(r"^\*\*\*\s+Answer\b") PROP_START_RE = re.compile(r"^\s*:PROPERTIES:\s*$") PROP_END_RE = re.compile(r"^\s*:END:\s*$") ID_RE = re.compile(r"^\s*:ID:\s+(\S+)\s*$") TITLE_RE = re.compile(r"^#\+TITLE:\s*(.+?)\s*$", re.IGNORECASE) SOURCE_TOOL_RE = re.compile(r"\borg[-\s]?drill\b", re.IGNORECASE) PLANNING_RE = re.compile(r"^\s*(SCHEDULED|DEADLINE|CLOSED):\s") SOURCE_LINE_RE = re.compile(r"^\s*source:\s", re.IGNORECASE) CREATED_LINE_RE = re.compile(r"^\s*:?created:?\s", re.IGNORECASE) RANGE_RE = re.compile(r"\d[^\n]*[-–—]\s*\d") THRESHOLD_RE = re.compile(r"[<>≤≥]\s*\d") BULLET_RE = re.compile(r"^\s*([-+*]|\d+[.)])\s+") BINARY_LEAD_RE = re.compile( r"^\s*(is|are|was|were|does|do|did|can|could|should|would|will|has|have|had)\b", re.IGNORECASE, ) # A heading qualifies as "prompt form" if it contains `?` or starts with one of # these imperative verbs (directive prompts like "Spell these out" and # "Introduce yourself" are valid even without `?`). IMPERATIVE_VERBS = frozenset({ "spell", "describe", "explain", "name", "list", "give", "show", "tell", "define", "compare", "identify", "outline", "introduce", "walk", "state", "recite", "recall", "summarize", }) # Function words ignored when comparing a question against its answer. STOPWORDS = frozenset({ "the", "a", "an", "is", "are", "was", "were", "of", "to", "in", "on", "for", "and", "or", "with", "what", "who", "whom", "when", "where", "why", "how", "which", "does", "do", "did", "tell", "me", "about", "their", "this", "that", "it", "as", "at", "by", "be", "your", "you", "they", "them", }) # Tuning knobs for the fuzzy checks. LEAKAGE_RATIO = 0.8 # share of a question's content words echoed in its answer LEAKAGE_MIN_WORDS = 3 # ignore very short questions, where overlap is noise BACK_WORD_LIMIT = 60 # words on a card back before it's flagged as overloaded def is_prompt_form(heading: str) -> bool: """True if the heading reads as a question or imperative prompt.""" if "?" in heading: return True first_word = heading.split(None, 1)[0].lower().rstrip(":,;") return first_word in IMPERATIVE_VERBS def content_words(text: str) -> set[str]: """Lowercased alphanumeric tokens of length >= 3, minus stopwords.""" return {w for w in re.findall(r"[a-z0-9]+", text.lower()) if len(w) >= 3 and w not in STOPWORDS} def leakage_ratio(heading: str, body: str) -> float: """Fraction of the question's content words that reappear in the answer. A high ratio means the answer is largely restated in the question, so the card can be answered by recognition rather than recall. Returns 0.0 for a question with fewer than LEAKAGE_MIN_WORDS content words, where overlap is just noise. """ hw = content_words(heading) if len(hw) < LEAKAGE_MIN_WORDS: return 0.0 return len(hw & content_words(body)) / len(hw) def prose_body(body: str) -> str: """Body with Source: citation and created-date lines removed. Those lines are metadata, not the answer. A Source line's URL slug often repeats the question's words, and a created date is bookkeeping — neither should count toward answer-leakage overlap. """ return "\n".join( ln for ln in body.splitlines() if not SOURCE_LINE_RE.match(ln) and not CREATED_LINE_RE.match(ln) ) def has_distinct_numeric_recall(heading: str, body: str) -> bool: """True if the answer carries numeric ranges/thresholds the question lacks. A range/category card ("What are the HbA1c ranges across normal, prediabetes, and diabetes?") echoes its categories in the answer, but the recalled content is the numbers, which the question doesn't give away — so high word overlap isn't leakage. """ body_nums = bool(RANGE_RE.search(body) or THRESHOLD_RE.search(body)) head_nums = bool(RANGE_RE.search(heading) or THRESHOLD_RE.search(heading)) return body_nums and not head_nums def is_leaky(heading: str, body: str) -> bool: """True if a card leaks its answer, after excluding citation lines and numeric-recall (range/category) cards.""" prose = prose_body(body) if leakage_ratio(heading, prose) < LEAKAGE_RATIO: return False return not has_distinct_numeric_recall(heading, prose) def normalize_heading(heading: str) -> str: """Collapse a heading to a comparison key (lowercase, alnum + single spaces).""" return re.sub(r"\s+", " ", re.sub(r"[^a-z0-9 ]", " ", heading.lower())).strip() def is_binary_prompt(heading: str) -> bool: """True for yes/no or 'A or B' prompts, which need little retrieval effort.""" if BINARY_LEAD_RE.match(heading): return True return bool(re.search(r"\bor\b", heading, re.IGNORECASE)) and heading.rstrip().endswith("?") def back_word_count(body: str) -> int: return len(body.split()) def is_list_back(body: str) -> bool: """True if the answer body is mostly an org list (an enumeration card).""" lines = [ln for ln in body.splitlines() if ln.strip()] if len(lines) < 2: return False bullets = sum(1 for ln in lines if BULLET_RE.match(ln)) return bullets >= 2 and bullets * 2 >= len(lines) def parse_cards(lines: list[str]) -> tuple[list[dict], int]: """Parse :drill: cards from org lines. Returns (cards, prop_count). Each card is a dict with heading, has_id, has_answer, and body (the answer text with PROPERTIES drawers, planning lines, and `*** Answer` headers removed, approximating the rendered back). """ cards: list[dict] = [] prop_count = 0 i = 0 n = len(lines) while i < n: m = CARD_RE.match(lines[i]) if not m: i += 1 continue heading = m.group(1).strip() i += 1 has_id = False has_answer = False in_drawer = False body_lines: list[str] = [] while i < n: line = lines[i] if line.startswith("* ") or CARD_RE.match(line): break if PROP_START_RE.match(line): prop_count += 1 in_drawer = True elif in_drawer and PROP_END_RE.match(line): in_drawer = False elif in_drawer: if ID_RE.match(line): has_id = True elif ANSWER_RE.match(line): has_answer = True elif PLANNING_RE.match(line): pass else: body_lines.append(line) i += 1 cards.append({ "heading": heading, "has_id": has_id, "has_answer": has_answer, "body": "\n".join(body_lines).strip(), }) return cards, prop_count def find_duplicate_fronts(cards: list[dict]) -> list[tuple[str, str]]: """Return (first, dup) heading pairs that normalize to the same key.""" seen: dict[str, str] = {} dups: list[tuple[str, str]] = [] for c in cards: key = normalize_heading(c["heading"]) if not key: continue if key in seen: dups.append((seen[key], c["heading"])) else: seen[key] = c["heading"] return dups def main() -> int: if len(sys.argv) != 2: print(f"usage: {sys.argv[0]} ", file=sys.stderr) return 2 path = Path(sys.argv[1]).expanduser().resolve() if not path.is_file(): print(f"error: {path} not found", file=sys.stderr) return 2 lines = path.read_text(encoding="utf-8").splitlines() title: str | None = None for line in lines[:20]: m = TITLE_RE.match(line) if m: title = m.group(1).strip() break cards, prop_count = parse_cards(lines) no_id = [c["heading"] for c in cards if not c["has_id"]] not_prompt = [c["heading"] for c in cards if not is_prompt_form(c["heading"])] answer_count = sum(1 for c in cards if c["has_answer"]) leaky = [c["heading"] for c in cards if is_leaky(c["heading"], c["body"])] dups = find_duplicate_fronts(cards) overloaded = [c["heading"] for c in cards if back_word_count(c["body"]) > BACK_WORD_LIMIT] listy = [c["heading"] for c in cards if is_list_back(c["body"])] binary = [c["heading"] for c in cards if is_binary_prompt(c["heading"])] print(f"{path.name} — drill deck stats") print() print(f"Deck title: {title if title else '(no #+TITLE)'}") print(f"Cards: {len(cards)}") drawer_status = "match" if prop_count == len(cards) else f"mismatch (expected {len(cards)})" print(f"PROPERTIES drawers: {prop_count} ({drawer_status})") print(f"*** Answer sub-headers: {answer_count} ({'clean' if answer_count == 0 else 'workflow violation'})") print(f"Cards missing :ID:: {len(no_id)}") print(f"Cards with non-prompt heading: {len(not_prompt)}") print(f"Cards with possible answer leakage: {len(leaky)}") print(f"Duplicate / near-duplicate fronts: {len(dups)}") print() warnings = 0 def emit_list(items: list[str]) -> None: for h in items[:5]: print(f" - {h}") if len(items) > 5: print(f" - ... and {len(items) - 5} more") def warn(msg: str, items: list[str] | None = None) -> None: nonlocal warnings warnings += 1 print(f"WARN: {msg}") if items: emit_list(items) def note(msg: str, items: list[str] | None = None) -> None: print(f"NOTE: {msg}") if items: emit_list(items) if title is None: warn("no #+TITLE: line found; deck name will fall back to the file basename") elif SOURCE_TOOL_RE.search(title): warn(f"#+TITLE contains source-tool jargon ('{title}'); the deck name shows in Anki — drop 'Org-Drill' for a name that reads well on the consumption side") if answer_count: warn(f"{answer_count} cards have *** Answer sub-headers (drop per flashcard-review.org)") if prop_count != len(cards): warn(f"PROPERTIES count {prop_count} does not match card count {len(cards)}") if no_id: warn(f"{len(no_id)} cards missing :ID:; losing identity risks SRS-state loss across rewrites", no_id) if not_prompt: warn(f"{len(not_prompt)} cards have non-prompt headings (no '?' and no imperative-verb start); likely topic-as-heading not yet rewritten", not_prompt) if leaky: warn(f"{len(leaky)} cards may leak their answer (question echoes >= {int(LEAKAGE_RATIO * 100)}% of its own answer's key words); reformulate so the answer is recalled, not recognized", leaky) if dups: warn(f"{len(dups)} duplicate / near-duplicate fronts (interference between confusable cards); disambiguate or merge", [f"{a} == {b}" for a, b in dups]) if overloaded: note(f"{len(overloaded)} cards have a long answer (> {BACK_WORD_LIMIT} words); candidates to split into atomic cards", overloaded) if listy: note(f"{len(listy)} cards have a list-shaped answer; enumeration cards recall poorly — candidates to split or use overlapping cloze", listy) if binary: note(f"{len(binary)} cards are binary (yes/no or 'A or B'); low retrieval effort — candidates to reformulate open-ended", binary) if warnings == 0: print("clean (with non-blocking notes above)" if (overloaded or listy or binary) else "clean") return 0 return 1 if __name__ == "__main__": raise SystemExit(main())