#!/usr/bin/env python3
"""Inventory + authoring-quality checks for an org-drill deck source file.

Reports counts and flags two tiers of issue.

Blocking WARNs (exit 1):
- PROPERTIES drawer count not matching card count
- Cards missing :ID: (risks SRS-state loss across rewrites)
- `*** Answer` sub-headers (should be 0 per flashcard-review.org)
- Non-prompt headings (topic-as-heading not yet rewritten)
- #+TITLE missing, or carrying source-tool jargon ("org-drill")
- Answer leakage: a card whose question echoes most of its own answer
  (Source: citation lines and created-date lines are excluded from the
  overlap, and range/category cards that recall numbers are exempted)
- Duplicate / near-duplicate fronts (interference between confusable cards)

Non-blocking NOTEs (exit unaffected):
- Overloaded backs (long answer — candidate to split into atomic cards)
- List-shaped backs (enumeration — candidate to split or use overlapping cloze)
- Binary yes/no prompts (low retrieval effort — candidate to reformulate)

Exits 0 when no blocking warnings are present, 1 otherwise, 2 on bad usage.
Use as a gate before regenerating the Anki deck or running flashcard-sync.

The fuzzy checks (leakage, duplicate, overloaded) are tuned by the LEAKAGE_*
and BACK_WORD_LIMIT constants below; loosen them if a real deck trips false
positives.

Usage:
  flashcard-stats.py <file.org>
"""
from __future__ import annotations

import re
import sys
from pathlib import Path

CARD_RE = re.compile(r"^\*\*\s+(.+?)\s+:drill:\s*$")
ANSWER_RE = re.compile(r"^\*\*\*\s+Answer\b")
PROP_START_RE = re.compile(r"^\s*:PROPERTIES:\s*$")
PROP_END_RE = re.compile(r"^\s*:END:\s*$")
ID_RE = re.compile(r"^\s*:ID:\s+(\S+)\s*$")
TITLE_RE = re.compile(r"^#\+TITLE:\s*(.+?)\s*$", re.IGNORECASE)
SOURCE_TOOL_RE = re.compile(r"\borg[-\s]?drill\b", re.IGNORECASE)
PLANNING_RE = re.compile(r"^\s*(SCHEDULED|DEADLINE|CLOSED):\s")
SOURCE_LINE_RE = re.compile(r"^\s*source:\s", re.IGNORECASE)
CREATED_LINE_RE = re.compile(r"^\s*:?created:?\s", re.IGNORECASE)
RANGE_RE = re.compile(r"\d[^\n]*[-–—]\s*\d")
THRESHOLD_RE = re.compile(r"[<>≤≥]\s*\d")
BULLET_RE = re.compile(r"^\s*([-+*]|\d+[.)])\s+")
BINARY_LEAD_RE = re.compile(
    r"^\s*(is|are|was|were|does|do|did|can|could|should|would|will|has|have|had)\b",
    re.IGNORECASE,
)

# A heading qualifies as "prompt form" if it contains `?` or starts with one of
# these imperative verbs (directive prompts like "Spell these out" and
# "Introduce yourself" are valid even without `?`).
IMPERATIVE_VERBS = frozenset({
    "spell", "describe", "explain", "name", "list", "give",
    "show", "tell", "define", "compare", "identify", "outline",
    "introduce", "walk", "state", "recite", "recall", "summarize",
})

# Function words ignored when comparing a question against its answer.
STOPWORDS = frozenset({
    "the", "a", "an", "is", "are", "was", "were", "of", "to", "in", "on",
    "for", "and", "or", "with", "what", "who", "whom", "when", "where", "why",
    "how", "which", "does", "do", "did", "tell", "me", "about", "their", "this",
    "that", "it", "as", "at", "by", "be", "your", "you", "they", "them",
})

# Tuning knobs for the fuzzy checks.
LEAKAGE_RATIO = 0.8     # share of a question's content words echoed in its answer
LEAKAGE_MIN_WORDS = 3   # ignore very short questions, where overlap is noise
BACK_WORD_LIMIT = 60    # words on a card back before it's flagged as overloaded


def is_prompt_form(heading: str) -> bool:
    """True if the heading reads as a question or imperative prompt."""
    if "?" in heading:
        return True
    first_word = heading.split(None, 1)[0].lower().rstrip(":,;")
    return first_word in IMPERATIVE_VERBS


def content_words(text: str) -> set[str]:
    """Lowercased alphanumeric tokens of length >= 3, minus stopwords."""
    return {w for w in re.findall(r"[a-z0-9]+", text.lower())
            if len(w) >= 3 and w not in STOPWORDS}


def leakage_ratio(heading: str, body: str) -> float:
    """Fraction of the question's content words that reappear in the answer.

    A high ratio means the answer is largely restated in the question, so the
    card can be answered by recognition rather than recall. Returns 0.0 for a
    question with fewer than LEAKAGE_MIN_WORDS content words, where overlap is
    just noise.
    """
    hw = content_words(heading)
    if len(hw) < LEAKAGE_MIN_WORDS:
        return 0.0
    return len(hw & content_words(body)) / len(hw)


def prose_body(body: str) -> str:
    """Body with Source: citation and created-date lines removed.

    Those lines are metadata, not the answer. A Source line's URL slug often
    repeats the question's words, and a created date is bookkeeping — neither
    should count toward answer-leakage overlap.
    """
    return "\n".join(
        ln for ln in body.splitlines()
        if not SOURCE_LINE_RE.match(ln) and not CREATED_LINE_RE.match(ln)
    )


def has_distinct_numeric_recall(heading: str, body: str) -> bool:
    """True if the answer carries numeric ranges/thresholds the question lacks.

    A range/category card ("What are the HbA1c ranges across normal,
    prediabetes, and diabetes?") echoes its categories in the answer, but the
    recalled content is the numbers, which the question doesn't give away — so
    high word overlap isn't leakage.
    """
    body_nums = bool(RANGE_RE.search(body) or THRESHOLD_RE.search(body))
    head_nums = bool(RANGE_RE.search(heading) or THRESHOLD_RE.search(heading))
    return body_nums and not head_nums


def is_leaky(heading: str, body: str) -> bool:
    """True if a card leaks its answer, after excluding citation lines and
    numeric-recall (range/category) cards."""
    prose = prose_body(body)
    if leakage_ratio(heading, prose) < LEAKAGE_RATIO:
        return False
    return not has_distinct_numeric_recall(heading, prose)


def normalize_heading(heading: str) -> str:
    """Collapse a heading to a comparison key (lowercase, alnum + single spaces)."""
    return re.sub(r"\s+", " ", re.sub(r"[^a-z0-9 ]", " ", heading.lower())).strip()


def is_binary_prompt(heading: str) -> bool:
    """True for yes/no or 'A or B' prompts, which need little retrieval effort."""
    if BINARY_LEAD_RE.match(heading):
        return True
    return bool(re.search(r"\bor\b", heading, re.IGNORECASE)) and heading.rstrip().endswith("?")


def back_word_count(body: str) -> int:
    return len(body.split())


def is_list_back(body: str) -> bool:
    """True if the answer body is mostly an org list (an enumeration card)."""
    lines = [ln for ln in body.splitlines() if ln.strip()]
    if len(lines) < 2:
        return False
    bullets = sum(1 for ln in lines if BULLET_RE.match(ln))
    return bullets >= 2 and bullets * 2 >= len(lines)


def parse_cards(lines: list[str]) -> tuple[list[dict], int]:
    """Parse :drill: cards from org lines.

    Returns (cards, prop_count). Each card is a dict with heading, has_id,
    has_answer, and body (the answer text with PROPERTIES drawers, planning
    lines, and `*** Answer` headers removed, approximating the rendered back).
    """
    cards: list[dict] = []
    prop_count = 0
    i = 0
    n = len(lines)
    while i < n:
        m = CARD_RE.match(lines[i])
        if not m:
            i += 1
            continue
        heading = m.group(1).strip()
        i += 1
        has_id = False
        has_answer = False
        in_drawer = False
        body_lines: list[str] = []
        while i < n:
            line = lines[i]
            if line.startswith("* ") or CARD_RE.match(line):
                break
            if PROP_START_RE.match(line):
                prop_count += 1
                in_drawer = True
            elif in_drawer and PROP_END_RE.match(line):
                in_drawer = False
            elif in_drawer:
                if ID_RE.match(line):
                    has_id = True
            elif ANSWER_RE.match(line):
                has_answer = True
            elif PLANNING_RE.match(line):
                pass
            else:
                body_lines.append(line)
            i += 1
        cards.append({
            "heading": heading,
            "has_id": has_id,
            "has_answer": has_answer,
            "body": "\n".join(body_lines).strip(),
        })
    return cards, prop_count


def find_duplicate_fronts(cards: list[dict]) -> list[tuple[str, str]]:
    """Return (first, dup) heading pairs that normalize to the same key."""
    seen: dict[str, str] = {}
    dups: list[tuple[str, str]] = []
    for c in cards:
        key = normalize_heading(c["heading"])
        if not key:
            continue
        if key in seen:
            dups.append((seen[key], c["heading"]))
        else:
            seen[key] = c["heading"]
    return dups


def main() -> int:
    if len(sys.argv) != 2:
        print(f"usage: {sys.argv[0]} <file.org>", file=sys.stderr)
        return 2

    path = Path(sys.argv[1]).expanduser().resolve()
    if not path.is_file():
        print(f"error: {path} not found", file=sys.stderr)
        return 2

    lines = path.read_text(encoding="utf-8").splitlines()

    title: str | None = None
    for line in lines[:20]:
        m = TITLE_RE.match(line)
        if m:
            title = m.group(1).strip()
            break

    cards, prop_count = parse_cards(lines)

    no_id = [c["heading"] for c in cards if not c["has_id"]]
    not_prompt = [c["heading"] for c in cards if not is_prompt_form(c["heading"])]
    answer_count = sum(1 for c in cards if c["has_answer"])
    leaky = [c["heading"] for c in cards if is_leaky(c["heading"], c["body"])]
    dups = find_duplicate_fronts(cards)
    overloaded = [c["heading"] for c in cards if back_word_count(c["body"]) > BACK_WORD_LIMIT]
    listy = [c["heading"] for c in cards if is_list_back(c["body"])]
    binary = [c["heading"] for c in cards if is_binary_prompt(c["heading"])]

    print(f"{path.name} — drill deck stats")
    print()
    print(f"Deck title: {title if title else '(no #+TITLE)'}")
    print(f"Cards: {len(cards)}")
    drawer_status = "match" if prop_count == len(cards) else f"mismatch (expected {len(cards)})"
    print(f"PROPERTIES drawers: {prop_count} ({drawer_status})")
    print(f"*** Answer sub-headers: {answer_count} ({'clean' if answer_count == 0 else 'workflow violation'})")
    print(f"Cards missing :ID:: {len(no_id)}")
    print(f"Cards with non-prompt heading: {len(not_prompt)}")
    print(f"Cards with possible answer leakage: {len(leaky)}")
    print(f"Duplicate / near-duplicate fronts: {len(dups)}")
    print()

    warnings = 0

    def emit_list(items: list[str]) -> None:
        for h in items[:5]:
            print(f"      - {h}")
        if len(items) > 5:
            print(f"      - ... and {len(items) - 5} more")

    def warn(msg: str, items: list[str] | None = None) -> None:
        nonlocal warnings
        warnings += 1
        print(f"WARN: {msg}")
        if items:
            emit_list(items)

    def note(msg: str, items: list[str] | None = None) -> None:
        print(f"NOTE: {msg}")
        if items:
            emit_list(items)

    if title is None:
        warn("no #+TITLE: line found; deck name will fall back to the file basename")
    elif SOURCE_TOOL_RE.search(title):
        warn(f"#+TITLE contains source-tool jargon ('{title}'); the deck name shows in Anki — drop 'Org-Drill' for a name that reads well on the consumption side")
    if answer_count:
        warn(f"{answer_count} cards have *** Answer sub-headers (drop per flashcard-review.org)")
    if prop_count != len(cards):
        warn(f"PROPERTIES count {prop_count} does not match card count {len(cards)}")
    if no_id:
        warn(f"{len(no_id)} cards missing :ID:; losing identity risks SRS-state loss across rewrites", no_id)
    if not_prompt:
        warn(f"{len(not_prompt)} cards have non-prompt headings (no '?' and no imperative-verb start); likely topic-as-heading not yet rewritten", not_prompt)
    if leaky:
        warn(f"{len(leaky)} cards may leak their answer (question echoes >= {int(LEAKAGE_RATIO * 100)}% of its own answer's key words); reformulate so the answer is recalled, not recognized", leaky)
    if dups:
        warn(f"{len(dups)} duplicate / near-duplicate fronts (interference between confusable cards); disambiguate or merge",
             [f"{a}  ==  {b}" for a, b in dups])

    if overloaded:
        note(f"{len(overloaded)} cards have a long answer (> {BACK_WORD_LIMIT} words); candidates to split into atomic cards", overloaded)
    if listy:
        note(f"{len(listy)} cards have a list-shaped answer; enumeration cards recall poorly — candidates to split or use overlapping cloze", listy)
    if binary:
        note(f"{len(binary)} cards are binary (yes/no or 'A or B'); low retrieval effort — candidates to reformulate open-ended", binary)

    if warnings == 0:
        print("clean (with non-blocking notes above)" if (overloaded or listy or binary) else "clean")
        return 0
    return 1


if __name__ == "__main__":
    raise SystemExit(main())