#!/usr/bin/env python3
"""Inventory + authoring-quality checks for an org-drill deck source file.

Reports counts and flags two tiers of issue.

Blocking WARNs (exit 1):
- PROPERTIES drawer count not matching card count
- Cards missing :ID: (risks SRS-state loss across rewrites)
- `*** Answer` sub-headers (should be 0 per drill-deck-review.org)
- Non-prompt headings (topic-as-heading not yet rewritten)
- #+TITLE missing, or carrying source-tool jargon ("org-drill")
- Answer leakage: a card whose question echoes most of its own answer
- Duplicate / near-duplicate fronts (interference between confusable cards)

Non-blocking NOTEs (exit unaffected):
- Overloaded backs (long answer — candidate to split into atomic cards)
- List-shaped backs (enumeration — candidate to split or use overlapping cloze)
- Binary yes/no prompts (low retrieval effort — candidate to reformulate)

Exits 0 when no blocking warnings are present, 1 otherwise, 2 on bad usage.
Use as a gate before regenerating the Anki deck or running drill-deck-sync.

The fuzzy checks (leakage, duplicate, overloaded) are tuned by the LEAKAGE_*
and BACK_WORD_LIMIT constants below; loosen them if a real deck trips false
positives.

Usage:
  drill-deck-stats.py <file.org>
"""
from __future__ import annotations

import re
import sys
from pathlib import Path

CARD_RE = re.compile(r"^\*\*\s+(.+?)\s+:drill:\s*$")
ANSWER_RE = re.compile(r"^\*\*\*\s+Answer\b")
PROP_START_RE = re.compile(r"^\s*:PROPERTIES:\s*$")
PROP_END_RE = re.compile(r"^\s*:END:\s*$")
ID_RE = re.compile(r"^\s*:ID:\s+(\S+)\s*$")
TITLE_RE = re.compile(r"^#\+TITLE:\s*(.+?)\s*$", re.IGNORECASE)
SOURCE_TOOL_RE = re.compile(r"\borg[-\s]?drill\b", re.IGNORECASE)
PLANNING_RE = re.compile(r"^\s*(SCHEDULED|DEADLINE|CLOSED):\s")
BULLET_RE = re.compile(r"^\s*([-+*]|\d+[.)])\s+")
BINARY_LEAD_RE = re.compile(
    r"^\s*(is|are|was|were|does|do|did|can|could|should|would|will|has|have|had)\b",
    re.IGNORECASE,
)

# A heading qualifies as "prompt form" if it contains `?` or starts with one of
# these imperative verbs (directive prompts like "Spell these out" and
# "Introduce yourself" are valid even without `?`).
IMPERATIVE_VERBS = frozenset({
    "spell", "describe", "explain", "name", "list", "give",
    "show", "tell", "define", "compare", "identify", "outline",
    "introduce", "walk", "state", "recite", "recall", "summarize",
})

# Function words ignored when comparing a question against its answer.
STOPWORDS = frozenset({
    "the", "a", "an", "is", "are", "was", "were", "of", "to", "in", "on",
    "for", "and", "or", "with", "what", "who", "whom", "when", "where", "why",
    "how", "which", "does", "do", "did", "tell", "me", "about", "their", "this",
    "that", "it", "as", "at", "by", "be", "your", "you", "they", "them",
})

# Tuning knobs for the fuzzy checks.
LEAKAGE_RATIO = 0.8     # share of a question's content words echoed in its answer
LEAKAGE_MIN_WORDS = 3   # ignore very short questions, where overlap is noise
BACK_WORD_LIMIT = 60    # words on a card back before it's flagged as overloaded


def is_prompt_form(heading: str) -> bool:
    """True if the heading reads as a question or imperative prompt."""
    if "?" in heading:
        return True
    first_word = heading.split(None, 1)[0].lower().rstrip(":,;")
    return first_word in IMPERATIVE_VERBS


def content_words(text: str) -> set[str]:
    """Lowercased alphanumeric tokens of length >= 3, minus stopwords."""
    return {w for w in re.findall(r"[a-z0-9]+", text.lower())
            if len(w) >= 3 and w not in STOPWORDS}


def leakage_ratio(heading: str, body: str) -> float:
    """Fraction of the question's content words that reappear in the answer.

    A high ratio means the answer is largely restated in the question, so the
    card can be answered by recognition rather than recall. Returns 0.0 for a
    question with fewer than LEAKAGE_MIN_WORDS content words, where overlap is
    just noise.
    """
    hw = content_words(heading)
    if len(hw) < LEAKAGE_MIN_WORDS:
        return 0.0
    return len(hw & content_words(body)) / len(hw)


def normalize_heading(heading: str) -> str:
    """Collapse a heading to a comparison key (lowercase, alnum + single spaces)."""
    return re.sub(r"\s+", " ", re.sub(r"[^a-z0-9 ]", " ", heading.lower())).strip()


def is_binary_prompt(heading: str) -> bool:
    """True for yes/no or 'A or B' prompts, which need little retrieval effort."""
    if BINARY_LEAD_RE.match(heading):
        return True
    return bool(re.search(r"\bor\b", heading, re.IGNORECASE)) and heading.rstrip().endswith("?")


def back_word_count(body: str) -> int:
    return len(body.split())


def is_list_back(body: str) -> bool:
    """True if the answer body is mostly an org list (an enumeration card)."""
    lines = [ln for ln in body.splitlines() if ln.strip()]
    if len(lines) < 2:
        return False
    bullets = sum(1 for ln in lines if BULLET_RE.match(ln))
    return bullets >= 2 and bullets * 2 >= len(lines)


def parse_cards(lines: list[str]) -> tuple[list[dict], int]:
    """Parse :drill: cards from org lines.

    Returns (cards, prop_count). Each card is a dict with heading, has_id,
    has_answer, and body (the answer text with PROPERTIES drawers, planning
    lines, and `*** Answer` headers removed, approximating the rendered back).
    """
    cards: list[dict] = []
    prop_count = 0
    i = 0
    n = len(lines)
    while i < n:
        m = CARD_RE.match(lines[i])
        if not m:
            i += 1
            continue
        heading = m.group(1).strip()
        i += 1
        has_id = False
        has_answer = False
        in_drawer = False
        body_lines: list[str] = []
        while i < n:
            line = lines[i]
            if line.startswith("* ") or CARD_RE.match(line):
                break
            if PROP_START_RE.match(line):
                prop_count += 1
                in_drawer = True
            elif in_drawer and PROP_END_RE.match(line):
                in_drawer = False
            elif in_drawer:
                if ID_RE.match(line):
                    has_id = True
            elif ANSWER_RE.match(line):
                has_answer = True
            elif PLANNING_RE.match(line):
                pass
            else:
                body_lines.append(line)
            i += 1
        cards.append({
            "heading": heading,
            "has_id": has_id,
            "has_answer": has_answer,
            "body": "\n".join(body_lines).strip(),
        })
    return cards, prop_count


def find_duplicate_fronts(cards: list[dict]) -> list[tuple[str, str]]:
    """Return (first, dup) heading pairs that normalize to the same key."""
    seen: dict[str, str] = {}
    dups: list[tuple[str, str]] = []
    for c in cards:
        key = normalize_heading(c["heading"])
        if not key:
            continue
        if key in seen:
            dups.append((seen[key], c["heading"]))
        else:
            seen[key] = c["heading"]
    return dups


def main() -> int:
    if len(sys.argv) != 2:
        print(f"usage: {sys.argv[0]} <file.org>", file=sys.stderr)
        return 2

    path = Path(sys.argv[1]).expanduser().resolve()
    if not path.is_file():
        print(f"error: {path} not found", file=sys.stderr)
        return 2

    lines = path.read_text(encoding="utf-8").splitlines()

    title: str | None = None
    for line in lines[:20]:
        m = TITLE_RE.match(line)
        if m:
            title = m.group(1).strip()
            break

    cards, prop_count = parse_cards(lines)

    no_id = [c["heading"] for c in cards if not c["has_id"]]
    not_prompt = [c["heading"] for c in cards if not is_prompt_form(c["heading"])]
    answer_count = sum(1 for c in cards if c["has_answer"])
    leaky = [c["heading"] for c in cards
             if leakage_ratio(c["heading"], c["body"]) >= LEAKAGE_RATIO]
    dups = find_duplicate_fronts(cards)
    overloaded = [c["heading"] for c in cards if back_word_count(c["body"]) > BACK_WORD_LIMIT]
    listy = [c["heading"] for c in cards if is_list_back(c["body"])]
    binary = [c["heading"] for c in cards if is_binary_prompt(c["heading"])]

    print(f"{path.name} — drill deck stats")
    print()
    print(f"Deck title: {title if title else '(no #+TITLE)'}")
    print(f"Cards: {len(cards)}")
    drawer_status = "match" if prop_count == len(cards) else f"mismatch (expected {len(cards)})"
    print(f"PROPERTIES drawers: {prop_count} ({drawer_status})")
    print(f"*** Answer sub-headers: {answer_count} ({'clean' if answer_count == 0 else 'workflow violation'})")
    print(f"Cards missing :ID:: {len(no_id)}")
    print(f"Cards with non-prompt heading: {len(not_prompt)}")
    print(f"Cards with possible answer leakage: {len(leaky)}")
    print(f"Duplicate / near-duplicate fronts: {len(dups)}")
    print()

    warnings = 0

    def emit_list(items: list[str]) -> None:
        for h in items[:5]:
            print(f"      - {h}")
        if len(items) > 5:
            print(f"      - ... and {len(items) - 5} more")

    def warn(msg: str, items: list[str] | None = None) -> None:
        nonlocal warnings
        warnings += 1
        print(f"WARN: {msg}")
        if items:
            emit_list(items)

    def note(msg: str, items: list[str] | None = None) -> None:
        print(f"NOTE: {msg}")
        if items:
            emit_list(items)

    if title is None:
        warn("no #+TITLE: line found; deck name will fall back to the file basename")
    elif SOURCE_TOOL_RE.search(title):
        warn(f"#+TITLE contains source-tool jargon ('{title}'); the deck name shows in Anki — drop 'Org-Drill' for a name that reads well on the consumption side")
    if answer_count:
        warn(f"{answer_count} cards have *** Answer sub-headers (drop per drill-deck-review.org)")
    if prop_count != len(cards):
        warn(f"PROPERTIES count {prop_count} does not match card count {len(cards)}")
    if no_id:
        warn(f"{len(no_id)} cards missing :ID:; losing identity risks SRS-state loss across rewrites", no_id)
    if not_prompt:
        warn(f"{len(not_prompt)} cards have non-prompt headings (no '?' and no imperative-verb start); likely topic-as-heading not yet rewritten", not_prompt)
    if leaky:
        warn(f"{len(leaky)} cards may leak their answer (question echoes >= {int(LEAKAGE_RATIO * 100)}% of its own answer's key words); reformulate so the answer is recalled, not recognized", leaky)
    if dups:
        warn(f"{len(dups)} duplicate / near-duplicate fronts (interference between confusable cards); disambiguate or merge",
             [f"{a}  ==  {b}" for a, b in dups])

    if overloaded:
        note(f"{len(overloaded)} cards have a long answer (> {BACK_WORD_LIMIT} words); candidates to split into atomic cards", overloaded)
    if listy:
        note(f"{len(listy)} cards have a list-shaped answer; enumeration cards recall poorly — candidates to split or use overlapping cloze", listy)
    if binary:
        note(f"{len(binary)} cards are binary (yes/no or 'A or B'); low retrieval effort — candidates to reformulate open-ended", binary)

    if warnings == 0:
        print("clean (with non-blocking notes above)" if (overloaded or listy or binary) else "clean")
        return 0
    return 1


if __name__ == "__main__":
    raise SystemExit(main())