diff options
Diffstat (limited to '.ai/scripts/drill-deck-stats.py')
| -rwxr-xr-x | .ai/scripts/drill-deck-stats.py | 44 |
1 files changed, 42 insertions, 2 deletions
diff --git a/.ai/scripts/drill-deck-stats.py b/.ai/scripts/drill-deck-stats.py index d0707e2..04c3468 100755 --- a/.ai/scripts/drill-deck-stats.py +++ b/.ai/scripts/drill-deck-stats.py @@ -10,6 +10,8 @@ Blocking WARNs (exit 1): - Non-prompt headings (topic-as-heading not yet rewritten) - #+TITLE missing, or carrying source-tool jargon ("org-drill") - Answer leakage: a card whose question echoes most of its own answer + (Source: citation lines and created-date lines are excluded from the + overlap, and range/category cards that recall numbers are exempted) - Duplicate / near-duplicate fronts (interference between confusable cards) Non-blocking NOTEs (exit unaffected): @@ -41,6 +43,10 @@ ID_RE = re.compile(r"^\s*:ID:\s+(\S+)\s*$") TITLE_RE = re.compile(r"^#\+TITLE:\s*(.+?)\s*$", re.IGNORECASE) SOURCE_TOOL_RE = re.compile(r"\borg[-\s]?drill\b", re.IGNORECASE) PLANNING_RE = re.compile(r"^\s*(SCHEDULED|DEADLINE|CLOSED):\s") +SOURCE_LINE_RE = re.compile(r"^\s*source:\s", re.IGNORECASE) +CREATED_LINE_RE = re.compile(r"^\s*:?created:?\s", re.IGNORECASE) +RANGE_RE = re.compile(r"\d[^\n]*[-–—]\s*\d") +THRESHOLD_RE = re.compile(r"[<>≤≥]\s*\d") BULLET_RE = re.compile(r"^\s*([-+*]|\d+[.)])\s+") BINARY_LEAD_RE = re.compile( r"^\s*(is|are|was|were|does|do|did|can|could|should|would|will|has|have|had)\b", @@ -98,6 +104,41 @@ def leakage_ratio(heading: str, body: str) -> float: return len(hw & content_words(body)) / len(hw) +def prose_body(body: str) -> str: + """Body with Source: citation and created-date lines removed. + + Those lines are metadata, not the answer. A Source line's URL slug often + repeats the question's words, and a created date is bookkeeping — neither + should count toward answer-leakage overlap. + """ + return "\n".join( + ln for ln in body.splitlines() + if not SOURCE_LINE_RE.match(ln) and not CREATED_LINE_RE.match(ln) + ) + + +def has_distinct_numeric_recall(heading: str, body: str) -> bool: + """True if the answer carries numeric ranges/thresholds the question lacks. + + A range/category card ("What are the HbA1c ranges across normal, + prediabetes, and diabetes?") echoes its categories in the answer, but the + recalled content is the numbers, which the question doesn't give away — so + high word overlap isn't leakage. + """ + body_nums = bool(RANGE_RE.search(body) or THRESHOLD_RE.search(body)) + head_nums = bool(RANGE_RE.search(heading) or THRESHOLD_RE.search(heading)) + return body_nums and not head_nums + + +def is_leaky(heading: str, body: str) -> bool: + """True if a card leaks its answer, after excluding citation lines and + numeric-recall (range/category) cards.""" + prose = prose_body(body) + if leakage_ratio(heading, prose) < LEAKAGE_RATIO: + return False + return not has_distinct_numeric_recall(heading, prose) + + def normalize_heading(heading: str) -> str: """Collapse a heading to a comparison key (lowercase, alnum + single spaces).""" return re.sub(r"\s+", " ", re.sub(r"[^a-z0-9 ]", " ", heading.lower())).strip() @@ -212,8 +253,7 @@ def main() -> int: no_id = [c["heading"] for c in cards if not c["has_id"]] not_prompt = [c["heading"] for c in cards if not is_prompt_form(c["heading"])] answer_count = sum(1 for c in cards if c["has_answer"]) - leaky = [c["heading"] for c in cards - if leakage_ratio(c["heading"], c["body"]) >= LEAKAGE_RATIO] + leaky = [c["heading"] for c in cards if is_leaky(c["heading"], c["body"])] dups = find_duplicate_fronts(cards) overloaded = [c["heading"] for c in cards if back_word_count(c["body"]) > BACK_WORD_LIMIT] listy = [c["heading"] for c in cards if is_list_back(c["body"])] |
