aboutsummaryrefslogtreecommitdiff
path: root/.ai/scripts
diff options
context:
space:
mode:
Diffstat (limited to '.ai/scripts')
-rwxr-xr-x.ai/scripts/drill-deck-stats.py44
-rwxr-xr-x.ai/scripts/drill-to-anki.py9
-rw-r--r--.ai/scripts/tests/test_drill_deck_stats.py74
-rw-r--r--.ai/scripts/tests/test_drill_to_anki.py5
4 files changed, 127 insertions, 5 deletions
diff --git a/.ai/scripts/drill-deck-stats.py b/.ai/scripts/drill-deck-stats.py
index d0707e2..04c3468 100755
--- a/.ai/scripts/drill-deck-stats.py
+++ b/.ai/scripts/drill-deck-stats.py
@@ -10,6 +10,8 @@ Blocking WARNs (exit 1):
- Non-prompt headings (topic-as-heading not yet rewritten)
- #+TITLE missing, or carrying source-tool jargon ("org-drill")
- Answer leakage: a card whose question echoes most of its own answer
+ (Source: citation lines and created-date lines are excluded from the
+ overlap, and range/category cards that recall numbers are exempted)
- Duplicate / near-duplicate fronts (interference between confusable cards)
Non-blocking NOTEs (exit unaffected):
@@ -41,6 +43,10 @@ ID_RE = re.compile(r"^\s*:ID:\s+(\S+)\s*$")
TITLE_RE = re.compile(r"^#\+TITLE:\s*(.+?)\s*$", re.IGNORECASE)
SOURCE_TOOL_RE = re.compile(r"\borg[-\s]?drill\b", re.IGNORECASE)
PLANNING_RE = re.compile(r"^\s*(SCHEDULED|DEADLINE|CLOSED):\s")
+SOURCE_LINE_RE = re.compile(r"^\s*source:\s", re.IGNORECASE)
+CREATED_LINE_RE = re.compile(r"^\s*:?created:?\s", re.IGNORECASE)
+RANGE_RE = re.compile(r"\d[^\n]*[-–—]\s*\d")
+THRESHOLD_RE = re.compile(r"[<>≤≥]\s*\d")
BULLET_RE = re.compile(r"^\s*([-+*]|\d+[.)])\s+")
BINARY_LEAD_RE = re.compile(
r"^\s*(is|are|was|were|does|do|did|can|could|should|would|will|has|have|had)\b",
@@ -98,6 +104,41 @@ def leakage_ratio(heading: str, body: str) -> float:
return len(hw & content_words(body)) / len(hw)
+def prose_body(body: str) -> str:
+ """Body with Source: citation and created-date lines removed.
+
+ Those lines are metadata, not the answer. A Source line's URL slug often
+ repeats the question's words, and a created date is bookkeeping — neither
+ should count toward answer-leakage overlap.
+ """
+ return "\n".join(
+ ln for ln in body.splitlines()
+ if not SOURCE_LINE_RE.match(ln) and not CREATED_LINE_RE.match(ln)
+ )
+
+
+def has_distinct_numeric_recall(heading: str, body: str) -> bool:
+ """True if the answer carries numeric ranges/thresholds the question lacks.
+
+ A range/category card ("What are the HbA1c ranges across normal,
+ prediabetes, and diabetes?") echoes its categories in the answer, but the
+ recalled content is the numbers, which the question doesn't give away — so
+ high word overlap isn't leakage.
+ """
+ body_nums = bool(RANGE_RE.search(body) or THRESHOLD_RE.search(body))
+ head_nums = bool(RANGE_RE.search(heading) or THRESHOLD_RE.search(heading))
+ return body_nums and not head_nums
+
+
+def is_leaky(heading: str, body: str) -> bool:
+ """True if a card leaks its answer, after excluding citation lines and
+ numeric-recall (range/category) cards."""
+ prose = prose_body(body)
+ if leakage_ratio(heading, prose) < LEAKAGE_RATIO:
+ return False
+ return not has_distinct_numeric_recall(heading, prose)
+
+
def normalize_heading(heading: str) -> str:
"""Collapse a heading to a comparison key (lowercase, alnum + single spaces)."""
return re.sub(r"\s+", " ", re.sub(r"[^a-z0-9 ]", " ", heading.lower())).strip()
@@ -212,8 +253,7 @@ def main() -> int:
no_id = [c["heading"] for c in cards if not c["has_id"]]
not_prompt = [c["heading"] for c in cards if not is_prompt_form(c["heading"])]
answer_count = sum(1 for c in cards if c["has_answer"])
- leaky = [c["heading"] for c in cards
- if leakage_ratio(c["heading"], c["body"]) >= LEAKAGE_RATIO]
+ leaky = [c["heading"] for c in cards if is_leaky(c["heading"], c["body"])]
dups = find_duplicate_fronts(cards)
overloaded = [c["heading"] for c in cards if back_word_count(c["body"]) > BACK_WORD_LIMIT]
listy = [c["heading"] for c in cards if is_list_back(c["body"])]
diff --git a/.ai/scripts/drill-to-anki.py b/.ai/scripts/drill-to-anki.py
index 1050021..9fe954e 100755
--- a/.ai/scripts/drill-to-anki.py
+++ b/.ai/scripts/drill-to-anki.py
@@ -90,15 +90,18 @@ def escape_html(s: str) -> str:
def strip_org_metadata(body_lines: list[str]) -> list[str]:
- """Drop :PROPERTIES: drawers and SCHEDULED/DEADLINE/CLOSED planning lines.
+ """Drop :PROPERTIES: drawers, planning lines, and created-date lines.
Org-drill needs these in the source file (SRS state lives in the
PROPERTIES drawer; SCHEDULED carries the next-review date), but they
- are noise on the back of an Anki card.
+ are noise on the back of an Anki card. A created/added date never
+ belongs on a card, so a stray "Created:" or ":CREATED:" body line is
+ dropped too.
"""
cleaned: list[str] = []
in_drawer = False
planning_re = re.compile(r"^\s*(SCHEDULED|DEADLINE|CLOSED):\s")
+ created_re = re.compile(r"^\s*:?created:?\s", re.IGNORECASE)
drawer_start_re = re.compile(r"^\s*:PROPERTIES:\s*$")
drawer_end_re = re.compile(r"^\s*:END:\s*$")
for line in body_lines:
@@ -109,7 +112,7 @@ def strip_org_metadata(body_lines: list[str]) -> list[str]:
if drawer_start_re.match(line):
in_drawer = True
continue
- if planning_re.match(line):
+ if planning_re.match(line) or created_re.match(line):
continue
cleaned.append(line)
return cleaned
diff --git a/.ai/scripts/tests/test_drill_deck_stats.py b/.ai/scripts/tests/test_drill_deck_stats.py
index 80b9913..d60084d 100644
--- a/.ai/scripts/tests/test_drill_deck_stats.py
+++ b/.ai/scripts/tests/test_drill_deck_stats.py
@@ -303,3 +303,77 @@ def test_cli_non_blocking_notes_keep_exit_zero(tmp_path):
r = _run(f)
assert r.returncode == 0
assert "NOTE" in r.stdout
+
+
+# --- leakage refinements: source-line strip + numeric carve-out ---
+
+def test_prose_body_strips_source_and_created_lines(stats):
+ body = "The real answer here.\nCreated: 2026-05-30\nSource: AHA — https://heart.org/x"
+ assert stats.prose_body(body) == "The real answer here."
+
+
+def test_has_distinct_numeric_recall_true_for_range_card(stats):
+ assert stats.has_distinct_numeric_recall(
+ "What are the HbA1c ranges across normal, prediabetes, and diabetes?",
+ "Normal: <5.7%. Prediabetes: 5.7-6.4%. Diabetes: >=6.5%.",
+ ) is True
+
+
+def test_has_distinct_numeric_recall_false_without_numbers(stats):
+ assert stats.has_distinct_numeric_recall("What is LEO?", "Low Earth Orbit.") is False
+
+
+def test_is_leaky_false_when_overlap_is_only_in_the_source_line(stats):
+ heading = "What blood pressure constitutes a hypertensive crisis?"
+ body = ("A reading at or above 180/120.\n"
+ "Source: AHA — https://heart.org/high-blood-pressure/hypertensive-crisis")
+ assert stats.is_leaky(heading, body) is False
+
+
+def test_is_leaky_false_for_numeric_range_card(stats):
+ heading = "What are the HbA1c ranges across normal, prediabetes, and diabetes?"
+ body = "HbA1c ranges. Normal: <5.7%. Prediabetes: 5.7-6.4%. Diabetes: >=6.5%."
+ assert stats.is_leaky(heading, body) is False
+
+
+def test_is_leaky_true_for_genuine_restatement(stats):
+ heading = "primary orbital regimes satellites classification"
+ body = "The primary orbital regimes satellites classification scheme."
+ assert stats.is_leaky(heading, body) is True
+
+
+SOURCE_LINE_DECK = """#+TITLE: Test Flashcards
+
+* Section
+** What blood pressure constitutes a hypertensive crisis? :drill:
+:PROPERTIES:
+:ID: c1
+:END:
+A reading at or above 180/120.
+
+Source: AHA — https://heart.org/high-blood-pressure/hypertensive-crisis-blood-pressure
+"""
+
+RANGE_CARD_DECK = """#+TITLE: Test Flashcards
+
+* Section
+** What are the HbA1c ranges across normal, prediabetes, and diabetes? :drill:
+:PROPERTIES:
+:ID: c1
+:END:
+HbA1c ranges. Normal: <5.7%. Prediabetes: 5.7-6.4%. Diabetes: >=6.5%.
+"""
+
+
+def test_cli_source_line_overlap_is_not_flagged(tmp_path):
+ f = tmp_path / "source.org"
+ f.write_text(SOURCE_LINE_DECK)
+ r = _run(f)
+ assert r.returncode == 0
+
+
+def test_cli_numeric_range_card_is_not_flagged(tmp_path):
+ f = tmp_path / "range.org"
+ f.write_text(RANGE_CARD_DECK)
+ r = _run(f)
+ assert r.returncode == 0
diff --git a/.ai/scripts/tests/test_drill_to_anki.py b/.ai/scripts/tests/test_drill_to_anki.py
index 6c5ef9b..fc17817 100644
--- a/.ai/scripts/tests/test_drill_to_anki.py
+++ b/.ai/scripts/tests/test_drill_to_anki.py
@@ -121,6 +121,11 @@ def test_strip_org_metadata_unclosed_drawer_swallows_the_rest(drill):
assert drill.strip_org_metadata(body) == []
+def test_strip_org_metadata_drops_created_date_line(drill):
+ # A created/added date never belongs on a card back.
+ assert drill.strip_org_metadata(["Created: 2026-05-30", "real answer"]) == ["real answer"]
+
+
# --- parse (pure, core parser) ---
SECTIONED = """* Orbital Regimes