aboutsummaryrefslogtreecommitdiff
path: root/docs/scripts
diff options
context:
space:
mode:
Diffstat (limited to 'docs/scripts')
-rw-r--r--docs/scripts/eml-view-and-extract-attachments-readme.org47
-rw-r--r--docs/scripts/eml-view-and-extract-attachments.py401
-rw-r--r--docs/scripts/tests/conftest.py77
-rw-r--r--docs/scripts/tests/fixtures/empty-body.eml16
-rw-r--r--docs/scripts/tests/fixtures/html-only.eml20
-rw-r--r--docs/scripts/tests/fixtures/multiple-received-headers.eml12
-rw-r--r--docs/scripts/tests/fixtures/no-received-headers.eml9
-rw-r--r--docs/scripts/tests/fixtures/plain-text.eml15
-rw-r--r--docs/scripts/tests/fixtures/with-attachment.eml27
-rw-r--r--docs/scripts/tests/test_extract_body.py96
-rw-r--r--docs/scripts/tests/test_extract_metadata.py65
-rw-r--r--docs/scripts/tests/test_generate_filenames.py157
-rw-r--r--docs/scripts/tests/test_integration_stdout.py68
-rw-r--r--docs/scripts/tests/test_parse_received_headers.py105
-rw-r--r--docs/scripts/tests/test_process_eml.py129
-rw-r--r--docs/scripts/tests/test_save_attachments.py97
16 files changed, 1313 insertions, 28 deletions
diff --git a/docs/scripts/eml-view-and-extract-attachments-readme.org b/docs/scripts/eml-view-and-extract-attachments-readme.org
new file mode 100644
index 0000000..c132df8
--- /dev/null
+++ b/docs/scripts/eml-view-and-extract-attachments-readme.org
@@ -0,0 +1,47 @@
+#+TITLE: eml-view-and-extract-attachments.py
+
+Extract email content and attachments from EML files with auto-renaming.
+
+* Usage
+
+#+begin_src bash
+# View mode — print metadata and body to stdout, extract attachments alongside EML
+python3 docs/scripts/eml-view-and-extract-attachments.py inbox/message.eml
+
+# Pipeline mode — extract, auto-rename, refile to output dir, clean up
+python3 docs/scripts/eml-view-and-extract-attachments.py inbox/message.eml --output-dir assets/
+#+end_src
+
+* Naming Convention
+
+Files are auto-renamed as =YYYY-MM-DD-HHMM-Sender-TYPE-Description.ext=:
+
+- =2026-02-05-1136-Jonathan-EMAIL-Re-Fw-4319-Danneel-Street.eml=
+- =2026-02-05-1136-Jonathan-EMAIL-Re-Fw-4319-Danneel-Street.txt=
+- =2026-02-05-1136-Jonathan-ATTACH-Ltr-Carrollton.pdf=
+
+Date and sender are parsed from email headers. Falls back to "unknown" for missing values.
+
+* Dependencies
+
+- Python 3 (stdlib only for core functionality)
+- =html2text= (optional — used for HTML-only emails, falls back to tag stripping)
+
+* Pipeline Mode Behavior
+
+1. Creates a temp directory alongside the source EML
+2. Copies and renames the EML, writes a =.txt= of the body, extracts attachments
+3. Checks for filename collisions in the output directory
+4. Moves all files to the output directory
+5. Cleans up the temp directory
+6. Prints a summary of created files
+
+Source EML is never modified or moved.
+
+* Tests
+
+#+begin_src bash
+python3 -m pytest docs/scripts/tests/ -v
+#+end_src
+
+48 tests: unit tests for parsing, filename generation, and attachment saving; integration tests for both pipeline and stdout modes. Requires =pytest=.
diff --git a/docs/scripts/eml-view-and-extract-attachments.py b/docs/scripts/eml-view-and-extract-attachments.py
index f498b83..3201c99 100644
--- a/docs/scripts/eml-view-and-extract-attachments.py
+++ b/docs/scripts/eml-view-and-extract-attachments.py
@@ -1,34 +1,343 @@
#!/usr/bin/env python3
+"""Extract email content and attachments from EML files.
+
+Without --output-dir: parse and print to stdout (backwards compatible).
+With --output-dir: full pipeline — extract, auto-rename, refile, clean up.
+"""
+
+import argparse
import email
-import sys
+import email.utils
import os
+import re
+import shutil
+import sys
+import tempfile
-def extract_attachments(eml_file):
- with open(eml_file, 'rb') as f:
- msg = email.message_from_binary_file(f)
- # Extract plain text body
- body_text = ""
+# ---------------------------------------------------------------------------
+# Parsing functions (no I/O beyond reading the input file)
+# ---------------------------------------------------------------------------
+
+def parse_received_headers(msg):
+ """Parse Received headers to extract sent/received times and servers."""
+ received_headers = msg.get_all('Received', [])
+
+ sent_server = None
+ sent_time = None
+ received_server = None
+ received_time = None
+
+ for header in received_headers:
+ header = ' '.join(header.split())
+
+ time_match = re.search(r';\s*(.+)$', header)
+ timestamp = time_match.group(1).strip() if time_match else None
+
+ from_match = re.search(r'from\s+([\w.-]+)', header)
+ by_match = re.search(r'by\s+([\w.-]+)', header)
+
+ if from_match and by_match and received_server is None:
+ received_time = timestamp
+ received_server = by_match.group(1)
+ sent_server = from_match.group(1)
+ sent_time = timestamp
+
+ if received_server is None and received_headers:
+ header = ' '.join(received_headers[0].split())
+ time_match = re.search(r';\s*(.+)$', header)
+ received_time = time_match.group(1).strip() if time_match else None
+ by_match = re.search(r'by\s+([\w.-]+)', header)
+ received_server = by_match.group(1) if by_match else "unknown"
+
+ return {
+ 'sent_time': sent_time,
+ 'sent_server': sent_server,
+ 'received_time': received_time,
+ 'received_server': received_server
+ }
+
+
+def extract_body(msg):
+ """Walk MIME parts, prefer text/plain, fall back to html2text on text/html.
+
+ Returns body text string.
+ """
+ plain_text = None
+ html_text = None
+
+ for part in msg.walk():
+ content_type = part.get_content_type()
+ if content_type == "text/plain" and plain_text is None:
+ payload = part.get_payload(decode=True)
+ if payload is not None:
+ plain_text = payload.decode('utf-8', errors='ignore')
+ elif content_type == "text/html" and html_text is None:
+ payload = part.get_payload(decode=True)
+ if payload is not None:
+ html_text = payload.decode('utf-8', errors='ignore')
+
+ if plain_text is not None:
+ return plain_text
+
+ if html_text is not None:
+ try:
+ import html2text
+ h = html2text.HTML2Text()
+ h.body_width = 0
+ return h.handle(html_text)
+ except ImportError:
+ # Strip HTML tags as fallback if html2text not installed
+ return re.sub(r'<[^>]+>', '', html_text)
+
+ return ""
+
+
+def extract_metadata(msg):
+ """Extract email metadata from headers.
+
+ Returns dict with from, to, subject, date, and timing info.
+ """
+ return {
+ 'from': msg.get('From'),
+ 'to': msg.get('To'),
+ 'subject': msg.get('Subject'),
+ 'date': msg.get('Date'),
+ 'timing': parse_received_headers(msg),
+ }
+
+
+def generate_basename(metadata):
+ """Generate date-sender prefix from metadata.
+
+ Returns e.g. "2026-02-05-1136-Jonathan".
+ Falls back to "unknown" for missing/malformed Date or From.
+ """
+ # Parse date
+ date_str = metadata.get('date')
+ date_prefix = "unknown"
+ if date_str:
+ try:
+ parsed = email.utils.parsedate_to_datetime(date_str)
+ date_prefix = parsed.strftime('%Y-%m-%d-%H%M')
+ except (ValueError, TypeError):
+ pass
+
+ # Parse sender first name
+ from_str = metadata.get('from')
+ sender = "unknown"
+ if from_str:
+ # Extract display name or email local part
+ display_name, addr = email.utils.parseaddr(from_str)
+ if display_name:
+ sender = display_name.split()[0]
+ elif addr:
+ sender = addr.split('@')[0]
+
+ return f"{date_prefix}-{sender}"
+
+
+def _clean_for_filename(text, max_length=80):
+ """Clean text for use in a filename.
+
+ Replace spaces with hyphens, strip chars unsafe for filenames,
+ collapse multiple hyphens.
+ """
+ text = text.strip()
+ text = text.replace(' ', '-')
+ # Keep alphanumeric, hyphens, dots, underscores
+ text = re.sub(r'[^\w\-.]', '', text)
+ # Collapse multiple hyphens
+ text = re.sub(r'-{2,}', '-', text)
+ # Strip leading/trailing hyphens
+ text = text.strip('-')
+ if len(text) > max_length:
+ text = text[:max_length].rstrip('-')
+ return text
+
+
+def generate_email_filename(basename, subject):
+ """Generate email filename from basename and subject.
+
+ Returns e.g. "2026-02-05-1136-Jonathan-EMAIL-Re-Fw-4319-Danneel-Street"
+ (without extension — caller adds .eml or .txt).
+ """
+ if subject:
+ clean_subject = _clean_for_filename(subject)
+ else:
+ clean_subject = "no-subject"
+ return f"{basename}-EMAIL-{clean_subject}"
+
+
+def generate_attachment_filename(basename, original_filename):
+ """Generate attachment filename from basename and original filename.
+
+ Returns e.g. "2026-02-05-1136-Jonathan-ATTACH-Ltr-Carrollton.pdf".
+ Preserves original extension.
+ """
+ if not original_filename:
+ return f"{basename}-ATTACH-unnamed"
+
+ name, ext = os.path.splitext(original_filename)
+ clean_name = _clean_for_filename(name)
+ return f"{basename}-ATTACH-{clean_name}{ext}"
+
+
+# ---------------------------------------------------------------------------
+# I/O functions (file operations)
+# ---------------------------------------------------------------------------
+
+def save_attachments(msg, output_dir, basename):
+ """Write attachment files to output_dir with auto-renamed filenames.
+
+ Returns list of dicts: {original_name, renamed_name, path}.
+ """
+ results = []
for part in msg.walk():
- if part.get_content_type() == "text/plain":
- body_text = part.get_payload(decode=True).decode('utf-8', errors='ignore')
- break
- elif part.get_content_type() == "text/html":
- # Fallback to HTML if no plain text
- if not body_text:
- body_text = part.get_payload(decode=True).decode('utf-8', errors='ignore')
-
- # Print email metadata and body
- print(f"From: {msg.get('From')}")
- print(f"To: {msg.get('To')}")
- print(f"Subject: {msg.get('Subject')}")
- print(f"Date: {msg.get('Date')}")
+ if part.get_content_maintype() == 'multipart':
+ continue
+ if part.get('Content-Disposition') is None:
+ continue
+
+ filename = part.get_filename()
+ if filename:
+ renamed = generate_attachment_filename(basename, filename)
+ filepath = os.path.join(output_dir, renamed)
+ with open(filepath, 'wb') as f:
+ f.write(part.get_payload(decode=True))
+ results.append({
+ 'original_name': filename,
+ 'renamed_name': renamed,
+ 'path': filepath,
+ })
+
+ return results
+
+
+def save_text(text, filepath):
+ """Write body text to a .txt file."""
+ with open(filepath, 'w', encoding='utf-8') as f:
+ f.write(text)
+
+
+# ---------------------------------------------------------------------------
+# Pipeline function
+# ---------------------------------------------------------------------------
+
+def process_eml(eml_path, output_dir):
+ """Full extraction pipeline.
+
+ 1. Create temp extraction dir
+ 2. Copy EML into temp dir
+ 3. Parse email (metadata, body, attachments)
+ 4. Generate filenames from headers
+ 5. Save renamed .eml, .txt, and attachments to temp dir
+ 6. Check for collisions in output_dir
+ 7. Move all files to output_dir
+ 8. Clean up temp dir
+ 9. Return results dict
+ """
+ eml_path = os.path.abspath(eml_path)
+ output_dir = os.path.abspath(output_dir)
+ os.makedirs(output_dir, exist_ok=True)
+
+ # Create temp dir as sibling of the EML file
+ eml_dir = os.path.dirname(eml_path)
+ temp_dir = tempfile.mkdtemp(prefix='extract-', dir=eml_dir)
+
+ try:
+ # Copy EML to temp dir
+ temp_eml = os.path.join(temp_dir, os.path.basename(eml_path))
+ shutil.copy2(eml_path, temp_eml)
+
+ # Parse
+ with open(eml_path, 'rb') as f:
+ msg = email.message_from_binary_file(f)
+
+ metadata = extract_metadata(msg)
+ body = extract_body(msg)
+ basename = generate_basename(metadata)
+ email_stem = generate_email_filename(basename, metadata['subject'])
+
+ # Save renamed EML
+ renamed_eml = f"{email_stem}.eml"
+ renamed_eml_path = os.path.join(temp_dir, renamed_eml)
+ os.rename(temp_eml, renamed_eml_path)
+
+ # Save .txt
+ renamed_txt = f"{email_stem}.txt"
+ renamed_txt_path = os.path.join(temp_dir, renamed_txt)
+ save_text(body, renamed_txt_path)
+
+ # Save attachments
+ attachment_results = save_attachments(msg, temp_dir, basename)
+
+ # Build file list
+ files = [
+ {'type': 'eml', 'name': renamed_eml, 'path': None},
+ {'type': 'txt', 'name': renamed_txt, 'path': None},
+ ]
+ for att in attachment_results:
+ files.append({
+ 'type': 'attach',
+ 'name': att['renamed_name'],
+ 'path': None,
+ })
+
+ # Check for collisions in output_dir
+ for file_info in files:
+ dest = os.path.join(output_dir, file_info['name'])
+ if os.path.exists(dest):
+ raise FileExistsError(
+ f"Collision: '{file_info['name']}' already exists in {output_dir}"
+ )
+
+ # Move all files to output_dir
+ for file_info in files:
+ src = os.path.join(temp_dir, file_info['name'])
+ dest = os.path.join(output_dir, file_info['name'])
+ shutil.move(src, dest)
+ file_info['path'] = dest
+
+ return {
+ 'metadata': metadata,
+ 'body': body,
+ 'files': files,
+ }
+
+ finally:
+ # Clean up temp dir
+ if os.path.exists(temp_dir):
+ shutil.rmtree(temp_dir)
+
+
+# ---------------------------------------------------------------------------
+# Stdout display (backwards-compatible mode)
+# ---------------------------------------------------------------------------
+
+def print_email(eml_path):
+ """Parse and print email to stdout. Extract attachments alongside EML.
+
+ This preserves the original script behavior when --output-dir is not given.
+ """
+ with open(eml_path, 'rb') as f:
+ msg = email.message_from_binary_file(f)
+
+ metadata = extract_metadata(msg)
+ body = extract_body(msg)
+ timing = metadata['timing']
+
+ print(f"From: {metadata['from']}")
+ print(f"To: {metadata['to']}")
+ print(f"Subject: {metadata['subject']}")
+ print(f"Date: {metadata['date']}")
+ print(f"Sent: {timing['sent_time']} (via {timing['sent_server']})")
+ print(f"Received: {timing['received_time']} (at {timing['received_server']})")
print()
- print(body_text)
+ print(body)
print()
- # Extract attachments
- attachments = []
+ # Extract attachments alongside the EML file
for part in msg.walk():
if part.get_content_maintype() == 'multipart':
continue
@@ -37,17 +346,53 @@ def extract_attachments(eml_file):
filename = part.get_filename()
if filename:
- filepath = os.path.join(os.path.dirname(eml_file), filename)
+ filepath = os.path.join(os.path.dirname(eml_path), filename)
with open(filepath, 'wb') as f:
f.write(part.get_payload(decode=True))
- attachments.append(filename)
print(f"Extracted attachment: {filename}")
- return attachments
+
+def print_pipeline_summary(result):
+ """Print summary after pipeline extraction."""
+ metadata = result['metadata']
+ timing = metadata['timing']
+
+ print(f"From: {metadata['from']}")
+ print(f"To: {metadata['to']}")
+ print(f"Subject: {metadata['subject']}")
+ print(f"Date: {metadata['date']}")
+ print(f"Sent: {timing['sent_time']} (via {timing['sent_server']})")
+ print(f"Received: {timing['received_time']} (at {timing['received_server']})")
+ print()
+ print("Files created:")
+ for f in result['files']:
+ print(f" [{f['type']:>6}] {f['name']}")
+ print(f"\nOutput directory: {os.path.dirname(result['files'][0]['path'])}")
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
if __name__ == "__main__":
- if len(sys.argv) < 2:
- print("Usage: extract_attachments.py <eml_file>")
+ parser = argparse.ArgumentParser(
+ description="Extract email content and attachments from EML files."
+ )
+ parser.add_argument('eml_path', help="Path to source EML file")
+ parser.add_argument(
+ '--output-dir',
+ help="Destination directory for extracted files. "
+ "Without this flag, prints to stdout only (backwards compatible)."
+ )
+
+ args = parser.parse_args()
+
+ if not os.path.isfile(args.eml_path):
+ print(f"Error: '{args.eml_path}' not found or is not a file.", file=sys.stderr)
sys.exit(1)
- extract_attachments(sys.argv[1])
+ if args.output_dir:
+ result = process_eml(args.eml_path, args.output_dir)
+ print_pipeline_summary(result)
+ else:
+ print_email(args.eml_path)
diff --git a/docs/scripts/tests/conftest.py b/docs/scripts/tests/conftest.py
new file mode 100644
index 0000000..8d965ab
--- /dev/null
+++ b/docs/scripts/tests/conftest.py
@@ -0,0 +1,77 @@
+"""Shared fixtures for EML extraction tests."""
+
+import os
+from email.message import EmailMessage
+from email.mime.application import MIMEApplication
+from email.mime.multipart import MIMEMultipart
+from email.mime.text import MIMEText
+
+import pytest
+
+
+@pytest.fixture
+def fixtures_dir():
+ """Return path to the fixtures/ directory."""
+ return os.path.join(os.path.dirname(__file__), 'fixtures')
+
+
+def make_plain_message(body="Test body", from_="Jonathan Smith <jsmith@example.com>",
+ to="Craig <craig@example.com>",
+ subject="Test Subject",
+ date="Wed, 05 Feb 2026 11:36:00 -0600"):
+ """Create an EmailMessage with text/plain body."""
+ msg = EmailMessage()
+ msg['From'] = from_
+ msg['To'] = to
+ msg['Subject'] = subject
+ msg['Date'] = date
+ msg.set_content(body)
+ return msg
+
+
+def make_html_message(html_body="<p>Test body</p>",
+ from_="Jonathan Smith <jsmith@example.com>",
+ to="Craig <craig@example.com>",
+ subject="Test Subject",
+ date="Wed, 05 Feb 2026 11:36:00 -0600"):
+ """Create an EmailMessage with text/html body only."""
+ msg = EmailMessage()
+ msg['From'] = from_
+ msg['To'] = to
+ msg['Subject'] = subject
+ msg['Date'] = date
+ msg.set_content(html_body, subtype='html')
+ return msg
+
+
+def make_message_with_attachment(body="Test body",
+ from_="Jonathan Smith <jsmith@example.com>",
+ to="Craig <craig@example.com>",
+ subject="Test Subject",
+ date="Wed, 05 Feb 2026 11:36:00 -0600",
+ attachment_filename="document.pdf",
+ attachment_content=b"fake pdf content"):
+ """Create a multipart message with a text body and one attachment."""
+ msg = MIMEMultipart()
+ msg['From'] = from_
+ msg['To'] = to
+ msg['Subject'] = subject
+ msg['Date'] = date
+
+ msg.attach(MIMEText(body, 'plain'))
+
+ att = MIMEApplication(attachment_content, Name=attachment_filename)
+ att['Content-Disposition'] = f'attachment; filename="{attachment_filename}"'
+ msg.attach(att)
+
+ return msg
+
+
+def add_received_headers(msg, headers):
+ """Add Received headers to an existing message.
+
+ headers: list of header strings, added in order (first = most recent).
+ """
+ for header in headers:
+ msg['Received'] = header
+ return msg
diff --git a/docs/scripts/tests/fixtures/empty-body.eml b/docs/scripts/tests/fixtures/empty-body.eml
new file mode 100644
index 0000000..cf008df
--- /dev/null
+++ b/docs/scripts/tests/fixtures/empty-body.eml
@@ -0,0 +1,16 @@
+From: Jonathan Smith <jsmith@example.com>
+To: Craig Jennings <craig@example.com>
+Subject: Empty Body Test
+Date: Thu, 05 Feb 2026 11:36:00 -0600
+MIME-Version: 1.0
+Content-Type: multipart/mixed; boundary="boundary456"
+Received: from mail-sender.example.com by mx.receiver.example.com with ESMTP; Thu, 05 Feb 2026 11:36:05 -0600
+
+--boundary456
+Content-Type: application/octet-stream; name="data.bin"
+Content-Disposition: attachment; filename="data.bin"
+Content-Transfer-Encoding: base64
+
+AQIDBA==
+
+--boundary456--
diff --git a/docs/scripts/tests/fixtures/html-only.eml b/docs/scripts/tests/fixtures/html-only.eml
new file mode 100644
index 0000000..4db7645
--- /dev/null
+++ b/docs/scripts/tests/fixtures/html-only.eml
@@ -0,0 +1,20 @@
+From: Jonathan Smith <jsmith@example.com>
+To: Craig Jennings <craig@example.com>
+Subject: HTML Update
+Date: Thu, 05 Feb 2026 11:36:00 -0600
+MIME-Version: 1.0
+Content-Type: text/html; charset="utf-8"
+Content-Transfer-Encoding: 7bit
+Received: from mail-sender.example.com by mx.receiver.example.com with ESMTP; Thu, 05 Feb 2026 11:36:05 -0600
+
+<html>
+<body>
+<p>Hi Craig,</p>
+<p>Here is the <strong>HTML</strong> update.</p>
+<ul>
+<li>Item one</li>
+<li>Item two</li>
+</ul>
+<p>Best,<br>Jonathan</p>
+</body>
+</html>
diff --git a/docs/scripts/tests/fixtures/multiple-received-headers.eml b/docs/scripts/tests/fixtures/multiple-received-headers.eml
new file mode 100644
index 0000000..1b8d6a7
--- /dev/null
+++ b/docs/scripts/tests/fixtures/multiple-received-headers.eml
@@ -0,0 +1,12 @@
+From: Jonathan Smith <jsmith@example.com>
+To: Craig Jennings <craig@example.com>
+Subject: Multiple Received Headers Test
+Date: Thu, 05 Feb 2026 11:36:00 -0600
+MIME-Version: 1.0
+Content-Type: text/plain; charset="utf-8"
+Content-Transfer-Encoding: 7bit
+Received: by internal.example.com with SMTP; Thu, 05 Feb 2026 11:36:10 -0600
+Received: from mail-sender.example.com by mx.receiver.example.com with ESMTP; Thu, 05 Feb 2026 11:36:05 -0600
+Received: from originator.example.com by relay.example.com with SMTP; Thu, 05 Feb 2026 11:35:58 -0600
+
+Test body with multiple received headers.
diff --git a/docs/scripts/tests/fixtures/no-received-headers.eml b/docs/scripts/tests/fixtures/no-received-headers.eml
new file mode 100644
index 0000000..8a05dc7
--- /dev/null
+++ b/docs/scripts/tests/fixtures/no-received-headers.eml
@@ -0,0 +1,9 @@
+From: Jonathan Smith <jsmith@example.com>
+To: Craig Jennings <craig@example.com>
+Subject: No Received Headers
+Date: Thu, 05 Feb 2026 11:36:00 -0600
+MIME-Version: 1.0
+Content-Type: text/plain; charset="utf-8"
+Content-Transfer-Encoding: 7bit
+
+Test body with no received headers at all.
diff --git a/docs/scripts/tests/fixtures/plain-text.eml b/docs/scripts/tests/fixtures/plain-text.eml
new file mode 100644
index 0000000..8cc9d9c
--- /dev/null
+++ b/docs/scripts/tests/fixtures/plain-text.eml
@@ -0,0 +1,15 @@
+From: Jonathan Smith <jsmith@example.com>
+To: Craig Jennings <craig@example.com>
+Subject: Re: Fw: 4319 Danneel Street
+Date: Thu, 05 Feb 2026 11:36:00 -0600
+MIME-Version: 1.0
+Content-Type: text/plain; charset="utf-8"
+Content-Transfer-Encoding: 7bit
+Received: from mail-sender.example.com by mx.receiver.example.com with ESMTP; Thu, 05 Feb 2026 11:36:05 -0600
+
+Hi Craig,
+
+Here is the update on 4319 Danneel Street.
+
+Best,
+Jonathan
diff --git a/docs/scripts/tests/fixtures/with-attachment.eml b/docs/scripts/tests/fixtures/with-attachment.eml
new file mode 100644
index 0000000..ac49c5d
--- /dev/null
+++ b/docs/scripts/tests/fixtures/with-attachment.eml
@@ -0,0 +1,27 @@
+From: Jonathan Smith <jsmith@example.com>
+To: Craig Jennings <craig@example.com>
+Subject: Ltr from Carrollton
+Date: Thu, 05 Feb 2026 11:36:00 -0600
+MIME-Version: 1.0
+Content-Type: multipart/mixed; boundary="boundary123"
+Received: from mail-sender.example.com by mx.receiver.example.com with ESMTP; Thu, 05 Feb 2026 11:36:05 -0600
+
+--boundary123
+Content-Type: text/plain; charset="utf-8"
+Content-Transfer-Encoding: 7bit
+
+Hi Craig,
+
+Please find the letter attached.
+
+Best,
+Jonathan
+
+--boundary123
+Content-Type: application/octet-stream; name="Ltr Carrollton.pdf"
+Content-Disposition: attachment; filename="Ltr Carrollton.pdf"
+Content-Transfer-Encoding: base64
+
+ZmFrZSBwZGYgY29udGVudA==
+
+--boundary123--
diff --git a/docs/scripts/tests/test_extract_body.py b/docs/scripts/tests/test_extract_body.py
new file mode 100644
index 0000000..7b53cda
--- /dev/null
+++ b/docs/scripts/tests/test_extract_body.py
@@ -0,0 +1,96 @@
+"""Tests for extract_body()."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
+
+from conftest import make_plain_message, make_html_message, make_message_with_attachment
+from email.message import EmailMessage
+from email.mime.multipart import MIMEMultipart
+from email.mime.text import MIMEText
+from email.mime.application import MIMEApplication
+
+import importlib.util
+spec = importlib.util.spec_from_file_location(
+ "eml_script",
+ os.path.join(os.path.dirname(__file__), '..', 'eml-view-and-extract-attachments.py')
+)
+eml_script = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(eml_script)
+
+extract_body = eml_script.extract_body
+
+
+class TestPlainText:
+ def test_returns_plain_text(self):
+ msg = make_plain_message(body="Hello, this is plain text.")
+ result = extract_body(msg)
+ assert "Hello, this is plain text." in result
+
+
+class TestHtmlOnly:
+ def test_returns_converted_html(self):
+ msg = make_html_message(html_body="<p>Hello <strong>world</strong></p>")
+ result = extract_body(msg)
+ assert "Hello" in result
+ assert "world" in result
+ # Should not contain raw HTML tags
+ assert "<p>" not in result
+ assert "<strong>" not in result
+
+
+class TestBothPlainAndHtml:
+ def test_prefers_plain_text(self):
+ msg = MIMEMultipart('alternative')
+ msg['From'] = 'test@example.com'
+ msg['To'] = 'dest@example.com'
+ msg['Subject'] = 'Test'
+ msg['Date'] = 'Thu, 05 Feb 2026 11:36:00 -0600'
+ msg.attach(MIMEText("Plain text version", 'plain'))
+ msg.attach(MIMEText("<p>HTML version</p>", 'html'))
+ result = extract_body(msg)
+ assert "Plain text version" in result
+ assert "HTML version" not in result
+
+
+class TestEmptyBody:
+ def test_returns_empty_string(self):
+ # Multipart with only attachments, no text parts
+ msg = MIMEMultipart()
+ msg['From'] = 'test@example.com'
+ att = MIMEApplication(b"binary data", Name="file.bin")
+ att['Content-Disposition'] = 'attachment; filename="file.bin"'
+ msg.attach(att)
+ result = extract_body(msg)
+ assert result == ""
+
+
+class TestNonUtf8Encoding:
+ def test_decodes_with_errors_ignore(self):
+ msg = EmailMessage()
+ msg['From'] = 'test@example.com'
+ # Set raw bytes that include invalid UTF-8
+ msg.set_content("Valid text with special: café")
+ result = extract_body(msg)
+ assert "Valid text" in result
+
+
+class TestHtmlWithStructure:
+ def test_preserves_list_structure(self):
+ html = "<ul><li>Item one</li><li>Item two</li></ul>"
+ msg = make_html_message(html_body=html)
+ result = extract_body(msg)
+ assert "Item one" in result
+ assert "Item two" in result
+
+
+class TestNoTextParts:
+ def test_returns_empty_string(self):
+ msg = MIMEMultipart()
+ msg['From'] = 'test@example.com'
+ att = MIMEApplication(b"data", Name="image.png")
+ att['Content-Disposition'] = 'attachment; filename="image.png"'
+ msg.attach(att)
+ result = extract_body(msg)
+ assert result == ""
diff --git a/docs/scripts/tests/test_extract_metadata.py b/docs/scripts/tests/test_extract_metadata.py
new file mode 100644
index 0000000..d5ee52e
--- /dev/null
+++ b/docs/scripts/tests/test_extract_metadata.py
@@ -0,0 +1,65 @@
+"""Tests for extract_metadata()."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
+
+from conftest import make_plain_message, add_received_headers
+from email.message import EmailMessage
+
+import importlib.util
+spec = importlib.util.spec_from_file_location(
+ "eml_script",
+ os.path.join(os.path.dirname(__file__), '..', 'eml-view-and-extract-attachments.py')
+)
+eml_script = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(eml_script)
+
+extract_metadata = eml_script.extract_metadata
+
+
+class TestAllHeadersPresent:
+ def test_complete_dict(self):
+ msg = make_plain_message(
+ from_="Jonathan Smith <jsmith@example.com>",
+ to="Craig <craig@example.com>",
+ subject="Test Subject",
+ date="Thu, 05 Feb 2026 11:36:00 -0600"
+ )
+ result = extract_metadata(msg)
+ assert result['from'] == "Jonathan Smith <jsmith@example.com>"
+ assert result['to'] == "Craig <craig@example.com>"
+ assert result['subject'] == "Test Subject"
+ assert result['date'] == "Thu, 05 Feb 2026 11:36:00 -0600"
+ assert 'timing' in result
+
+
+class TestMissingFrom:
+ def test_from_is_none(self):
+ msg = EmailMessage()
+ msg['To'] = 'craig@example.com'
+ msg['Subject'] = 'Test'
+ msg['Date'] = 'Thu, 05 Feb 2026 11:36:00 -0600'
+ msg.set_content("body")
+ result = extract_metadata(msg)
+ assert result['from'] is None
+
+
+class TestMissingDate:
+ def test_date_is_none(self):
+ msg = EmailMessage()
+ msg['From'] = 'test@example.com'
+ msg['To'] = 'craig@example.com'
+ msg['Subject'] = 'Test'
+ msg.set_content("body")
+ result = extract_metadata(msg)
+ assert result['date'] is None
+
+
+class TestLongSubject:
+ def test_full_subject_returned(self):
+ long_subject = "Re: Fw: This is a very long subject line that spans many words and might be folded"
+ msg = make_plain_message(subject=long_subject)
+ result = extract_metadata(msg)
+ assert result['subject'] == long_subject
diff --git a/docs/scripts/tests/test_generate_filenames.py b/docs/scripts/tests/test_generate_filenames.py
new file mode 100644
index 0000000..07c8f84
--- /dev/null
+++ b/docs/scripts/tests/test_generate_filenames.py
@@ -0,0 +1,157 @@
+"""Tests for generate_basename(), generate_email_filename(), generate_attachment_filename()."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
+
+import importlib.util
+spec = importlib.util.spec_from_file_location(
+ "eml_script",
+ os.path.join(os.path.dirname(__file__), '..', 'eml-view-and-extract-attachments.py')
+)
+eml_script = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(eml_script)
+
+generate_basename = eml_script.generate_basename
+generate_email_filename = eml_script.generate_email_filename
+generate_attachment_filename = eml_script.generate_attachment_filename
+
+
+# --- generate_basename ---
+
+class TestGenerateBasename:
+ def test_standard_from_and_date(self):
+ metadata = {
+ 'from': 'Jonathan Smith <jsmith@example.com>',
+ 'date': 'Wed, 05 Feb 2026 11:36:00 -0600',
+ }
+ assert generate_basename(metadata) == "2026-02-05-1136-Jonathan"
+
+ def test_from_with_display_name_first_token(self):
+ metadata = {
+ 'from': 'C Ciarm <cciarm@example.com>',
+ 'date': 'Wed, 05 Feb 2026 11:36:00 -0600',
+ }
+ result = generate_basename(metadata)
+ assert result == "2026-02-05-1136-C"
+
+ def test_from_without_display_name(self):
+ metadata = {
+ 'from': 'jsmith@example.com',
+ 'date': 'Wed, 05 Feb 2026 11:36:00 -0600',
+ }
+ result = generate_basename(metadata)
+ assert result == "2026-02-05-1136-jsmith"
+
+ def test_missing_date(self):
+ metadata = {
+ 'from': 'Jonathan Smith <jsmith@example.com>',
+ 'date': None,
+ }
+ result = generate_basename(metadata)
+ assert result == "unknown-Jonathan"
+
+ def test_missing_from(self):
+ metadata = {
+ 'from': None,
+ 'date': 'Wed, 05 Feb 2026 11:36:00 -0600',
+ }
+ result = generate_basename(metadata)
+ assert result == "2026-02-05-1136-unknown"
+
+ def test_both_missing(self):
+ metadata = {'from': None, 'date': None}
+ result = generate_basename(metadata)
+ assert result == "unknown-unknown"
+
+ def test_unparseable_date(self):
+ metadata = {
+ 'from': 'Jonathan <j@example.com>',
+ 'date': 'not a real date',
+ }
+ result = generate_basename(metadata)
+ assert result == "unknown-Jonathan"
+
+ def test_none_date_no_crash(self):
+ metadata = {'from': 'Test <t@e.com>', 'date': None}
+ # Should not raise
+ result = generate_basename(metadata)
+ assert "unknown" in result
+
+
+# --- generate_email_filename ---
+
+class TestGenerateEmailFilename:
+ def test_standard_subject(self):
+ result = generate_email_filename(
+ "2026-02-05-1136-Jonathan",
+ "Re: Fw: 4319 Danneel Street"
+ )
+ assert result == "2026-02-05-1136-Jonathan-EMAIL-Re-Fw-4319-Danneel-Street"
+
+ def test_subject_with_special_chars(self):
+ result = generate_email_filename(
+ "2026-02-05-1136-Jonathan",
+ "Update: Meeting (draft) & notes!"
+ )
+ # Colons, parens, ampersands, exclamation stripped
+ assert "EMAIL" in result
+ assert ":" not in result
+ assert "(" not in result
+ assert ")" not in result
+ assert "&" not in result
+ assert "!" not in result
+
+ def test_none_subject(self):
+ result = generate_email_filename("2026-02-05-1136-Jonathan", None)
+ assert result == "2026-02-05-1136-Jonathan-EMAIL-no-subject"
+
+ def test_empty_subject(self):
+ result = generate_email_filename("2026-02-05-1136-Jonathan", "")
+ assert result == "2026-02-05-1136-Jonathan-EMAIL-no-subject"
+
+ def test_very_long_subject(self):
+ long_subject = "A" * 100 + " " + "B" * 100
+ result = generate_email_filename("2026-02-05-1136-Jonathan", long_subject)
+ # The cleaned subject part should be truncated
+ # basename (27) + "-EMAIL-" (7) + subject
+ # Subject itself is limited to 80 chars by _clean_for_filename
+ subject_part = result.split("-EMAIL-")[1]
+ assert len(subject_part) <= 80
+
+
+# --- generate_attachment_filename ---
+
+class TestGenerateAttachmentFilename:
+ def test_standard_attachment(self):
+ result = generate_attachment_filename(
+ "2026-02-05-1136-Jonathan",
+ "Ltr Carrollton.pdf"
+ )
+ assert result == "2026-02-05-1136-Jonathan-ATTACH-Ltr-Carrollton.pdf"
+
+ def test_filename_with_spaces_and_parens(self):
+ result = generate_attachment_filename(
+ "2026-02-05-1136-Jonathan",
+ "Document (final copy).pdf"
+ )
+ assert " " not in result
+ assert "(" not in result
+ assert ")" not in result
+ assert result.endswith(".pdf")
+
+ def test_preserves_extension(self):
+ result = generate_attachment_filename(
+ "2026-02-05-1136-Jonathan",
+ "photo.jpg"
+ )
+ assert result.endswith(".jpg")
+
+ def test_none_filename(self):
+ result = generate_attachment_filename("2026-02-05-1136-Jonathan", None)
+ assert result == "2026-02-05-1136-Jonathan-ATTACH-unnamed"
+
+ def test_empty_filename(self):
+ result = generate_attachment_filename("2026-02-05-1136-Jonathan", "")
+ assert result == "2026-02-05-1136-Jonathan-ATTACH-unnamed"
diff --git a/docs/scripts/tests/test_integration_stdout.py b/docs/scripts/tests/test_integration_stdout.py
new file mode 100644
index 0000000..d87478e
--- /dev/null
+++ b/docs/scripts/tests/test_integration_stdout.py
@@ -0,0 +1,68 @@
+"""Integration tests for backwards-compatible stdout mode (no --output-dir)."""
+
+import os
+import shutil
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
+
+import importlib.util
+spec = importlib.util.spec_from_file_location(
+ "eml_script",
+ os.path.join(os.path.dirname(__file__), '..', 'eml-view-and-extract-attachments.py')
+)
+eml_script = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(eml_script)
+
+print_email = eml_script.print_email
+
+FIXTURES = os.path.join(os.path.dirname(__file__), 'fixtures')
+
+
+class TestPlainTextStdout:
+ def test_metadata_and_body_printed(self, tmp_path, capsys):
+ eml_src = os.path.join(FIXTURES, 'plain-text.eml')
+ working_eml = tmp_path / "message.eml"
+ shutil.copy2(eml_src, working_eml)
+
+ print_email(str(working_eml))
+ captured = capsys.readouterr()
+
+ assert "From: Jonathan Smith <jsmith@example.com>" in captured.out
+ assert "To: Craig Jennings <craig@example.com>" in captured.out
+ assert "Subject: Re: Fw: 4319 Danneel Street" in captured.out
+ assert "Date:" in captured.out
+ assert "Sent:" in captured.out
+ assert "Received:" in captured.out
+ assert "4319 Danneel Street" in captured.out
+
+
+class TestHtmlFallbackStdout:
+ def test_html_converted_on_stdout(self, tmp_path, capsys):
+ eml_src = os.path.join(FIXTURES, 'html-only.eml')
+ working_eml = tmp_path / "message.eml"
+ shutil.copy2(eml_src, working_eml)
+
+ print_email(str(working_eml))
+ captured = capsys.readouterr()
+
+ # Should see converted text, not raw HTML
+ assert "HTML" in captured.out
+ assert "<p>" not in captured.out
+
+
+class TestAttachmentsStdout:
+ def test_attachment_extracted_alongside_eml(self, tmp_path, capsys):
+ eml_src = os.path.join(FIXTURES, 'with-attachment.eml')
+ working_eml = tmp_path / "message.eml"
+ shutil.copy2(eml_src, working_eml)
+
+ print_email(str(working_eml))
+ captured = capsys.readouterr()
+
+ assert "Extracted attachment:" in captured.out
+ assert "Ltr Carrollton.pdf" in captured.out
+
+ # File should exist alongside the EML
+ extracted = tmp_path / "Ltr Carrollton.pdf"
+ assert extracted.exists()
diff --git a/docs/scripts/tests/test_parse_received_headers.py b/docs/scripts/tests/test_parse_received_headers.py
new file mode 100644
index 0000000..e12e1fb
--- /dev/null
+++ b/docs/scripts/tests/test_parse_received_headers.py
@@ -0,0 +1,105 @@
+"""Tests for parse_received_headers()."""
+
+import email
+import sys
+import os
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
+
+from conftest import make_plain_message, add_received_headers
+from email.message import EmailMessage
+
+# Import the function under test
+import importlib.util
+spec = importlib.util.spec_from_file_location(
+ "eml_script",
+ os.path.join(os.path.dirname(__file__), '..', 'eml-view-and-extract-attachments.py')
+)
+eml_script = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(eml_script)
+
+parse_received_headers = eml_script.parse_received_headers
+
+
+class TestSingleHeader:
+ def test_header_with_from_and_by(self):
+ msg = EmailMessage()
+ msg['Received'] = (
+ 'from mail-sender.example.com by mx.receiver.example.com '
+ 'with ESMTP; Thu, 05 Feb 2026 11:36:05 -0600'
+ )
+ result = parse_received_headers(msg)
+ assert result['sent_server'] == 'mail-sender.example.com'
+ assert result['received_server'] == 'mx.receiver.example.com'
+ assert result['sent_time'] == 'Thu, 05 Feb 2026 11:36:05 -0600'
+ assert result['received_time'] == 'Thu, 05 Feb 2026 11:36:05 -0600'
+
+
+class TestMultipleHeaders:
+ def test_uses_first_with_both_from_and_by(self):
+ msg = EmailMessage()
+ # Most recent first (by only)
+ msg['Received'] = 'by internal.example.com with SMTP; Thu, 05 Feb 2026 11:36:10 -0600'
+ # Next: has both from and by — this should be selected
+ msg['Received'] = (
+ 'from mail-sender.example.com by mx.receiver.example.com '
+ 'with ESMTP; Thu, 05 Feb 2026 11:36:05 -0600'
+ )
+ # Oldest
+ msg['Received'] = (
+ 'from originator.example.com by relay.example.com '
+ 'with SMTP; Thu, 05 Feb 2026 11:35:58 -0600'
+ )
+ result = parse_received_headers(msg)
+ assert result['sent_server'] == 'mail-sender.example.com'
+ assert result['received_server'] == 'mx.receiver.example.com'
+
+
+class TestNoReceivedHeaders:
+ def test_all_values_none(self):
+ msg = EmailMessage()
+ result = parse_received_headers(msg)
+ assert result['sent_time'] is None
+ assert result['sent_server'] is None
+ assert result['received_time'] is None
+ assert result['received_server'] is None
+
+
+class TestByButNoFrom:
+ def test_falls_back_to_first_header(self):
+ msg = EmailMessage()
+ msg['Received'] = 'by internal.example.com with SMTP; Thu, 05 Feb 2026 11:36:10 -0600'
+ result = parse_received_headers(msg)
+ assert result['received_server'] == 'internal.example.com'
+ assert result['received_time'] == 'Thu, 05 Feb 2026 11:36:10 -0600'
+ # No from in any header, so sent_server stays None
+ assert result['sent_server'] is None
+
+
+class TestMultilineFoldedHeader:
+ def test_normalizes_whitespace(self):
+ # Use email.message_from_string to parse raw folded headers
+ # (EmailMessage policy rejects embedded CRLF in set values)
+ raw = (
+ "From: test@example.com\r\n"
+ "Received: from mail-sender.example.com\r\n"
+ " by mx.receiver.example.com\r\n"
+ " with ESMTP; Thu, 05 Feb 2026 11:36:05 -0600\r\n"
+ "\r\n"
+ "body\r\n"
+ )
+ msg = email.message_from_string(raw)
+ result = parse_received_headers(msg)
+ assert result['sent_server'] == 'mail-sender.example.com'
+ assert result['received_server'] == 'mx.receiver.example.com'
+
+
+class TestMalformedTimestamp:
+ def test_no_semicolon(self):
+ msg = EmailMessage()
+ msg['Received'] = 'from sender.example.com by receiver.example.com with SMTP'
+ result = parse_received_headers(msg)
+ assert result['sent_server'] == 'sender.example.com'
+ assert result['received_server'] == 'receiver.example.com'
+ assert result['sent_time'] is None
+ assert result['received_time'] is None
diff --git a/docs/scripts/tests/test_process_eml.py b/docs/scripts/tests/test_process_eml.py
new file mode 100644
index 0000000..26c5ad5
--- /dev/null
+++ b/docs/scripts/tests/test_process_eml.py
@@ -0,0 +1,129 @@
+"""Integration tests for process_eml() — full pipeline with --output-dir."""
+
+import os
+import shutil
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
+
+import importlib.util
+spec = importlib.util.spec_from_file_location(
+ "eml_script",
+ os.path.join(os.path.dirname(__file__), '..', 'eml-view-and-extract-attachments.py')
+)
+eml_script = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(eml_script)
+
+process_eml = eml_script.process_eml
+
+import pytest
+
+
+FIXTURES = os.path.join(os.path.dirname(__file__), 'fixtures')
+
+
+class TestPlainTextPipeline:
+ def test_creates_eml_and_txt(self, tmp_path):
+ eml_src = os.path.join(FIXTURES, 'plain-text.eml')
+ # Copy fixture to tmp_path so temp dir can be created as sibling
+ working_eml = tmp_path / "inbox" / "message.eml"
+ working_eml.parent.mkdir()
+ shutil.copy2(eml_src, working_eml)
+
+ output_dir = tmp_path / "output"
+ result = process_eml(str(working_eml), str(output_dir))
+
+ # Should have exactly 2 files: .eml and .txt
+ assert len(result['files']) == 2
+ eml_file = result['files'][0]
+ txt_file = result['files'][1]
+
+ assert eml_file['type'] == 'eml'
+ assert txt_file['type'] == 'txt'
+ assert eml_file['name'].endswith('.eml')
+ assert txt_file['name'].endswith('.txt')
+
+ # Files exist in output dir
+ assert os.path.isfile(eml_file['path'])
+ assert os.path.isfile(txt_file['path'])
+
+ # Filenames contain expected components
+ assert 'Jonathan' in eml_file['name']
+ assert 'EMAIL' in eml_file['name']
+ assert '2026-02-05' in eml_file['name']
+
+ # Temp dir cleaned up (no extract-* dirs in inbox)
+ inbox_contents = os.listdir(str(tmp_path / "inbox"))
+ assert not any(d.startswith('extract-') for d in inbox_contents)
+
+
+class TestHtmlFallbackPipeline:
+ def test_txt_contains_converted_html(self, tmp_path):
+ eml_src = os.path.join(FIXTURES, 'html-only.eml')
+ working_eml = tmp_path / "inbox" / "message.eml"
+ working_eml.parent.mkdir()
+ shutil.copy2(eml_src, working_eml)
+
+ output_dir = tmp_path / "output"
+ result = process_eml(str(working_eml), str(output_dir))
+
+ txt_file = result['files'][1]
+ with open(txt_file['path'], 'r') as f:
+ content = f.read()
+
+ # Should be converted, not raw HTML
+ assert '<p>' not in content
+ assert '<strong>' not in content
+ assert 'HTML' in content
+
+
+class TestAttachmentPipeline:
+ def test_eml_txt_and_attachment_created(self, tmp_path):
+ eml_src = os.path.join(FIXTURES, 'with-attachment.eml')
+ working_eml = tmp_path / "inbox" / "message.eml"
+ working_eml.parent.mkdir()
+ shutil.copy2(eml_src, working_eml)
+
+ output_dir = tmp_path / "output"
+ result = process_eml(str(working_eml), str(output_dir))
+
+ assert len(result['files']) == 3
+ types = [f['type'] for f in result['files']]
+ assert types == ['eml', 'txt', 'attach']
+
+ # Attachment is auto-renamed
+ attach_file = result['files'][2]
+ assert 'ATTACH' in attach_file['name']
+ assert attach_file['name'].endswith('.pdf')
+ assert os.path.isfile(attach_file['path'])
+
+
+class TestCollisionDetection:
+ def test_raises_on_existing_file(self, tmp_path):
+ eml_src = os.path.join(FIXTURES, 'plain-text.eml')
+ working_eml = tmp_path / "inbox" / "message.eml"
+ working_eml.parent.mkdir()
+ shutil.copy2(eml_src, working_eml)
+
+ output_dir = tmp_path / "output"
+ # Run once to create files
+ result = process_eml(str(working_eml), str(output_dir))
+
+ # Run again — should raise FileExistsError
+ with pytest.raises(FileExistsError, match="Collision"):
+ process_eml(str(working_eml), str(output_dir))
+
+
+class TestMissingOutputDir:
+ def test_creates_directory(self, tmp_path):
+ eml_src = os.path.join(FIXTURES, 'plain-text.eml')
+ working_eml = tmp_path / "inbox" / "message.eml"
+ working_eml.parent.mkdir()
+ shutil.copy2(eml_src, working_eml)
+
+ output_dir = tmp_path / "new" / "nested" / "output"
+ assert not output_dir.exists()
+
+ result = process_eml(str(working_eml), str(output_dir))
+ assert output_dir.exists()
+ assert len(result['files']) == 2
diff --git a/docs/scripts/tests/test_save_attachments.py b/docs/scripts/tests/test_save_attachments.py
new file mode 100644
index 0000000..32f02a6
--- /dev/null
+++ b/docs/scripts/tests/test_save_attachments.py
@@ -0,0 +1,97 @@
+"""Tests for save_attachments()."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
+
+from conftest import make_plain_message, make_message_with_attachment
+from email.mime.multipart import MIMEMultipart
+from email.mime.text import MIMEText
+from email.mime.application import MIMEApplication
+
+import importlib.util
+spec = importlib.util.spec_from_file_location(
+ "eml_script",
+ os.path.join(os.path.dirname(__file__), '..', 'eml-view-and-extract-attachments.py')
+)
+eml_script = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(eml_script)
+
+save_attachments = eml_script.save_attachments
+
+
+class TestSingleAttachment:
+ def test_file_written_and_returned(self, tmp_path):
+ msg = make_message_with_attachment(
+ attachment_filename="report.pdf",
+ attachment_content=b"pdf bytes here"
+ )
+ result = save_attachments(msg, str(tmp_path), "2026-02-05-1136-Jonathan")
+
+ assert len(result) == 1
+ assert result[0]['original_name'] == "report.pdf"
+ assert "ATTACH" in result[0]['renamed_name']
+ assert result[0]['renamed_name'].endswith(".pdf")
+
+ # File actually exists and has correct content
+ written_path = result[0]['path']
+ assert os.path.isfile(written_path)
+ with open(written_path, 'rb') as f:
+ assert f.read() == b"pdf bytes here"
+
+
+class TestMultipleAttachments:
+ def test_all_written_and_returned(self, tmp_path):
+ msg = MIMEMultipart()
+ msg['From'] = 'test@example.com'
+ msg['Date'] = 'Thu, 05 Feb 2026 11:36:00 -0600'
+ msg.attach(MIMEText("body", 'plain'))
+
+ for name, content in [("doc1.pdf", b"pdf1"), ("image.png", b"png1")]:
+ att = MIMEApplication(content, Name=name)
+ att['Content-Disposition'] = f'attachment; filename="{name}"'
+ msg.attach(att)
+
+ result = save_attachments(msg, str(tmp_path), "2026-02-05-1136-Jonathan")
+
+ assert len(result) == 2
+ for r in result:
+ assert os.path.isfile(r['path'])
+
+
+class TestNoAttachments:
+ def test_empty_list(self, tmp_path):
+ msg = make_plain_message()
+ result = save_attachments(msg, str(tmp_path), "2026-02-05-1136-Jonathan")
+ assert result == []
+
+
+class TestFilenameWithSpaces:
+ def test_cleaned_filename(self, tmp_path):
+ msg = make_message_with_attachment(
+ attachment_filename="My Document (1).pdf",
+ attachment_content=b"data"
+ )
+ result = save_attachments(msg, str(tmp_path), "2026-02-05-1136-Jonathan")
+
+ assert len(result) == 1
+ assert " " not in result[0]['renamed_name']
+ assert os.path.isfile(result[0]['path'])
+
+
+class TestNoContentDisposition:
+ def test_skipped(self, tmp_path):
+ msg = MIMEMultipart()
+ msg['From'] = 'test@example.com'
+ msg.attach(MIMEText("body", 'plain'))
+
+ # Add a part without Content-Disposition
+ part = MIMEApplication(b"data", Name="file.bin")
+ # Explicitly remove Content-Disposition if present
+ if 'Content-Disposition' in part:
+ del part['Content-Disposition']
+ msg.attach(part)
+
+ result = save_attachments(msg, str(tmp_path), "2026-02-05-1136-Jonathan")
+ assert result == []