From afb86d5c559413bddf80ff38260d0cf0debb585f Mon Sep 17 00:00:00 2001 From: Craig Jennings Date: Thu, 6 Nov 2025 00:43:13 -0600 Subject: feat: Add AssemblyAI transcription backend with speaker diarization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Integrated AssemblyAI as the third transcription backend alongside OpenAI API and local-whisper, now set as the default due to superior speaker diarization capabilities (up to 50 speakers). New Features: - AssemblyAI backend with automatic speaker labeling - Backend switching UI via C-; T b (completing-read interface) - Universal speech model supporting 99 languages - API key management through auth-source/authinfo.gpg Implementation: - Created scripts/assemblyai-transcribe (upload → poll → format workflow) - Updated transcription-config.el with multi-backend support - Added cj/--get-assemblyai-api-key for secure credential retrieval - Refactored process environment handling from if to pcase - Added cj/transcription-switch-backend interactive command Testing: - Created test-transcription-config--transcription-script-path.el - 5 unit tests covering all 3 backends (100% passing) - Followed quality-engineer.org guidelines (test pure functions only) - Investigated 18 test failures: documented cleanup in todo.org Files Modified: - modules/transcription-config.el - Multi-backend support and UI - scripts/assemblyai-transcribe - NEW: AssemblyAI integration script - tests/test-transcription-config--transcription-script-path.el - NEW - todo.org - Added test cleanup task (Method 3, priority C) - docs/NOTES.org - Comprehensive session notes added Successfully tested with 33KB and 4.1MB audio files (3s and 9s processing). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- scripts/assemblyai-transcribe | 134 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 134 insertions(+) create mode 100755 scripts/assemblyai-transcribe (limited to 'scripts/assemblyai-transcribe') diff --git a/scripts/assemblyai-transcribe b/scripts/assemblyai-transcribe new file mode 100755 index 00000000..22cbf538 --- /dev/null +++ b/scripts/assemblyai-transcribe @@ -0,0 +1,134 @@ +#!/usr/bin/env bash +# assemblyai-transcribe - Transcribe audio files using AssemblyAI API with speaker diarization +# Usage: assemblyai-transcribe [language] +# +# Requires: ASSEMBLYAI_API_KEY environment variable +# Language: en, es, fr, etc. (default: en) +# Features: Speaker diarization (up to 50 speakers) + +set -euo pipefail + +# Parse arguments +AUDIO="${1:-}" +LANG="${2:-en}" + +# Validate arguments +if [[ -z "$AUDIO" ]]; then + echo "Usage: assemblyai-transcribe [language]" >&2 + echo "Example: assemblyai-transcribe meeting.m4a en" >&2 + exit 1 +fi + +if [[ ! -f "$AUDIO" ]]; then + echo "Error: Audio file not found: $AUDIO" >&2 + exit 1 +fi + +# Check API key is set +if [[ -z "${ASSEMBLYAI_API_KEY:-}" ]]; then + echo "Error: ASSEMBLYAI_API_KEY environment variable not set" >&2 + exit 1 +fi + +# Check curl is available +if ! command -v curl &> /dev/null; then + echo "Error: curl command not found" >&2 + exit 1 +fi + +# Check jq is available (for JSON parsing) +if ! command -v jq &> /dev/null; then + echo "Error: jq command not found (required for JSON parsing)" >&2 + echo "Install with: sudo pacman -S jq" >&2 + exit 1 +fi + +API_BASE="https://api.assemblyai.com/v2" + +# Step 1: Upload audio file +echo "Uploading audio file..." >&2 +UPLOAD_RESPONSE=$(curl -s -X POST "${API_BASE}/upload" \ + -H "Authorization: ${ASSEMBLYAI_API_KEY}" \ + --data-binary "@${AUDIO}") + +UPLOAD_URL=$(echo "$UPLOAD_RESPONSE" | jq -r '.upload_url') + +if [[ -z "$UPLOAD_URL" ]] || [[ "$UPLOAD_URL" == "null" ]]; then + echo "Error: Failed to upload audio file" >&2 + echo "$UPLOAD_RESPONSE" >&2 + exit 1 +fi + +echo "Upload complete. Submitting transcription..." >&2 + +# Step 2: Submit transcription request with speaker labels +TRANSCRIPT_REQUEST=$(cat <&2 + echo "$TRANSCRIPT_RESPONSE" >&2 + exit 1 +fi + +echo "Transcription job submitted (ID: ${TRANSCRIPT_ID})" >&2 +echo "Waiting for completion..." >&2 + +# Step 3: Poll for completion +STATUS="queued" +POLL_INTERVAL=3 +MAX_WAIT=1800 # 30 minutes +ELAPSED=0 + +while [[ "$STATUS" == "queued" ]] || [[ "$STATUS" == "processing" ]]; do + if [[ $ELAPSED -ge $MAX_WAIT ]]; then + echo "Error: Transcription timed out after ${MAX_WAIT} seconds" >&2 + exit 1 + fi + + sleep $POLL_INTERVAL + ELAPSED=$((ELAPSED + POLL_INTERVAL)) + + RESULT=$(curl -s -X GET "${API_BASE}/transcript/${TRANSCRIPT_ID}" \ + -H "Authorization: ${ASSEMBLYAI_API_KEY}") + + STATUS=$(echo "$RESULT" | jq -r '.status') + + if [[ "$STATUS" == "processing" ]]; then + echo "Processing... (${ELAPSED}s elapsed)" >&2 + fi +done + +# Check if transcription failed +if [[ "$STATUS" != "completed" ]]; then + ERROR_MSG=$(echo "$RESULT" | jq -r '.error // "Unknown error"') + echo "Error: Transcription failed with status: ${STATUS}" >&2 + echo "Error message: ${ERROR_MSG}" >&2 + exit 1 +fi + +echo "Transcription complete! (${ELAPSED}s total)" >&2 + +# Step 4: Format output with speaker labels +# Extract utterances and format as "Speaker A: text" +echo "$RESULT" | jq -r ' + if .utterances then + .utterances[] | "Speaker \(.speaker): \(.text)" + else + .text + end +' -- cgit v1.2.3