diff options
Diffstat (limited to 'scripts')
| -rwxr-xr-x | scripts/assemblyai-transcribe | 134 | ||||
| -rwxr-xr-x | scripts/languagetool-flycheck | 82 |
2 files changed, 216 insertions, 0 deletions
diff --git a/scripts/assemblyai-transcribe b/scripts/assemblyai-transcribe new file mode 100755 index 00000000..22cbf538 --- /dev/null +++ b/scripts/assemblyai-transcribe @@ -0,0 +1,134 @@ +#!/usr/bin/env bash +# assemblyai-transcribe - Transcribe audio files using AssemblyAI API with speaker diarization +# Usage: assemblyai-transcribe <audio-file> [language] +# +# Requires: ASSEMBLYAI_API_KEY environment variable +# Language: en, es, fr, etc. (default: en) +# Features: Speaker diarization (up to 50 speakers) + +set -euo pipefail + +# Parse arguments +AUDIO="${1:-}" +LANG="${2:-en}" + +# Validate arguments +if [[ -z "$AUDIO" ]]; then + echo "Usage: assemblyai-transcribe <audio-file> [language]" >&2 + echo "Example: assemblyai-transcribe meeting.m4a en" >&2 + exit 1 +fi + +if [[ ! -f "$AUDIO" ]]; then + echo "Error: Audio file not found: $AUDIO" >&2 + exit 1 +fi + +# Check API key is set +if [[ -z "${ASSEMBLYAI_API_KEY:-}" ]]; then + echo "Error: ASSEMBLYAI_API_KEY environment variable not set" >&2 + exit 1 +fi + +# Check curl is available +if ! command -v curl &> /dev/null; then + echo "Error: curl command not found" >&2 + exit 1 +fi + +# Check jq is available (for JSON parsing) +if ! command -v jq &> /dev/null; then + echo "Error: jq command not found (required for JSON parsing)" >&2 + echo "Install with: sudo pacman -S jq" >&2 + exit 1 +fi + +API_BASE="https://api.assemblyai.com/v2" + +# Step 1: Upload audio file +echo "Uploading audio file..." >&2 +UPLOAD_RESPONSE=$(curl -s -X POST "${API_BASE}/upload" \ + -H "Authorization: ${ASSEMBLYAI_API_KEY}" \ + --data-binary "@${AUDIO}") + +UPLOAD_URL=$(echo "$UPLOAD_RESPONSE" | jq -r '.upload_url') + +if [[ -z "$UPLOAD_URL" ]] || [[ "$UPLOAD_URL" == "null" ]]; then + echo "Error: Failed to upload audio file" >&2 + echo "$UPLOAD_RESPONSE" >&2 + exit 1 +fi + +echo "Upload complete. Submitting transcription..." >&2 + +# Step 2: Submit transcription request with speaker labels +TRANSCRIPT_REQUEST=$(cat <<EOF +{ + "audio_url": "${UPLOAD_URL}", + "language_code": "${LANG}", + "speech_model": "universal", + "speaker_labels": true +} +EOF +) + +TRANSCRIPT_RESPONSE=$(curl -s -X POST "${API_BASE}/transcript" \ + -H "Authorization: ${ASSEMBLYAI_API_KEY}" \ + -H "Content-Type: application/json" \ + -d "$TRANSCRIPT_REQUEST") + +TRANSCRIPT_ID=$(echo "$TRANSCRIPT_RESPONSE" | jq -r '.id') + +if [[ -z "$TRANSCRIPT_ID" ]] || [[ "$TRANSCRIPT_ID" == "null" ]]; then + echo "Error: Failed to submit transcription" >&2 + echo "$TRANSCRIPT_RESPONSE" >&2 + exit 1 +fi + +echo "Transcription job submitted (ID: ${TRANSCRIPT_ID})" >&2 +echo "Waiting for completion..." >&2 + +# Step 3: Poll for completion +STATUS="queued" +POLL_INTERVAL=3 +MAX_WAIT=1800 # 30 minutes +ELAPSED=0 + +while [[ "$STATUS" == "queued" ]] || [[ "$STATUS" == "processing" ]]; do + if [[ $ELAPSED -ge $MAX_WAIT ]]; then + echo "Error: Transcription timed out after ${MAX_WAIT} seconds" >&2 + exit 1 + fi + + sleep $POLL_INTERVAL + ELAPSED=$((ELAPSED + POLL_INTERVAL)) + + RESULT=$(curl -s -X GET "${API_BASE}/transcript/${TRANSCRIPT_ID}" \ + -H "Authorization: ${ASSEMBLYAI_API_KEY}") + + STATUS=$(echo "$RESULT" | jq -r '.status') + + if [[ "$STATUS" == "processing" ]]; then + echo "Processing... (${ELAPSED}s elapsed)" >&2 + fi +done + +# Check if transcription failed +if [[ "$STATUS" != "completed" ]]; then + ERROR_MSG=$(echo "$RESULT" | jq -r '.error // "Unknown error"') + echo "Error: Transcription failed with status: ${STATUS}" >&2 + echo "Error message: ${ERROR_MSG}" >&2 + exit 1 +fi + +echo "Transcription complete! (${ELAPSED}s total)" >&2 + +# Step 4: Format output with speaker labels +# Extract utterances and format as "Speaker A: text" +echo "$RESULT" | jq -r ' + if .utterances then + .utterances[] | "Speaker \(.speaker): \(.text)" + else + .text + end +' diff --git a/scripts/languagetool-flycheck b/scripts/languagetool-flycheck new file mode 100755 index 00000000..ecbc900f --- /dev/null +++ b/scripts/languagetool-flycheck @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 +""" +Wrapper for LanguageTool to produce flycheck-compatible output. +Output format: filename:line:column: message +""" + +import json +import sys +import subprocess + +def main(): + if len(sys.argv) < 2: + print("Usage: languagetool-flycheck FILE", file=sys.stderr) + sys.exit(1) + + filename = sys.argv[1] + + # Run languagetool with JSON output + try: + result = subprocess.run( + ['languagetool', '-l', 'en-US', '--json', filename], + capture_output=True, + text=True, + timeout=30 + ) + except subprocess.TimeoutExpired: + print(f"{filename}:1:1: LanguageTool timeout", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"{filename}:1:1: LanguageTool error: {e}", file=sys.stderr) + sys.exit(1) + + # Parse JSON output + try: + # Find the JSON in the output (skip warning lines) + json_output = None + for line in result.stdout.split('\n'): + if line.startswith('{'): + json_output = line + break + + if not json_output: + sys.exit(0) # No errors found + + data = json.loads(json_output) + + # Read file to calculate line numbers from character offsets + with open(filename, 'r', encoding='utf-8') as f: + content = f.read() + + # Convert matches to flycheck format + for match in data.get('matches', []): + offset = match['offset'] + length = match['length'] + message = match['message'] + rule_id = match['rule']['id'] + + # Calculate line and column from offset + line = content[:offset].count('\n') + 1 + line_start = content.rfind('\n', 0, offset) + 1 + column = offset - line_start + 1 + + # Get first suggestion if available + suggestions = match.get('replacements', []) + if suggestions: + suggestion = suggestions[0]['value'] + message = f"{rule_id}: {message} Suggestion: {suggestion}" + else: + message = f"{rule_id}: {message}" + + # Output in flycheck format + print(f"{filename}:{line}:{column}: {message}") + + except json.JSONDecodeError as e: + print(f"{filename}:1:1: Failed to parse LanguageTool JSON: {e}", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"{filename}:1:1: Error processing LanguageTool output: {e}", file=sys.stderr) + sys.exit(1) + +if __name__ == '__main__': + main() |
