""" Local Whisper transcription service. Uses faster-whisper (a CTranslate2-based port of OpenAI Whisper) for transcribing short audio clips (WhatsApp voice notes) on the CPU. Runs fully offline after the first model download. No API keys, no per-minute cost. Model is lazy-loaded on first call and cached in memory for the lifetime of the process. Default model: 'tiny' — the smallest and fastest variant (~75 MB), good enough for conversational Spanish. Change WHISPER_MODEL below to 'base' (150 MB, slightly better accuracy) or 'small' (500 MB, noticeably better) if you have the RAM and don't mind 2-3x slower inference. """ import base64 as _b64 import os import subprocess import tempfile import threading # ─── Config ────────────────────────────────────────────────────────────── # 'base' is the sweet spot for Mexican Spanish voice notes on CPU: # tiny (75 MB) — too small, misses words in noisy/robot audio # base (150 MB) — good accuracy, ~2s per 30s clip on a modern CPU ← default # small (500 MB) — best accuracy, ~5s per 30s clip, worth it if RAM permits WHISPER_MODEL = "base" WHISPER_DEVICE = "cpu" WHISPER_COMPUTE = "int8" # int8 quantization — CPU-friendly, minimal quality loss # ─── Lazy singleton model loader ───────────────────────────────────────── _model = None _model_lock = threading.Lock() def _get_model(): """Load the Whisper model on first use. Thread-safe.""" global _model if _model is not None: return _model with _model_lock: if _model is not None: return _model from faster_whisper import WhisperModel print(f"[whisper] Loading {WHISPER_MODEL} model ({WHISPER_DEVICE}, {WHISPER_COMPUTE})...") _model = WhisperModel( WHISPER_MODEL, device=WHISPER_DEVICE, compute_type=WHISPER_COMPUTE, ) print("[whisper] Model ready.") return _model # ─── Public API ────────────────────────────────────────────────────────── def transcribe_audio_base64(audio_base64: str, mimetype: str = "audio/ogg", language: str = "es") -> str | None: """Transcribe a base64-encoded audio blob to text. Args: audio_base64: Raw base64 string (no data: prefix). mimetype: MIME type from the sender (e.g. 'audio/ogg' for WA voice notes). language: ISO 639-1 code to bias the model. 'es' for Spanish MX. Returns: The transcribed text, or None if transcription fails or is empty. """ if not audio_base64: return None # Decode base64 → write to a temp file with the right extension so # ffmpeg (invoked by faster-whisper/CTranslate2) picks the decoder. ext = _extension_for_mimetype(mimetype) try: audio_bytes = _b64.b64decode(audio_base64) except Exception as e: print(f"[whisper] base64 decode failed: {e}") return None tmp_in = None tmp_wav = None try: # Write the original audio to a temp file tmp_in = tempfile.NamedTemporaryFile(suffix=ext, delete=False) tmp_in.write(audio_bytes) tmp_in.close() # WhatsApp voice notes are OGG/Opus — faster-whisper can handle it # via its pyav decoder, but converting to 16kHz mono WAV first is # more reliable across formats and ~2x faster. tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) tmp_wav.close() rc = subprocess.run( ["ffmpeg", "-y", "-i", tmp_in.name, "-ar", "16000", "-ac", "1", "-f", "wav", tmp_wav.name], capture_output=True, ) if rc.returncode != 0: print(f"[whisper] ffmpeg conversion failed: {rc.stderr.decode()[:200]}") return None # Run Whisper # - beam_size=5 for better accuracy on short/noisy clips # - no VAD filter (was trimming real speech in some tests) # - condition_on_previous_text=False for short independent clips model = _get_model() segments, info = model.transcribe( tmp_wav.name, language=language, beam_size=5, vad_filter=False, condition_on_previous_text=False, ) text = " ".join(s.text.strip() for s in segments if s.text.strip()) text = text.strip() if not text: return None print(f"[whisper] ({info.language}, {info.duration:.1f}s) → {text[:100]}") return text except Exception as e: print(f"[whisper] transcription error: {e}") return None finally: for f in (tmp_in, tmp_wav): if f: try: os.unlink(f.name) except Exception: pass def _extension_for_mimetype(mimetype: str) -> str: """Map a MIME type to a file extension ffmpeg understands.""" m = (mimetype or "").lower() if "opus" in m or "ogg" in m: return ".ogg" if "mp3" in m or "mpeg" in m: return ".mp3" if "mp4" in m or "aac" in m: return ".m4a" if "wav" in m: return ".wav" if "webm" in m: return ".webm" return ".ogg" # WhatsApp voice notes are usually OGG/Opus