Autoparts-DB/pos/services/whisper_local.py

"""
Local Whisper transcription service.

Uses faster-whisper (a CTranslate2-based port of OpenAI Whisper) for
transcribing short audio clips (WhatsApp voice notes) on the CPU.

Runs fully offline after the first model download. No API keys, no
per-minute cost. Model is lazy-loaded on first call and cached in memory
for the lifetime of the process.

Default model: 'tiny' — the smallest and fastest variant (~75 MB), good
enough for conversational Spanish. Change WHISPER_MODEL below to 'base'
(150 MB, slightly better accuracy) or 'small' (500 MB, noticeably better)
if you have the RAM and don't mind 2-3x slower inference.
"""

import base64 as _b64
import os
import subprocess
import tempfile
import threading

# ─── Config ──────────────────────────────────────────────────────────────
# 'base' is the sweet spot for Mexican Spanish voice notes on CPU:
#   tiny  (75 MB) — too small, misses words in noisy/robot audio
#   base  (150 MB) — good accuracy, ~2s per 30s clip on a modern CPU  ← default
#   small (500 MB) — best accuracy, ~5s per 30s clip, worth it if RAM permits
WHISPER_MODEL = "base"
WHISPER_DEVICE = "cpu"
WHISPER_COMPUTE = "int8"     # int8 quantization — CPU-friendly, minimal quality loss

# ─── Lazy singleton model loader ─────────────────────────────────────────
_model = None
_model_lock = threading.Lock()


def _get_model():
    """Load the Whisper model on first use. Thread-safe."""
    global _model
    if _model is not None:
        return _model
    with _model_lock:
        if _model is not None:
            return _model
        from faster_whisper import WhisperModel
        print(f"[whisper] Loading {WHISPER_MODEL} model ({WHISPER_DEVICE}, {WHISPER_COMPUTE})...")
        _model = WhisperModel(
            WHISPER_MODEL,
            device=WHISPER_DEVICE,
            compute_type=WHISPER_COMPUTE,
        )
        print("[whisper] Model ready.")
        return _model


# ─── Public API ──────────────────────────────────────────────────────────

def transcribe_audio_base64(audio_base64: str, mimetype: str = "audio/ogg",
                             language: str = "es") -> str | None:
    """Transcribe a base64-encoded audio blob to text.

    Args:
        audio_base64: Raw base64 string (no data: prefix).
        mimetype: MIME type from the sender (e.g. 'audio/ogg' for WA voice notes).
        language: ISO 639-1 code to bias the model. 'es' for Spanish MX.

    Returns:
        The transcribed text, or None if transcription fails or is empty.
    """
    if not audio_base64:
        return None

    # Decode base64 → write to a temp file with the right extension so
    # ffmpeg (invoked by faster-whisper/CTranslate2) picks the decoder.
    ext = _extension_for_mimetype(mimetype)
    try:
        audio_bytes = _b64.b64decode(audio_base64)
    except Exception as e:
        print(f"[whisper] base64 decode failed: {e}")
        return None

    tmp_in = None
    tmp_wav = None
    try:
        # Write the original audio to a temp file
        tmp_in = tempfile.NamedTemporaryFile(suffix=ext, delete=False)
        tmp_in.write(audio_bytes)
        tmp_in.close()

        # WhatsApp voice notes are OGG/Opus — faster-whisper can handle it
        # via its pyav decoder, but converting to 16kHz mono WAV first is
        # more reliable across formats and ~2x faster.
        tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
        tmp_wav.close()
        rc = subprocess.run(
            ["ffmpeg", "-y", "-i", tmp_in.name,
             "-ar", "16000", "-ac", "1",
             "-f", "wav", tmp_wav.name],
            capture_output=True,
        )
        if rc.returncode != 0:
            print(f"[whisper] ffmpeg conversion failed: {rc.stderr.decode()[:200]}")
            return None

        # Run Whisper
        # - beam_size=5 for better accuracy on short/noisy clips
        # - no VAD filter (was trimming real speech in some tests)
        # - condition_on_previous_text=False for short independent clips
        model = _get_model()
        segments, info = model.transcribe(
            tmp_wav.name,
            language=language,
            beam_size=5,
            vad_filter=False,
            condition_on_previous_text=False,
        )
        text = " ".join(s.text.strip() for s in segments if s.text.strip())
        text = text.strip()

        if not text:
            return None

        print(f"[whisper] ({info.language}, {info.duration:.1f}s) → {text[:100]}")
        return text

    except Exception as e:
        print(f"[whisper] transcription error: {e}")
        return None
    finally:
        for f in (tmp_in, tmp_wav):
            if f:
                try:
                    os.unlink(f.name)
                except Exception:
                    pass


def _extension_for_mimetype(mimetype: str) -> str:
    """Map a MIME type to a file extension ffmpeg understands."""
    m = (mimetype or "").lower()
    if "opus" in m or "ogg" in m:
        return ".ogg"
    if "mp3" in m or "mpeg" in m:
        return ".mp3"
    if "mp4" in m or "aac" in m:
        return ".m4a"
    if "wav" in m:
        return ".wav"
    if "webm" in m:
        return ".webm"
    return ".ogg"  # WhatsApp voice notes are usually OGG/Opus