Files
Autoparts-DB/pos/services/whisper_local.py
consultoria-as e95f7cf684 feat: complete session — catalog, marketplace, WhatsApp, peer-to-peer, install scripts
Major features:
- Pixel-Perfect glassmorphism design (landing + POS + public catalog)
- OEM/Local catalog toggle with Nexpart taxonomy (14 groups, 108 subgroups, 558 part types)
- Marketplace B2B Phase 1 (bodegas, POs, status machine, WA+email notifications)
- Peer-to-peer inventory (multi-instance, LAN discovery)
- WhatsApp: photo→Vision AI, voice→Whisper, conversational quotations
- Smart unified search (VIN/plate/part_number/keyword auto-detect)
- Shop Supplies tab (vehicle-independent parts)
- Chatbot AI fallback chain (5 models) + response cache
- CSV inventory import tool + setup_instance.sh installer
- Tablet-responsive CSS + sidebar toggle
- Filters, export CSV, employee edit, business data save
- Quotation system (WA→POS) with auto-print on confirmation
- Live stats on landing page

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-18 05:35:53 +00:00

152 lines
5.5 KiB
Python

"""
Local Whisper transcription service.
Uses faster-whisper (a CTranslate2-based port of OpenAI Whisper) for
transcribing short audio clips (WhatsApp voice notes) on the CPU.
Runs fully offline after the first model download. No API keys, no
per-minute cost. Model is lazy-loaded on first call and cached in memory
for the lifetime of the process.
Default model: 'tiny' — the smallest and fastest variant (~75 MB), good
enough for conversational Spanish. Change WHISPER_MODEL below to 'base'
(150 MB, slightly better accuracy) or 'small' (500 MB, noticeably better)
if you have the RAM and don't mind 2-3x slower inference.
"""
import base64 as _b64
import os
import subprocess
import tempfile
import threading
# ─── Config ──────────────────────────────────────────────────────────────
# 'base' is the sweet spot for Mexican Spanish voice notes on CPU:
# tiny (75 MB) — too small, misses words in noisy/robot audio
# base (150 MB) — good accuracy, ~2s per 30s clip on a modern CPU ← default
# small (500 MB) — best accuracy, ~5s per 30s clip, worth it if RAM permits
WHISPER_MODEL = "base"
WHISPER_DEVICE = "cpu"
WHISPER_COMPUTE = "int8" # int8 quantization — CPU-friendly, minimal quality loss
# ─── Lazy singleton model loader ─────────────────────────────────────────
_model = None
_model_lock = threading.Lock()
def _get_model():
"""Load the Whisper model on first use. Thread-safe."""
global _model
if _model is not None:
return _model
with _model_lock:
if _model is not None:
return _model
from faster_whisper import WhisperModel
print(f"[whisper] Loading {WHISPER_MODEL} model ({WHISPER_DEVICE}, {WHISPER_COMPUTE})...")
_model = WhisperModel(
WHISPER_MODEL,
device=WHISPER_DEVICE,
compute_type=WHISPER_COMPUTE,
)
print("[whisper] Model ready.")
return _model
# ─── Public API ──────────────────────────────────────────────────────────
def transcribe_audio_base64(audio_base64: str, mimetype: str = "audio/ogg",
language: str = "es") -> str | None:
"""Transcribe a base64-encoded audio blob to text.
Args:
audio_base64: Raw base64 string (no data: prefix).
mimetype: MIME type from the sender (e.g. 'audio/ogg' for WA voice notes).
language: ISO 639-1 code to bias the model. 'es' for Spanish MX.
Returns:
The transcribed text, or None if transcription fails or is empty.
"""
if not audio_base64:
return None
# Decode base64 → write to a temp file with the right extension so
# ffmpeg (invoked by faster-whisper/CTranslate2) picks the decoder.
ext = _extension_for_mimetype(mimetype)
try:
audio_bytes = _b64.b64decode(audio_base64)
except Exception as e:
print(f"[whisper] base64 decode failed: {e}")
return None
tmp_in = None
tmp_wav = None
try:
# Write the original audio to a temp file
tmp_in = tempfile.NamedTemporaryFile(suffix=ext, delete=False)
tmp_in.write(audio_bytes)
tmp_in.close()
# WhatsApp voice notes are OGG/Opus — faster-whisper can handle it
# via its pyav decoder, but converting to 16kHz mono WAV first is
# more reliable across formats and ~2x faster.
tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
tmp_wav.close()
rc = subprocess.run(
["ffmpeg", "-y", "-i", tmp_in.name,
"-ar", "16000", "-ac", "1",
"-f", "wav", tmp_wav.name],
capture_output=True,
)
if rc.returncode != 0:
print(f"[whisper] ffmpeg conversion failed: {rc.stderr.decode()[:200]}")
return None
# Run Whisper
# - beam_size=5 for better accuracy on short/noisy clips
# - no VAD filter (was trimming real speech in some tests)
# - condition_on_previous_text=False for short independent clips
model = _get_model()
segments, info = model.transcribe(
tmp_wav.name,
language=language,
beam_size=5,
vad_filter=False,
condition_on_previous_text=False,
)
text = " ".join(s.text.strip() for s in segments if s.text.strip())
text = text.strip()
if not text:
return None
print(f"[whisper] ({info.language}, {info.duration:.1f}s) → {text[:100]}")
return text
except Exception as e:
print(f"[whisper] transcription error: {e}")
return None
finally:
for f in (tmp_in, tmp_wav):
if f:
try:
os.unlink(f.name)
except Exception:
pass
def _extension_for_mimetype(mimetype: str) -> str:
"""Map a MIME type to a file extension ffmpeg understands."""
m = (mimetype or "").lower()
if "opus" in m or "ogg" in m:
return ".ogg"
if "mp3" in m or "mpeg" in m:
return ".mp3"
if "mp4" in m or "aac" in m:
return ".m4a"
if "wav" in m:
return ".wav"
if "webm" in m:
return ".webm"
return ".ogg" # WhatsApp voice notes are usually OGG/Opus