Major features: - Pixel-Perfect glassmorphism design (landing + POS + public catalog) - OEM/Local catalog toggle with Nexpart taxonomy (14 groups, 108 subgroups, 558 part types) - Marketplace B2B Phase 1 (bodegas, POs, status machine, WA+email notifications) - Peer-to-peer inventory (multi-instance, LAN discovery) - WhatsApp: photo→Vision AI, voice→Whisper, conversational quotations - Smart unified search (VIN/plate/part_number/keyword auto-detect) - Shop Supplies tab (vehicle-independent parts) - Chatbot AI fallback chain (5 models) + response cache - CSV inventory import tool + setup_instance.sh installer - Tablet-responsive CSS + sidebar toggle - Filters, export CSV, employee edit, business data save - Quotation system (WA→POS) with auto-print on confirmation - Live stats on landing page Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
152 lines
5.5 KiB
Python
152 lines
5.5 KiB
Python
"""
|
|
Local Whisper transcription service.
|
|
|
|
Uses faster-whisper (a CTranslate2-based port of OpenAI Whisper) for
|
|
transcribing short audio clips (WhatsApp voice notes) on the CPU.
|
|
|
|
Runs fully offline after the first model download. No API keys, no
|
|
per-minute cost. Model is lazy-loaded on first call and cached in memory
|
|
for the lifetime of the process.
|
|
|
|
Default model: 'tiny' — the smallest and fastest variant (~75 MB), good
|
|
enough for conversational Spanish. Change WHISPER_MODEL below to 'base'
|
|
(150 MB, slightly better accuracy) or 'small' (500 MB, noticeably better)
|
|
if you have the RAM and don't mind 2-3x slower inference.
|
|
"""
|
|
|
|
import base64 as _b64
|
|
import os
|
|
import subprocess
|
|
import tempfile
|
|
import threading
|
|
|
|
# ─── Config ──────────────────────────────────────────────────────────────
|
|
# 'base' is the sweet spot for Mexican Spanish voice notes on CPU:
|
|
# tiny (75 MB) — too small, misses words in noisy/robot audio
|
|
# base (150 MB) — good accuracy, ~2s per 30s clip on a modern CPU ← default
|
|
# small (500 MB) — best accuracy, ~5s per 30s clip, worth it if RAM permits
|
|
WHISPER_MODEL = "base"
|
|
WHISPER_DEVICE = "cpu"
|
|
WHISPER_COMPUTE = "int8" # int8 quantization — CPU-friendly, minimal quality loss
|
|
|
|
# ─── Lazy singleton model loader ─────────────────────────────────────────
|
|
_model = None
|
|
_model_lock = threading.Lock()
|
|
|
|
|
|
def _get_model():
|
|
"""Load the Whisper model on first use. Thread-safe."""
|
|
global _model
|
|
if _model is not None:
|
|
return _model
|
|
with _model_lock:
|
|
if _model is not None:
|
|
return _model
|
|
from faster_whisper import WhisperModel
|
|
print(f"[whisper] Loading {WHISPER_MODEL} model ({WHISPER_DEVICE}, {WHISPER_COMPUTE})...")
|
|
_model = WhisperModel(
|
|
WHISPER_MODEL,
|
|
device=WHISPER_DEVICE,
|
|
compute_type=WHISPER_COMPUTE,
|
|
)
|
|
print("[whisper] Model ready.")
|
|
return _model
|
|
|
|
|
|
# ─── Public API ──────────────────────────────────────────────────────────
|
|
|
|
def transcribe_audio_base64(audio_base64: str, mimetype: str = "audio/ogg",
|
|
language: str = "es") -> str | None:
|
|
"""Transcribe a base64-encoded audio blob to text.
|
|
|
|
Args:
|
|
audio_base64: Raw base64 string (no data: prefix).
|
|
mimetype: MIME type from the sender (e.g. 'audio/ogg' for WA voice notes).
|
|
language: ISO 639-1 code to bias the model. 'es' for Spanish MX.
|
|
|
|
Returns:
|
|
The transcribed text, or None if transcription fails or is empty.
|
|
"""
|
|
if not audio_base64:
|
|
return None
|
|
|
|
# Decode base64 → write to a temp file with the right extension so
|
|
# ffmpeg (invoked by faster-whisper/CTranslate2) picks the decoder.
|
|
ext = _extension_for_mimetype(mimetype)
|
|
try:
|
|
audio_bytes = _b64.b64decode(audio_base64)
|
|
except Exception as e:
|
|
print(f"[whisper] base64 decode failed: {e}")
|
|
return None
|
|
|
|
tmp_in = None
|
|
tmp_wav = None
|
|
try:
|
|
# Write the original audio to a temp file
|
|
tmp_in = tempfile.NamedTemporaryFile(suffix=ext, delete=False)
|
|
tmp_in.write(audio_bytes)
|
|
tmp_in.close()
|
|
|
|
# WhatsApp voice notes are OGG/Opus — faster-whisper can handle it
|
|
# via its pyav decoder, but converting to 16kHz mono WAV first is
|
|
# more reliable across formats and ~2x faster.
|
|
tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
|
tmp_wav.close()
|
|
rc = subprocess.run(
|
|
["ffmpeg", "-y", "-i", tmp_in.name,
|
|
"-ar", "16000", "-ac", "1",
|
|
"-f", "wav", tmp_wav.name],
|
|
capture_output=True,
|
|
)
|
|
if rc.returncode != 0:
|
|
print(f"[whisper] ffmpeg conversion failed: {rc.stderr.decode()[:200]}")
|
|
return None
|
|
|
|
# Run Whisper
|
|
# - beam_size=5 for better accuracy on short/noisy clips
|
|
# - no VAD filter (was trimming real speech in some tests)
|
|
# - condition_on_previous_text=False for short independent clips
|
|
model = _get_model()
|
|
segments, info = model.transcribe(
|
|
tmp_wav.name,
|
|
language=language,
|
|
beam_size=5,
|
|
vad_filter=False,
|
|
condition_on_previous_text=False,
|
|
)
|
|
text = " ".join(s.text.strip() for s in segments if s.text.strip())
|
|
text = text.strip()
|
|
|
|
if not text:
|
|
return None
|
|
|
|
print(f"[whisper] ({info.language}, {info.duration:.1f}s) → {text[:100]}")
|
|
return text
|
|
|
|
except Exception as e:
|
|
print(f"[whisper] transcription error: {e}")
|
|
return None
|
|
finally:
|
|
for f in (tmp_in, tmp_wav):
|
|
if f:
|
|
try:
|
|
os.unlink(f.name)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def _extension_for_mimetype(mimetype: str) -> str:
|
|
"""Map a MIME type to a file extension ffmpeg understands."""
|
|
m = (mimetype or "").lower()
|
|
if "opus" in m or "ogg" in m:
|
|
return ".ogg"
|
|
if "mp3" in m or "mpeg" in m:
|
|
return ".mp3"
|
|
if "mp4" in m or "aac" in m:
|
|
return ".m4a"
|
|
if "wav" in m:
|
|
return ".wav"
|
|
if "webm" in m:
|
|
return ".webm"
|
|
return ".ogg" # WhatsApp voice notes are usually OGG/Opus
|