feat(whatsapp): QWEN primary AI backend, Hermes fallback, conversation history, vehicle persistence, demo prompts

- Add QWEN (qwen3.6) as primary AI backend with short system prompt - Hermes remains as fallback with 45s timeout - Increase QWEN timeout to 35s, max_tokens to 4000 - Add conversation history loading from whatsapp_messages (last 4 msgs) - Persist detected vehicle in whatsapp_sessions table - Add 'limpiar chat' / 'nuevo chat' / 'reset' commands to clear history - Fix CSS conflict: rename whatsapp chat-panel classes to wa-chat-panel - Fix JS ID conflicts with chat.js widget (waChatPanel, waChatMessages, etc.) - Improve no-stock response: conversational with alternatives - Split search_query by | for multi-part lookups - Add DEMO_PROMPTS.md and DEMO_PROMPTS_V2.md
2026-05-06 20:27:14 +00:00
parent 371d72887e
commit ff45905b49
33 changed files with 3040 additions and 445 deletions
--- a/pos/services/ai_chat.py
+++ b/pos/services/ai_chat.py
@@ -3,9 +3,15 @@

 import requests
 import json
-from config import OPENROUTER_API_KEY
+from config import OPENROUTER_API_KEY, HERMES_API_URL, HERMES_API_KEY
+from config import QWEN_API_URL, QWEN_API_KEY, QWEN_MODEL

 OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"
+HERMES_ENABLED = bool(HERMES_API_KEY and HERMES_API_URL)
+HERMES_CHAT_URL = (HERMES_API_URL.rstrip('/') + '/chat/completions') if HERMES_API_URL else None
+
+QWEN_ENABLED = bool(QWEN_API_KEY and QWEN_API_URL)
+QWEN_CHAT_URL = (QWEN_API_URL.rstrip('/') + '/chat/completions') if QWEN_API_URL else None

 # ⚠️ SOLO MODELOS GRATUITOS — No cambiar a modelos de pago.
 # El modelo DEBE terminar en ":free" para garantizar costo $0.
@@ -24,11 +30,69 @@ FALLBACK_MODELS = [
    "meta-llama/llama-3.3-70b-instruct:free", # Meta — último fallback
 ]

+# Hermes Agent model (OpenAI-compatible API server)
+HERMES_MODEL = "hermes-agent"
+
 def _validate_model(model_id):
-    """Ensure only free models are used. Raises if model is not free."""
+    """Ensure only free models are used. Raises if model is not free.
+    
+    Skips validation for Hermes Agent and QWEN models (self-hosted / private API).
+    """
+    if model_id == HERMES_MODEL:
+        return
+    if model_id == QWEN_MODEL:
+        return
    if not model_id.endswith(':free'):
        raise ValueError(f"BLOQUEADO: Solo se permiten modelos gratuitos (:free). Modelo '{model_id}' no es gratuito.")

+
+def _post_chat_completion(url, api_key, model_id, messages, max_tokens=800, temperature=0.3, timeout=25):
+    """Generic OpenAI-compatible chat completion POST.
+    
+    Returns the parsed response dict on success, None on failure.
+    """
+    try:
+        resp = requests.post(
+            url,
+            headers={
+                "Authorization": f"Bearer {api_key}",
+                "Content-Type": "application/json",
+            },
+            json={
+                "model": model_id,
+                "messages": messages,
+                "max_tokens": max_tokens,
+                "temperature": temperature,
+            },
+            timeout=timeout,
+        )
+        if resp.status_code == 429:
+            print(f"[AI] Rate limited on {model_id} ({url})")
+            return None
+        if resp.status_code >= 400:
+            print(f"[AI] HTTP {resp.status_code} on {model_id} ({url}): {resp.text[:200]}")
+            return None
+        data = resp.json()
+        choice = data.get("choices", [{}])[0]
+        content = choice.get("message", {}).get("content") or ""
+        content = content.strip()
+        finish = choice.get("finish_reason", "")
+        if not content:
+            print(f"[AI] Empty response from {model_id} (finish={finish})")
+            return None
+        return {"content": content, "finish_reason": finish, "model": model_id}
+    except Exception as e:
+        print(f"[AI] Error with {model_id} ({url}): {e}")
+        return None
+
+
+SYSTEM_PROMPT_SHORT = """Eres un asistente de refaccionaria automotriz mexicana. Ayuda a encontrar autopartes.
+Responde SIEMPRE en formato JSON: {"message":"...","search_query":"...","vehicle":{"brand":"...","model":"...","year":...}}
+search_query va EN INGLES cuando el usuario pide una parte. Traducciones: Balatas=Brake Pad, Disco de freno=Brake Disc, Amortiguador=Shock Absorber, Filtro de aceite=Oil Filter, Filtro de aire=Air Filter, Bujias=Spark Plug, Banda=V-Belt, Bomba de agua=Water Pump, Alternador=Alternator, Radiador=Radiator, Sensor de oxigeno=Oxygen Sensor, Terminal de direccion=Tie Rod End, Bomba de gasolina=Fuel Pump, Clutch=Clutch Kit, Mofle=Exhaust, Inyector=Injector.
+No preguntes mas si ya puedes buscar. Si el usuario describe un sintoma, diagnostica y sugiere partes.
+Cuando pida cotizacion o multiples partes, search_query DEBE usar | para separar cada parte: "Brake Pad|Air Filter|Oil Filter|Spark Plug".
+"""
+
 SYSTEM_PROMPT = """Eres un asistente de refaccionaria automotriz mexicana. Tu trabajo es ayudar a encontrar autopartes.

 IMPORTANTE: Responde SIEMPRE en formato JSON valido con esta estructura:
@@ -161,6 +225,7 @@ def get_inventory_context(tenant_conn, branch_id=None):


 VISION_MODEL = "google/gemma-3-27b-it:free"
+HERMES_VISION_MODEL = "hermes-agent"

 VISION_SYSTEM_PROMPT = """Eres un experto en identificación de autopartes. El usuario te envía una foto de una parte automotriz.
 Tu trabajo es:
@@ -219,54 +284,41 @@ def chat_with_image(user_message, image_base64, conversation_history=None, inven
    ]
    messages.append({"role": "user", "content": user_content})

-    import time
-    max_retries = 3
-
-    for attempt in range(max_retries):
+    # Try Hermes first for vision (if enabled), fallback to OpenRouter
+    backends = []
+    if HERMES_ENABLED:
+        backends.append((HERMES_CHAT_URL, HERMES_API_KEY, HERMES_VISION_MODEL))
+    if OPENROUTER_API_KEY:
+        backends.append((OPENROUTER_URL, OPENROUTER_API_KEY, VISION_MODEL))
+    
+    last_error = None
+    for url, key, model_id in backends:
+        _validate_model(model_id)
+        result = _post_chat_completion(url, key, model_id, messages, max_tokens=500, temperature=0.3, timeout=30)
+        if result is None:
+            last_error = "api_error"
+            continue
+        content = result["content"]
        try:
-            resp = requests.post(
-                OPENROUTER_URL,
-                headers={
-                    "Authorization": f"Bearer {OPENROUTER_API_KEY}",
-                    "Content-Type": "application/json",
-                },
-                json={
-                    "model": VISION_MODEL,
-                    "messages": messages,
-                    "max_tokens": 500,
-                    "temperature": 0.3,
-                },
-                timeout=30,
-            )
-            if resp.status_code == 429:
-                wait = (attempt + 1) * 5
-                if attempt < max_retries - 1:
-                    time.sleep(wait)
-                    continue
-                return {"message": "El asistente esta ocupado. Intenta de nuevo en unos segundos.", "search_query": None, "vehicle": None}
-            resp.raise_for_status()
-            data = resp.json()
-            content = data["choices"][0]["message"]["content"]
-
-            try:
-                stripped = content.strip()
-                if stripped.startswith("```"):
-                    lines = stripped.split("\n")
-                    json_str = "\n".join(lines[1:-1])
-                    parsed = json.loads(json_str)
-                else:
-                    parsed = json.loads(stripped)
+            stripped = content.strip()
+            if stripped.startswith("```"):
+                lines = stripped.split("\n")
+                json_str = "\n".join(lines[1:-1])
+                parsed = json.loads(json_str)
                return parsed
-            except (json.JSONDecodeError, IndexError):
-                return {"message": content, "search_query": None, "vehicle": None}
-        except Exception as e:
-            if attempt < max_retries - 1:
-                continue
-            return {
-                "message": f"Error al analizar imagen: {str(e)}",
-                "search_query": None,
-                "vehicle": None,
-            }
+            else:
+                parsed = json.loads(stripped)
+                return parsed
+        except (json.JSONDecodeError, IndexError):
+            return {"message": content, "search_query": None, "vehicle": None}
+    
+    if last_error == "api_error":
+        return {"message": "El asistente esta ocupado. Intenta de nuevo en unos segundos.", "search_query": None, "vehicle": None}
+    return {
+        "message": f"Error al analizar imagen: {last_error}",
+        "search_query": None,
+        "vehicle": None,
+    }


 def classify_part(part_number):
@@ -287,47 +339,32 @@ def classify_part(part_number):
        {"role": "user", "content": prompt}
    ]

-    import time
-    max_retries = 3
-
-    for attempt in range(max_retries):
+    # Try Hermes first (if enabled), fallback to OpenRouter
+    backends = []
+    if HERMES_ENABLED:
+        backends.append((HERMES_CHAT_URL, HERMES_API_KEY, HERMES_MODEL))
+    if OPENROUTER_API_KEY:
+        backends.append((OPENROUTER_URL, OPENROUTER_API_KEY, MODEL))
+    
+    for url, key, model_id in backends:
+        _validate_model(model_id)
+        result = _post_chat_completion(url, key, model_id, messages, max_tokens=300, temperature=0.2, timeout=15)
+        if result is None:
+            continue
+        content = result["content"]
        try:
-            resp = requests.post(
-                OPENROUTER_URL,
-                headers={
-                    "Authorization": f"Bearer {OPENROUTER_API_KEY}",
-                    "Content-Type": "application/json",
-                },
-                json={
-                    "model": MODEL,
-                    "messages": messages,
-                    "max_tokens": 300,
-                    "temperature": 0.2,
-                },
-                timeout=15,
-            )
-            if resp.status_code == 429:
-                wait = (attempt + 1) * 5
-                if attempt < max_retries - 1:
-                    time.sleep(wait)
-                    continue
-                return {"name": None, "brand": None, "vehicle": None, "category": None}
-            resp.raise_for_status()
-            data = resp.json()
-            content = data["choices"][0]["message"]["content"]
-
            stripped = content.strip()
            if stripped.startswith("```"):
                lines = stripped.split("\n")
                json_str = "\n".join(lines[1:-1])
                parsed = json.loads(json_str)
+                return parsed
            else:
                parsed = json.loads(stripped)
-            return parsed
+                return parsed
        except Exception:
-            if attempt < max_retries - 1:
-                continue
-            return {"name": None, "brand": None, "vehicle": None, "category": None}
+            continue
+    return {"name": None, "brand": None, "vehicle": None, "category": None}


 # ═══════════════════════════════════════════════════════════════════════════
@@ -491,74 +528,71 @@ def chat(user_message, conversation_history=None, inventory_context=None):

    last_error = None

-    # Try each model in the fallback chain on 429 (rate limit)
-    for model_id in FALLBACK_MODELS:
-        _validate_model(model_id)  # Block paid models
-        try:
-            resp = requests.post(
-                OPENROUTER_URL,
-                headers={
-                    "Authorization": f"Bearer {OPENROUTER_API_KEY}",
-                    "Content-Type": "application/json",
-                },
-                json={
-                    "model": model_id,
-                    "messages": messages,
-                    "max_tokens": 800,
-                    "temperature": 0.3,
-                },
-                timeout=25,
-            )
-            if resp.status_code == 429:
+    # Build backend list: QWEN first (fast, ~1s), then Hermes (specialized, ~30s), then OpenRouter
+    backends = []
+    if QWEN_ENABLED:
+        backends.append((QWEN_CHAT_URL, QWEN_API_KEY, QWEN_MODEL, 35, SYSTEM_PROMPT_SHORT, 4000))
+    if HERMES_ENABLED:
+        backends.append((HERMES_CHAT_URL, HERMES_API_KEY, HERMES_MODEL, 45, SYSTEM_PROMPT, 800))
+    if OPENROUTER_API_KEY:
+        for m in FALLBACK_MODELS:
+            backends.append((OPENROUTER_URL, OPENROUTER_API_KEY, m, 25, SYSTEM_PROMPT, 800))
+
+    for url, key, model_id, timeout_sec, sys_prompt, max_tok in backends:
+        _validate_model(model_id)
+        # Use backend-specific system prompt and max_tokens
+        sys_content = sys_prompt
+        if inventory_context:
+            sys_content = sys_prompt + "\n\n" + inventory_context
+        msgs = [{"role": "system", "content": sys_content}]
+        if conversation_history:
+            msgs.extend(conversation_history)
+        msgs.append({"role": "user", "content": user_message})
+        result = _post_chat_completion(url, key, model_id, msgs, max_tokens=max_tok, temperature=0.3, timeout=timeout_sec)
+        if result is None:
+            if url == QWEN_CHAT_URL:
+                print(f"[AI] QWEN failed, trying Hermes fallback...")
+                last_error = "qwen_failed"
+            elif url == HERMES_CHAT_URL:
+                print(f"[AI] Hermes failed, trying OpenRouter fallback...")
+                last_error = "hermes_timeout"
+            else:
                print(f"[AI] Rate limited on {model_id}, trying next model...")
                last_error = "rate_limit"
-                continue
-            if resp.status_code >= 400:
-                print(f"[AI] HTTP {resp.status_code} on {model_id}: {resp.text[:200]}")
-                last_error = f"http_{resp.status_code}"
-                continue
-            data = resp.json()
-            choice = data.get("choices", [{}])[0]
-            content = choice.get("message", {}).get("content", "").strip()
-            finish = choice.get("finish_reason", "")
-
-            if not content:
-                print(f"[AI] Empty response from {model_id} (finish={finish})")
-                last_error = "empty_response"
-                continue
-
-            print(f"[AI] Response from {model_id} (finish={finish}, {len(content)} chars)")
-
-            # Try to parse JSON response
-            try:
-                stripped = content.strip()
-                if stripped.startswith("```"):
-                    lines = stripped.split("\n")
-                    json_str = "\n".join(lines[1:-1])
-                    parsed = json.loads(json_str)
-                else:
-                    parsed = json.loads(stripped)
-                # Successful JSON response — cache it
-                if cache_key:
-                    _cache_set(cache_key, parsed)
-                return parsed
-            except (json.JSONDecodeError, IndexError):
-                fallback = {"message": content, "search_query": None, "vehicle": None}
-                # Cache the fallback too — the model gave us a real answer,
-                # it just wasn't JSON. Next hit saves the API call.
-                if cache_key:
-                    _cache_set(cache_key, fallback)
-                return fallback
-        except Exception as e:
-            print(f"[AI] Error with {model_id}: {e}")
-            last_error = str(e)
            continue
+        
+        content = result["content"]
+        finish = result["finish_reason"]
+        print(f"[AI] Response from {model_id} (finish={finish}, {len(content)} chars)")
+
+        # Try to parse JSON response
+        try:
+            stripped = content.strip()
+            if stripped.startswith("```"):
+                lines = stripped.split("\n")
+                json_str = "\n".join(lines[1:-1])
+                parsed = json.loads(json_str)
+            else:
+                parsed = json.loads(stripped)
+            # Successful JSON response — cache it
+            if cache_key:
+                _cache_set(cache_key, parsed)
+            return parsed
+        except (json.JSONDecodeError, IndexError):
+            fallback = {"message": content, "search_query": None, "vehicle": None}
+            # Cache the fallback too — the model gave us a real answer,
+            # it just wasn't JSON. Next hit saves the API call.
+            if cache_key:
+                _cache_set(cache_key, fallback)
+            return fallback

    # All models exhausted — DON'T cache errors, we want retries next time
    if last_error == "rate_limit":
        return {"message": "El asistente está ocupado. Intenta de nuevo en unos segundos.", "search_query": None, "vehicle": None}
+    if last_error == "hermes_timeout":
+        return {"message": "El asistente tardó mucho en responder. Intenta de nuevo en un momento.", "search_query": None, "vehicle": None}
    return {
-        "message": f"Error de conexion: {last_error}",
+        "message": "El asistente no está disponible en este momento. Intenta de nuevo en unos segundos.",
        "search_query": None,
        "vehicle": None,
    }