feat(whatsapp): QWEN primary AI backend, Hermes fallback, conversation history, vehicle persistence, demo prompts

- Add QWEN (qwen3.6) as primary AI backend with short system prompt
- Hermes remains as fallback with 45s timeout
- Increase QWEN timeout to 35s, max_tokens to 4000
- Add conversation history loading from whatsapp_messages (last 4 msgs)
- Persist detected vehicle in whatsapp_sessions table
- Add 'limpiar chat' / 'nuevo chat' / 'reset' commands to clear history
- Fix CSS conflict: rename whatsapp chat-panel classes to wa-chat-panel
- Fix JS ID conflicts with chat.js widget (waChatPanel, waChatMessages, etc.)
- Improve no-stock response: conversational with alternatives
- Split search_query by | for multi-part lookups
- Add DEMO_PROMPTS.md and DEMO_PROMPTS_V2.md
This commit is contained in:
2026-05-06 20:27:14 +00:00
parent 371d72887e
commit ff45905b49
33 changed files with 3040 additions and 445 deletions

View File

@@ -3,9 +3,15 @@
import requests
import json
from config import OPENROUTER_API_KEY
from config import OPENROUTER_API_KEY, HERMES_API_URL, HERMES_API_KEY
from config import QWEN_API_URL, QWEN_API_KEY, QWEN_MODEL
OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"
HERMES_ENABLED = bool(HERMES_API_KEY and HERMES_API_URL)
HERMES_CHAT_URL = (HERMES_API_URL.rstrip('/') + '/chat/completions') if HERMES_API_URL else None
QWEN_ENABLED = bool(QWEN_API_KEY and QWEN_API_URL)
QWEN_CHAT_URL = (QWEN_API_URL.rstrip('/') + '/chat/completions') if QWEN_API_URL else None
# ⚠️ SOLO MODELOS GRATUITOS — No cambiar a modelos de pago.
# El modelo DEBE terminar en ":free" para garantizar costo $0.
@@ -24,11 +30,69 @@ FALLBACK_MODELS = [
"meta-llama/llama-3.3-70b-instruct:free", # Meta — último fallback
]
# Hermes Agent model (OpenAI-compatible API server)
HERMES_MODEL = "hermes-agent"
def _validate_model(model_id):
"""Ensure only free models are used. Raises if model is not free."""
"""Ensure only free models are used. Raises if model is not free.
Skips validation for Hermes Agent and QWEN models (self-hosted / private API).
"""
if model_id == HERMES_MODEL:
return
if model_id == QWEN_MODEL:
return
if not model_id.endswith(':free'):
raise ValueError(f"BLOQUEADO: Solo se permiten modelos gratuitos (:free). Modelo '{model_id}' no es gratuito.")
def _post_chat_completion(url, api_key, model_id, messages, max_tokens=800, temperature=0.3, timeout=25):
"""Generic OpenAI-compatible chat completion POST.
Returns the parsed response dict on success, None on failure.
"""
try:
resp = requests.post(
url,
headers={
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
},
json={
"model": model_id,
"messages": messages,
"max_tokens": max_tokens,
"temperature": temperature,
},
timeout=timeout,
)
if resp.status_code == 429:
print(f"[AI] Rate limited on {model_id} ({url})")
return None
if resp.status_code >= 400:
print(f"[AI] HTTP {resp.status_code} on {model_id} ({url}): {resp.text[:200]}")
return None
data = resp.json()
choice = data.get("choices", [{}])[0]
content = choice.get("message", {}).get("content") or ""
content = content.strip()
finish = choice.get("finish_reason", "")
if not content:
print(f"[AI] Empty response from {model_id} (finish={finish})")
return None
return {"content": content, "finish_reason": finish, "model": model_id}
except Exception as e:
print(f"[AI] Error with {model_id} ({url}): {e}")
return None
SYSTEM_PROMPT_SHORT = """Eres un asistente de refaccionaria automotriz mexicana. Ayuda a encontrar autopartes.
Responde SIEMPRE en formato JSON: {"message":"...","search_query":"...","vehicle":{"brand":"...","model":"...","year":...}}
search_query va EN INGLES cuando el usuario pide una parte. Traducciones: Balatas=Brake Pad, Disco de freno=Brake Disc, Amortiguador=Shock Absorber, Filtro de aceite=Oil Filter, Filtro de aire=Air Filter, Bujias=Spark Plug, Banda=V-Belt, Bomba de agua=Water Pump, Alternador=Alternator, Radiador=Radiator, Sensor de oxigeno=Oxygen Sensor, Terminal de direccion=Tie Rod End, Bomba de gasolina=Fuel Pump, Clutch=Clutch Kit, Mofle=Exhaust, Inyector=Injector.
No preguntes mas si ya puedes buscar. Si el usuario describe un sintoma, diagnostica y sugiere partes.
Cuando pida cotizacion o multiples partes, search_query DEBE usar | para separar cada parte: "Brake Pad|Air Filter|Oil Filter|Spark Plug".
"""
SYSTEM_PROMPT = """Eres un asistente de refaccionaria automotriz mexicana. Tu trabajo es ayudar a encontrar autopartes.
IMPORTANTE: Responde SIEMPRE en formato JSON valido con esta estructura:
@@ -161,6 +225,7 @@ def get_inventory_context(tenant_conn, branch_id=None):
VISION_MODEL = "google/gemma-3-27b-it:free"
HERMES_VISION_MODEL = "hermes-agent"
VISION_SYSTEM_PROMPT = """Eres un experto en identificación de autopartes. El usuario te envía una foto de una parte automotriz.
Tu trabajo es:
@@ -219,54 +284,41 @@ def chat_with_image(user_message, image_base64, conversation_history=None, inven
]
messages.append({"role": "user", "content": user_content})
import time
max_retries = 3
for attempt in range(max_retries):
# Try Hermes first for vision (if enabled), fallback to OpenRouter
backends = []
if HERMES_ENABLED:
backends.append((HERMES_CHAT_URL, HERMES_API_KEY, HERMES_VISION_MODEL))
if OPENROUTER_API_KEY:
backends.append((OPENROUTER_URL, OPENROUTER_API_KEY, VISION_MODEL))
last_error = None
for url, key, model_id in backends:
_validate_model(model_id)
result = _post_chat_completion(url, key, model_id, messages, max_tokens=500, temperature=0.3, timeout=30)
if result is None:
last_error = "api_error"
continue
content = result["content"]
try:
resp = requests.post(
OPENROUTER_URL,
headers={
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
"Content-Type": "application/json",
},
json={
"model": VISION_MODEL,
"messages": messages,
"max_tokens": 500,
"temperature": 0.3,
},
timeout=30,
)
if resp.status_code == 429:
wait = (attempt + 1) * 5
if attempt < max_retries - 1:
time.sleep(wait)
continue
return {"message": "El asistente esta ocupado. Intenta de nuevo en unos segundos.", "search_query": None, "vehicle": None}
resp.raise_for_status()
data = resp.json()
content = data["choices"][0]["message"]["content"]
try:
stripped = content.strip()
if stripped.startswith("```"):
lines = stripped.split("\n")
json_str = "\n".join(lines[1:-1])
parsed = json.loads(json_str)
else:
parsed = json.loads(stripped)
stripped = content.strip()
if stripped.startswith("```"):
lines = stripped.split("\n")
json_str = "\n".join(lines[1:-1])
parsed = json.loads(json_str)
return parsed
except (json.JSONDecodeError, IndexError):
return {"message": content, "search_query": None, "vehicle": None}
except Exception as e:
if attempt < max_retries - 1:
continue
return {
"message": f"Error al analizar imagen: {str(e)}",
"search_query": None,
"vehicle": None,
}
else:
parsed = json.loads(stripped)
return parsed
except (json.JSONDecodeError, IndexError):
return {"message": content, "search_query": None, "vehicle": None}
if last_error == "api_error":
return {"message": "El asistente esta ocupado. Intenta de nuevo en unos segundos.", "search_query": None, "vehicle": None}
return {
"message": f"Error al analizar imagen: {last_error}",
"search_query": None,
"vehicle": None,
}
def classify_part(part_number):
@@ -287,47 +339,32 @@ def classify_part(part_number):
{"role": "user", "content": prompt}
]
import time
max_retries = 3
for attempt in range(max_retries):
# Try Hermes first (if enabled), fallback to OpenRouter
backends = []
if HERMES_ENABLED:
backends.append((HERMES_CHAT_URL, HERMES_API_KEY, HERMES_MODEL))
if OPENROUTER_API_KEY:
backends.append((OPENROUTER_URL, OPENROUTER_API_KEY, MODEL))
for url, key, model_id in backends:
_validate_model(model_id)
result = _post_chat_completion(url, key, model_id, messages, max_tokens=300, temperature=0.2, timeout=15)
if result is None:
continue
content = result["content"]
try:
resp = requests.post(
OPENROUTER_URL,
headers={
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
"Content-Type": "application/json",
},
json={
"model": MODEL,
"messages": messages,
"max_tokens": 300,
"temperature": 0.2,
},
timeout=15,
)
if resp.status_code == 429:
wait = (attempt + 1) * 5
if attempt < max_retries - 1:
time.sleep(wait)
continue
return {"name": None, "brand": None, "vehicle": None, "category": None}
resp.raise_for_status()
data = resp.json()
content = data["choices"][0]["message"]["content"]
stripped = content.strip()
if stripped.startswith("```"):
lines = stripped.split("\n")
json_str = "\n".join(lines[1:-1])
parsed = json.loads(json_str)
return parsed
else:
parsed = json.loads(stripped)
return parsed
return parsed
except Exception:
if attempt < max_retries - 1:
continue
return {"name": None, "brand": None, "vehicle": None, "category": None}
continue
return {"name": None, "brand": None, "vehicle": None, "category": None}
# ═══════════════════════════════════════════════════════════════════════════
@@ -491,74 +528,71 @@ def chat(user_message, conversation_history=None, inventory_context=None):
last_error = None
# Try each model in the fallback chain on 429 (rate limit)
for model_id in FALLBACK_MODELS:
_validate_model(model_id) # Block paid models
try:
resp = requests.post(
OPENROUTER_URL,
headers={
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
"Content-Type": "application/json",
},
json={
"model": model_id,
"messages": messages,
"max_tokens": 800,
"temperature": 0.3,
},
timeout=25,
)
if resp.status_code == 429:
# Build backend list: QWEN first (fast, ~1s), then Hermes (specialized, ~30s), then OpenRouter
backends = []
if QWEN_ENABLED:
backends.append((QWEN_CHAT_URL, QWEN_API_KEY, QWEN_MODEL, 35, SYSTEM_PROMPT_SHORT, 4000))
if HERMES_ENABLED:
backends.append((HERMES_CHAT_URL, HERMES_API_KEY, HERMES_MODEL, 45, SYSTEM_PROMPT, 800))
if OPENROUTER_API_KEY:
for m in FALLBACK_MODELS:
backends.append((OPENROUTER_URL, OPENROUTER_API_KEY, m, 25, SYSTEM_PROMPT, 800))
for url, key, model_id, timeout_sec, sys_prompt, max_tok in backends:
_validate_model(model_id)
# Use backend-specific system prompt and max_tokens
sys_content = sys_prompt
if inventory_context:
sys_content = sys_prompt + "\n\n" + inventory_context
msgs = [{"role": "system", "content": sys_content}]
if conversation_history:
msgs.extend(conversation_history)
msgs.append({"role": "user", "content": user_message})
result = _post_chat_completion(url, key, model_id, msgs, max_tokens=max_tok, temperature=0.3, timeout=timeout_sec)
if result is None:
if url == QWEN_CHAT_URL:
print(f"[AI] QWEN failed, trying Hermes fallback...")
last_error = "qwen_failed"
elif url == HERMES_CHAT_URL:
print(f"[AI] Hermes failed, trying OpenRouter fallback...")
last_error = "hermes_timeout"
else:
print(f"[AI] Rate limited on {model_id}, trying next model...")
last_error = "rate_limit"
continue
if resp.status_code >= 400:
print(f"[AI] HTTP {resp.status_code} on {model_id}: {resp.text[:200]}")
last_error = f"http_{resp.status_code}"
continue
data = resp.json()
choice = data.get("choices", [{}])[0]
content = choice.get("message", {}).get("content", "").strip()
finish = choice.get("finish_reason", "")
if not content:
print(f"[AI] Empty response from {model_id} (finish={finish})")
last_error = "empty_response"
continue
print(f"[AI] Response from {model_id} (finish={finish}, {len(content)} chars)")
# Try to parse JSON response
try:
stripped = content.strip()
if stripped.startswith("```"):
lines = stripped.split("\n")
json_str = "\n".join(lines[1:-1])
parsed = json.loads(json_str)
else:
parsed = json.loads(stripped)
# Successful JSON response — cache it
if cache_key:
_cache_set(cache_key, parsed)
return parsed
except (json.JSONDecodeError, IndexError):
fallback = {"message": content, "search_query": None, "vehicle": None}
# Cache the fallback too — the model gave us a real answer,
# it just wasn't JSON. Next hit saves the API call.
if cache_key:
_cache_set(cache_key, fallback)
return fallback
except Exception as e:
print(f"[AI] Error with {model_id}: {e}")
last_error = str(e)
continue
content = result["content"]
finish = result["finish_reason"]
print(f"[AI] Response from {model_id} (finish={finish}, {len(content)} chars)")
# Try to parse JSON response
try:
stripped = content.strip()
if stripped.startswith("```"):
lines = stripped.split("\n")
json_str = "\n".join(lines[1:-1])
parsed = json.loads(json_str)
else:
parsed = json.loads(stripped)
# Successful JSON response — cache it
if cache_key:
_cache_set(cache_key, parsed)
return parsed
except (json.JSONDecodeError, IndexError):
fallback = {"message": content, "search_query": None, "vehicle": None}
# Cache the fallback too — the model gave us a real answer,
# it just wasn't JSON. Next hit saves the API call.
if cache_key:
_cache_set(cache_key, fallback)
return fallback
# All models exhausted — DON'T cache errors, we want retries next time
if last_error == "rate_limit":
return {"message": "El asistente está ocupado. Intenta de nuevo en unos segundos.", "search_query": None, "vehicle": None}
if last_error == "hermes_timeout":
return {"message": "El asistente tardó mucho en responder. Intenta de nuevo en un momento.", "search_query": None, "vehicle": None}
return {
"message": f"Error de conexion: {last_error}",
"message": "El asistente no está disponible en este momento. Intenta de nuevo en unos segundos.",
"search_query": None,
"vehicle": None,
}