Autoparts-DB/pos/services/ai_chat.py

# /home/Autopartes/pos/services/ai_chat.py
"""AI Chat service using OpenRouter for parts lookup assistance."""

import requests
import json
from config import OPENROUTER_API_KEY

OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"

# ⚠️ SOLO MODELOS GRATUITOS — No cambiar a modelos de pago.
# El modelo DEBE terminar en ":free" para garantizar costo $0.
MODEL = "qwen/qwen3.6-plus:free"

# Fallback chain: si el modelo principal tiene rate limit (429) o 404
# (deprecated), intenta los siguientes. Todos :free. Mezclamos proveedores
# distintos porque los rate limits aplican por-proveedor.
# Lista actualizada 2026-04-09 después de que qwen3.6-plus fue deprecated.
FALLBACK_MODELS = [
    "openai/gpt-oss-120b:free",               # OpenInference — gran cobertura
    "google/gemma-4-31b-it:free",             # Google — nuevo, 262K ctx
    "qwen/qwen3-next-80b-a3b-instruct:free",  # Alibaba — 262K ctx
    "z-ai/glm-4.5-air:free",                  # Z.AI
    "google/gemma-3-27b-it:free",             # Google — backup vision
    "meta-llama/llama-3.3-70b-instruct:free", # Meta — último fallback
]

def _validate_model(model_id):
    """Ensure only free models are used. Raises if model is not free."""
    if not model_id.endswith(':free'):
        raise ValueError(f"BLOQUEADO: Solo se permiten modelos gratuitos (:free). Modelo '{model_id}' no es gratuito.")

SYSTEM_PROMPT = """Eres un asistente de refaccionaria automotriz mexicana. Tu trabajo es ayudar a encontrar autopartes.

IMPORTANTE: Responde SIEMPRE en formato JSON valido con esta estructura:
{
  "message": "Tu respuesta al usuario en español",
  "search_query": "termino de busqueda EN INGLES para el catalogo",
  "vehicle": {"brand": "TOYOTA", "model": "Corolla", "year": 2020}
}

Reglas OBLIGATORIAS:
1. "search_query" SIEMPRE debe tener un valor cuando el usuario menciona una parte. NUNCA dejes null si el usuario pide algo.
2. "search_query" debe estar EN INGLES porque el catalogo TecDoc tiene nombres en ingles. Traducciones comunes:
   - Balatas/Pastillas de freno = "Brake Pad"
   - Discos de freno = "Brake Disc"
   - Amortiguador = "Shock Absorber"
   - Filtro de aceite = "Oil Filter"
   - Filtro de aire = "Air Filter"
   - Bujias = "Spark Plug"
   - Banda serpentina = "V-Belt" o "Serpentine Belt"
   - Bomba de agua = "Water Pump"
   - Alternador = "Alternator"
   - Radiador = "Radiator"
   - Sensor de oxigeno = "Oxygen Sensor"
   - Terminal de direccion = "Tie Rod End"
   - Bomba de gasolina = "Fuel Pump"
   - Clutch/Embrague = "Clutch Kit"
   - Mofle/Escape = "Exhaust"
   - Inyector = "Injector"
3. "vehicle" extrae marca, modelo y ano. La marca en MAYUSCULAS.
4. Nombres mexicanos: Tsuru = TSURU, Aveo = AVEO, Jetta = JETTA, Pointer = POINTER, Chevy = CORSA, Vocho = BEETLE.
5. No preguntes mas info si ya puedes buscar. Si el usuario dice "balatas para Tsuru 2015", busca directo.
6. "message" es breve y directo: "Buscando balatas para Nissan Tsuru 2015..."

Cuando el usuario describe un SINTOMA del vehiculo (no una parte especifica), diagnostica el problema y sugiere las partes que podrian necesitar reemplazo.

Ejemplos de sintomas:
- "el carro vibra al frenar" → Discos de freno y/o balatas desgastadas. search_query: "Brake Disc"
- "se calienta el motor" → Termostato, bomba de agua, radiador. search_query: "Thermostat"
- "hace ruido al dar vuelta" → Juntas homocineticas. search_query: "CV Joint"
- "no arranca" → Bateria, alternador, motor de arranque. search_query: "Starter Motor"
- "gasta mucha gasolina" → Filtro de aire, bujias, inyectores. search_query: "Air Filter"
- "huele a gasolina" → Inyectores, bomba de gasolina, mangueras. search_query: "Fuel Pump"
- "se jala a un lado" → Terminales de direccion, rotulas, alineacion. search_query: "Tie Rod End"
- "hace ruido al arrancar" → Banda serpentina, tensor, marcha. search_query: "Serpentine Belt"
- "pierde aceite" → Junta de tapa de valvulas, empaques. search_query: "Gasket"
- "el aire no enfria" → Compresor de AC, gas refrigerante. search_query: "A/C Compressor"

Si detectas un sintoma, responde con:
1. Diagnostico probable
2. Lista de partes que podrian necesitar reemplazo (en orden de probabilidad)
3. search_query con la parte mas probable

Cuando el usuario pida una COTIZACION o diga "cotizame", "cuanto cuesta", "precio de":
1. Identifica TODAS las partes necesarias para el trabajo completo
2. Devuelve multiples search_queries separadas por |

Ejemplo: "cotizame frenos completos para Corolla 2020"
search_query: "Brake Pad|Brake Disc|Brake Fluid|Brake Hose"

Ejemplo: "servicio completo para Tsuru 2015"
search_query: "Oil Filter|Air Filter|Spark Plug|Coolant|Brake Fluid"

Ejemplo: "kit de distribucion para Jetta 2018"
search_query: "Timing Belt|Tensioner|Idler Pulley|Water Pump"

Detecta el idioma del usuario y responde en el mismo idioma.
Si escribe en ingles, responde en ingles.
Si escribe en espanol, responde en espanol.
El search_query SIEMPRE debe ser en ingles (el catalogo TecDoc esta en ingles).
"""


def get_inventory_context(tenant_conn, branch_id=None):
    """Build a summary string of the tenant's inventory for AI context.

    Returns a string like:
        Este negocio tiene 1234 productos en inventario.
        Categorias: BOSCH (45), MONROE (32), ACDelco (28), ...
        Productos con stock bajo (<=3): 15
    """
    cur = tenant_conn.cursor()
    try:
        # Total items
        where = "i.is_active = true"
        params = []
        if branch_id:
            where += " AND i.branch_id = %s"
            params.append(branch_id)

        cur.execute(f"SELECT COUNT(*) FROM inventory i WHERE {where}", params)
        total = cur.fetchone()[0] or 0

        if total == 0:
            return "CONTEXTO DEL INVENTARIO:\nEste negocio aun no tiene productos en inventario."

        # Top brands with counts
        cur.execute(f"""
            SELECT i.brand, COUNT(*) as cnt
            FROM inventory i
            WHERE {where} AND i.brand IS NOT NULL AND i.brand != ''
            GROUP BY i.brand
            ORDER BY cnt DESC
            LIMIT 15
        """, params)
        brands = cur.fetchall()
        brand_list = ", ".join(f"{row[0]} ({row[1]})" for row in brands if row[0])

        # Products with low stock (<=3)
        cur.execute(f"""
            SELECT COUNT(*) FROM inventory i
            WHERE {where}
              AND COALESCE((SELECT stock FROM inventory_stock_summary WHERE inventory_id = i.id), 0) <= 3
        """, params)
        low_stock = cur.fetchone()[0] or 0

        lines = [
            "CONTEXTO DEL INVENTARIO:",
            f"Este negocio tiene {total} productos en inventario.",
        ]
        if brand_list:
            lines.append(f"Marcas disponibles: {brand_list}")
        lines.append(f"Productos con stock bajo (<=3 unidades): {low_stock}")
        lines.append("IMPORTANTE: Cuando busques partes, SIEMPRE prioriza lo que el negocio tiene en inventario local.")

        return "\n".join(lines)
    except Exception:
        return ""
    finally:
        cur.close()


VISION_MODEL = "google/gemma-3-27b-it:free"

VISION_SYSTEM_PROMPT = """Eres un experto en identificación de autopartes. El usuario te envía una foto de una parte automotriz.
Tu trabajo es:
1. Identificar que parte es (nombre en español e inglés)
2. Describir características visibles (material, desgaste, marca si es visible)
3. Sugerir términos de búsqueda para encontrarla en un catálogo

IMPORTANTE: Responde SIEMPRE en formato JSON válido con esta estructura:
{
  "message": "Descripción de la parte identificada en español",
  "search_query": "término de búsqueda EN INGLÉS para el catálogo",
  "vehicle": null
}

Ejemplos de partes comunes:
- Pastillas/balatas de freno = "Brake Pad"
- Disco de freno = "Brake Disc"
- Filtro de aceite = "Oil Filter"
- Bujía = "Spark Plug"
- Amortiguador = "Shock Absorber"
- Bomba de agua = "Water Pump"
- Sensor de oxígeno = "Oxygen Sensor"
"""


def chat_with_image(user_message, image_base64, conversation_history=None, inventory_context=None):
    """Send a message with an image to a vision-capable AI model.

    Args:
        user_message: The user's chat message.
        image_base64: Base64-encoded image (with or without data URL prefix).
        conversation_history: Previous messages in the conversation.
        inventory_context: Optional inventory summary string.
    """
    _validate_model(VISION_MODEL)

    system_content = VISION_SYSTEM_PROMPT
    if inventory_context:
        system_content = VISION_SYSTEM_PROMPT + "\n\n" + inventory_context

    # Ensure proper data URL format
    if image_base64 and not image_base64.startswith('data:'):
        image_base64 = 'data:image/jpeg;base64,' + image_base64

    messages = [{"role": "system", "content": system_content}]
    if conversation_history:
        # Only add text-only history messages
        for h in conversation_history:
            if isinstance(h.get('content'), str):
                messages.append(h)

    # Build multimodal user message
    user_content = [
        {"type": "image_url", "image_url": {"url": image_base64}},
        {"type": "text", "text": user_message or "Identifica esta parte automotriz y sugiere términos de búsqueda."}
    ]
    messages.append({"role": "user", "content": user_content})

    import time
    max_retries = 3

    for attempt in range(max_retries):
        try:
            resp = requests.post(
                OPENROUTER_URL,
                headers={
                    "Authorization": f"Bearer {OPENROUTER_API_KEY}",
                    "Content-Type": "application/json",
                },
                json={
                    "model": VISION_MODEL,
                    "messages": messages,
                    "max_tokens": 500,
                    "temperature": 0.3,
                },
                timeout=30,
            )
            if resp.status_code == 429:
                wait = (attempt + 1) * 5
                if attempt < max_retries - 1:
                    time.sleep(wait)
                    continue
                return {"message": "El asistente esta ocupado. Intenta de nuevo en unos segundos.", "search_query": None, "vehicle": None}
            resp.raise_for_status()
            data = resp.json()
            content = data["choices"][0]["message"]["content"]

            try:
                stripped = content.strip()
                if stripped.startswith("```"):
                    lines = stripped.split("\n")
                    json_str = "\n".join(lines[1:-1])
                    parsed = json.loads(json_str)
                else:
                    parsed = json.loads(stripped)
                return parsed
            except (json.JSONDecodeError, IndexError):
                return {"message": content, "search_query": None, "vehicle": None}
        except Exception as e:
            if attempt < max_retries - 1:
                continue
            return {
                "message": f"Error al analizar imagen: {str(e)}",
                "search_query": None,
                "vehicle": None,
            }


def classify_part(part_number):
    """Ask AI to identify a part by its OEM number."""
    _validate_model(MODEL)

    prompt = (
        f"Given auto part number '{part_number}', identify:\n"
        f"1) What part it is (name in Spanish)\n"
        f"2) Which brand makes it\n"
        f"3) What vehicle it fits\n"
        f"4) What category it belongs to (e.g. Frenos, Motor, Suspensión, Eléctrico, Filtros, Transmisión)\n"
        f"Respond ONLY in valid JSON: {{\"name\": \"...\", \"brand\": \"...\", \"vehicle\": \"...\", \"category\": \"...\"}}"
    )

    messages = [
        {"role": "system", "content": "Eres un experto en autopartes. Responde SOLO en JSON válido, sin texto adicional."},
        {"role": "user", "content": prompt}
    ]

    import time
    max_retries = 3

    for attempt in range(max_retries):
        try:
            resp = requests.post(
                OPENROUTER_URL,
                headers={
                    "Authorization": f"Bearer {OPENROUTER_API_KEY}",
                    "Content-Type": "application/json",
                },
                json={
                    "model": MODEL,
                    "messages": messages,
                    "max_tokens": 300,
                    "temperature": 0.2,
                },
                timeout=15,
            )
            if resp.status_code == 429:
                wait = (attempt + 1) * 5
                if attempt < max_retries - 1:
                    time.sleep(wait)
                    continue
                return {"name": None, "brand": None, "vehicle": None, "category": None}
            resp.raise_for_status()
            data = resp.json()
            content = data["choices"][0]["message"]["content"]

            stripped = content.strip()
            if stripped.startswith("```"):
                lines = stripped.split("\n")
                json_str = "\n".join(lines[1:-1])
                parsed = json.loads(json_str)
            else:
                parsed = json.loads(stripped)
            return parsed
        except Exception:
            if attempt < max_retries - 1:
                continue
            return {"name": None, "brand": None, "vehicle": None, "category": None}


# ═══════════════════════════════════════════════════════════════════════════
# RESPONSE CACHE — reduces OpenRouter calls for repeated questions
# ═══════════════════════════════════════════════════════════════════════════
# Keyed by a normalized form of the user message. TTL 1 hour. Bypasses
# caching for messages containing VINs or specific part numbers (where the
# answer depends on the exact string).

import hashlib as _hashlib
import re as _re
import time as _time_chat

_RESPONSE_CACHE = {}           # key → (expires_at, response_dict)
_CACHE_TTL_SECONDS = 3600      # 1 hour
_CACHE_MAX_SIZE = 1000
_CACHE_HITS = 0
_CACHE_MISSES = 0

# Stopwords that add noise but no meaning — stripped from cache keys.
_CACHE_STOPWORDS = {
    'necesito', 'necesitas', 'me', 'das', 'dame', 'tienes', 'tiene', 'hay',
    'quiero', 'quisiera', 'puedes', 'puede', 'favor', 'por', 'porfavor',
    'hola', 'buenos', 'dias', 'tardes', 'noches', 'holaa',
    'i', 'need', 'want', 'do', 'you', 'have', 'please',
}

# Patterns that disable caching — if the message contains any of these, we
# never cache the response because the answer is specific to that exact input.
# Rules designed to minimize false positives against normal Spanish queries
# like "necesito balatas para corolla 2018".
_CACHE_BYPASS_PATTERNS = [
    # 17-char VIN (strict, no spaces, alphanumeric except I/O/Q)
    _re.compile(r'\b[A-HJ-NPR-Z0-9]{17}\b'),
    # Long numeric (12+ digits — too long to be a year/model code)
    _re.compile(r'\b\d{12,}\b'),
    # Mexican license plate: 3 letters + 3-4 digits
    _re.compile(r'\b[A-Z]{3}[-\s]?\d{3,4}\b'),
    # OEM with REQUIRED dash/slash separator(s), letters+digits on both sides,
    # and a total length that makes it unlikely to be a brand+year collision.
    # Example matches: "4G0-857-951-A", "0 986 4B7 013" (after normalizing).
    _re.compile(r'\b[A-Z0-9]{2,}[-/][A-Z0-9]{2,}([-/][A-Z0-9]+)+\b'),
]


def _should_bypass_cache(message: str) -> bool:
    """True if the message has VIN / part number / plate — don't cache."""
    if not message:
        return True
    upper = message.upper()
    for pat in _CACHE_BYPASS_PATTERNS:
        if pat.search(upper):
            return True
    return False


def _normalize_for_cache(message: str) -> str:
    """Lowercase, strip punctuation, collapse whitespace, drop stopwords."""
    if not message:
        return ''
    s = message.lower().strip()
    s = _re.sub(r'[¿?¡!.,;:()\[\]{}\'"]+', ' ', s)
    s = _re.sub(r'\s+', ' ', s).strip()
    tokens = [t for t in s.split() if t and t not in _CACHE_STOPWORDS]
    return ' '.join(tokens)


def _cache_key(user_message: str, inventory_context: str | None) -> str | None:
    """Build a stable cache key for (message, inventory_context).

    Returns None if the message should bypass the cache.
    """
    if _should_bypass_cache(user_message):
        return None
    normalized = _normalize_for_cache(user_message)
    if not normalized:
        return None
    # Hash the inventory context so same-tenant-same-question cache hits,
    # different-tenant-same-question does NOT (inventory context differs).
    ctx_hash = _hashlib.md5((inventory_context or '').encode()).hexdigest()[:12]
    return f"{normalized}::{ctx_hash}"


def _cache_get(key: str):
    global _CACHE_HITS, _CACHE_MISSES
    if not key:
        _CACHE_MISSES += 1
        return None
    entry = _RESPONSE_CACHE.get(key)
    if not entry:
        _CACHE_MISSES += 1
        return None
    expires_at, data = entry
    if _time_chat.time() > expires_at:
        _RESPONSE_CACHE.pop(key, None)
        _CACHE_MISSES += 1
        return None
    _CACHE_HITS += 1
    return data


def _cache_set(key: str, data: dict):
    if not key or not data:
        return
    _RESPONSE_CACHE[key] = (_time_chat.time() + _CACHE_TTL_SECONDS, data)
    # Bounded cache — evict oldest entries if we grow past the limit
    if len(_RESPONSE_CACHE) > _CACHE_MAX_SIZE:
        oldest_keys = sorted(
            _RESPONSE_CACHE.items(), key=lambda kv: kv[1][0]
        )[:200]
        for k, _v in oldest_keys:
            _RESPONSE_CACHE.pop(k, None)


def chat_cache_stats() -> dict:
    """Diagnostic helper: hit rate and cache size."""
    total = _CACHE_HITS + _CACHE_MISSES
    hit_rate = (_CACHE_HITS * 100 / total) if total else 0
    return {
        'entries': len(_RESPONSE_CACHE),
        'hits': _CACHE_HITS,
        'misses': _CACHE_MISSES,
        'hit_rate_pct': round(hit_rate, 1),
        'ttl_seconds': _CACHE_TTL_SECONDS,
    }


def chat_cache_clear():
    """Manual cache invalidation — e.g. after inventory bulk changes."""
    _RESPONSE_CACHE.clear()


def chat(user_message, conversation_history=None, inventory_context=None):
    """Send a message to the AI and get a response with search suggestions.

    Caches responses for repeated identical questions (subject to bypass
    rules — messages with VINs / part numbers / plates are never cached).

    Args:
        user_message: The user's chat message.
        conversation_history: Previous messages in the conversation.
        inventory_context: Optional inventory summary string to inject into the system prompt.
    """
    # Cache lookup — only when there's no conversation history (stateless)
    cache_key = None
    if not conversation_history:
        cache_key = _cache_key(user_message, inventory_context)
        cached = _cache_get(cache_key)
        if cached is not None:
            print(f"[AI] Cache HIT for '{user_message[:40]}...'")
            return cached

    system_content = SYSTEM_PROMPT
    if inventory_context:
        system_content = SYSTEM_PROMPT + "\n\n" + inventory_context

    messages = [{"role": "system", "content": system_content}]
    if conversation_history:
        messages.extend(conversation_history)
    messages.append({"role": "user", "content": user_message})

    last_error = None

    # Try each model in the fallback chain on 429 (rate limit)
    for model_id in FALLBACK_MODELS:
        _validate_model(model_id)  # Block paid models
        try:
            resp = requests.post(
                OPENROUTER_URL,
                headers={
                    "Authorization": f"Bearer {OPENROUTER_API_KEY}",
                    "Content-Type": "application/json",
                },
                json={
                    "model": model_id,
                    "messages": messages,
                    "max_tokens": 800,
                    "temperature": 0.3,
                },
                timeout=25,
            )
            if resp.status_code == 429:
                print(f"[AI] Rate limited on {model_id}, trying next model...")
                last_error = "rate_limit"
                continue
            if resp.status_code >= 400:
                print(f"[AI] HTTP {resp.status_code} on {model_id}: {resp.text[:200]}")
                last_error = f"http_{resp.status_code}"
                continue
            data = resp.json()
            choice = data.get("choices", [{}])[0]
            content = choice.get("message", {}).get("content", "").strip()
            finish = choice.get("finish_reason", "")

            if not content:
                print(f"[AI] Empty response from {model_id} (finish={finish})")
                last_error = "empty_response"
                continue

            print(f"[AI] Response from {model_id} (finish={finish}, {len(content)} chars)")

            # Try to parse JSON response
            try:
                stripped = content.strip()
                if stripped.startswith("```"):
                    lines = stripped.split("\n")
                    json_str = "\n".join(lines[1:-1])
                    parsed = json.loads(json_str)
                else:
                    parsed = json.loads(stripped)
                # Successful JSON response — cache it
                if cache_key:
                    _cache_set(cache_key, parsed)
                return parsed
            except (json.JSONDecodeError, IndexError):
                fallback = {"message": content, "search_query": None, "vehicle": None}
                # Cache the fallback too — the model gave us a real answer,
                # it just wasn't JSON. Next hit saves the API call.
                if cache_key:
                    _cache_set(cache_key, fallback)
                return fallback
        except Exception as e:
            print(f"[AI] Error with {model_id}: {e}")
            last_error = str(e)
            continue

    # All models exhausted — DON'T cache errors, we want retries next time
    if last_error == "rate_limit":
        return {"message": "El asistente está ocupado. Intenta de nuevo en unos segundos.", "search_query": None, "vehicle": None}
    return {
        "message": f"Error de conexion: {last_error}",
        "search_query": None,
        "vehicle": None,
    }