feat: complete session — catalog, marketplace, WhatsApp, peer-to-peer, install scripts

Major features: - Pixel-Perfect glassmorphism design (landing + POS + public catalog) - OEM/Local catalog toggle with Nexpart taxonomy (14 groups, 108 subgroups, 558 part types) - Marketplace B2B Phase 1 (bodegas, POs, status machine, WA+email notifications) - Peer-to-peer inventory (multi-instance, LAN discovery) - WhatsApp: photo→Vision AI, voice→Whisper, conversational quotations - Smart unified search (VIN/plate/part_number/keyword auto-detect) - Shop Supplies tab (vehicle-independent parts) - Chatbot AI fallback chain (5 models) + response cache - CSV inventory import tool + setup_instance.sh installer - Tablet-responsive CSS + sidebar toggle - Filters, export CSV, employee edit, business data save - Quotation system (WA→POS) with auto-print on confirmation - Live stats on landing page Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-18 05:35:53 +00:00
parent 6b097614a0
commit e95f7cf684
54 changed files with 11226 additions and 1422 deletions
--- a/pos/services/ai_chat.py
+++ b/pos/services/ai_chat.py
@@ -9,8 +9,20 @@ OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"

 # ⚠️ SOLO MODELOS GRATUITOS — No cambiar a modelos de pago.
 # El modelo DEBE terminar en ":free" para garantizar costo $0.
-# Alternativas gratuitas: "meta-llama/llama-4-scout:free", "google/gemma-3-27b-it:free"
-MODEL = "qwen/qwen3.6-plus-preview:free"
+MODEL = "qwen/qwen3.6-plus:free"
+
+# Fallback chain: si el modelo principal tiene rate limit (429) o 404
+# (deprecated), intenta los siguientes. Todos :free. Mezclamos proveedores
+# distintos porque los rate limits aplican por-proveedor.
+# Lista actualizada 2026-04-09 después de que qwen3.6-plus fue deprecated.
+FALLBACK_MODELS = [
+    "openai/gpt-oss-120b:free",               # OpenInference — gran cobertura
+    "google/gemma-4-31b-it:free",             # Google — nuevo, 262K ctx
+    "qwen/qwen3-next-80b-a3b-instruct:free",  # Alibaba — 262K ctx
+    "z-ai/glm-4.5-air:free",                  # Z.AI
+    "google/gemma-3-27b-it:free",             # Google — backup vision
+    "meta-llama/llama-3.3-70b-instruct:free", # Meta — último fallback
+]

 def _validate_model(model_id):
    """Ensure only free models are used. Raises if model is not free."""
@@ -318,15 +330,155 @@ def classify_part(part_number):
            return {"name": None, "brand": None, "vehicle": None, "category": None}


+# ═══════════════════════════════════════════════════════════════════════════
+# RESPONSE CACHE — reduces OpenRouter calls for repeated questions
+# ═══════════════════════════════════════════════════════════════════════════
+# Keyed by a normalized form of the user message. TTL 1 hour. Bypasses
+# caching for messages containing VINs or specific part numbers (where the
+# answer depends on the exact string).
+
+import hashlib as _hashlib
+import re as _re
+import time as _time_chat
+
+_RESPONSE_CACHE = {}           # key → (expires_at, response_dict)
+_CACHE_TTL_SECONDS = 3600      # 1 hour
+_CACHE_MAX_SIZE = 1000
+_CACHE_HITS = 0
+_CACHE_MISSES = 0
+
+# Stopwords that add noise but no meaning — stripped from cache keys.
+_CACHE_STOPWORDS = {
+    'necesito', 'necesitas', 'me', 'das', 'dame', 'tienes', 'tiene', 'hay',
+    'quiero', 'quisiera', 'puedes', 'puede', 'favor', 'por', 'porfavor',
+    'hola', 'buenos', 'dias', 'tardes', 'noches', 'holaa',
+    'i', 'need', 'want', 'do', 'you', 'have', 'please',
+}
+
+# Patterns that disable caching — if the message contains any of these, we
+# never cache the response because the answer is specific to that exact input.
+# Rules designed to minimize false positives against normal Spanish queries
+# like "necesito balatas para corolla 2018".
+_CACHE_BYPASS_PATTERNS = [
+    # 17-char VIN (strict, no spaces, alphanumeric except I/O/Q)
+    _re.compile(r'\b[A-HJ-NPR-Z0-9]{17}\b'),
+    # Long numeric (12+ digits — too long to be a year/model code)
+    _re.compile(r'\b\d{12,}\b'),
+    # Mexican license plate: 3 letters + 3-4 digits
+    _re.compile(r'\b[A-Z]{3}[-\s]?\d{3,4}\b'),
+    # OEM with REQUIRED dash/slash separator(s), letters+digits on both sides,
+    # and a total length that makes it unlikely to be a brand+year collision.
+    # Example matches: "4G0-857-951-A", "0 986 4B7 013" (after normalizing).
+    _re.compile(r'\b[A-Z0-9]{2,}[-/][A-Z0-9]{2,}([-/][A-Z0-9]+)+\b'),
+]
+
+
+def _should_bypass_cache(message: str) -> bool:
+    """True if the message has VIN / part number / plate — don't cache."""
+    if not message:
+        return True
+    upper = message.upper()
+    for pat in _CACHE_BYPASS_PATTERNS:
+        if pat.search(upper):
+            return True
+    return False
+
+
+def _normalize_for_cache(message: str) -> str:
+    """Lowercase, strip punctuation, collapse whitespace, drop stopwords."""
+    if not message:
+        return ''
+    s = message.lower().strip()
+    s = _re.sub(r'[¿?¡!.,;:()\[\]{}\'"]+', ' ', s)
+    s = _re.sub(r'\s+', ' ', s).strip()
+    tokens = [t for t in s.split() if t and t not in _CACHE_STOPWORDS]
+    return ' '.join(tokens)
+
+
+def _cache_key(user_message: str, inventory_context: str | None) -> str | None:
+    """Build a stable cache key for (message, inventory_context).
+
+    Returns None if the message should bypass the cache.
+    """
+    if _should_bypass_cache(user_message):
+        return None
+    normalized = _normalize_for_cache(user_message)
+    if not normalized:
+        return None
+    # Hash the inventory context so same-tenant-same-question cache hits,
+    # different-tenant-same-question does NOT (inventory context differs).
+    ctx_hash = _hashlib.md5((inventory_context or '').encode()).hexdigest()[:12]
+    return f"{normalized}::{ctx_hash}"
+
+
+def _cache_get(key: str):
+    global _CACHE_HITS, _CACHE_MISSES
+    if not key:
+        _CACHE_MISSES += 1
+        return None
+    entry = _RESPONSE_CACHE.get(key)
+    if not entry:
+        _CACHE_MISSES += 1
+        return None
+    expires_at, data = entry
+    if _time_chat.time() > expires_at:
+        _RESPONSE_CACHE.pop(key, None)
+        _CACHE_MISSES += 1
+        return None
+    _CACHE_HITS += 1
+    return data
+
+
+def _cache_set(key: str, data: dict):
+    if not key or not data:
+        return
+    _RESPONSE_CACHE[key] = (_time_chat.time() + _CACHE_TTL_SECONDS, data)
+    # Bounded cache — evict oldest entries if we grow past the limit
+    if len(_RESPONSE_CACHE) > _CACHE_MAX_SIZE:
+        oldest_keys = sorted(
+            _RESPONSE_CACHE.items(), key=lambda kv: kv[1][0]
+        )[:200]
+        for k, _v in oldest_keys:
+            _RESPONSE_CACHE.pop(k, None)
+
+
+def chat_cache_stats() -> dict:
+    """Diagnostic helper: hit rate and cache size."""
+    total = _CACHE_HITS + _CACHE_MISSES
+    hit_rate = (_CACHE_HITS * 100 / total) if total else 0
+    return {
+        'entries': len(_RESPONSE_CACHE),
+        'hits': _CACHE_HITS,
+        'misses': _CACHE_MISSES,
+        'hit_rate_pct': round(hit_rate, 1),
+        'ttl_seconds': _CACHE_TTL_SECONDS,
+    }
+
+
+def chat_cache_clear():
+    """Manual cache invalidation — e.g. after inventory bulk changes."""
+    _RESPONSE_CACHE.clear()
+
+
 def chat(user_message, conversation_history=None, inventory_context=None):
    """Send a message to the AI and get a response with search suggestions.

+    Caches responses for repeated identical questions (subject to bypass
+    rules — messages with VINs / part numbers / plates are never cached).
+
    Args:
        user_message: The user's chat message.
        conversation_history: Previous messages in the conversation.
        inventory_context: Optional inventory summary string to inject into the system prompt.
    """
-    _validate_model(MODEL)  # Block paid models
+    # Cache lookup — only when there's no conversation history (stateless)
+    cache_key = None
+    if not conversation_history:
+        cache_key = _cache_key(user_message, inventory_context)
+        cached = _cache_get(cache_key)
+        if cached is not None:
+            print(f"[AI] Cache HIT for '{user_message[:40]}...'")
+            return cached

    system_content = SYSTEM_PROMPT
    if inventory_context:
@@ -337,10 +489,11 @@ def chat(user_message, conversation_history=None, inventory_context=None):
        messages.extend(conversation_history)
    messages.append({"role": "user", "content": user_message})

-    import time
-    max_retries = 3
+    last_error = None

-    for attempt in range(max_retries):
+    # Try each model in the fallback chain on 429 (rate limit)
+    for model_id in FALLBACK_MODELS:
+        _validate_model(model_id)  # Block paid models
        try:
            resp = requests.post(
                OPENROUTER_URL,
@@ -349,23 +502,32 @@ def chat(user_message, conversation_history=None, inventory_context=None):
                    "Content-Type": "application/json",
                },
                json={
-                    "model": MODEL,
+                    "model": model_id,
                    "messages": messages,
-                    "max_tokens": 500,
+                    "max_tokens": 800,
                    "temperature": 0.3,
                },
-                timeout=20,
+                timeout=25,
            )
            if resp.status_code == 429:
-                # Rate limited — wait and retry
-                wait = (attempt + 1) * 5  # 5s, 10s, 15s
-                if attempt < max_retries - 1:
-                    time.sleep(wait)
-                    continue
-                return {"message": "El asistente está ocupado. Intenta de nuevo en unos segundos.", "search_query": None, "vehicle": None}
-            resp.raise_for_status()
+                print(f"[AI] Rate limited on {model_id}, trying next model...")
+                last_error = "rate_limit"
+                continue
+            if resp.status_code >= 400:
+                print(f"[AI] HTTP {resp.status_code} on {model_id}: {resp.text[:200]}")
+                last_error = f"http_{resp.status_code}"
+                continue
            data = resp.json()
-            content = data["choices"][0]["message"]["content"]
+            choice = data.get("choices", [{}])[0]
+            content = choice.get("message", {}).get("content", "").strip()
+            finish = choice.get("finish_reason", "")
+
+            if not content:
+                print(f"[AI] Empty response from {model_id} (finish={finish})")
+                last_error = "empty_response"
+                continue
+
+            print(f"[AI] Response from {model_id} (finish={finish}, {len(content)} chars)")

            # Try to parse JSON response
            try:
@@ -376,14 +538,27 @@ def chat(user_message, conversation_history=None, inventory_context=None):
                    parsed = json.loads(json_str)
                else:
                    parsed = json.loads(stripped)
+                # Successful JSON response — cache it
+                if cache_key:
+                    _cache_set(cache_key, parsed)
                return parsed
            except (json.JSONDecodeError, IndexError):
-                return {"message": content, "search_query": None, "vehicle": None}
+                fallback = {"message": content, "search_query": None, "vehicle": None}
+                # Cache the fallback too — the model gave us a real answer,
+                # it just wasn't JSON. Next hit saves the API call.
+                if cache_key:
+                    _cache_set(cache_key, fallback)
+                return fallback
        except Exception as e:
-            if attempt < max_retries - 1:
-                continue
-            return {
-                "message": f"Error de conexion: {str(e)}",
-                "search_query": None,
-                "vehicle": None,
-            }
+            print(f"[AI] Error with {model_id}: {e}")
+            last_error = str(e)
+            continue
+
+    # All models exhausted — DON'T cache errors, we want retries next time
+    if last_error == "rate_limit":
+        return {"message": "El asistente está ocupado. Intenta de nuevo en unos segundos.", "search_query": None, "vehicle": None}
+    return {
+        "message": f"Error de conexion: {last_error}",
+        "search_query": None,
+        "vehicle": None,
+    }