feat: Implementar PWA, Analytics, Reportes PDF y mejoras OCR

FASE 1 - PWA y Frontend: - Crear templates/base.html, dashboard.html, analytics.html, executive.html - Crear static/css/main.css con diseño responsivo - Agregar static/js/app.js, pwa.js, camera.js, charts.js - Implementar manifest.json y service-worker.js para PWA - Soporte para captura de tickets desde cámara móvil FASE 2 - Analytics: - Crear módulo analytics/ con predictions.py, trends.py, comparisons.py - Implementar predicción básica con promedio móvil + tendencia lineal - Agregar endpoints /api/analytics/trends, predictions, comparisons - Integrar Chart.js para gráficas interactivas FASE 3 - Reportes PDF: - Crear módulo reports/ con pdf_generator.py - Implementar SalesReportPDF con generar_reporte_diario y ejecutivo - Agregar comando /reporte [diario|semanal|ejecutivo] - Agregar endpoints /api/reports/generate y /api/reports/download FASE 4 - Mejoras OCR: - Crear módulo ocr/ con processor.py, preprocessor.py, patterns.py - Implementar AmountDetector con patrones múltiples de montos - Agregar preprocesador adaptativo con pipelines para diferentes condiciones - Soporte para corrección de rotación (deskew) y threshold Otsu Dependencias agregadas: - reportlab, matplotlib (PDF) - scipy, pandas (analytics) - imutils, deskew, cachetools (OCR) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-19 03:26:16 +00:00
parent ed1658eb2b
commit 9936deaa90
25 changed files with 5501 additions and 282 deletions
--- a/sales-bot/ocr/patterns.py
+++ b/sales-bot/ocr/patterns.py
@@ -0,0 +1,223 @@
+"""
+Ticket format patterns for Sales Bot OCR
+Supports multiple ticket formats from different stores
+"""
+
+import re
+from typing import Dict, List, Optional
+
+# Ticket format configurations
+TICKET_FORMATS = {
+    'oxxo': {
+        'identificadores': ['oxxo', 'femsa', 'cadena comercial'],
+        'patron_total': r'total\s*\$?\s*([\d,]+\.\d{2})',
+        'patron_fecha': r'(\d{2}/\d{2}/\d{4})',
+        'patron_hora': r'(\d{2}:\d{2}:\d{2})',
+        'prioridad': 1
+    },
+    'walmart': {
+        'identificadores': ['walmart', 'walmex', 'wal-mart', 'bodega aurrera'],
+        'patron_total': r'total\s*\$\s*([\d,]+\.\d{2})',
+        'patron_fecha': r'(\d{2}-\d{2}-\d{4})',
+        'prioridad': 2
+    },
+    'soriana': {
+        'identificadores': ['soriana', 'mega soriana', 'city club'],
+        'patron_total': r'total\s*a?\s*pagar\s*\$?\s*([\d,]+\.\d{2})',
+        'patron_fecha': r'(\d{2}/\d{2}/\d{4})',
+        'prioridad': 3
+    },
+    'tienda_pintura': {
+        'identificadores': ['tinte', 'cromatique', 'oxidante', 'distribuidora',
+                          'colorante', 'pintura', 'tono', 'decolorante', 'revelador'],
+        'patron_total': r'total\s*\$?\s*([\d,]+[\s\.]?\d{0,2})',
+        'patron_productos': r'^(.+?)\s+(\d{1,3})\s+\$?\s*([\d,]+)',
+        'patron_tubos': r'(\d+)\s*(?:tubos?|pzas?|piezas?|unid)',
+        'prioridad': 0  # Highest priority for paint stores
+    },
+    'farmacia': {
+        'identificadores': ['farmacia', 'guadalajara', 'benavides', 'similares', 'ahorro'],
+        'patron_total': r'total\s*\$?\s*([\d,]+\.\d{2})',
+        'patron_fecha': r'(\d{2}/\d{2}/\d{2,4})',
+        'prioridad': 4
+    },
+    'seven_eleven': {
+        'identificadores': ['7-eleven', '7eleven', '7 eleven', 'iconn'],
+        'patron_total': r'total\s*\$?\s*([\d,]+\.\d{2})',
+        'patron_fecha': r'(\d{2}/\d{2}/\d{4})',
+        'prioridad': 5
+    },
+    'generico': {
+        'identificadores': [],  # Fallback - matches everything
+        'patron_total': r'total\s*\$?\s*([\d,]+[\s\.]?\d{0,2})',
+        'patron_fecha': r'(\d{2}[/-]\d{2}[/-]\d{2,4})',
+        'prioridad': 99
+    }
+}
+
+# Common patterns for amount extraction (in priority order)
+AMOUNT_PATTERNS = [
+    # Explicit total patterns
+    (r'total\s*a\s*pagar\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'total_a_pagar', 1),
+    (r'gran\s*total\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'gran_total', 2),
+    (r'total\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'total', 3),
+
+    # Payment related
+    (r'a\s*cobrar\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'a_cobrar', 4),
+    (r'importe\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'importe', 5),
+    (r'monto\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'monto', 6),
+    (r'suma\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'suma', 7),
+
+    # Subtotal (lower priority)
+    (r'subtotal\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'subtotal', 8),
+
+    # Last resort - currency amounts at end of lines
+    (r'\$\s*([\d,]+\.\d{2})\s*$', 'monto_final', 9),
+]
+
+# Date patterns
+DATE_PATTERNS = [
+    r'(\d{2}/\d{2}/\d{4})',      # DD/MM/YYYY
+    r'(\d{2}-\d{2}-\d{4})',      # DD-MM-YYYY
+    r'(\d{4}-\d{2}-\d{2})',      # YYYY-MM-DD
+    r'(\d{2}/\d{2}/\d{2})',      # DD/MM/YY
+    r'(\d{1,2}\s+de\s+\w+\s+de\s+\d{4})',  # D de Mes de YYYY
+]
+
+# Client name patterns
+CLIENT_PATTERNS = [
+    r'cliente\s*:?\s*(.+?)(?:\n|$)',
+    r'nombre\s*:?\s*(.+?)(?:\n|$)',
+    r'sr\.?\s*(.+?)(?:\n|$)',
+    r'sra\.?\s*(.+?)(?:\n|$)',
+]
+
+
+def detectar_formato_ticket(texto: str) -> str:
+    """
+    Detecta el formato del ticket basado en identificadores.
+
+    Args:
+        texto: Texto extraído del ticket
+
+    Returns:
+        Nombre del formato detectado
+    """
+    texto_lower = texto.lower()
+
+    # Check formats by priority (lower number = higher priority)
+    formatos_encontrados = []
+
+    for formato, config in TICKET_FORMATS.items():
+        if formato == 'generico':
+            continue
+
+        for identificador in config.get('identificadores', []):
+            if identificador in texto_lower:
+                formatos_encontrados.append((formato, config.get('prioridad', 99)))
+                break
+
+    if formatos_encontrados:
+        # Sort by priority and return highest priority match
+        formatos_encontrados.sort(key=lambda x: x[1])
+        return formatos_encontrados[0][0]
+
+    return 'generico'
+
+
+def get_patron_total(formato: str) -> str:
+    """
+    Obtiene el patrón de total para un formato específico.
+
+    Args:
+        formato: Nombre del formato
+
+    Returns:
+        Patrón regex para extraer el total
+    """
+    config = TICKET_FORMATS.get(formato, TICKET_FORMATS['generico'])
+    return config.get('patron_total', TICKET_FORMATS['generico']['patron_total'])
+
+
+def extraer_fecha_ticket(texto: str) -> Optional[str]:
+    """
+    Extrae la fecha del ticket.
+
+    Args:
+        texto: Texto del ticket
+
+    Returns:
+        Fecha encontrada o None
+    """
+    for patron in DATE_PATTERNS:
+        match = re.search(patron, texto, re.IGNORECASE)
+        if match:
+            return match.group(1)
+    return None
+
+
+def extraer_cliente_ticket(texto: str) -> Optional[str]:
+    """
+    Extrae el nombre del cliente del ticket.
+
+    Args:
+        texto: Texto del ticket
+
+    Returns:
+        Nombre del cliente o None
+    """
+    for patron in CLIENT_PATTERNS:
+        match = re.search(patron, texto, re.IGNORECASE)
+        if match:
+            cliente = match.group(1).strip()
+            # Clean up common artifacts
+            cliente = re.sub(r'[^\w\s\-\.]', '', cliente)
+            if len(cliente) > 2:  # Valid name should have at least 3 chars
+                return cliente
+    return None
+
+
+def contar_tubos_texto(texto: str) -> int:
+    """
+    Cuenta la cantidad de tubos mencionados en el ticket.
+
+    Args:
+        texto: Texto del ticket
+
+    Returns:
+        Cantidad de tubos detectados
+    """
+    texto_lower = texto.lower()
+    total_tubos = 0
+
+    # Pattern for explicit tube counts
+    patrones_tubos = [
+        r'(\d+)\s*(?:tubos?|tbs?)',
+        r'(\d+)\s*(?:pzas?|piezas?)\s*(?:de\s+)?(?:tinte|color)',
+        r'(?:cantidad|qty|cant)\s*:?\s*(\d+)',
+        r'x\s*(\d+)\s*(?:tubos?)?',
+    ]
+
+    for patron in patrones_tubos:
+        matches = re.findall(patron, texto_lower)
+        for match in matches:
+            try:
+                total_tubos += int(match)
+            except ValueError:
+                continue
+
+    # If no explicit count found, estimate from line items
+    if total_tubos == 0:
+        # Count lines that look like product entries
+        lineas = texto_lower.split('\n')
+        for linea in lineas:
+            if any(word in linea for word in ['tinte', 'color', 'tubo', 'cromatique']):
+                # Check for quantity at start of line or after product name
+                qty_match = re.search(r'^(\d+)\s+|x\s*(\d+)|(\d+)\s*pza', linea)
+                if qty_match:
+                    qty = next((g for g in qty_match.groups() if g), '1')
+                    total_tubos += int(qty)
+                else:
+                    total_tubos += 1  # Assume 1 if no explicit quantity
+
+    return total_tubos