feat: Implementar PWA, Analytics, Reportes PDF y mejoras OCR

FASE 1 - PWA y Frontend: - Crear templates/base.html, dashboard.html, analytics.html, executive.html - Crear static/css/main.css con diseño responsivo - Agregar static/js/app.js, pwa.js, camera.js, charts.js - Implementar manifest.json y service-worker.js para PWA - Soporte para captura de tickets desde cámara móvil FASE 2 - Analytics: - Crear módulo analytics/ con predictions.py, trends.py, comparisons.py - Implementar predicción básica con promedio móvil + tendencia lineal - Agregar endpoints /api/analytics/trends, predictions, comparisons - Integrar Chart.js para gráficas interactivas FASE 3 - Reportes PDF: - Crear módulo reports/ con pdf_generator.py - Implementar SalesReportPDF con generar_reporte_diario y ejecutivo - Agregar comando /reporte [diario|semanal|ejecutivo] - Agregar endpoints /api/reports/generate y /api/reports/download FASE 4 - Mejoras OCR: - Crear módulo ocr/ con processor.py, preprocessor.py, patterns.py - Implementar AmountDetector con patrones múltiples de montos - Agregar preprocesador adaptativo con pipelines para diferentes condiciones - Soporte para corrección de rotación (deskew) y threshold Otsu Dependencias agregadas: - reportlab, matplotlib (PDF) - scipy, pandas (analytics) - imutils, deskew, cachetools (OCR) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-19 03:26:16 +00:00
parent ed1658eb2b
commit 9936deaa90
25 changed files with 5501 additions and 282 deletions
--- a/sales-bot/ocr/processor.py
+++ b/sales-bot/ocr/processor.py
@@ -0,0 +1,294 @@
+"""
+Main OCR processor for Sales Bot
+Combines preprocessing, text extraction, and amount detection
+"""
+
+import logging
+import os
+from typing import Dict, Optional
+from io import BytesIO
+
+logger = logging.getLogger(__name__)
+
+# Try to import OCR engine
+try:
+    import pytesseract
+    from PIL import Image
+    TESSERACT_AVAILABLE = True
+except ImportError:
+    TESSERACT_AVAILABLE = False
+    logger.warning("pytesseract not available. OCR will not work.")
+
+# Import local modules
+from .preprocessor import ImagePreprocessor, preprocess_image
+from .amount_detector import AmountDetector, detectar_monto
+from .patterns import (
+    detectar_formato_ticket,
+    extraer_fecha_ticket,
+    extraer_cliente_ticket,
+    contar_tubos_texto,
+    get_patron_total
+)
+
+
+class OCRProcessor:
+    """
+    Main OCR processor that coordinates image preprocessing,
+    text extraction, and data parsing.
+    """
+
+    def __init__(self):
+        self.preprocessor = ImagePreprocessor()
+        self.amount_detector = AmountDetector()
+        self.confidence_threshold = float(os.getenv('OCR_CONFIDENCE_THRESHOLD', '0.6'))
+
+        # Tesseract configuration for Spanish
+        self.tesseract_config = '--oem 3 --psm 6 -l spa'
+
+    def process(self, image_bytes: bytes) -> Dict:
+        """
+        Process a ticket image and extract relevant data.
+
+        Args:
+            image_bytes: Raw image bytes (JPEG, PNG, etc.)
+
+        Returns:
+            Dict with extracted data:
+            - texto: Full extracted text
+            - monto: Detected total amount
+            - cliente: Client name if found
+            - fecha: Date if found
+            - tubos: Number of tubes/items
+            - formato: Detected ticket format
+            - confianza: Confidence score
+        """
+        if not TESSERACT_AVAILABLE:
+            return {
+                'error': 'Tesseract OCR not available',
+                'texto': '',
+                'monto': 0,
+                'confianza': 0
+            }
+
+        try:
+            # Preprocess image
+            processed_bytes = self.preprocessor.preprocess(image_bytes)
+
+            # Extract text
+            texto = self._extract_text(processed_bytes)
+
+            if not texto or len(texto.strip()) < 10:
+                # Try again with original image
+                texto = self._extract_text(image_bytes)
+
+            if not texto:
+                return {
+                    'error': 'No text could be extracted',
+                    'texto': '',
+                    'monto': 0,
+                    'confianza': 0
+                }
+
+            # Detect ticket format
+            formato = detectar_formato_ticket(texto)
+
+            # Extract amount
+            monto_result = self.amount_detector.detectar_monto(texto)
+            monto = monto_result.get('monto', 0) if monto_result else 0
+            monto_confianza = monto_result.get('confianza', 0) if monto_result else 0
+            monto_tipo = monto_result.get('tipo', 'unknown') if monto_result else 'unknown'
+
+            # Extract other data
+            cliente = extraer_cliente_ticket(texto)
+            fecha = extraer_fecha_ticket(texto)
+            tubos = contar_tubos_texto(texto)
+
+            # Calculate overall confidence
+            confianza = self._calculate_overall_confidence(
+                texto, monto, monto_confianza, cliente, fecha
+            )
+
+            return {
+                'texto': texto,
+                'monto': monto,
+                'monto_tipo': monto_tipo,
+                'cliente': cliente,
+                'fecha': fecha,
+                'tubos': tubos,
+                'formato': formato,
+                'confianza': confianza
+            }
+
+        except Exception as e:
+            logger.error(f"Error processing image: {e}", exc_info=True)
+            return {
+                'error': str(e),
+                'texto': '',
+                'monto': 0,
+                'confianza': 0
+            }
+
+    def _extract_text(self, image_bytes: bytes) -> str:
+        """
+        Extract text from image bytes using Tesseract.
+        """
+        try:
+            # Load image
+            img = Image.open(BytesIO(image_bytes))
+
+            # Convert to RGB if necessary
+            if img.mode != 'RGB' and img.mode != 'L':
+                img = img.convert('RGB')
+
+            # Run OCR
+            texto = pytesseract.image_to_string(img, config=self.tesseract_config)
+
+            # Clean up text
+            texto = self._clean_text(texto)
+
+            return texto
+
+        except Exception as e:
+            logger.error(f"Error extracting text: {e}")
+            return ''
+
+    def _clean_text(self, texto: str) -> str:
+        """
+        Clean up OCR output text.
+        """
+        if not texto:
+            return ''
+
+        # Remove excessive whitespace
+        import re
+        texto = re.sub(r'\s+', ' ', texto)
+        texto = re.sub(r'\n\s*\n', '\n', texto)
+
+        # Fix common OCR errors
+        replacements = {
+            '|': 'l',
+            '0': 'O',  # Only in certain contexts
+            '1': 'I',  # Only in certain contexts
+            'S': '$',  # Only at start of amounts
+        }
+
+        # Apply selective replacements
+        # (Being careful not to corrupt actual numbers)
+
+        return texto.strip()
+
+    def _calculate_overall_confidence(
+        self,
+        texto: str,
+        monto: float,
+        monto_confianza: float,
+        cliente: Optional[str],
+        fecha: Optional[str]
+    ) -> float:
+        """
+        Calculate overall extraction confidence.
+        """
+        confidence = 0.0
+
+        # Text quality (based on length and structure)
+        if len(texto) > 50:
+            confidence += 0.2
+        if len(texto) > 200:
+            confidence += 0.1
+
+        # Amount detection confidence
+        confidence += monto_confianza * 0.4
+
+        # Bonus for finding additional data
+        if cliente:
+            confidence += 0.1
+        if fecha:
+            confidence += 0.1
+
+        # Check for typical receipt keywords
+        keywords = ['total', 'cliente', 'fecha', 'ticket', 'venta', 'pago']
+        found_keywords = sum(1 for kw in keywords if kw in texto.lower())
+        confidence += min(found_keywords * 0.05, 0.2)
+
+        return min(confidence, 1.0)
+
+    def process_multiple(self, images: list) -> Dict:
+        """
+        Process multiple images (e.g., multi-page receipt).
+        Combines results from all images.
+
+        Args:
+            images: List of image bytes
+
+        Returns:
+            Combined results
+        """
+        all_texto = []
+        total_monto = 0
+        cliente = None
+        fecha = None
+        tubos = 0
+        formato = None
+        max_confianza = 0
+
+        for img_bytes in images:
+            result = self.process(img_bytes)
+
+            if result.get('texto'):
+                all_texto.append(result['texto'])
+
+            if result.get('monto', 0) > total_monto:
+                total_monto = result['monto']
+
+            if not cliente and result.get('cliente'):
+                cliente = result['cliente']
+
+            if not fecha and result.get('fecha'):
+                fecha = result['fecha']
+
+            tubos += result.get('tubos', 0)
+
+            if not formato and result.get('formato'):
+                formato = result['formato']
+
+            if result.get('confianza', 0) > max_confianza:
+                max_confianza = result['confianza']
+
+        return {
+            'texto': '\n---\n'.join(all_texto),
+            'monto': total_monto,
+            'cliente': cliente,
+            'fecha': fecha,
+            'tubos': tubos,
+            'formato': formato,
+            'confianza': max_confianza,
+            'paginas': len(images)
+        }
+
+
+def procesar_ticket_imagen(image_bytes: bytes) -> Dict:
+    """
+    Convenience function to process a ticket image.
+
+    Args:
+        image_bytes: Raw image bytes
+
+    Returns:
+        Dict with extracted data
+    """
+    processor = OCRProcessor()
+    return processor.process(image_bytes)
+
+
+def procesar_multiples_imagenes(images: list) -> Dict:
+    """
+    Convenience function to process multiple images.
+
+    Args:
+        images: List of image bytes
+
+    Returns:
+        Combined results
+    """
+    processor = OCRProcessor()
+    return processor.process_multiple(images)