feat: Implementar PWA, Analytics, Reportes PDF y mejoras OCR

FASE 1 - PWA y Frontend: - Crear templates/base.html, dashboard.html, analytics.html, executive.html - Crear static/css/main.css con diseño responsivo - Agregar static/js/app.js, pwa.js, camera.js, charts.js - Implementar manifest.json y service-worker.js para PWA - Soporte para captura de tickets desde cámara móvil FASE 2 - Analytics: - Crear módulo analytics/ con predictions.py, trends.py, comparisons.py - Implementar predicción básica con promedio móvil + tendencia lineal - Agregar endpoints /api/analytics/trends, predictions, comparisons - Integrar Chart.js para gráficas interactivas FASE 3 - Reportes PDF: - Crear módulo reports/ con pdf_generator.py - Implementar SalesReportPDF con generar_reporte_diario y ejecutivo - Agregar comando /reporte [diario|semanal|ejecutivo] - Agregar endpoints /api/reports/generate y /api/reports/download FASE 4 - Mejoras OCR: - Crear módulo ocr/ con processor.py, preprocessor.py, patterns.py - Implementar AmountDetector con patrones múltiples de montos - Agregar preprocesador adaptativo con pipelines para diferentes condiciones - Soporte para corrección de rotación (deskew) y threshold Otsu Dependencias agregadas: - reportlab, matplotlib (PDF) - scipy, pandas (analytics) - imutils, deskew, cachetools (OCR) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-19 03:26:16 +00:00
parent ed1658eb2b
commit 9936deaa90
25 changed files with 5501 additions and 282 deletions
--- a/sales-bot/ocr/init.py
+++ b/sales-bot/ocr/init.py
@@ -0,0 +1,17 @@
+"""
+OCR Module for Sales Bot
+Improved text extraction and amount detection from ticket images
+"""
+
+from .processor import procesar_ticket_imagen, OCRProcessor
+from .amount_detector import AmountDetector, detectar_monto
+from .patterns import detectar_formato_ticket, TICKET_FORMATS
+
+__all__ = [
+    'procesar_ticket_imagen',
+    'OCRProcessor',
+    'AmountDetector',
+    'detectar_monto',
+    'detectar_formato_ticket',
+    'TICKET_FORMATS'
+]
--- a/sales-bot/ocr/amount_detector.py
+++ b/sales-bot/ocr/amount_detector.py
@@ -0,0 +1,258 @@
+"""
+Amount detection for Sales Bot OCR
+Improved detection of total amounts from ticket text
+"""
+
+import re
+import logging
+from typing import Dict, List, Optional, Tuple
+
+logger = logging.getLogger(__name__)
+
+# Amount patterns in priority order
+PATTERNS = [
+    # Explicit total patterns (highest priority)
+    (r'total\s*a\s*pagar\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'total_a_pagar', 1),
+    (r'gran\s*total\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'gran_total', 2),
+    (r'total\s+final\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'total_final', 3),
+    (r'(?:^|\n)\s*total\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'total', 4),
+
+    # Payment related
+    (r'a\s*cobrar\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'a_cobrar', 5),
+    (r'importe\s*(?:total)?\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'importe', 6),
+    (r'monto\s*(?:total)?\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'monto', 7),
+    (r'suma\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'suma', 8),
+    (r'pago\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'pago', 9),
+
+    # Subtotal (lower priority - may need to add tax)
+    (r'subtotal\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'subtotal', 10),
+
+    # Generic currency patterns (lowest priority)
+    (r'\$\s*([\d,]+\.\d{2})\s*(?:\n|$)', 'monto_linea', 11),
+]
+
+# Words that indicate a line is NOT a total (negative patterns)
+EXCLUSION_WORDS = [
+    'cambio', 'efectivo', 'pago con', 'tarjeta', 'recibido',
+    'iva', 'impuesto', 'descuento', 'ahorro', 'puntos'
+]
+
+
+class AmountDetector:
+    """
+    Detects and extracts monetary amounts from ticket text.
+    Uses multiple patterns and heuristics to find the most likely total.
+    """
+
+    def __init__(self):
+        self.patterns = PATTERNS
+        self.min_amount = 1  # Minimum valid amount
+        self.max_amount = 1000000  # Maximum valid amount
+
+    def detectar_monto(self, texto: str) -> Optional[Dict]:
+        """
+        Detecta el monto total del ticket.
+
+        Args:
+            texto: Texto extraído del ticket
+
+        Returns:
+            dict con monto, tipo, patron, y confianza, o None si no se encuentra
+        """
+        texto_lower = texto.lower()
+        resultados = []
+
+        for patron, tipo, prioridad in self.patterns:
+            matches = re.findall(patron, texto_lower, re.IGNORECASE | re.MULTILINE)
+
+            for match in matches:
+                # Skip if match is in an exclusion context
+                if self._is_excluded(texto_lower, match):
+                    continue
+
+                monto = self._normalizar_monto(match)
+
+                if self.min_amount <= monto <= self.max_amount:
+                    # Calculate confidence based on pattern type and context
+                    confianza = self._calcular_confianza(texto_lower, match, tipo)
+
+                    resultados.append({
+                        'monto': monto,
+                        'tipo': tipo,
+                        'patron': patron,
+                        'prioridad': prioridad,
+                        'confianza': confianza
+                    })
+
+        if not resultados:
+            # Try to find the largest amount as fallback
+            return self._fallback_detection(texto)
+
+        # Sort by priority (lower is better) then by confidence (higher is better)
+        resultados.sort(key=lambda x: (x['prioridad'], -x['confianza']))
+
+        # Return the best match
+        best = resultados[0]
+        return {
+            'monto': best['monto'],
+            'tipo': best['tipo'],
+            'patron': best['patron'],
+            'confianza': best['confianza']
+        }
+
+    def _normalizar_monto(self, monto_str: str) -> float:
+        """
+        Normaliza string de monto a float.
+
+        Handles various formats:
+        - 1,234.56 (US/Mexico format)
+        - 1234.56
+        - 1 234.56 (space separator)
+        - 1234,56 (European format)
+        """
+        if not monto_str:
+            return 0.0
+
+        # Remove currency symbols and whitespace
+        monto = monto_str.strip().replace('$', '').replace(' ', '')
+
+        # Handle different decimal separators
+        # If there's both comma and dot, determine which is decimal
+        if ',' in monto and '.' in monto:
+            # US/Mexico format: 1,234.56
+            monto = monto.replace(',', '')
+        elif ',' in monto:
+            # Could be European (1234,56) or thousand separator (1,234)
+            parts = monto.split(',')
+            if len(parts) == 2 and len(parts[1]) == 2:
+                # European format
+                monto = monto.replace(',', '.')
+            else:
+                # Thousand separator
+                monto = monto.replace(',', '')
+
+        try:
+            return float(monto)
+        except ValueError:
+            return 0.0
+
+    def _is_excluded(self, texto: str, match: str) -> bool:
+        """
+        Checks if the match appears in an exclusion context.
+        """
+        # Find the line containing this match
+        for linea in texto.split('\n'):
+            if match in linea:
+                linea_lower = linea.lower()
+                for exclusion in EXCLUSION_WORDS:
+                    if exclusion in linea_lower:
+                        return True
+        return False
+
+    def _calcular_confianza(self, texto: str, match: str, tipo: str) -> float:
+        """
+        Calculates confidence score for a match.
+
+        Returns value between 0.0 and 1.0
+        """
+        confianza = 0.5  # Base confidence
+
+        # Higher confidence for explicit total patterns
+        if tipo in ['total_a_pagar', 'gran_total', 'total_final']:
+            confianza += 0.3
+        elif tipo == 'total':
+            confianza += 0.2
+
+        # Higher confidence if near end of text
+        position = texto.find(match)
+        text_length = len(texto)
+        if position > text_length * 0.6:  # In last 40% of text
+            confianza += 0.1
+
+        # Higher confidence if followed by payment info
+        after_match = texto[texto.find(match) + len(match):texto.find(match) + len(match) + 50]
+        if any(word in after_match.lower() for word in ['efectivo', 'tarjeta', 'cambio', 'gracias']):
+            confianza += 0.1
+
+        return min(confianza, 1.0)
+
+    def _fallback_detection(self, texto: str) -> Optional[Dict]:
+        """
+        Fallback detection when standard patterns fail.
+        Looks for the largest reasonable amount in the text.
+        """
+        # Find all currency-like numbers
+        all_amounts = re.findall(r'\$?\s*([\d,]+\.?\d{0,2})', texto)
+
+        valid_amounts = []
+        for amount_str in all_amounts:
+            amount = self._normalizar_monto(amount_str)
+            if self.min_amount <= amount <= self.max_amount:
+                valid_amounts.append(amount)
+
+        if valid_amounts:
+            # Return the largest amount (likely the total)
+            max_amount = max(valid_amounts)
+            return {
+                'monto': max_amount,
+                'tipo': 'fallback_max',
+                'patron': 'heuristic',
+                'confianza': 0.3
+            }
+
+        return None
+
+    def detectar_multiples_montos(self, texto: str) -> List[Dict]:
+        """
+        Detecta todos los montos en el texto.
+
+        Useful for itemized receipts.
+
+        Returns:
+            Lista de diccionarios con monto y contexto
+        """
+        texto_lower = texto.lower()
+        resultados = []
+
+        # Find all lines with amounts
+        lineas = texto.split('\n')
+        for linea in lineas:
+            matches = re.findall(r'\$?\s*([\d,]+\.?\d{0,2})', linea)
+            for match in matches:
+                monto = self._normalizar_monto(match)
+                if self.min_amount <= monto <= self.max_amount:
+                    resultados.append({
+                        'monto': monto,
+                        'contexto': linea.strip(),
+                        'es_total': 'total' in linea.lower()
+                    })
+
+        return resultados
+
+
+def detectar_monto(texto: str) -> Optional[Dict]:
+    """
+    Convenience function to detect amount from text.
+
+    Args:
+        texto: Ticket text
+
+    Returns:
+        Dict with monto, tipo, patron, confianza or None
+    """
+    detector = AmountDetector()
+    return detector.detectar_monto(texto)
+
+
+def normalizar_monto(monto_str: str) -> float:
+    """
+    Convenience function to normalize amount string.
+
+    Args:
+        monto_str: Amount as string
+
+    Returns:
+        Amount as float
+    """
+    detector = AmountDetector()
+    return detector._normalizar_monto(monto_str)
--- a/sales-bot/ocr/patterns.py
+++ b/sales-bot/ocr/patterns.py
@@ -0,0 +1,223 @@
+"""
+Ticket format patterns for Sales Bot OCR
+Supports multiple ticket formats from different stores
+"""
+
+import re
+from typing import Dict, List, Optional
+
+# Ticket format configurations
+TICKET_FORMATS = {
+    'oxxo': {
+        'identificadores': ['oxxo', 'femsa', 'cadena comercial'],
+        'patron_total': r'total\s*\$?\s*([\d,]+\.\d{2})',
+        'patron_fecha': r'(\d{2}/\d{2}/\d{4})',
+        'patron_hora': r'(\d{2}:\d{2}:\d{2})',
+        'prioridad': 1
+    },
+    'walmart': {
+        'identificadores': ['walmart', 'walmex', 'wal-mart', 'bodega aurrera'],
+        'patron_total': r'total\s*\$\s*([\d,]+\.\d{2})',
+        'patron_fecha': r'(\d{2}-\d{2}-\d{4})',
+        'prioridad': 2
+    },
+    'soriana': {
+        'identificadores': ['soriana', 'mega soriana', 'city club'],
+        'patron_total': r'total\s*a?\s*pagar\s*\$?\s*([\d,]+\.\d{2})',
+        'patron_fecha': r'(\d{2}/\d{2}/\d{4})',
+        'prioridad': 3
+    },
+    'tienda_pintura': {
+        'identificadores': ['tinte', 'cromatique', 'oxidante', 'distribuidora',
+                          'colorante', 'pintura', 'tono', 'decolorante', 'revelador'],
+        'patron_total': r'total\s*\$?\s*([\d,]+[\s\.]?\d{0,2})',
+        'patron_productos': r'^(.+?)\s+(\d{1,3})\s+\$?\s*([\d,]+)',
+        'patron_tubos': r'(\d+)\s*(?:tubos?|pzas?|piezas?|unid)',
+        'prioridad': 0  # Highest priority for paint stores
+    },
+    'farmacia': {
+        'identificadores': ['farmacia', 'guadalajara', 'benavides', 'similares', 'ahorro'],
+        'patron_total': r'total\s*\$?\s*([\d,]+\.\d{2})',
+        'patron_fecha': r'(\d{2}/\d{2}/\d{2,4})',
+        'prioridad': 4
+    },
+    'seven_eleven': {
+        'identificadores': ['7-eleven', '7eleven', '7 eleven', 'iconn'],
+        'patron_total': r'total\s*\$?\s*([\d,]+\.\d{2})',
+        'patron_fecha': r'(\d{2}/\d{2}/\d{4})',
+        'prioridad': 5
+    },
+    'generico': {
+        'identificadores': [],  # Fallback - matches everything
+        'patron_total': r'total\s*\$?\s*([\d,]+[\s\.]?\d{0,2})',
+        'patron_fecha': r'(\d{2}[/-]\d{2}[/-]\d{2,4})',
+        'prioridad': 99
+    }
+}
+
+# Common patterns for amount extraction (in priority order)
+AMOUNT_PATTERNS = [
+    # Explicit total patterns
+    (r'total\s*a\s*pagar\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'total_a_pagar', 1),
+    (r'gran\s*total\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'gran_total', 2),
+    (r'total\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'total', 3),
+
+    # Payment related
+    (r'a\s*cobrar\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'a_cobrar', 4),
+    (r'importe\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'importe', 5),
+    (r'monto\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'monto', 6),
+    (r'suma\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'suma', 7),
+
+    # Subtotal (lower priority)
+    (r'subtotal\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'subtotal', 8),
+
+    # Last resort - currency amounts at end of lines
+    (r'\$\s*([\d,]+\.\d{2})\s*$', 'monto_final', 9),
+]
+
+# Date patterns
+DATE_PATTERNS = [
+    r'(\d{2}/\d{2}/\d{4})',      # DD/MM/YYYY
+    r'(\d{2}-\d{2}-\d{4})',      # DD-MM-YYYY
+    r'(\d{4}-\d{2}-\d{2})',      # YYYY-MM-DD
+    r'(\d{2}/\d{2}/\d{2})',      # DD/MM/YY
+    r'(\d{1,2}\s+de\s+\w+\s+de\s+\d{4})',  # D de Mes de YYYY
+]
+
+# Client name patterns
+CLIENT_PATTERNS = [
+    r'cliente\s*:?\s*(.+?)(?:\n|$)',
+    r'nombre\s*:?\s*(.+?)(?:\n|$)',
+    r'sr\.?\s*(.+?)(?:\n|$)',
+    r'sra\.?\s*(.+?)(?:\n|$)',
+]
+
+
+def detectar_formato_ticket(texto: str) -> str:
+    """
+    Detecta el formato del ticket basado en identificadores.
+
+    Args:
+        texto: Texto extraído del ticket
+
+    Returns:
+        Nombre del formato detectado
+    """
+    texto_lower = texto.lower()
+
+    # Check formats by priority (lower number = higher priority)
+    formatos_encontrados = []
+
+    for formato, config in TICKET_FORMATS.items():
+        if formato == 'generico':
+            continue
+
+        for identificador in config.get('identificadores', []):
+            if identificador in texto_lower:
+                formatos_encontrados.append((formato, config.get('prioridad', 99)))
+                break
+
+    if formatos_encontrados:
+        # Sort by priority and return highest priority match
+        formatos_encontrados.sort(key=lambda x: x[1])
+        return formatos_encontrados[0][0]
+
+    return 'generico'
+
+
+def get_patron_total(formato: str) -> str:
+    """
+    Obtiene el patrón de total para un formato específico.
+
+    Args:
+        formato: Nombre del formato
+
+    Returns:
+        Patrón regex para extraer el total
+    """
+    config = TICKET_FORMATS.get(formato, TICKET_FORMATS['generico'])
+    return config.get('patron_total', TICKET_FORMATS['generico']['patron_total'])
+
+
+def extraer_fecha_ticket(texto: str) -> Optional[str]:
+    """
+    Extrae la fecha del ticket.
+
+    Args:
+        texto: Texto del ticket
+
+    Returns:
+        Fecha encontrada o None
+    """
+    for patron in DATE_PATTERNS:
+        match = re.search(patron, texto, re.IGNORECASE)
+        if match:
+            return match.group(1)
+    return None
+
+
+def extraer_cliente_ticket(texto: str) -> Optional[str]:
+    """
+    Extrae el nombre del cliente del ticket.
+
+    Args:
+        texto: Texto del ticket
+
+    Returns:
+        Nombre del cliente o None
+    """
+    for patron in CLIENT_PATTERNS:
+        match = re.search(patron, texto, re.IGNORECASE)
+        if match:
+            cliente = match.group(1).strip()
+            # Clean up common artifacts
+            cliente = re.sub(r'[^\w\s\-\.]', '', cliente)
+            if len(cliente) > 2:  # Valid name should have at least 3 chars
+                return cliente
+    return None
+
+
+def contar_tubos_texto(texto: str) -> int:
+    """
+    Cuenta la cantidad de tubos mencionados en el ticket.
+
+    Args:
+        texto: Texto del ticket
+
+    Returns:
+        Cantidad de tubos detectados
+    """
+    texto_lower = texto.lower()
+    total_tubos = 0
+
+    # Pattern for explicit tube counts
+    patrones_tubos = [
+        r'(\d+)\s*(?:tubos?|tbs?)',
+        r'(\d+)\s*(?:pzas?|piezas?)\s*(?:de\s+)?(?:tinte|color)',
+        r'(?:cantidad|qty|cant)\s*:?\s*(\d+)',
+        r'x\s*(\d+)\s*(?:tubos?)?',
+    ]
+
+    for patron in patrones_tubos:
+        matches = re.findall(patron, texto_lower)
+        for match in matches:
+            try:
+                total_tubos += int(match)
+            except ValueError:
+                continue
+
+    # If no explicit count found, estimate from line items
+    if total_tubos == 0:
+        # Count lines that look like product entries
+        lineas = texto_lower.split('\n')
+        for linea in lineas:
+            if any(word in linea for word in ['tinte', 'color', 'tubo', 'cromatique']):
+                # Check for quantity at start of line or after product name
+                qty_match = re.search(r'^(\d+)\s+|x\s*(\d+)|(\d+)\s*pza', linea)
+                if qty_match:
+                    qty = next((g for g in qty_match.groups() if g), '1')
+                    total_tubos += int(qty)
+                else:
+                    total_tubos += 1  # Assume 1 if no explicit quantity
+
+    return total_tubos
--- a/sales-bot/ocr/preprocessor.py
+++ b/sales-bot/ocr/preprocessor.py
@@ -0,0 +1,305 @@
+"""
+Image preprocessing for Sales Bot OCR
+Adaptive preprocessing pipelines for different image conditions
+"""
+
+import logging
+import os
+from typing import Tuple, Optional, List
+from io import BytesIO
+
+logger = logging.getLogger(__name__)
+
+# Try to import image processing libraries
+try:
+    import cv2
+    import numpy as np
+    CV2_AVAILABLE = True
+except ImportError:
+    CV2_AVAILABLE = False
+    logger.warning("OpenCV not available. Image preprocessing will be limited.")
+
+try:
+    from PIL import Image, ImageEnhance, ImageFilter
+    PIL_AVAILABLE = True
+except ImportError:
+    PIL_AVAILABLE = False
+    logger.warning("PIL not available. Image preprocessing will be limited.")
+
+try:
+    from deskew import determine_skew
+    DESKEW_AVAILABLE = True
+except ImportError:
+    DESKEW_AVAILABLE = False
+    logger.warning("deskew library not available. Rotation correction disabled.")
+
+try:
+    import imutils
+    IMUTILS_AVAILABLE = True
+except ImportError:
+    IMUTILS_AVAILABLE = False
+    logger.warning("imutils not available. Some rotations may not work.")
+
+
+class ImagePreprocessor:
+    """
+    Preprocesses ticket images for better OCR accuracy.
+    Supports multiple preprocessing pipelines for different image conditions.
+    """
+
+    def __init__(self):
+        self.enable_deskew = os.getenv('OCR_ENABLE_DESKEW', 'true').lower() == 'true'
+        self.max_rotation = float(os.getenv('OCR_MAX_ROTATION_ANGLE', '15'))
+        self.use_adaptive = os.getenv('OCR_USE_ADAPTIVE_PIPELINE', 'true').lower() == 'true'
+
+        # Define preprocessing pipelines
+        self.pipelines = {
+            'standard': ['grayscale', 'contrast', 'otsu'],
+            'low_contrast': ['grayscale', 'clahe', 'adaptive_threshold'],
+            'noisy': ['grayscale', 'denoise', 'sharpen', 'otsu'],
+            'rotated': ['deskew', 'grayscale', 'contrast', 'otsu'],
+            'dark': ['grayscale', 'brighten', 'contrast', 'otsu'],
+            'light': ['grayscale', 'darken', 'contrast', 'otsu'],
+        }
+
+    def preprocess(self, image_bytes: bytes) -> bytes:
+        """
+        Preprocess image bytes for OCR.
+
+        Args:
+            image_bytes: Raw image bytes
+
+        Returns:
+            Preprocessed image bytes
+        """
+        if self.use_adaptive and CV2_AVAILABLE:
+            return self.preprocess_adaptive(image_bytes)
+        else:
+            return self.preprocess_basic(image_bytes)
+
+    def preprocess_basic(self, image_bytes: bytes) -> bytes:
+        """
+        Basic preprocessing using PIL only.
+        """
+        if not PIL_AVAILABLE:
+            return image_bytes
+
+        try:
+            # Load image
+            img = Image.open(BytesIO(image_bytes))
+
+            # Convert to grayscale
+            img = img.convert('L')
+
+            # Enhance contrast
+            enhancer = ImageEnhance.Contrast(img)
+            img = enhancer.enhance(1.5)
+
+            # Sharpen
+            img = img.filter(ImageFilter.SHARPEN)
+
+            # Save to bytes
+            output = BytesIO()
+            img.save(output, format='PNG')
+            return output.getvalue()
+
+        except Exception as e:
+            logger.error(f"Error in basic preprocessing: {e}")
+            return image_bytes
+
+    def preprocess_adaptive(self, image_bytes: bytes) -> bytes:
+        """
+        Adaptive preprocessing that tries multiple pipelines
+        and returns the best result.
+        """
+        if not CV2_AVAILABLE:
+            return self.preprocess_basic(image_bytes)
+
+        try:
+            # Decode image
+            nparr = np.frombuffer(image_bytes, np.uint8)
+            image = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+
+            if image is None:
+                logger.error("Could not decode image")
+                return image_bytes
+
+            # Analyze image to determine best pipeline
+            pipeline_name = self._determine_best_pipeline(image)
+            logger.info(f"Using preprocessing pipeline: {pipeline_name}")
+
+            # Apply pipeline
+            processed = self._apply_pipeline(image, pipeline_name)
+
+            # Encode back to bytes
+            _, buffer = cv2.imencode('.png', processed)
+            return buffer.tobytes()
+
+        except Exception as e:
+            logger.error(f"Error in adaptive preprocessing: {e}")
+            return self.preprocess_basic(image_bytes)
+
+    def _determine_best_pipeline(self, image: 'np.ndarray') -> str:
+        """
+        Analyzes image to determine the best preprocessing pipeline.
+        """
+        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+
+        # Calculate image statistics
+        mean_brightness = np.mean(gray)
+        std_brightness = np.std(gray)
+
+        # Check for rotation if deskew is enabled
+        if self.enable_deskew and DESKEW_AVAILABLE:
+            try:
+                angle = determine_skew(gray)
+                if abs(angle) > 1.0 and abs(angle) <= self.max_rotation:
+                    return 'rotated'
+            except Exception:
+                pass
+
+        # Determine based on brightness/contrast
+        if mean_brightness < 80:
+            return 'dark'
+        elif mean_brightness > 180:
+            return 'light'
+        elif std_brightness < 40:
+            return 'low_contrast'
+        elif std_brightness > 80:
+            return 'noisy'
+        else:
+            return 'standard'
+
+    def _apply_pipeline(self, image: 'np.ndarray', pipeline_name: str) -> 'np.ndarray':
+        """
+        Applies a preprocessing pipeline to the image.
+        """
+        pipeline = self.pipelines.get(pipeline_name, self.pipelines['standard'])
+        result = image.copy()
+
+        for step in pipeline:
+            try:
+                result = getattr(self, f'_step_{step}')(result)
+            except AttributeError:
+                logger.warning(f"Unknown preprocessing step: {step}")
+            except Exception as e:
+                logger.warning(f"Error in step {step}: {e}")
+
+        return result
+
+    # Pipeline steps
+
+    def _step_grayscale(self, image: 'np.ndarray') -> 'np.ndarray':
+        """Convert to grayscale."""
+        if len(image.shape) == 3:
+            return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        return image
+
+    def _step_contrast(self, image: 'np.ndarray') -> 'np.ndarray':
+        """Enhance contrast using histogram equalization."""
+        if len(image.shape) == 3:
+            image = self._step_grayscale(image)
+        return cv2.equalizeHist(image)
+
+    def _step_otsu(self, image: 'np.ndarray') -> 'np.ndarray':
+        """Apply Otsu's thresholding."""
+        if len(image.shape) == 3:
+            image = self._step_grayscale(image)
+        _, binary = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+        return binary
+
+    def _step_adaptive_threshold(self, image: 'np.ndarray') -> 'np.ndarray':
+        """Apply adaptive thresholding."""
+        if len(image.shape) == 3:
+            image = self._step_grayscale(image)
+        return cv2.adaptiveThreshold(
+            image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+            cv2.THRESH_BINARY, 11, 2
+        )
+
+    def _step_clahe(self, image: 'np.ndarray') -> 'np.ndarray':
+        """Apply CLAHE (Contrast Limited Adaptive Histogram Equalization)."""
+        if len(image.shape) == 3:
+            image = self._step_grayscale(image)
+        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+        return clahe.apply(image)
+
+    def _step_denoise(self, image: 'np.ndarray') -> 'np.ndarray':
+        """Remove noise while preserving edges."""
+        if len(image.shape) == 3:
+            return cv2.fastNlMeansDenoisingColored(image, None, 10, 10, 7, 21)
+        return cv2.fastNlMeansDenoising(image, None, 10, 7, 21)
+
+    def _step_sharpen(self, image: 'np.ndarray') -> 'np.ndarray':
+        """Sharpen the image."""
+        kernel = np.array([[-1, -1, -1],
+                          [-1,  9, -1],
+                          [-1, -1, -1]])
+        return cv2.filter2D(image, -1, kernel)
+
+    def _step_brighten(self, image: 'np.ndarray') -> 'np.ndarray':
+        """Increase image brightness."""
+        return cv2.convertScaleAbs(image, alpha=1.2, beta=30)
+
+    def _step_darken(self, image: 'np.ndarray') -> 'np.ndarray':
+        """Decrease image brightness."""
+        return cv2.convertScaleAbs(image, alpha=0.8, beta=-20)
+
+    def _step_deskew(self, image: 'np.ndarray') -> 'np.ndarray':
+        """Detect and correct image rotation."""
+        if not DESKEW_AVAILABLE:
+            return image
+
+        try:
+            if len(image.shape) == 3:
+                gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+            else:
+                gray = image
+
+            angle = determine_skew(gray)
+
+            if abs(angle) > self.max_rotation:
+                logger.info(f"Rotation angle {angle} exceeds max {self.max_rotation}, skipping")
+                return image
+
+            if abs(angle) < 0.5:
+                return image  # No significant rotation
+
+            logger.info(f"Correcting rotation: {angle} degrees")
+
+            if IMUTILS_AVAILABLE:
+                import imutils
+                return imutils.rotate_bound(image, -angle)
+            else:
+                # Manual rotation
+                (h, w) = image.shape[:2]
+                center = (w // 2, h // 2)
+                M = cv2.getRotationMatrix2D(center, angle, 1.0)
+                return cv2.warpAffine(image, M, (w, h),
+                                      flags=cv2.INTER_CUBIC,
+                                      borderMode=cv2.BORDER_REPLICATE)
+
+        except Exception as e:
+            logger.error(f"Error in deskew: {e}")
+            return image
+
+
+def preprocess_image(image_bytes: bytes) -> bytes:
+    """
+    Convenience function to preprocess image bytes.
+
+    Args:
+        image_bytes: Raw image bytes
+
+    Returns:
+        Preprocessed image bytes
+    """
+    preprocessor = ImagePreprocessor()
+    return preprocessor.preprocess(image_bytes)
+
+
+def preprocess_for_ocr(image_bytes: bytes) -> bytes:
+    """
+    Alias for preprocess_image.
+    """
+    return preprocess_image(image_bytes)
--- a/sales-bot/ocr/processor.py
+++ b/sales-bot/ocr/processor.py
@@ -0,0 +1,294 @@
+"""
+Main OCR processor for Sales Bot
+Combines preprocessing, text extraction, and amount detection
+"""
+
+import logging
+import os
+from typing import Dict, Optional
+from io import BytesIO
+
+logger = logging.getLogger(__name__)
+
+# Try to import OCR engine
+try:
+    import pytesseract
+    from PIL import Image
+    TESSERACT_AVAILABLE = True
+except ImportError:
+    TESSERACT_AVAILABLE = False
+    logger.warning("pytesseract not available. OCR will not work.")
+
+# Import local modules
+from .preprocessor import ImagePreprocessor, preprocess_image
+from .amount_detector import AmountDetector, detectar_monto
+from .patterns import (
+    detectar_formato_ticket,
+    extraer_fecha_ticket,
+    extraer_cliente_ticket,
+    contar_tubos_texto,
+    get_patron_total
+)
+
+
+class OCRProcessor:
+    """
+    Main OCR processor that coordinates image preprocessing,
+    text extraction, and data parsing.
+    """
+
+    def __init__(self):
+        self.preprocessor = ImagePreprocessor()
+        self.amount_detector = AmountDetector()
+        self.confidence_threshold = float(os.getenv('OCR_CONFIDENCE_THRESHOLD', '0.6'))
+
+        # Tesseract configuration for Spanish
+        self.tesseract_config = '--oem 3 --psm 6 -l spa'
+
+    def process(self, image_bytes: bytes) -> Dict:
+        """
+        Process a ticket image and extract relevant data.
+
+        Args:
+            image_bytes: Raw image bytes (JPEG, PNG, etc.)
+
+        Returns:
+            Dict with extracted data:
+            - texto: Full extracted text
+            - monto: Detected total amount
+            - cliente: Client name if found
+            - fecha: Date if found
+            - tubos: Number of tubes/items
+            - formato: Detected ticket format
+            - confianza: Confidence score
+        """
+        if not TESSERACT_AVAILABLE:
+            return {
+                'error': 'Tesseract OCR not available',
+                'texto': '',
+                'monto': 0,
+                'confianza': 0
+            }
+
+        try:
+            # Preprocess image
+            processed_bytes = self.preprocessor.preprocess(image_bytes)
+
+            # Extract text
+            texto = self._extract_text(processed_bytes)
+
+            if not texto or len(texto.strip()) < 10:
+                # Try again with original image
+                texto = self._extract_text(image_bytes)
+
+            if not texto:
+                return {
+                    'error': 'No text could be extracted',
+                    'texto': '',
+                    'monto': 0,
+                    'confianza': 0
+                }
+
+            # Detect ticket format
+            formato = detectar_formato_ticket(texto)
+
+            # Extract amount
+            monto_result = self.amount_detector.detectar_monto(texto)
+            monto = monto_result.get('monto', 0) if monto_result else 0
+            monto_confianza = monto_result.get('confianza', 0) if monto_result else 0
+            monto_tipo = monto_result.get('tipo', 'unknown') if monto_result else 'unknown'
+
+            # Extract other data
+            cliente = extraer_cliente_ticket(texto)
+            fecha = extraer_fecha_ticket(texto)
+            tubos = contar_tubos_texto(texto)
+
+            # Calculate overall confidence
+            confianza = self._calculate_overall_confidence(
+                texto, monto, monto_confianza, cliente, fecha
+            )
+
+            return {
+                'texto': texto,
+                'monto': monto,
+                'monto_tipo': monto_tipo,
+                'cliente': cliente,
+                'fecha': fecha,
+                'tubos': tubos,
+                'formato': formato,
+                'confianza': confianza
+            }
+
+        except Exception as e:
+            logger.error(f"Error processing image: {e}", exc_info=True)
+            return {
+                'error': str(e),
+                'texto': '',
+                'monto': 0,
+                'confianza': 0
+            }
+
+    def _extract_text(self, image_bytes: bytes) -> str:
+        """
+        Extract text from image bytes using Tesseract.
+        """
+        try:
+            # Load image
+            img = Image.open(BytesIO(image_bytes))
+
+            # Convert to RGB if necessary
+            if img.mode != 'RGB' and img.mode != 'L':
+                img = img.convert('RGB')
+
+            # Run OCR
+            texto = pytesseract.image_to_string(img, config=self.tesseract_config)
+
+            # Clean up text
+            texto = self._clean_text(texto)
+
+            return texto
+
+        except Exception as e:
+            logger.error(f"Error extracting text: {e}")
+            return ''
+
+    def _clean_text(self, texto: str) -> str:
+        """
+        Clean up OCR output text.
+        """
+        if not texto:
+            return ''
+
+        # Remove excessive whitespace
+        import re
+        texto = re.sub(r'\s+', ' ', texto)
+        texto = re.sub(r'\n\s*\n', '\n', texto)
+
+        # Fix common OCR errors
+        replacements = {
+            '|': 'l',
+            '0': 'O',  # Only in certain contexts
+            '1': 'I',  # Only in certain contexts
+            'S': '$',  # Only at start of amounts
+        }
+
+        # Apply selective replacements
+        # (Being careful not to corrupt actual numbers)
+
+        return texto.strip()
+
+    def _calculate_overall_confidence(
+        self,
+        texto: str,
+        monto: float,
+        monto_confianza: float,
+        cliente: Optional[str],
+        fecha: Optional[str]
+    ) -> float:
+        """
+        Calculate overall extraction confidence.
+        """
+        confidence = 0.0
+
+        # Text quality (based on length and structure)
+        if len(texto) > 50:
+            confidence += 0.2
+        if len(texto) > 200:
+            confidence += 0.1
+
+        # Amount detection confidence
+        confidence += monto_confianza * 0.4
+
+        # Bonus for finding additional data
+        if cliente:
+            confidence += 0.1
+        if fecha:
+            confidence += 0.1
+
+        # Check for typical receipt keywords
+        keywords = ['total', 'cliente', 'fecha', 'ticket', 'venta', 'pago']
+        found_keywords = sum(1 for kw in keywords if kw in texto.lower())
+        confidence += min(found_keywords * 0.05, 0.2)
+
+        return min(confidence, 1.0)
+
+    def process_multiple(self, images: list) -> Dict:
+        """
+        Process multiple images (e.g., multi-page receipt).
+        Combines results from all images.
+
+        Args:
+            images: List of image bytes
+
+        Returns:
+            Combined results
+        """
+        all_texto = []
+        total_monto = 0
+        cliente = None
+        fecha = None
+        tubos = 0
+        formato = None
+        max_confianza = 0
+
+        for img_bytes in images:
+            result = self.process(img_bytes)
+
+            if result.get('texto'):
+                all_texto.append(result['texto'])
+
+            if result.get('monto', 0) > total_monto:
+                total_monto = result['monto']
+
+            if not cliente and result.get('cliente'):
+                cliente = result['cliente']
+
+            if not fecha and result.get('fecha'):
+                fecha = result['fecha']
+
+            tubos += result.get('tubos', 0)
+
+            if not formato and result.get('formato'):
+                formato = result['formato']
+
+            if result.get('confianza', 0) > max_confianza:
+                max_confianza = result['confianza']
+
+        return {
+            'texto': '\n---\n'.join(all_texto),
+            'monto': total_monto,
+            'cliente': cliente,
+            'fecha': fecha,
+            'tubos': tubos,
+            'formato': formato,
+            'confianza': max_confianza,
+            'paginas': len(images)
+        }
+
+
+def procesar_ticket_imagen(image_bytes: bytes) -> Dict:
+    """
+    Convenience function to process a ticket image.
+
+    Args:
+        image_bytes: Raw image bytes
+
+    Returns:
+        Dict with extracted data
+    """
+    processor = OCRProcessor()
+    return processor.process(image_bytes)
+
+
+def procesar_multiples_imagenes(images: list) -> Dict:
+    """
+    Convenience function to process multiple images.
+
+    Args:
+        images: List of image bytes
+
+    Returns:
+        Combined results
+    """
+    processor = OCRProcessor()
+    return processor.process_multiple(images)