sales-bot-stacks/sales-bot/ocr/processor.py

"""
Main OCR processor for Sales Bot
Combines preprocessing, text extraction, and amount detection
"""

import logging
import os
from typing import Dict, Optional
from io import BytesIO

logger = logging.getLogger(__name__)

# Try to import OCR engine
try:
    import pytesseract
    from PIL import Image
    TESSERACT_AVAILABLE = True
except ImportError:
    TESSERACT_AVAILABLE = False
    logger.warning("pytesseract not available. OCR will not work.")

# Import local modules
from .preprocessor import ImagePreprocessor, preprocess_image
from .amount_detector import AmountDetector, detectar_monto
from .patterns import (
    detectar_formato_ticket,
    extraer_fecha_ticket,
    extraer_cliente_ticket,
    contar_tubos_texto,
    get_patron_total
)


class OCRProcessor:
    """
    Main OCR processor that coordinates image preprocessing,
    text extraction, and data parsing.
    """

    def __init__(self):
        self.preprocessor = ImagePreprocessor()
        self.amount_detector = AmountDetector()
        self.confidence_threshold = float(os.getenv('OCR_CONFIDENCE_THRESHOLD', '0.6'))

        # Tesseract configuration for Spanish
        self.tesseract_config = '--oem 3 --psm 6 -l spa'

    def process(self, image_bytes: bytes) -> Dict:
        """
        Process a ticket image and extract relevant data.

        Args:
            image_bytes: Raw image bytes (JPEG, PNG, etc.)

        Returns:
            Dict with extracted data:
            - texto: Full extracted text
            - monto: Detected total amount
            - cliente: Client name if found
            - fecha: Date if found
            - tubos: Number of tubes/items
            - formato: Detected ticket format
            - confianza: Confidence score
        """
        if not TESSERACT_AVAILABLE:
            return {
                'error': 'Tesseract OCR not available',
                'texto': '',
                'monto': 0,
                'confianza': 0
            }

        try:
            # Preprocess image
            processed_bytes = self.preprocessor.preprocess(image_bytes)

            # Extract text
            texto = self._extract_text(processed_bytes)

            if not texto or len(texto.strip()) < 10:
                # Try again with original image
                texto = self._extract_text(image_bytes)

            if not texto:
                return {
                    'error': 'No text could be extracted',
                    'texto': '',
                    'monto': 0,
                    'confianza': 0
                }

            # Detect ticket format
            formato = detectar_formato_ticket(texto)

            # Extract amount
            monto_result = self.amount_detector.detectar_monto(texto)
            monto = monto_result.get('monto', 0) if monto_result else 0
            monto_confianza = monto_result.get('confianza', 0) if monto_result else 0
            monto_tipo = monto_result.get('tipo', 'unknown') if monto_result else 'unknown'

            # Extract other data
            cliente = extraer_cliente_ticket(texto)
            fecha = extraer_fecha_ticket(texto)
            tubos = contar_tubos_texto(texto)

            # Calculate overall confidence
            confianza = self._calculate_overall_confidence(
                texto, monto, monto_confianza, cliente, fecha
            )

            return {
                'texto': texto,
                'monto': monto,
                'monto_tipo': monto_tipo,
                'cliente': cliente,
                'fecha': fecha,
                'tubos': tubos,
                'formato': formato,
                'confianza': confianza
            }

        except Exception as e:
            logger.error(f"Error processing image: {e}", exc_info=True)
            return {
                'error': str(e),
                'texto': '',
                'monto': 0,
                'confianza': 0
            }

    def _extract_text(self, image_bytes: bytes) -> str:
        """
        Extract text from image bytes using Tesseract.
        """
        try:
            # Load image
            img = Image.open(BytesIO(image_bytes))

            # Convert to RGB if necessary
            if img.mode != 'RGB' and img.mode != 'L':
                img = img.convert('RGB')

            # Run OCR
            texto = pytesseract.image_to_string(img, config=self.tesseract_config)

            # Clean up text
            texto = self._clean_text(texto)

            return texto

        except Exception as e:
            logger.error(f"Error extracting text: {e}")
            return ''

    def _clean_text(self, texto: str) -> str:
        """
        Clean up OCR output text.
        """
        if not texto:
            return ''

        # Remove excessive whitespace
        import re
        texto = re.sub(r'\s+', ' ', texto)
        texto = re.sub(r'\n\s*\n', '\n', texto)

        # Fix common OCR errors
        replacements = {
            '|': 'l',
            '0': 'O',  # Only in certain contexts
            '1': 'I',  # Only in certain contexts
            'S': '$',  # Only at start of amounts
        }

        # Apply selective replacements
        # (Being careful not to corrupt actual numbers)

        return texto.strip()

    def _calculate_overall_confidence(
        self,
        texto: str,
        monto: float,
        monto_confianza: float,
        cliente: Optional[str],
        fecha: Optional[str]
    ) -> float:
        """
        Calculate overall extraction confidence.
        """
        confidence = 0.0

        # Text quality (based on length and structure)
        if len(texto) > 50:
            confidence += 0.2
        if len(texto) > 200:
            confidence += 0.1

        # Amount detection confidence
        confidence += monto_confianza * 0.4

        # Bonus for finding additional data
        if cliente:
            confidence += 0.1
        if fecha:
            confidence += 0.1

        # Check for typical receipt keywords
        keywords = ['total', 'cliente', 'fecha', 'ticket', 'venta', 'pago']
        found_keywords = sum(1 for kw in keywords if kw in texto.lower())
        confidence += min(found_keywords * 0.05, 0.2)

        return min(confidence, 1.0)

    def process_multiple(self, images: list) -> Dict:
        """
        Process multiple images (e.g., multi-page receipt).
        Combines results from all images.

        Args:
            images: List of image bytes

        Returns:
            Combined results
        """
        all_texto = []
        total_monto = 0
        cliente = None
        fecha = None
        tubos = 0
        formato = None
        max_confianza = 0

        for img_bytes in images:
            result = self.process(img_bytes)

            if result.get('texto'):
                all_texto.append(result['texto'])

            if result.get('monto', 0) > total_monto:
                total_monto = result['monto']

            if not cliente and result.get('cliente'):
                cliente = result['cliente']

            if not fecha and result.get('fecha'):
                fecha = result['fecha']

            tubos += result.get('tubos', 0)

            if not formato and result.get('formato'):
                formato = result['formato']

            if result.get('confianza', 0) > max_confianza:
                max_confianza = result['confianza']

        return {
            'texto': '\n---\n'.join(all_texto),
            'monto': total_monto,
            'cliente': cliente,
            'fecha': fecha,
            'tubos': tubos,
            'formato': formato,
            'confianza': max_confianza,
            'paginas': len(images)
        }


def procesar_ticket_imagen(image_bytes: bytes) -> Dict:
    """
    Convenience function to process a ticket image.

    Args:
        image_bytes: Raw image bytes

    Returns:
        Dict with extracted data
    """
    processor = OCRProcessor()
    return processor.process(image_bytes)


def procesar_multiples_imagenes(images: list) -> Dict:
    """
    Convenience function to process multiple images.

    Args:
        images: List of image bytes

    Returns:
        Combined results
    """
    processor = OCRProcessor()
    return processor.process_multiple(images)