sales-bot-stacks/sales-bot/ocr/amount_detector.py

"""
Amount detection for Sales Bot OCR
Improved detection of total amounts from ticket text
"""

import re
import logging
from typing import Dict, List, Optional, Tuple

logger = logging.getLogger(__name__)

# Amount patterns in priority order
PATTERNS = [
    # Explicit total patterns (highest priority)
    (r'total\s*a\s*pagar\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'total_a_pagar', 1),
    (r'gran\s*total\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'gran_total', 2),
    (r'total\s+final\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'total_final', 3),
    (r'(?:^|\n)\s*total\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'total', 4),

    # Payment related
    (r'a\s*cobrar\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'a_cobrar', 5),
    (r'importe\s*(?:total)?\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'importe', 6),
    (r'monto\s*(?:total)?\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'monto', 7),
    (r'suma\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'suma', 8),
    (r'pago\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'pago', 9),

    # Subtotal (lower priority - may need to add tax)
    (r'subtotal\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'subtotal', 10),

    # Generic currency patterns (lowest priority)
    (r'\$\s*([\d,]+\.\d{2})\s*(?:\n|$)', 'monto_linea', 11),
]

# Words that indicate a line is NOT a total (negative patterns)
EXCLUSION_WORDS = [
    'cambio', 'efectivo', 'pago con', 'tarjeta', 'recibido',
    'iva', 'impuesto', 'descuento', 'ahorro', 'puntos'
]


class AmountDetector:
    """
    Detects and extracts monetary amounts from ticket text.
    Uses multiple patterns and heuristics to find the most likely total.
    """

    def __init__(self):
        self.patterns = PATTERNS
        self.min_amount = 1  # Minimum valid amount
        self.max_amount = 1000000  # Maximum valid amount

    def detectar_monto(self, texto: str) -> Optional[Dict]:
        """
        Detecta el monto total del ticket.

        Args:
            texto: Texto extraído del ticket

        Returns:
            dict con monto, tipo, patron, y confianza, o None si no se encuentra
        """
        texto_lower = texto.lower()
        resultados = []

        for patron, tipo, prioridad in self.patterns:
            matches = re.findall(patron, texto_lower, re.IGNORECASE | re.MULTILINE)

            for match in matches:
                # Skip if match is in an exclusion context
                if self._is_excluded(texto_lower, match):
                    continue

                monto = self._normalizar_monto(match)

                if self.min_amount <= monto <= self.max_amount:
                    # Calculate confidence based on pattern type and context
                    confianza = self._calcular_confianza(texto_lower, match, tipo)

                    resultados.append({
                        'monto': monto,
                        'tipo': tipo,
                        'patron': patron,
                        'prioridad': prioridad,
                        'confianza': confianza
                    })

        if not resultados:
            # Try to find the largest amount as fallback
            return self._fallback_detection(texto)

        # Sort by priority (lower is better) then by confidence (higher is better)
        resultados.sort(key=lambda x: (x['prioridad'], -x['confianza']))

        # Return the best match
        best = resultados[0]
        return {
            'monto': best['monto'],
            'tipo': best['tipo'],
            'patron': best['patron'],
            'confianza': best['confianza']
        }

    def _normalizar_monto(self, monto_str: str) -> float:
        """
        Normaliza string de monto a float.

        Handles various formats:
        - 1,234.56 (US/Mexico format)
        - 1234.56
        - 1 234.56 (space separator)
        - 1234,56 (European format)
        """
        if not monto_str:
            return 0.0

        # Remove currency symbols and whitespace
        monto = monto_str.strip().replace('$', '').replace(' ', '')

        # Handle different decimal separators
        # If there's both comma and dot, determine which is decimal
        if ',' in monto and '.' in monto:
            # US/Mexico format: 1,234.56
            monto = monto.replace(',', '')
        elif ',' in monto:
            # Could be European (1234,56) or thousand separator (1,234)
            parts = monto.split(',')
            if len(parts) == 2 and len(parts[1]) == 2:
                # European format
                monto = monto.replace(',', '.')
            else:
                # Thousand separator
                monto = monto.replace(',', '')

        try:
            return float(monto)
        except ValueError:
            return 0.0

    def _is_excluded(self, texto: str, match: str) -> bool:
        """
        Checks if the match appears in an exclusion context.
        """
        # Find the line containing this match
        for linea in texto.split('\n'):
            if match in linea:
                linea_lower = linea.lower()
                for exclusion in EXCLUSION_WORDS:
                    if exclusion in linea_lower:
                        return True
        return False

    def _calcular_confianza(self, texto: str, match: str, tipo: str) -> float:
        """
        Calculates confidence score for a match.

        Returns value between 0.0 and 1.0
        """
        confianza = 0.5  # Base confidence

        # Higher confidence for explicit total patterns
        if tipo in ['total_a_pagar', 'gran_total', 'total_final']:
            confianza += 0.3
        elif tipo == 'total':
            confianza += 0.2

        # Higher confidence if near end of text
        position = texto.find(match)
        text_length = len(texto)
        if position > text_length * 0.6:  # In last 40% of text
            confianza += 0.1

        # Higher confidence if followed by payment info
        after_match = texto[texto.find(match) + len(match):texto.find(match) + len(match) + 50]
        if any(word in after_match.lower() for word in ['efectivo', 'tarjeta', 'cambio', 'gracias']):
            confianza += 0.1

        return min(confianza, 1.0)

    def _fallback_detection(self, texto: str) -> Optional[Dict]:
        """
        Fallback detection when standard patterns fail.
        Looks for the largest reasonable amount in the text.
        """
        # Find all currency-like numbers
        all_amounts = re.findall(r'\$?\s*([\d,]+\.?\d{0,2})', texto)

        valid_amounts = []
        for amount_str in all_amounts:
            amount = self._normalizar_monto(amount_str)
            if self.min_amount <= amount <= self.max_amount:
                valid_amounts.append(amount)

        if valid_amounts:
            # Return the largest amount (likely the total)
            max_amount = max(valid_amounts)
            return {
                'monto': max_amount,
                'tipo': 'fallback_max',
                'patron': 'heuristic',
                'confianza': 0.3
            }

        return None

    def detectar_multiples_montos(self, texto: str) -> List[Dict]:
        """
        Detecta todos los montos en el texto.

        Useful for itemized receipts.

        Returns:
            Lista de diccionarios con monto y contexto
        """
        texto_lower = texto.lower()
        resultados = []

        # Find all lines with amounts
        lineas = texto.split('\n')
        for linea in lineas:
            matches = re.findall(r'\$?\s*([\d,]+\.?\d{0,2})', linea)
            for match in matches:
                monto = self._normalizar_monto(match)
                if self.min_amount <= monto <= self.max_amount:
                    resultados.append({
                        'monto': monto,
                        'contexto': linea.strip(),
                        'es_total': 'total' in linea.lower()
                    })

        return resultados


def detectar_monto(texto: str) -> Optional[Dict]:
    """
    Convenience function to detect amount from text.

    Args:
        texto: Ticket text

    Returns:
        Dict with monto, tipo, patron, confianza or None
    """
    detector = AmountDetector()
    return detector.detectar_monto(texto)


def normalizar_monto(monto_str: str) -> float:
    """
    Convenience function to normalize amount string.

    Args:
        monto_str: Amount as string

    Returns:
        Amount as float
    """
    detector = AmountDetector()
    return detector._normalizar_monto(monto_str)