""" Amount detection for Sales Bot OCR Improved detection of total amounts from ticket text """ import re import logging from typing import Dict, List, Optional, Tuple logger = logging.getLogger(__name__) # Amount patterns in priority order PATTERNS = [ # Explicit total patterns (highest priority) (r'total\s*a\s*pagar\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'total_a_pagar', 1), (r'gran\s*total\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'gran_total', 2), (r'total\s+final\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'total_final', 3), (r'(?:^|\n)\s*total\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'total', 4), # Payment related (r'a\s*cobrar\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'a_cobrar', 5), (r'importe\s*(?:total)?\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'importe', 6), (r'monto\s*(?:total)?\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'monto', 7), (r'suma\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'suma', 8), (r'pago\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'pago', 9), # Subtotal (lower priority - may need to add tax) (r'subtotal\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'subtotal', 10), # Generic currency patterns (lowest priority) (r'\$\s*([\d,]+\.\d{2})\s*(?:\n|$)', 'monto_linea', 11), ] # Words that indicate a line is NOT a total (negative patterns) EXCLUSION_WORDS = [ 'cambio', 'efectivo', 'pago con', 'tarjeta', 'recibido', 'iva', 'impuesto', 'descuento', 'ahorro', 'puntos' ] class AmountDetector: """ Detects and extracts monetary amounts from ticket text. Uses multiple patterns and heuristics to find the most likely total. """ def __init__(self): self.patterns = PATTERNS self.min_amount = 1 # Minimum valid amount self.max_amount = 1000000 # Maximum valid amount def detectar_monto(self, texto: str) -> Optional[Dict]: """ Detecta el monto total del ticket. Args: texto: Texto extraído del ticket Returns: dict con monto, tipo, patron, y confianza, o None si no se encuentra """ texto_lower = texto.lower() resultados = [] for patron, tipo, prioridad in self.patterns: matches = re.findall(patron, texto_lower, re.IGNORECASE | re.MULTILINE) for match in matches: # Skip if match is in an exclusion context if self._is_excluded(texto_lower, match): continue monto = self._normalizar_monto(match) if self.min_amount <= monto <= self.max_amount: # Calculate confidence based on pattern type and context confianza = self._calcular_confianza(texto_lower, match, tipo) resultados.append({ 'monto': monto, 'tipo': tipo, 'patron': patron, 'prioridad': prioridad, 'confianza': confianza }) if not resultados: # Try to find the largest amount as fallback return self._fallback_detection(texto) # Sort by priority (lower is better) then by confidence (higher is better) resultados.sort(key=lambda x: (x['prioridad'], -x['confianza'])) # Return the best match best = resultados[0] return { 'monto': best['monto'], 'tipo': best['tipo'], 'patron': best['patron'], 'confianza': best['confianza'] } def _normalizar_monto(self, monto_str: str) -> float: """ Normaliza string de monto a float. Handles various formats: - 1,234.56 (US/Mexico format) - 1234.56 - 1 234.56 (space separator) - 1234,56 (European format) """ if not monto_str: return 0.0 # Remove currency symbols and whitespace monto = monto_str.strip().replace('$', '').replace(' ', '') # Handle different decimal separators # If there's both comma and dot, determine which is decimal if ',' in monto and '.' in monto: # US/Mexico format: 1,234.56 monto = monto.replace(',', '') elif ',' in monto: # Could be European (1234,56) or thousand separator (1,234) parts = monto.split(',') if len(parts) == 2 and len(parts[1]) == 2: # European format monto = monto.replace(',', '.') else: # Thousand separator monto = monto.replace(',', '') try: return float(monto) except ValueError: return 0.0 def _is_excluded(self, texto: str, match: str) -> bool: """ Checks if the match appears in an exclusion context. """ # Find the line containing this match for linea in texto.split('\n'): if match in linea: linea_lower = linea.lower() for exclusion in EXCLUSION_WORDS: if exclusion in linea_lower: return True return False def _calcular_confianza(self, texto: str, match: str, tipo: str) -> float: """ Calculates confidence score for a match. Returns value between 0.0 and 1.0 """ confianza = 0.5 # Base confidence # Higher confidence for explicit total patterns if tipo in ['total_a_pagar', 'gran_total', 'total_final']: confianza += 0.3 elif tipo == 'total': confianza += 0.2 # Higher confidence if near end of text position = texto.find(match) text_length = len(texto) if position > text_length * 0.6: # In last 40% of text confianza += 0.1 # Higher confidence if followed by payment info after_match = texto[texto.find(match) + len(match):texto.find(match) + len(match) + 50] if any(word in after_match.lower() for word in ['efectivo', 'tarjeta', 'cambio', 'gracias']): confianza += 0.1 return min(confianza, 1.0) def _fallback_detection(self, texto: str) -> Optional[Dict]: """ Fallback detection when standard patterns fail. Looks for the largest reasonable amount in the text. """ # Find all currency-like numbers all_amounts = re.findall(r'\$?\s*([\d,]+\.?\d{0,2})', texto) valid_amounts = [] for amount_str in all_amounts: amount = self._normalizar_monto(amount_str) if self.min_amount <= amount <= self.max_amount: valid_amounts.append(amount) if valid_amounts: # Return the largest amount (likely the total) max_amount = max(valid_amounts) return { 'monto': max_amount, 'tipo': 'fallback_max', 'patron': 'heuristic', 'confianza': 0.3 } return None def detectar_multiples_montos(self, texto: str) -> List[Dict]: """ Detecta todos los montos en el texto. Useful for itemized receipts. Returns: Lista de diccionarios con monto y contexto """ texto_lower = texto.lower() resultados = [] # Find all lines with amounts lineas = texto.split('\n') for linea in lineas: matches = re.findall(r'\$?\s*([\d,]+\.?\d{0,2})', linea) for match in matches: monto = self._normalizar_monto(match) if self.min_amount <= monto <= self.max_amount: resultados.append({ 'monto': monto, 'contexto': linea.strip(), 'es_total': 'total' in linea.lower() }) return resultados def detectar_monto(texto: str) -> Optional[Dict]: """ Convenience function to detect amount from text. Args: texto: Ticket text Returns: Dict with monto, tipo, patron, confianza or None """ detector = AmountDetector() return detector.detectar_monto(texto) def normalizar_monto(monto_str: str) -> float: """ Convenience function to normalize amount string. Args: monto_str: Amount as string Returns: Amount as float """ detector = AmountDetector() return detector._normalizar_monto(monto_str)