""" Main OCR processor for Sales Bot Combines preprocessing, text extraction, and amount detection """ import logging import os from typing import Dict, Optional from io import BytesIO logger = logging.getLogger(__name__) # Try to import OCR engine try: import pytesseract from PIL import Image TESSERACT_AVAILABLE = True except ImportError: TESSERACT_AVAILABLE = False logger.warning("pytesseract not available. OCR will not work.") # Import local modules from .preprocessor import ImagePreprocessor, preprocess_image from .amount_detector import AmountDetector, detectar_monto from .patterns import ( detectar_formato_ticket, extraer_fecha_ticket, extraer_cliente_ticket, contar_tubos_texto, get_patron_total ) class OCRProcessor: """ Main OCR processor that coordinates image preprocessing, text extraction, and data parsing. """ def __init__(self): self.preprocessor = ImagePreprocessor() self.amount_detector = AmountDetector() self.confidence_threshold = float(os.getenv('OCR_CONFIDENCE_THRESHOLD', '0.6')) # Tesseract configuration for Spanish self.tesseract_config = '--oem 3 --psm 6 -l spa' def process(self, image_bytes: bytes) -> Dict: """ Process a ticket image and extract relevant data. Args: image_bytes: Raw image bytes (JPEG, PNG, etc.) Returns: Dict with extracted data: - texto: Full extracted text - monto: Detected total amount - cliente: Client name if found - fecha: Date if found - tubos: Number of tubes/items - formato: Detected ticket format - confianza: Confidence score """ if not TESSERACT_AVAILABLE: return { 'error': 'Tesseract OCR not available', 'texto': '', 'monto': 0, 'confianza': 0 } try: # Preprocess image processed_bytes = self.preprocessor.preprocess(image_bytes) # Extract text texto = self._extract_text(processed_bytes) if not texto or len(texto.strip()) < 10: # Try again with original image texto = self._extract_text(image_bytes) if not texto: return { 'error': 'No text could be extracted', 'texto': '', 'monto': 0, 'confianza': 0 } # Detect ticket format formato = detectar_formato_ticket(texto) # Extract amount monto_result = self.amount_detector.detectar_monto(texto) monto = monto_result.get('monto', 0) if monto_result else 0 monto_confianza = monto_result.get('confianza', 0) if monto_result else 0 monto_tipo = monto_result.get('tipo', 'unknown') if monto_result else 'unknown' # Extract other data cliente = extraer_cliente_ticket(texto) fecha = extraer_fecha_ticket(texto) tubos = contar_tubos_texto(texto) # Calculate overall confidence confianza = self._calculate_overall_confidence( texto, monto, monto_confianza, cliente, fecha ) return { 'texto': texto, 'monto': monto, 'monto_tipo': monto_tipo, 'cliente': cliente, 'fecha': fecha, 'tubos': tubos, 'formato': formato, 'confianza': confianza } except Exception as e: logger.error(f"Error processing image: {e}", exc_info=True) return { 'error': str(e), 'texto': '', 'monto': 0, 'confianza': 0 } def _extract_text(self, image_bytes: bytes) -> str: """ Extract text from image bytes using Tesseract. """ try: # Load image img = Image.open(BytesIO(image_bytes)) # Convert to RGB if necessary if img.mode != 'RGB' and img.mode != 'L': img = img.convert('RGB') # Run OCR texto = pytesseract.image_to_string(img, config=self.tesseract_config) # Clean up text texto = self._clean_text(texto) return texto except Exception as e: logger.error(f"Error extracting text: {e}") return '' def _clean_text(self, texto: str) -> str: """ Clean up OCR output text. """ if not texto: return '' # Remove excessive whitespace import re texto = re.sub(r'\s+', ' ', texto) texto = re.sub(r'\n\s*\n', '\n', texto) # Fix common OCR errors replacements = { '|': 'l', '0': 'O', # Only in certain contexts '1': 'I', # Only in certain contexts 'S': '$', # Only at start of amounts } # Apply selective replacements # (Being careful not to corrupt actual numbers) return texto.strip() def _calculate_overall_confidence( self, texto: str, monto: float, monto_confianza: float, cliente: Optional[str], fecha: Optional[str] ) -> float: """ Calculate overall extraction confidence. """ confidence = 0.0 # Text quality (based on length and structure) if len(texto) > 50: confidence += 0.2 if len(texto) > 200: confidence += 0.1 # Amount detection confidence confidence += monto_confianza * 0.4 # Bonus for finding additional data if cliente: confidence += 0.1 if fecha: confidence += 0.1 # Check for typical receipt keywords keywords = ['total', 'cliente', 'fecha', 'ticket', 'venta', 'pago'] found_keywords = sum(1 for kw in keywords if kw in texto.lower()) confidence += min(found_keywords * 0.05, 0.2) return min(confidence, 1.0) def process_multiple(self, images: list) -> Dict: """ Process multiple images (e.g., multi-page receipt). Combines results from all images. Args: images: List of image bytes Returns: Combined results """ all_texto = [] total_monto = 0 cliente = None fecha = None tubos = 0 formato = None max_confianza = 0 for img_bytes in images: result = self.process(img_bytes) if result.get('texto'): all_texto.append(result['texto']) if result.get('monto', 0) > total_monto: total_monto = result['monto'] if not cliente and result.get('cliente'): cliente = result['cliente'] if not fecha and result.get('fecha'): fecha = result['fecha'] tubos += result.get('tubos', 0) if not formato and result.get('formato'): formato = result['formato'] if result.get('confianza', 0) > max_confianza: max_confianza = result['confianza'] return { 'texto': '\n---\n'.join(all_texto), 'monto': total_monto, 'cliente': cliente, 'fecha': fecha, 'tubos': tubos, 'formato': formato, 'confianza': max_confianza, 'paginas': len(images) } def procesar_ticket_imagen(image_bytes: bytes) -> Dict: """ Convenience function to process a ticket image. Args: image_bytes: Raw image bytes Returns: Dict with extracted data """ processor = OCRProcessor() return processor.process(image_bytes) def procesar_multiples_imagenes(images: list) -> Dict: """ Convenience function to process multiple images. Args: images: List of image bytes Returns: Combined results """ processor = OCRProcessor() return processor.process_multiple(images)