Files
sales-bot-stacks/sales-bot/ocr/amount_detector.py
consultoria-as 9936deaa90 feat: Implementar PWA, Analytics, Reportes PDF y mejoras OCR
FASE 1 - PWA y Frontend:
- Crear templates/base.html, dashboard.html, analytics.html, executive.html
- Crear static/css/main.css con diseño responsivo
- Agregar static/js/app.js, pwa.js, camera.js, charts.js
- Implementar manifest.json y service-worker.js para PWA
- Soporte para captura de tickets desde cámara móvil

FASE 2 - Analytics:
- Crear módulo analytics/ con predictions.py, trends.py, comparisons.py
- Implementar predicción básica con promedio móvil + tendencia lineal
- Agregar endpoints /api/analytics/trends, predictions, comparisons
- Integrar Chart.js para gráficas interactivas

FASE 3 - Reportes PDF:
- Crear módulo reports/ con pdf_generator.py
- Implementar SalesReportPDF con generar_reporte_diario y ejecutivo
- Agregar comando /reporte [diario|semanal|ejecutivo]
- Agregar endpoints /api/reports/generate y /api/reports/download

FASE 4 - Mejoras OCR:
- Crear módulo ocr/ con processor.py, preprocessor.py, patterns.py
- Implementar AmountDetector con patrones múltiples de montos
- Agregar preprocesador adaptativo con pipelines para diferentes condiciones
- Soporte para corrección de rotación (deskew) y threshold Otsu

Dependencias agregadas:
- reportlab, matplotlib (PDF)
- scipy, pandas (analytics)
- imutils, deskew, cachetools (OCR)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-19 03:26:16 +00:00

259 lines
8.3 KiB
Python

"""
Amount detection for Sales Bot OCR
Improved detection of total amounts from ticket text
"""
import re
import logging
from typing import Dict, List, Optional, Tuple
logger = logging.getLogger(__name__)
# Amount patterns in priority order
PATTERNS = [
# Explicit total patterns (highest priority)
(r'total\s*a\s*pagar\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'total_a_pagar', 1),
(r'gran\s*total\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'gran_total', 2),
(r'total\s+final\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'total_final', 3),
(r'(?:^|\n)\s*total\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'total', 4),
# Payment related
(r'a\s*cobrar\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'a_cobrar', 5),
(r'importe\s*(?:total)?\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'importe', 6),
(r'monto\s*(?:total)?\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'monto', 7),
(r'suma\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'suma', 8),
(r'pago\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'pago', 9),
# Subtotal (lower priority - may need to add tax)
(r'subtotal\s*:?\s*\$?\s*([\d,]+[\s\.]?\d{0,2})', 'subtotal', 10),
# Generic currency patterns (lowest priority)
(r'\$\s*([\d,]+\.\d{2})\s*(?:\n|$)', 'monto_linea', 11),
]
# Words that indicate a line is NOT a total (negative patterns)
EXCLUSION_WORDS = [
'cambio', 'efectivo', 'pago con', 'tarjeta', 'recibido',
'iva', 'impuesto', 'descuento', 'ahorro', 'puntos'
]
class AmountDetector:
"""
Detects and extracts monetary amounts from ticket text.
Uses multiple patterns and heuristics to find the most likely total.
"""
def __init__(self):
self.patterns = PATTERNS
self.min_amount = 1 # Minimum valid amount
self.max_amount = 1000000 # Maximum valid amount
def detectar_monto(self, texto: str) -> Optional[Dict]:
"""
Detecta el monto total del ticket.
Args:
texto: Texto extraído del ticket
Returns:
dict con monto, tipo, patron, y confianza, o None si no se encuentra
"""
texto_lower = texto.lower()
resultados = []
for patron, tipo, prioridad in self.patterns:
matches = re.findall(patron, texto_lower, re.IGNORECASE | re.MULTILINE)
for match in matches:
# Skip if match is in an exclusion context
if self._is_excluded(texto_lower, match):
continue
monto = self._normalizar_monto(match)
if self.min_amount <= monto <= self.max_amount:
# Calculate confidence based on pattern type and context
confianza = self._calcular_confianza(texto_lower, match, tipo)
resultados.append({
'monto': monto,
'tipo': tipo,
'patron': patron,
'prioridad': prioridad,
'confianza': confianza
})
if not resultados:
# Try to find the largest amount as fallback
return self._fallback_detection(texto)
# Sort by priority (lower is better) then by confidence (higher is better)
resultados.sort(key=lambda x: (x['prioridad'], -x['confianza']))
# Return the best match
best = resultados[0]
return {
'monto': best['monto'],
'tipo': best['tipo'],
'patron': best['patron'],
'confianza': best['confianza']
}
def _normalizar_monto(self, monto_str: str) -> float:
"""
Normaliza string de monto a float.
Handles various formats:
- 1,234.56 (US/Mexico format)
- 1234.56
- 1 234.56 (space separator)
- 1234,56 (European format)
"""
if not monto_str:
return 0.0
# Remove currency symbols and whitespace
monto = monto_str.strip().replace('$', '').replace(' ', '')
# Handle different decimal separators
# If there's both comma and dot, determine which is decimal
if ',' in monto and '.' in monto:
# US/Mexico format: 1,234.56
monto = monto.replace(',', '')
elif ',' in monto:
# Could be European (1234,56) or thousand separator (1,234)
parts = monto.split(',')
if len(parts) == 2 and len(parts[1]) == 2:
# European format
monto = monto.replace(',', '.')
else:
# Thousand separator
monto = monto.replace(',', '')
try:
return float(monto)
except ValueError:
return 0.0
def _is_excluded(self, texto: str, match: str) -> bool:
"""
Checks if the match appears in an exclusion context.
"""
# Find the line containing this match
for linea in texto.split('\n'):
if match in linea:
linea_lower = linea.lower()
for exclusion in EXCLUSION_WORDS:
if exclusion in linea_lower:
return True
return False
def _calcular_confianza(self, texto: str, match: str, tipo: str) -> float:
"""
Calculates confidence score for a match.
Returns value between 0.0 and 1.0
"""
confianza = 0.5 # Base confidence
# Higher confidence for explicit total patterns
if tipo in ['total_a_pagar', 'gran_total', 'total_final']:
confianza += 0.3
elif tipo == 'total':
confianza += 0.2
# Higher confidence if near end of text
position = texto.find(match)
text_length = len(texto)
if position > text_length * 0.6: # In last 40% of text
confianza += 0.1
# Higher confidence if followed by payment info
after_match = texto[texto.find(match) + len(match):texto.find(match) + len(match) + 50]
if any(word in after_match.lower() for word in ['efectivo', 'tarjeta', 'cambio', 'gracias']):
confianza += 0.1
return min(confianza, 1.0)
def _fallback_detection(self, texto: str) -> Optional[Dict]:
"""
Fallback detection when standard patterns fail.
Looks for the largest reasonable amount in the text.
"""
# Find all currency-like numbers
all_amounts = re.findall(r'\$?\s*([\d,]+\.?\d{0,2})', texto)
valid_amounts = []
for amount_str in all_amounts:
amount = self._normalizar_monto(amount_str)
if self.min_amount <= amount <= self.max_amount:
valid_amounts.append(amount)
if valid_amounts:
# Return the largest amount (likely the total)
max_amount = max(valid_amounts)
return {
'monto': max_amount,
'tipo': 'fallback_max',
'patron': 'heuristic',
'confianza': 0.3
}
return None
def detectar_multiples_montos(self, texto: str) -> List[Dict]:
"""
Detecta todos los montos en el texto.
Useful for itemized receipts.
Returns:
Lista de diccionarios con monto y contexto
"""
texto_lower = texto.lower()
resultados = []
# Find all lines with amounts
lineas = texto.split('\n')
for linea in lineas:
matches = re.findall(r'\$?\s*([\d,]+\.?\d{0,2})', linea)
for match in matches:
monto = self._normalizar_monto(match)
if self.min_amount <= monto <= self.max_amount:
resultados.append({
'monto': monto,
'contexto': linea.strip(),
'es_total': 'total' in linea.lower()
})
return resultados
def detectar_monto(texto: str) -> Optional[Dict]:
"""
Convenience function to detect amount from text.
Args:
texto: Ticket text
Returns:
Dict with monto, tipo, patron, confianza or None
"""
detector = AmountDetector()
return detector.detectar_monto(texto)
def normalizar_monto(monto_str: str) -> float:
"""
Convenience function to normalize amount string.
Args:
monto_str: Amount as string
Returns:
Amount as float
"""
detector = AmountDetector()
return detector._normalizar_monto(monto_str)