#!/usr/bin/env python3 """ Script para enriquecer datos técnicos de motores 1. Parsea nombres de motor para extraer: cilindrada, cilindros, tipo combustible 2. Opcionalmente usa NHTSA API para datos adicionales """ import sqlite3 import re import json import urllib.request import time from pathlib import Path DB_PATH = Path(__file__).parent.parent / 'vehicle_database.db' def parse_engine_name(name): """ Extrae datos técnicos del nombre del motor. Ejemplos: "2.5L 4-Cylinder" -> displacement_cc=2500, cylinders=4 "3.5L V6 DOHC" -> displacement_cc=3500, cylinders=6 "1.5L Turbo I4" -> displacement_cc=1500, cylinders=4, turbo=True "5.0L V8 Supercharged" -> displacement_cc=5000, cylinders=8 """ result = { 'displacement_cc': None, 'cylinders': None, 'fuel_type': None, 'is_turbo': False, 'is_supercharged': False, 'engine_config': None } if not name: return result name_upper = name.upper() # Extract displacement (e.g., "2.5L", "2500cc", "2.5 L", "2.5-liter") displacement_patterns = [ r'(\d+\.?\d*)\s*L(?:ITER)?(?:\s|$|-)', # 2.5L, 2.5 L, 2.5-liter r'(\d+\.?\d*)\s*-?\s*L(?:ITER)?', # 2.5L, 2.5-L r'(\d{3,4})\s*CC', # 2500cc ] for pattern in displacement_patterns: match = re.search(pattern, name_upper) if match: value = float(match.group(1)) if value < 20: # It's in liters result['displacement_cc'] = int(value * 1000) else: # It's in cc result['displacement_cc'] = int(value) break # Extract cylinders cylinder_patterns = [ r'(\d+)\s*-?\s*CYL(?:INDER)?', # 4-Cylinder, 4 Cyl r'V\s*-?\s*(\d+)', # V6, V-8 r'I\s*-?\s*(\d+)', # I4, I-4 r'INLINE\s*-?\s*(\d+)', # Inline-4 r'FLAT\s*-?\s*(\d+)', # Flat-6 r'H\s*-?\s*(\d+)', # H4 (boxer) r'W\s*-?\s*(\d+)', # W12 ] for pattern in cylinder_patterns: match = re.search(pattern, name_upper) if match: result['cylinders'] = int(match.group(1)) break # Detect engine configuration if 'V6' in name_upper or 'V8' in name_upper or 'V10' in name_upper or 'V12' in name_upper: match = re.search(r'V(\d+)', name_upper) if match: result['engine_config'] = f'V{match.group(1)}' elif re.search(r'I[- ]?\d|INLINE', name_upper): result['engine_config'] = 'Inline' elif 'FLAT' in name_upper or re.search(r'H[- ]?\d', name_upper): result['engine_config'] = 'Flat/Boxer' elif 'W12' in name_upper or 'W16' in name_upper: result['engine_config'] = 'W' elif 'ROTARY' in name_upper: result['engine_config'] = 'Rotary' # Detect turbo/supercharger if 'TURBO' in name_upper or 'T-GDI' in name_upper or 'TFSI' in name_upper or 'TSI' in name_upper: result['is_turbo'] = True if 'SUPERCHARGE' in name_upper or 'KOMPRESSOR' in name_upper: result['is_supercharged'] = True # Detect fuel type if 'DIESEL' in name_upper or 'TDI' in name_upper or 'CDI' in name_upper or 'HDI' in name_upper: result['fuel_type'] = 'diesel' elif 'ELECTRIC' in name_upper or 'EV' in name_upper or 'BATTERY' in name_upper: result['fuel_type'] = 'electric' elif 'HYBRID' in name_upper or 'HEV' in name_upper or 'PHEV' in name_upper: result['fuel_type'] = 'hybrid' elif 'FLEX' in name_upper or 'E85' in name_upper: result['fuel_type'] = 'other' # Flex fuel else: # Default to gasoline for most engines if result['displacement_cc'] or result['cylinders']: result['fuel_type'] = 'gasoline' return result def estimate_power(displacement_cc, cylinders, is_turbo, fuel_type): """ Estima potencia aproximada basada en características del motor. Estos son valores aproximados típicos. """ if not displacement_cc: return None # Base HP per liter (naturally aspirated gasoline) hp_per_liter = 70 # Average modern engine # Adjustments if fuel_type == 'diesel': hp_per_liter = 50 elif fuel_type == 'electric': return None # Can't estimate electric motor power this way if is_turbo: hp_per_liter *= 1.35 # Turbo adds ~35% power # Calculate base power liters = displacement_cc / 1000 estimated_hp = int(liters * hp_per_liter) # Round to nearest 5 return round(estimated_hp / 5) * 5 def estimate_torque(displacement_cc, power_hp, is_turbo, fuel_type): """ Estima torque aproximado basada en potencia y tipo de motor. """ if not power_hp: return None # Torque/HP ratio varies by engine type if fuel_type == 'diesel': ratio = 2.0 # Diesel engines have higher torque per HP elif is_turbo: ratio = 1.2 # Turbo engines have good torque else: ratio = 1.0 # NA gasoline # Estimate torque in lb-ft, then convert to Nm torque_lb_ft = power_hp * ratio torque_nm = int(torque_lb_ft * 1.3558) # Round to nearest 5 return round(torque_nm / 5) * 5 def update_engines_from_parsing(): """ Actualiza la tabla engines parseando los nombres. """ conn = sqlite3.connect(DB_PATH) conn.row_factory = sqlite3.Row cursor = conn.cursor() # Get all engines that need updating cursor.execute(""" SELECT id, name, displacement_cc, cylinders, fuel_type, power_hp, torque_nm FROM engines WHERE (displacement_cc IS NULL OR cylinders IS NULL OR fuel_type IS NULL) """) engines = cursor.fetchall() print(f"Found {len(engines)} engines to process...") updated = 0 for engine in engines: parsed = parse_engine_name(engine['name']) # Only update if we found new data updates = [] params = [] if parsed['displacement_cc'] and not engine['displacement_cc']: updates.append('displacement_cc = ?') params.append(parsed['displacement_cc']) if parsed['cylinders'] and not engine['cylinders']: updates.append('cylinders = ?') params.append(parsed['cylinders']) if parsed['fuel_type'] and not engine['fuel_type']: updates.append('fuel_type = ?') params.append(parsed['fuel_type']) # Estimate power and torque if we have enough data displacement = parsed['displacement_cc'] or engine['displacement_cc'] cylinders = parsed['cylinders'] or engine['cylinders'] fuel_type = parsed['fuel_type'] or engine['fuel_type'] if displacement and not engine['power_hp']: estimated_hp = estimate_power(displacement, cylinders, parsed['is_turbo'], fuel_type) if estimated_hp: updates.append('power_hp = ?') params.append(estimated_hp) # Also estimate torque if not engine['torque_nm']: estimated_torque = estimate_torque(displacement, estimated_hp, parsed['is_turbo'], fuel_type) if estimated_torque: updates.append('torque_nm = ?') params.append(estimated_torque) if updates: params.append(engine['id']) cursor.execute(f""" UPDATE engines SET {', '.join(updates)} WHERE id = ? """, params) updated += 1 if updated % 1000 == 0: print(f" Updated {updated} engines...") conn.commit() conn.commit() print(f"\nTotal updated: {updated} engines") # Show sample results print("\n=== Sample Results ===") cursor.execute(""" SELECT name, displacement_cc, cylinders, fuel_type, power_hp, torque_nm FROM engines WHERE displacement_cc IS NOT NULL LIMIT 10 """) for row in cursor.fetchall(): print(f" {row['name']}: {row['displacement_cc']}cc, {row['cylinders']} cyl, " f"{row['fuel_type']}, {row['power_hp']}HP, {row['torque_nm']}Nm") conn.close() return updated def get_stats(): """Muestra estadísticas de cobertura de datos.""" conn = sqlite3.connect(DB_PATH) cursor = conn.cursor() print("\n=== Data Coverage Statistics ===") cursor.execute("SELECT COUNT(*) FROM engines") total = cursor.fetchone()[0] print(f"Total engines: {total}") fields = ['displacement_cc', 'cylinders', 'fuel_type', 'power_hp', 'torque_nm'] for field in fields: cursor.execute(f"SELECT COUNT(*) FROM engines WHERE {field} IS NOT NULL") count = cursor.fetchone()[0] pct = (count / total * 100) if total > 0 else 0 print(f" {field}: {count} ({pct:.1f}%)") conn.close() if __name__ == '__main__': import sys print("=" * 50) print("Engine Data Enrichment Script") print("=" * 50) # Show current stats get_stats() if len(sys.argv) > 1 and sys.argv[1] == '--dry-run': print("\n=== Dry Run - Testing Parser ===") test_names = [ "2.5L 4-Cylinder", "3.5L V6 DOHC", "1.5L Turbo I4", "5.0L V8 Supercharged", "2.0L TDI Diesel", "Electric Motor", "1.8T TSI", "3.0L Twin Turbo V6", "6.2L V8 HEMI", "2.4L DOHC 16-Valve", ] for name in test_names: result = parse_engine_name(name) hp = estimate_power(result['displacement_cc'], result['cylinders'], result['is_turbo'], result['fuel_type']) print(f"\n '{name}':") print(f" -> {result['displacement_cc']}cc, {result['cylinders']} cyl, " f"{result['fuel_type']}, turbo={result['is_turbo']}, est. {hp}HP") else: print("\n" + "=" * 50) print("Updating engines from name parsing...") print("=" * 50) update_engines_from_parsing() # Show final stats get_stats()