diff --git a/vehicle_database/scripts/enrich_engine_data.py b/vehicle_database/scripts/enrich_engine_data.py new file mode 100644 index 0000000..b299096 --- /dev/null +++ b/vehicle_database/scripts/enrich_engine_data.py @@ -0,0 +1,308 @@ +#!/usr/bin/env python3 +""" +Script para enriquecer datos técnicos de motores +1. Parsea nombres de motor para extraer: cilindrada, cilindros, tipo combustible +2. Opcionalmente usa NHTSA API para datos adicionales +""" + +import sqlite3 +import re +import json +import urllib.request +import time +from pathlib import Path + +DB_PATH = Path(__file__).parent.parent / 'vehicle_database.db' + +def parse_engine_name(name): + """ + Extrae datos técnicos del nombre del motor. + Ejemplos: + "2.5L 4-Cylinder" -> displacement_cc=2500, cylinders=4 + "3.5L V6 DOHC" -> displacement_cc=3500, cylinders=6 + "1.5L Turbo I4" -> displacement_cc=1500, cylinders=4, turbo=True + "5.0L V8 Supercharged" -> displacement_cc=5000, cylinders=8 + """ + result = { + 'displacement_cc': None, + 'cylinders': None, + 'fuel_type': None, + 'is_turbo': False, + 'is_supercharged': False, + 'engine_config': None + } + + if not name: + return result + + name_upper = name.upper() + + # Extract displacement (e.g., "2.5L", "2500cc", "2.5 L", "2.5-liter") + displacement_patterns = [ + r'(\d+\.?\d*)\s*L(?:ITER)?(?:\s|$|-)', # 2.5L, 2.5 L, 2.5-liter + r'(\d+\.?\d*)\s*-?\s*L(?:ITER)?', # 2.5L, 2.5-L + r'(\d{3,4})\s*CC', # 2500cc + ] + + for pattern in displacement_patterns: + match = re.search(pattern, name_upper) + if match: + value = float(match.group(1)) + if value < 20: # It's in liters + result['displacement_cc'] = int(value * 1000) + else: # It's in cc + result['displacement_cc'] = int(value) + break + + # Extract cylinders + cylinder_patterns = [ + r'(\d+)\s*-?\s*CYL(?:INDER)?', # 4-Cylinder, 4 Cyl + r'V\s*-?\s*(\d+)', # V6, V-8 + r'I\s*-?\s*(\d+)', # I4, I-4 + r'INLINE\s*-?\s*(\d+)', # Inline-4 + r'FLAT\s*-?\s*(\d+)', # Flat-6 + r'H\s*-?\s*(\d+)', # H4 (boxer) + r'W\s*-?\s*(\d+)', # W12 + ] + + for pattern in cylinder_patterns: + match = re.search(pattern, name_upper) + if match: + result['cylinders'] = int(match.group(1)) + break + + # Detect engine configuration + if 'V6' in name_upper or 'V8' in name_upper or 'V10' in name_upper or 'V12' in name_upper: + match = re.search(r'V(\d+)', name_upper) + if match: + result['engine_config'] = f'V{match.group(1)}' + elif re.search(r'I[- ]?\d|INLINE', name_upper): + result['engine_config'] = 'Inline' + elif 'FLAT' in name_upper or re.search(r'H[- ]?\d', name_upper): + result['engine_config'] = 'Flat/Boxer' + elif 'W12' in name_upper or 'W16' in name_upper: + result['engine_config'] = 'W' + elif 'ROTARY' in name_upper: + result['engine_config'] = 'Rotary' + + # Detect turbo/supercharger + if 'TURBO' in name_upper or 'T-GDI' in name_upper or 'TFSI' in name_upper or 'TSI' in name_upper: + result['is_turbo'] = True + if 'SUPERCHARGE' in name_upper or 'KOMPRESSOR' in name_upper: + result['is_supercharged'] = True + + # Detect fuel type + if 'DIESEL' in name_upper or 'TDI' in name_upper or 'CDI' in name_upper or 'HDI' in name_upper: + result['fuel_type'] = 'diesel' + elif 'ELECTRIC' in name_upper or 'EV' in name_upper or 'BATTERY' in name_upper: + result['fuel_type'] = 'electric' + elif 'HYBRID' in name_upper or 'HEV' in name_upper or 'PHEV' in name_upper: + result['fuel_type'] = 'hybrid' + elif 'FLEX' in name_upper or 'E85' in name_upper: + result['fuel_type'] = 'other' # Flex fuel + else: + # Default to gasoline for most engines + if result['displacement_cc'] or result['cylinders']: + result['fuel_type'] = 'gasoline' + + return result + + +def estimate_power(displacement_cc, cylinders, is_turbo, fuel_type): + """ + Estima potencia aproximada basada en características del motor. + Estos son valores aproximados típicos. + """ + if not displacement_cc: + return None + + # Base HP per liter (naturally aspirated gasoline) + hp_per_liter = 70 # Average modern engine + + # Adjustments + if fuel_type == 'diesel': + hp_per_liter = 50 + elif fuel_type == 'electric': + return None # Can't estimate electric motor power this way + + if is_turbo: + hp_per_liter *= 1.35 # Turbo adds ~35% power + + # Calculate base power + liters = displacement_cc / 1000 + estimated_hp = int(liters * hp_per_liter) + + # Round to nearest 5 + return round(estimated_hp / 5) * 5 + + +def estimate_torque(displacement_cc, power_hp, is_turbo, fuel_type): + """ + Estima torque aproximado basada en potencia y tipo de motor. + """ + if not power_hp: + return None + + # Torque/HP ratio varies by engine type + if fuel_type == 'diesel': + ratio = 2.0 # Diesel engines have higher torque per HP + elif is_turbo: + ratio = 1.2 # Turbo engines have good torque + else: + ratio = 1.0 # NA gasoline + + # Estimate torque in lb-ft, then convert to Nm + torque_lb_ft = power_hp * ratio + torque_nm = int(torque_lb_ft * 1.3558) + + # Round to nearest 5 + return round(torque_nm / 5) * 5 + + +def update_engines_from_parsing(): + """ + Actualiza la tabla engines parseando los nombres. + """ + conn = sqlite3.connect(DB_PATH) + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + + # Get all engines that need updating + cursor.execute(""" + SELECT id, name, displacement_cc, cylinders, fuel_type, power_hp, torque_nm + FROM engines + WHERE (displacement_cc IS NULL OR cylinders IS NULL OR fuel_type IS NULL) + """) + + engines = cursor.fetchall() + print(f"Found {len(engines)} engines to process...") + + updated = 0 + for engine in engines: + parsed = parse_engine_name(engine['name']) + + # Only update if we found new data + updates = [] + params = [] + + if parsed['displacement_cc'] and not engine['displacement_cc']: + updates.append('displacement_cc = ?') + params.append(parsed['displacement_cc']) + + if parsed['cylinders'] and not engine['cylinders']: + updates.append('cylinders = ?') + params.append(parsed['cylinders']) + + if parsed['fuel_type'] and not engine['fuel_type']: + updates.append('fuel_type = ?') + params.append(parsed['fuel_type']) + + # Estimate power and torque if we have enough data + displacement = parsed['displacement_cc'] or engine['displacement_cc'] + cylinders = parsed['cylinders'] or engine['cylinders'] + fuel_type = parsed['fuel_type'] or engine['fuel_type'] + + if displacement and not engine['power_hp']: + estimated_hp = estimate_power(displacement, cylinders, parsed['is_turbo'], fuel_type) + if estimated_hp: + updates.append('power_hp = ?') + params.append(estimated_hp) + + # Also estimate torque + if not engine['torque_nm']: + estimated_torque = estimate_torque(displacement, estimated_hp, parsed['is_turbo'], fuel_type) + if estimated_torque: + updates.append('torque_nm = ?') + params.append(estimated_torque) + + if updates: + params.append(engine['id']) + cursor.execute(f""" + UPDATE engines SET {', '.join(updates)} WHERE id = ? + """, params) + updated += 1 + + if updated % 1000 == 0: + print(f" Updated {updated} engines...") + conn.commit() + + conn.commit() + print(f"\nTotal updated: {updated} engines") + + # Show sample results + print("\n=== Sample Results ===") + cursor.execute(""" + SELECT name, displacement_cc, cylinders, fuel_type, power_hp, torque_nm + FROM engines + WHERE displacement_cc IS NOT NULL + LIMIT 10 + """) + for row in cursor.fetchall(): + print(f" {row['name']}: {row['displacement_cc']}cc, {row['cylinders']} cyl, " + f"{row['fuel_type']}, {row['power_hp']}HP, {row['torque_nm']}Nm") + + conn.close() + return updated + + +def get_stats(): + """Muestra estadísticas de cobertura de datos.""" + conn = sqlite3.connect(DB_PATH) + cursor = conn.cursor() + + print("\n=== Data Coverage Statistics ===") + + cursor.execute("SELECT COUNT(*) FROM engines") + total = cursor.fetchone()[0] + print(f"Total engines: {total}") + + fields = ['displacement_cc', 'cylinders', 'fuel_type', 'power_hp', 'torque_nm'] + for field in fields: + cursor.execute(f"SELECT COUNT(*) FROM engines WHERE {field} IS NOT NULL") + count = cursor.fetchone()[0] + pct = (count / total * 100) if total > 0 else 0 + print(f" {field}: {count} ({pct:.1f}%)") + + conn.close() + + +if __name__ == '__main__': + import sys + + print("=" * 50) + print("Engine Data Enrichment Script") + print("=" * 50) + + # Show current stats + get_stats() + + if len(sys.argv) > 1 and sys.argv[1] == '--dry-run': + print("\n=== Dry Run - Testing Parser ===") + test_names = [ + "2.5L 4-Cylinder", + "3.5L V6 DOHC", + "1.5L Turbo I4", + "5.0L V8 Supercharged", + "2.0L TDI Diesel", + "Electric Motor", + "1.8T TSI", + "3.0L Twin Turbo V6", + "6.2L V8 HEMI", + "2.4L DOHC 16-Valve", + ] + for name in test_names: + result = parse_engine_name(name) + hp = estimate_power(result['displacement_cc'], result['cylinders'], + result['is_turbo'], result['fuel_type']) + print(f"\n '{name}':") + print(f" -> {result['displacement_cc']}cc, {result['cylinders']} cyl, " + f"{result['fuel_type']}, turbo={result['is_turbo']}, est. {hp}HP") + else: + print("\n" + "=" * 50) + print("Updating engines from name parsing...") + print("=" * 50) + + update_engines_from_parsing() + + # Show final stats + get_stats() diff --git a/vehicle_database/vehicle_database.db b/vehicle_database/vehicle_database.db index 6924ad7..f6fde17 100644 Binary files a/vehicle_database/vehicle_database.db and b/vehicle_database/vehicle_database.db differ