New script (enrich_engine_data.py): - Parses engine names to extract displacement, cylinders, fuel type - Estimates HP and torque based on engine characteristics - Turbo engines get +35% power estimate - Handles V6, V8, I4, diesel, electric, hybrid patterns Results: - 13,287 engines updated with technical data - 99% coverage for displacement_cc - 59% coverage for cylinders - 99.9% coverage for fuel_type - 95.7% coverage for power_hp and torque_nm Example data: - Chevrolet avg: 4945cc, 337HP, 7.6 cyl - Toyota avg: 2767cc, 202HP, 6.5 cyl - BMW avg: 3117cc, 262HP, 8.7 cyl Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
309 lines
10 KiB
Python
309 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Script para enriquecer datos técnicos de motores
|
|
1. Parsea nombres de motor para extraer: cilindrada, cilindros, tipo combustible
|
|
2. Opcionalmente usa NHTSA API para datos adicionales
|
|
"""
|
|
|
|
import sqlite3
|
|
import re
|
|
import json
|
|
import urllib.request
|
|
import time
|
|
from pathlib import Path
|
|
|
|
DB_PATH = Path(__file__).parent.parent / 'vehicle_database.db'
|
|
|
|
def parse_engine_name(name):
|
|
"""
|
|
Extrae datos técnicos del nombre del motor.
|
|
Ejemplos:
|
|
"2.5L 4-Cylinder" -> displacement_cc=2500, cylinders=4
|
|
"3.5L V6 DOHC" -> displacement_cc=3500, cylinders=6
|
|
"1.5L Turbo I4" -> displacement_cc=1500, cylinders=4, turbo=True
|
|
"5.0L V8 Supercharged" -> displacement_cc=5000, cylinders=8
|
|
"""
|
|
result = {
|
|
'displacement_cc': None,
|
|
'cylinders': None,
|
|
'fuel_type': None,
|
|
'is_turbo': False,
|
|
'is_supercharged': False,
|
|
'engine_config': None
|
|
}
|
|
|
|
if not name:
|
|
return result
|
|
|
|
name_upper = name.upper()
|
|
|
|
# Extract displacement (e.g., "2.5L", "2500cc", "2.5 L", "2.5-liter")
|
|
displacement_patterns = [
|
|
r'(\d+\.?\d*)\s*L(?:ITER)?(?:\s|$|-)', # 2.5L, 2.5 L, 2.5-liter
|
|
r'(\d+\.?\d*)\s*-?\s*L(?:ITER)?', # 2.5L, 2.5-L
|
|
r'(\d{3,4})\s*CC', # 2500cc
|
|
]
|
|
|
|
for pattern in displacement_patterns:
|
|
match = re.search(pattern, name_upper)
|
|
if match:
|
|
value = float(match.group(1))
|
|
if value < 20: # It's in liters
|
|
result['displacement_cc'] = int(value * 1000)
|
|
else: # It's in cc
|
|
result['displacement_cc'] = int(value)
|
|
break
|
|
|
|
# Extract cylinders
|
|
cylinder_patterns = [
|
|
r'(\d+)\s*-?\s*CYL(?:INDER)?', # 4-Cylinder, 4 Cyl
|
|
r'V\s*-?\s*(\d+)', # V6, V-8
|
|
r'I\s*-?\s*(\d+)', # I4, I-4
|
|
r'INLINE\s*-?\s*(\d+)', # Inline-4
|
|
r'FLAT\s*-?\s*(\d+)', # Flat-6
|
|
r'H\s*-?\s*(\d+)', # H4 (boxer)
|
|
r'W\s*-?\s*(\d+)', # W12
|
|
]
|
|
|
|
for pattern in cylinder_patterns:
|
|
match = re.search(pattern, name_upper)
|
|
if match:
|
|
result['cylinders'] = int(match.group(1))
|
|
break
|
|
|
|
# Detect engine configuration
|
|
if 'V6' in name_upper or 'V8' in name_upper or 'V10' in name_upper or 'V12' in name_upper:
|
|
match = re.search(r'V(\d+)', name_upper)
|
|
if match:
|
|
result['engine_config'] = f'V{match.group(1)}'
|
|
elif re.search(r'I[- ]?\d|INLINE', name_upper):
|
|
result['engine_config'] = 'Inline'
|
|
elif 'FLAT' in name_upper or re.search(r'H[- ]?\d', name_upper):
|
|
result['engine_config'] = 'Flat/Boxer'
|
|
elif 'W12' in name_upper or 'W16' in name_upper:
|
|
result['engine_config'] = 'W'
|
|
elif 'ROTARY' in name_upper:
|
|
result['engine_config'] = 'Rotary'
|
|
|
|
# Detect turbo/supercharger
|
|
if 'TURBO' in name_upper or 'T-GDI' in name_upper or 'TFSI' in name_upper or 'TSI' in name_upper:
|
|
result['is_turbo'] = True
|
|
if 'SUPERCHARGE' in name_upper or 'KOMPRESSOR' in name_upper:
|
|
result['is_supercharged'] = True
|
|
|
|
# Detect fuel type
|
|
if 'DIESEL' in name_upper or 'TDI' in name_upper or 'CDI' in name_upper or 'HDI' in name_upper:
|
|
result['fuel_type'] = 'diesel'
|
|
elif 'ELECTRIC' in name_upper or 'EV' in name_upper or 'BATTERY' in name_upper:
|
|
result['fuel_type'] = 'electric'
|
|
elif 'HYBRID' in name_upper or 'HEV' in name_upper or 'PHEV' in name_upper:
|
|
result['fuel_type'] = 'hybrid'
|
|
elif 'FLEX' in name_upper or 'E85' in name_upper:
|
|
result['fuel_type'] = 'other' # Flex fuel
|
|
else:
|
|
# Default to gasoline for most engines
|
|
if result['displacement_cc'] or result['cylinders']:
|
|
result['fuel_type'] = 'gasoline'
|
|
|
|
return result
|
|
|
|
|
|
def estimate_power(displacement_cc, cylinders, is_turbo, fuel_type):
|
|
"""
|
|
Estima potencia aproximada basada en características del motor.
|
|
Estos son valores aproximados típicos.
|
|
"""
|
|
if not displacement_cc:
|
|
return None
|
|
|
|
# Base HP per liter (naturally aspirated gasoline)
|
|
hp_per_liter = 70 # Average modern engine
|
|
|
|
# Adjustments
|
|
if fuel_type == 'diesel':
|
|
hp_per_liter = 50
|
|
elif fuel_type == 'electric':
|
|
return None # Can't estimate electric motor power this way
|
|
|
|
if is_turbo:
|
|
hp_per_liter *= 1.35 # Turbo adds ~35% power
|
|
|
|
# Calculate base power
|
|
liters = displacement_cc / 1000
|
|
estimated_hp = int(liters * hp_per_liter)
|
|
|
|
# Round to nearest 5
|
|
return round(estimated_hp / 5) * 5
|
|
|
|
|
|
def estimate_torque(displacement_cc, power_hp, is_turbo, fuel_type):
|
|
"""
|
|
Estima torque aproximado basada en potencia y tipo de motor.
|
|
"""
|
|
if not power_hp:
|
|
return None
|
|
|
|
# Torque/HP ratio varies by engine type
|
|
if fuel_type == 'diesel':
|
|
ratio = 2.0 # Diesel engines have higher torque per HP
|
|
elif is_turbo:
|
|
ratio = 1.2 # Turbo engines have good torque
|
|
else:
|
|
ratio = 1.0 # NA gasoline
|
|
|
|
# Estimate torque in lb-ft, then convert to Nm
|
|
torque_lb_ft = power_hp * ratio
|
|
torque_nm = int(torque_lb_ft * 1.3558)
|
|
|
|
# Round to nearest 5
|
|
return round(torque_nm / 5) * 5
|
|
|
|
|
|
def update_engines_from_parsing():
|
|
"""
|
|
Actualiza la tabla engines parseando los nombres.
|
|
"""
|
|
conn = sqlite3.connect(DB_PATH)
|
|
conn.row_factory = sqlite3.Row
|
|
cursor = conn.cursor()
|
|
|
|
# Get all engines that need updating
|
|
cursor.execute("""
|
|
SELECT id, name, displacement_cc, cylinders, fuel_type, power_hp, torque_nm
|
|
FROM engines
|
|
WHERE (displacement_cc IS NULL OR cylinders IS NULL OR fuel_type IS NULL)
|
|
""")
|
|
|
|
engines = cursor.fetchall()
|
|
print(f"Found {len(engines)} engines to process...")
|
|
|
|
updated = 0
|
|
for engine in engines:
|
|
parsed = parse_engine_name(engine['name'])
|
|
|
|
# Only update if we found new data
|
|
updates = []
|
|
params = []
|
|
|
|
if parsed['displacement_cc'] and not engine['displacement_cc']:
|
|
updates.append('displacement_cc = ?')
|
|
params.append(parsed['displacement_cc'])
|
|
|
|
if parsed['cylinders'] and not engine['cylinders']:
|
|
updates.append('cylinders = ?')
|
|
params.append(parsed['cylinders'])
|
|
|
|
if parsed['fuel_type'] and not engine['fuel_type']:
|
|
updates.append('fuel_type = ?')
|
|
params.append(parsed['fuel_type'])
|
|
|
|
# Estimate power and torque if we have enough data
|
|
displacement = parsed['displacement_cc'] or engine['displacement_cc']
|
|
cylinders = parsed['cylinders'] or engine['cylinders']
|
|
fuel_type = parsed['fuel_type'] or engine['fuel_type']
|
|
|
|
if displacement and not engine['power_hp']:
|
|
estimated_hp = estimate_power(displacement, cylinders, parsed['is_turbo'], fuel_type)
|
|
if estimated_hp:
|
|
updates.append('power_hp = ?')
|
|
params.append(estimated_hp)
|
|
|
|
# Also estimate torque
|
|
if not engine['torque_nm']:
|
|
estimated_torque = estimate_torque(displacement, estimated_hp, parsed['is_turbo'], fuel_type)
|
|
if estimated_torque:
|
|
updates.append('torque_nm = ?')
|
|
params.append(estimated_torque)
|
|
|
|
if updates:
|
|
params.append(engine['id'])
|
|
cursor.execute(f"""
|
|
UPDATE engines SET {', '.join(updates)} WHERE id = ?
|
|
""", params)
|
|
updated += 1
|
|
|
|
if updated % 1000 == 0:
|
|
print(f" Updated {updated} engines...")
|
|
conn.commit()
|
|
|
|
conn.commit()
|
|
print(f"\nTotal updated: {updated} engines")
|
|
|
|
# Show sample results
|
|
print("\n=== Sample Results ===")
|
|
cursor.execute("""
|
|
SELECT name, displacement_cc, cylinders, fuel_type, power_hp, torque_nm
|
|
FROM engines
|
|
WHERE displacement_cc IS NOT NULL
|
|
LIMIT 10
|
|
""")
|
|
for row in cursor.fetchall():
|
|
print(f" {row['name']}: {row['displacement_cc']}cc, {row['cylinders']} cyl, "
|
|
f"{row['fuel_type']}, {row['power_hp']}HP, {row['torque_nm']}Nm")
|
|
|
|
conn.close()
|
|
return updated
|
|
|
|
|
|
def get_stats():
|
|
"""Muestra estadísticas de cobertura de datos."""
|
|
conn = sqlite3.connect(DB_PATH)
|
|
cursor = conn.cursor()
|
|
|
|
print("\n=== Data Coverage Statistics ===")
|
|
|
|
cursor.execute("SELECT COUNT(*) FROM engines")
|
|
total = cursor.fetchone()[0]
|
|
print(f"Total engines: {total}")
|
|
|
|
fields = ['displacement_cc', 'cylinders', 'fuel_type', 'power_hp', 'torque_nm']
|
|
for field in fields:
|
|
cursor.execute(f"SELECT COUNT(*) FROM engines WHERE {field} IS NOT NULL")
|
|
count = cursor.fetchone()[0]
|
|
pct = (count / total * 100) if total > 0 else 0
|
|
print(f" {field}: {count} ({pct:.1f}%)")
|
|
|
|
conn.close()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
import sys
|
|
|
|
print("=" * 50)
|
|
print("Engine Data Enrichment Script")
|
|
print("=" * 50)
|
|
|
|
# Show current stats
|
|
get_stats()
|
|
|
|
if len(sys.argv) > 1 and sys.argv[1] == '--dry-run':
|
|
print("\n=== Dry Run - Testing Parser ===")
|
|
test_names = [
|
|
"2.5L 4-Cylinder",
|
|
"3.5L V6 DOHC",
|
|
"1.5L Turbo I4",
|
|
"5.0L V8 Supercharged",
|
|
"2.0L TDI Diesel",
|
|
"Electric Motor",
|
|
"1.8T TSI",
|
|
"3.0L Twin Turbo V6",
|
|
"6.2L V8 HEMI",
|
|
"2.4L DOHC 16-Valve",
|
|
]
|
|
for name in test_names:
|
|
result = parse_engine_name(name)
|
|
hp = estimate_power(result['displacement_cc'], result['cylinders'],
|
|
result['is_turbo'], result['fuel_type'])
|
|
print(f"\n '{name}':")
|
|
print(f" -> {result['displacement_cc']}cc, {result['cylinders']} cyl, "
|
|
f"{result['fuel_type']}, turbo={result['is_turbo']}, est. {hp}HP")
|
|
else:
|
|
print("\n" + "=" * 50)
|
|
print("Updating engines from name parsing...")
|
|
print("=" * 50)
|
|
|
|
update_engines_from_parsing()
|
|
|
|
# Show final stats
|
|
get_stats()
|