Add engine data enrichment script and populate tech specs
New script (enrich_engine_data.py): - Parses engine names to extract displacement, cylinders, fuel type - Estimates HP and torque based on engine characteristics - Turbo engines get +35% power estimate - Handles V6, V8, I4, diesel, electric, hybrid patterns Results: - 13,287 engines updated with technical data - 99% coverage for displacement_cc - 59% coverage for cylinders - 99.9% coverage for fuel_type - 95.7% coverage for power_hp and torque_nm Example data: - Chevrolet avg: 4945cc, 337HP, 7.6 cyl - Toyota avg: 2767cc, 202HP, 6.5 cyl - BMW avg: 3117cc, 262HP, 8.7 cyl Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
308
vehicle_database/scripts/enrich_engine_data.py
Normal file
308
vehicle_database/scripts/enrich_engine_data.py
Normal file
@@ -0,0 +1,308 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script para enriquecer datos técnicos de motores
|
||||
1. Parsea nombres de motor para extraer: cilindrada, cilindros, tipo combustible
|
||||
2. Opcionalmente usa NHTSA API para datos adicionales
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
import re
|
||||
import json
|
||||
import urllib.request
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
DB_PATH = Path(__file__).parent.parent / 'vehicle_database.db'
|
||||
|
||||
def parse_engine_name(name):
|
||||
"""
|
||||
Extrae datos técnicos del nombre del motor.
|
||||
Ejemplos:
|
||||
"2.5L 4-Cylinder" -> displacement_cc=2500, cylinders=4
|
||||
"3.5L V6 DOHC" -> displacement_cc=3500, cylinders=6
|
||||
"1.5L Turbo I4" -> displacement_cc=1500, cylinders=4, turbo=True
|
||||
"5.0L V8 Supercharged" -> displacement_cc=5000, cylinders=8
|
||||
"""
|
||||
result = {
|
||||
'displacement_cc': None,
|
||||
'cylinders': None,
|
||||
'fuel_type': None,
|
||||
'is_turbo': False,
|
||||
'is_supercharged': False,
|
||||
'engine_config': None
|
||||
}
|
||||
|
||||
if not name:
|
||||
return result
|
||||
|
||||
name_upper = name.upper()
|
||||
|
||||
# Extract displacement (e.g., "2.5L", "2500cc", "2.5 L", "2.5-liter")
|
||||
displacement_patterns = [
|
||||
r'(\d+\.?\d*)\s*L(?:ITER)?(?:\s|$|-)', # 2.5L, 2.5 L, 2.5-liter
|
||||
r'(\d+\.?\d*)\s*-?\s*L(?:ITER)?', # 2.5L, 2.5-L
|
||||
r'(\d{3,4})\s*CC', # 2500cc
|
||||
]
|
||||
|
||||
for pattern in displacement_patterns:
|
||||
match = re.search(pattern, name_upper)
|
||||
if match:
|
||||
value = float(match.group(1))
|
||||
if value < 20: # It's in liters
|
||||
result['displacement_cc'] = int(value * 1000)
|
||||
else: # It's in cc
|
||||
result['displacement_cc'] = int(value)
|
||||
break
|
||||
|
||||
# Extract cylinders
|
||||
cylinder_patterns = [
|
||||
r'(\d+)\s*-?\s*CYL(?:INDER)?', # 4-Cylinder, 4 Cyl
|
||||
r'V\s*-?\s*(\d+)', # V6, V-8
|
||||
r'I\s*-?\s*(\d+)', # I4, I-4
|
||||
r'INLINE\s*-?\s*(\d+)', # Inline-4
|
||||
r'FLAT\s*-?\s*(\d+)', # Flat-6
|
||||
r'H\s*-?\s*(\d+)', # H4 (boxer)
|
||||
r'W\s*-?\s*(\d+)', # W12
|
||||
]
|
||||
|
||||
for pattern in cylinder_patterns:
|
||||
match = re.search(pattern, name_upper)
|
||||
if match:
|
||||
result['cylinders'] = int(match.group(1))
|
||||
break
|
||||
|
||||
# Detect engine configuration
|
||||
if 'V6' in name_upper or 'V8' in name_upper or 'V10' in name_upper or 'V12' in name_upper:
|
||||
match = re.search(r'V(\d+)', name_upper)
|
||||
if match:
|
||||
result['engine_config'] = f'V{match.group(1)}'
|
||||
elif re.search(r'I[- ]?\d|INLINE', name_upper):
|
||||
result['engine_config'] = 'Inline'
|
||||
elif 'FLAT' in name_upper or re.search(r'H[- ]?\d', name_upper):
|
||||
result['engine_config'] = 'Flat/Boxer'
|
||||
elif 'W12' in name_upper or 'W16' in name_upper:
|
||||
result['engine_config'] = 'W'
|
||||
elif 'ROTARY' in name_upper:
|
||||
result['engine_config'] = 'Rotary'
|
||||
|
||||
# Detect turbo/supercharger
|
||||
if 'TURBO' in name_upper or 'T-GDI' in name_upper or 'TFSI' in name_upper or 'TSI' in name_upper:
|
||||
result['is_turbo'] = True
|
||||
if 'SUPERCHARGE' in name_upper or 'KOMPRESSOR' in name_upper:
|
||||
result['is_supercharged'] = True
|
||||
|
||||
# Detect fuel type
|
||||
if 'DIESEL' in name_upper or 'TDI' in name_upper or 'CDI' in name_upper or 'HDI' in name_upper:
|
||||
result['fuel_type'] = 'diesel'
|
||||
elif 'ELECTRIC' in name_upper or 'EV' in name_upper or 'BATTERY' in name_upper:
|
||||
result['fuel_type'] = 'electric'
|
||||
elif 'HYBRID' in name_upper or 'HEV' in name_upper or 'PHEV' in name_upper:
|
||||
result['fuel_type'] = 'hybrid'
|
||||
elif 'FLEX' in name_upper or 'E85' in name_upper:
|
||||
result['fuel_type'] = 'other' # Flex fuel
|
||||
else:
|
||||
# Default to gasoline for most engines
|
||||
if result['displacement_cc'] or result['cylinders']:
|
||||
result['fuel_type'] = 'gasoline'
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def estimate_power(displacement_cc, cylinders, is_turbo, fuel_type):
|
||||
"""
|
||||
Estima potencia aproximada basada en características del motor.
|
||||
Estos son valores aproximados típicos.
|
||||
"""
|
||||
if not displacement_cc:
|
||||
return None
|
||||
|
||||
# Base HP per liter (naturally aspirated gasoline)
|
||||
hp_per_liter = 70 # Average modern engine
|
||||
|
||||
# Adjustments
|
||||
if fuel_type == 'diesel':
|
||||
hp_per_liter = 50
|
||||
elif fuel_type == 'electric':
|
||||
return None # Can't estimate electric motor power this way
|
||||
|
||||
if is_turbo:
|
||||
hp_per_liter *= 1.35 # Turbo adds ~35% power
|
||||
|
||||
# Calculate base power
|
||||
liters = displacement_cc / 1000
|
||||
estimated_hp = int(liters * hp_per_liter)
|
||||
|
||||
# Round to nearest 5
|
||||
return round(estimated_hp / 5) * 5
|
||||
|
||||
|
||||
def estimate_torque(displacement_cc, power_hp, is_turbo, fuel_type):
|
||||
"""
|
||||
Estima torque aproximado basada en potencia y tipo de motor.
|
||||
"""
|
||||
if not power_hp:
|
||||
return None
|
||||
|
||||
# Torque/HP ratio varies by engine type
|
||||
if fuel_type == 'diesel':
|
||||
ratio = 2.0 # Diesel engines have higher torque per HP
|
||||
elif is_turbo:
|
||||
ratio = 1.2 # Turbo engines have good torque
|
||||
else:
|
||||
ratio = 1.0 # NA gasoline
|
||||
|
||||
# Estimate torque in lb-ft, then convert to Nm
|
||||
torque_lb_ft = power_hp * ratio
|
||||
torque_nm = int(torque_lb_ft * 1.3558)
|
||||
|
||||
# Round to nearest 5
|
||||
return round(torque_nm / 5) * 5
|
||||
|
||||
|
||||
def update_engines_from_parsing():
|
||||
"""
|
||||
Actualiza la tabla engines parseando los nombres.
|
||||
"""
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
conn.row_factory = sqlite3.Row
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get all engines that need updating
|
||||
cursor.execute("""
|
||||
SELECT id, name, displacement_cc, cylinders, fuel_type, power_hp, torque_nm
|
||||
FROM engines
|
||||
WHERE (displacement_cc IS NULL OR cylinders IS NULL OR fuel_type IS NULL)
|
||||
""")
|
||||
|
||||
engines = cursor.fetchall()
|
||||
print(f"Found {len(engines)} engines to process...")
|
||||
|
||||
updated = 0
|
||||
for engine in engines:
|
||||
parsed = parse_engine_name(engine['name'])
|
||||
|
||||
# Only update if we found new data
|
||||
updates = []
|
||||
params = []
|
||||
|
||||
if parsed['displacement_cc'] and not engine['displacement_cc']:
|
||||
updates.append('displacement_cc = ?')
|
||||
params.append(parsed['displacement_cc'])
|
||||
|
||||
if parsed['cylinders'] and not engine['cylinders']:
|
||||
updates.append('cylinders = ?')
|
||||
params.append(parsed['cylinders'])
|
||||
|
||||
if parsed['fuel_type'] and not engine['fuel_type']:
|
||||
updates.append('fuel_type = ?')
|
||||
params.append(parsed['fuel_type'])
|
||||
|
||||
# Estimate power and torque if we have enough data
|
||||
displacement = parsed['displacement_cc'] or engine['displacement_cc']
|
||||
cylinders = parsed['cylinders'] or engine['cylinders']
|
||||
fuel_type = parsed['fuel_type'] or engine['fuel_type']
|
||||
|
||||
if displacement and not engine['power_hp']:
|
||||
estimated_hp = estimate_power(displacement, cylinders, parsed['is_turbo'], fuel_type)
|
||||
if estimated_hp:
|
||||
updates.append('power_hp = ?')
|
||||
params.append(estimated_hp)
|
||||
|
||||
# Also estimate torque
|
||||
if not engine['torque_nm']:
|
||||
estimated_torque = estimate_torque(displacement, estimated_hp, parsed['is_turbo'], fuel_type)
|
||||
if estimated_torque:
|
||||
updates.append('torque_nm = ?')
|
||||
params.append(estimated_torque)
|
||||
|
||||
if updates:
|
||||
params.append(engine['id'])
|
||||
cursor.execute(f"""
|
||||
UPDATE engines SET {', '.join(updates)} WHERE id = ?
|
||||
""", params)
|
||||
updated += 1
|
||||
|
||||
if updated % 1000 == 0:
|
||||
print(f" Updated {updated} engines...")
|
||||
conn.commit()
|
||||
|
||||
conn.commit()
|
||||
print(f"\nTotal updated: {updated} engines")
|
||||
|
||||
# Show sample results
|
||||
print("\n=== Sample Results ===")
|
||||
cursor.execute("""
|
||||
SELECT name, displacement_cc, cylinders, fuel_type, power_hp, torque_nm
|
||||
FROM engines
|
||||
WHERE displacement_cc IS NOT NULL
|
||||
LIMIT 10
|
||||
""")
|
||||
for row in cursor.fetchall():
|
||||
print(f" {row['name']}: {row['displacement_cc']}cc, {row['cylinders']} cyl, "
|
||||
f"{row['fuel_type']}, {row['power_hp']}HP, {row['torque_nm']}Nm")
|
||||
|
||||
conn.close()
|
||||
return updated
|
||||
|
||||
|
||||
def get_stats():
|
||||
"""Muestra estadísticas de cobertura de datos."""
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
print("\n=== Data Coverage Statistics ===")
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM engines")
|
||||
total = cursor.fetchone()[0]
|
||||
print(f"Total engines: {total}")
|
||||
|
||||
fields = ['displacement_cc', 'cylinders', 'fuel_type', 'power_hp', 'torque_nm']
|
||||
for field in fields:
|
||||
cursor.execute(f"SELECT COUNT(*) FROM engines WHERE {field} IS NOT NULL")
|
||||
count = cursor.fetchone()[0]
|
||||
pct = (count / total * 100) if total > 0 else 0
|
||||
print(f" {field}: {count} ({pct:.1f}%)")
|
||||
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
|
||||
print("=" * 50)
|
||||
print("Engine Data Enrichment Script")
|
||||
print("=" * 50)
|
||||
|
||||
# Show current stats
|
||||
get_stats()
|
||||
|
||||
if len(sys.argv) > 1 and sys.argv[1] == '--dry-run':
|
||||
print("\n=== Dry Run - Testing Parser ===")
|
||||
test_names = [
|
||||
"2.5L 4-Cylinder",
|
||||
"3.5L V6 DOHC",
|
||||
"1.5L Turbo I4",
|
||||
"5.0L V8 Supercharged",
|
||||
"2.0L TDI Diesel",
|
||||
"Electric Motor",
|
||||
"1.8T TSI",
|
||||
"3.0L Twin Turbo V6",
|
||||
"6.2L V8 HEMI",
|
||||
"2.4L DOHC 16-Valve",
|
||||
]
|
||||
for name in test_names:
|
||||
result = parse_engine_name(name)
|
||||
hp = estimate_power(result['displacement_cc'], result['cylinders'],
|
||||
result['is_turbo'], result['fuel_type'])
|
||||
print(f"\n '{name}':")
|
||||
print(f" -> {result['displacement_cc']}cc, {result['cylinders']} cyl, "
|
||||
f"{result['fuel_type']}, turbo={result['is_turbo']}, est. {hp}HP")
|
||||
else:
|
||||
print("\n" + "=" * 50)
|
||||
print("Updating engines from name parsing...")
|
||||
print("=" * 50)
|
||||
|
||||
update_engines_from_parsing()
|
||||
|
||||
# Show final stats
|
||||
get_stats()
|
||||
Reference in New Issue
Block a user