Initial commit: Sistema Autoparts DB

- Base de datos SQLite con información de vehículos
- Dashboard web con Flask y Bootstrap
- Scripts de web scraping para RockAuto
- Interfaz CLI para consultas
- Documentación completa del proyecto

Incluye:
- 12 marcas de vehículos
- 10,923 modelos
- 10,919 especificaciones de motores
- 12,075 combinaciones modelo-año-motor
This commit is contained in:
2026-01-19 08:45:03 +00:00
commit f395d67136
59 changed files with 10881 additions and 0 deletions

View File

@@ -0,0 +1,163 @@
#!/usr/bin/env python3
"""
Script optimizado para extraer todos los vehículos Toyota de RockAuto
Guarda datos incrementalmente para no perder progreso
"""
import requests
from bs4 import BeautifulSoup
import sqlite3
import time
import re
import sys
from urllib.parse import unquote
DB_PATH = "/home/Autopartes/vehicle_database/vehicle_database.db"
BASE_URL = "https://www.rockauto.com/en/catalog"
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml',
'Accept-Language': 'en-US,en;q=0.9',
})
def clean_name(name):
name = unquote(name.replace('+', ' '))
return re.sub(r'\s+', ' ', name).strip().upper()
def get_soup(url, retries=3):
for attempt in range(retries):
try:
time.sleep(0.3) # Delay corto
response = session.get(url, timeout=10)
if response.status_code == 200:
return BeautifulSoup(response.content, 'html.parser')
except Exception as e:
if attempt < retries - 1:
time.sleep(2)
else:
print(f" Error: {e}")
return None
def get_years(brand):
brand_url = brand.lower().replace(' ', '+')
soup = get_soup(f"{BASE_URL}/{brand_url}")
if not soup:
return []
years = set()
for link in soup.find_all('a', href=True):
match = re.search(rf'/catalog/{re.escape(brand_url)},(\d{{4}})', link['href'], re.I)
if match:
year = int(match.group(1))
if 1950 < year <= 2030:
years.add(year)
return sorted(years, reverse=True)
def get_models(brand, year):
brand_url = brand.lower().replace(' ', '+')
soup = get_soup(f"{BASE_URL}/{brand_url},{year}")
if not soup:
return []
models = set()
for link in soup.find_all('a', href=True):
pattern = rf'/catalog/{re.escape(brand_url)},{year},([^,/]+)'
match = re.search(pattern, link['href'], re.I)
if match:
model = clean_name(match.group(1))
if model and not model.isdigit() and len(model) > 1:
models.add(model)
return sorted(models)
def get_engines(brand, year, model):
brand_url = brand.lower().replace(' ', '+')
model_url = model.lower().replace(' ', '+')
soup = get_soup(f"{BASE_URL}/{brand_url},{year},{model_url}")
if not soup:
return []
engines = set()
for link in soup.find_all('a', href=True):
pattern = rf'/catalog/{re.escape(brand_url)},{year},{re.escape(model_url)},([^,/]+)'
match = re.search(pattern, link['href'], re.I)
if match:
engine = clean_name(match.group(1))
if engine and re.search(r'\d+\.?\d*L|V\d|I\d|H\d|HYBRID|ELECTRIC|DIESEL', engine, re.I):
engines.add(engine)
return sorted(engines) if engines else ['Standard']
def save_to_db(conn, brand, year, model, engine):
cursor = conn.cursor()
try:
cursor.execute("INSERT OR IGNORE INTO brands (name) VALUES (?)", (brand,))
cursor.execute("SELECT id FROM brands WHERE name = ?", (brand,))
brand_id = cursor.fetchone()[0]
cursor.execute("INSERT OR IGNORE INTO years (year) VALUES (?)", (year,))
cursor.execute("SELECT id FROM years WHERE year = ?", (year,))
year_id = cursor.fetchone()[0]
cursor.execute("INSERT OR IGNORE INTO engines (name) VALUES (?)", (engine,))
cursor.execute("SELECT id FROM engines WHERE name = ?", (engine,))
engine_id = cursor.fetchone()[0]
cursor.execute("INSERT OR IGNORE INTO models (brand_id, name) VALUES (?, ?)", (brand_id, model))
cursor.execute("SELECT id FROM models WHERE brand_id = ? AND name = ?", (brand_id, model))
model_id = cursor.fetchone()[0]
cursor.execute(
"INSERT OR IGNORE INTO model_year_engine (model_id, year_id, engine_id) VALUES (?, ?, ?)",
(model_id, year_id, engine_id)
)
return cursor.rowcount > 0
except Exception as e:
print(f" DB Error: {e}")
return False
def main():
brand = "TOYOTA"
print(f"Obteniendo años disponibles para {brand}...")
years = get_years(brand)
print(f"Encontrados {len(years)} años: {years[0]} - {years[-1]}")
# Filtrar solo 1975-2026
years = [y for y in years if 1975 <= y <= 2026]
print(f"Procesando años 1975-2026: {len(years)} años")
print("=" * 60)
conn = sqlite3.connect(DB_PATH)
total_saved = 0
total_vehicles = 0
for i, year in enumerate(years, 1):
print(f"\n[{i}/{len(years)}] Año {year}: ", end="", flush=True)
models = get_models(brand, year)
print(f"{len(models)} modelos")
year_count = 0
for model in models:
engines = get_engines(brand, year, model)
for engine in engines:
total_vehicles += 1
if save_to_db(conn, brand, year, model, engine):
total_saved += 1
year_count += 1
print(f" {model}: {engine}")
conn.commit()
print(f" -> Guardados: {year_count} nuevos")
conn.close()
print("\n" + "=" * 60)
print(f"RESUMEN TOYOTA")
print(f" Años procesados: {len(years)}")
print(f" Total vehículos encontrados: {total_vehicles}")
print(f" Nuevos guardados: {total_saved}")
if __name__ == "__main__":
main()