#!/usr/bin/env python3 """ Script optimizado para extraer todos los vehículos Toyota de RockAuto Guarda datos incrementalmente para no perder progreso """ import requests from bs4 import BeautifulSoup import sqlite3 import time import re import sys from urllib.parse import unquote DB_PATH = "/home/Autopartes/vehicle_database/vehicle_database.db" BASE_URL = "https://www.rockauto.com/en/catalog" session = requests.Session() session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml', 'Accept-Language': 'en-US,en;q=0.9', }) def clean_name(name): name = unquote(name.replace('+', ' ')) return re.sub(r'\s+', ' ', name).strip().upper() def get_soup(url, retries=3): for attempt in range(retries): try: time.sleep(0.3) # Delay corto response = session.get(url, timeout=10) if response.status_code == 200: return BeautifulSoup(response.content, 'html.parser') except Exception as e: if attempt < retries - 1: time.sleep(2) else: print(f" Error: {e}") return None def get_years(brand): brand_url = brand.lower().replace(' ', '+') soup = get_soup(f"{BASE_URL}/{brand_url}") if not soup: return [] years = set() for link in soup.find_all('a', href=True): match = re.search(rf'/catalog/{re.escape(brand_url)},(\d{{4}})', link['href'], re.I) if match: year = int(match.group(1)) if 1950 < year <= 2030: years.add(year) return sorted(years, reverse=True) def get_models(brand, year): brand_url = brand.lower().replace(' ', '+') soup = get_soup(f"{BASE_URL}/{brand_url},{year}") if not soup: return [] models = set() for link in soup.find_all('a', href=True): pattern = rf'/catalog/{re.escape(brand_url)},{year},([^,/]+)' match = re.search(pattern, link['href'], re.I) if match: model = clean_name(match.group(1)) if model and not model.isdigit() and len(model) > 1: models.add(model) return sorted(models) def get_engines(brand, year, model): brand_url = brand.lower().replace(' ', '+') model_url = model.lower().replace(' ', '+') soup = get_soup(f"{BASE_URL}/{brand_url},{year},{model_url}") if not soup: return [] engines = set() for link in soup.find_all('a', href=True): pattern = rf'/catalog/{re.escape(brand_url)},{year},{re.escape(model_url)},([^,/]+)' match = re.search(pattern, link['href'], re.I) if match: engine = clean_name(match.group(1)) if engine and re.search(r'\d+\.?\d*L|V\d|I\d|H\d|HYBRID|ELECTRIC|DIESEL', engine, re.I): engines.add(engine) return sorted(engines) if engines else ['Standard'] def save_to_db(conn, brand, year, model, engine): cursor = conn.cursor() try: cursor.execute("INSERT OR IGNORE INTO brands (name) VALUES (?)", (brand,)) cursor.execute("SELECT id FROM brands WHERE name = ?", (brand,)) brand_id = cursor.fetchone()[0] cursor.execute("INSERT OR IGNORE INTO years (year) VALUES (?)", (year,)) cursor.execute("SELECT id FROM years WHERE year = ?", (year,)) year_id = cursor.fetchone()[0] cursor.execute("INSERT OR IGNORE INTO engines (name) VALUES (?)", (engine,)) cursor.execute("SELECT id FROM engines WHERE name = ?", (engine,)) engine_id = cursor.fetchone()[0] cursor.execute("INSERT OR IGNORE INTO models (brand_id, name) VALUES (?, ?)", (brand_id, model)) cursor.execute("SELECT id FROM models WHERE brand_id = ? AND name = ?", (brand_id, model)) model_id = cursor.fetchone()[0] cursor.execute( "INSERT OR IGNORE INTO model_year_engine (model_id, year_id, engine_id) VALUES (?, ?, ?)", (model_id, year_id, engine_id) ) return cursor.rowcount > 0 except Exception as e: print(f" DB Error: {e}") return False def main(): brand = "TOYOTA" print(f"Obteniendo años disponibles para {brand}...") years = get_years(brand) print(f"Encontrados {len(years)} años: {years[0]} - {years[-1]}") # Filtrar solo 1975-2026 years = [y for y in years if 1975 <= y <= 2026] print(f"Procesando años 1975-2026: {len(years)} años") print("=" * 60) conn = sqlite3.connect(DB_PATH) total_saved = 0 total_vehicles = 0 for i, year in enumerate(years, 1): print(f"\n[{i}/{len(years)}] Año {year}: ", end="", flush=True) models = get_models(brand, year) print(f"{len(models)} modelos") year_count = 0 for model in models: engines = get_engines(brand, year, model) for engine in engines: total_vehicles += 1 if save_to_db(conn, brand, year, model, engine): total_saved += 1 year_count += 1 print(f" {model}: {engine}") conn.commit() print(f" -> Guardados: {year_count} nuevos") conn.close() print("\n" + "=" * 60) print(f"RESUMEN TOYOTA") print(f" Años procesados: {len(years)}") print(f" Total vehículos encontrados: {total_vehicles}") print(f" Nuevos guardados: {total_saved}") if __name__ == "__main__": main()