#!/usr/bin/env python3 """ Scraper de Toyota para Windows - Procesa de 3 en 3 años - Espera 60 segundos entre lotes para activar VPN - Años faltantes: 1975-2003 """ import requests from bs4 import BeautifulSoup import sqlite3 import time import re import os import sys from urllib.parse import unquote # Detectar ruta base del proyecto SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) # Si estamos en vehicle_scraper, subir un nivel if os.path.basename(SCRIPT_DIR) == "vehicle_scraper": BASE_DIR = os.path.dirname(SCRIPT_DIR) else: BASE_DIR = SCRIPT_DIR DB_PATH = os.path.join(BASE_DIR, "vehicle_database", "vehicle_database.db") BASE_URL = "https://www.rockauto.com/en/catalog" # Años que faltan por scrapear MISSING_YEARS = [ 2003, 2002, 2001, 2000, 1999, 1998, 1997, 1996, 1995, 1994, 1993, 1992, 1991, 1990, 1989, 1988, 1987, 1986, 1985, 1984, 1983, 1982, 1981, 1980, 1979, 1978, 1977, 1976, 1975 ] session = requests.Session() session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml', 'Accept-Language': 'en-US,en;q=0.9', }) def clean_name(name): name = unquote(name.replace('+', ' ')) return re.sub(r'\s+', ' ', name).strip().upper() def get_soup(url, retries=3): for attempt in range(retries): try: time.sleep(0.5) response = session.get(url, timeout=15) if response.status_code == 200: return BeautifulSoup(response.content, 'html.parser') elif response.status_code == 403: print(f"\n [!] Bloqueado (403) - Cambia el VPN") return None except Exception as e: if attempt < retries - 1: time.sleep(3) else: print(f"\n Error: {e}") return None def get_models(brand, year): brand_url = brand.lower().replace(' ', '+') soup = get_soup(f"{BASE_URL}/{brand_url},{year}") if not soup: return [] models = set() for link in soup.find_all('a', href=True): pattern = rf'/catalog/{re.escape(brand_url)},{year},([^,/]+)' match = re.search(pattern, link['href'], re.I) if match: model = clean_name(match.group(1)) if model and not model.isdigit() and len(model) > 1: models.add(model) return sorted(models) def get_engines(brand, year, model): brand_url = brand.lower().replace(' ', '+') model_url = model.lower().replace(' ', '+') soup = get_soup(f"{BASE_URL}/{brand_url},{year},{model_url}") if not soup: return ['STANDARD'] engines = set() for link in soup.find_all('a', href=True): pattern = rf'/catalog/{re.escape(brand_url)},{year},{re.escape(model_url)},([^,/]+)' match = re.search(pattern, link['href'], re.I) if match: engine = clean_name(match.group(1)) if engine and re.search(r'\d+\.?\d*L|V\d|I\d|H\d|HYBRID|ELECTRIC|DIESEL', engine, re.I): engines.add(engine) return sorted(engines) if engines else ['STANDARD'] def save_to_db(conn, brand, year, model, engine): cursor = conn.cursor() try: cursor.execute("INSERT OR IGNORE INTO brands (name) VALUES (?)", (brand,)) cursor.execute("SELECT id FROM brands WHERE name = ?", (brand,)) brand_id = cursor.fetchone()[0] cursor.execute("INSERT OR IGNORE INTO years (year) VALUES (?)", (year,)) cursor.execute("SELECT id FROM years WHERE year = ?", (year,)) year_id = cursor.fetchone()[0] cursor.execute("INSERT OR IGNORE INTO engines (name) VALUES (?)", (engine,)) cursor.execute("SELECT id FROM engines WHERE name = ?", (engine,)) engine_id = cursor.fetchone()[0] cursor.execute("INSERT OR IGNORE INTO models (brand_id, name) VALUES (?, ?)", (brand_id, model)) cursor.execute("SELECT id FROM models WHERE brand_id = ? AND name = ?", (brand_id, model)) model_id = cursor.fetchone()[0] cursor.execute( "INSERT OR IGNORE INTO model_year_engine (model_id, year_id, engine_id) VALUES (?, ?, ?)", (model_id, year_id, engine_id) ) return cursor.rowcount > 0 except Exception as e: print(f" DB Error: {e}") return False def get_existing_years(conn, brand): """Obtiene los años que ya existen para esta marca""" cursor = conn.cursor() cursor.execute(""" SELECT DISTINCT y.year FROM years y JOIN model_year_engine mye ON y.id = mye.year_id JOIN models m ON mye.model_id = m.id JOIN brands b ON m.brand_id = b.id WHERE b.name = ? """, (brand,)) return set(row[0] for row in cursor.fetchall()) def process_batch(conn, brand, years_batch, batch_num, total_batches): """Procesa un lote de 3 años""" print(f"\n{'='*60}") print(f"LOTE {batch_num}/{total_batches}: Años {years_batch}") print('='*60) batch_saved = 0 batch_total = 0 for year in years_batch: print(f"\n[Año {year}] Obteniendo modelos... ", end="", flush=True) models = get_models(brand, year) print(f"{len(models)} modelos encontrados") if not models: print(f" No se encontraron modelos para {year}") continue for model in models: engines = get_engines(brand, year, model) for engine in engines: batch_total += 1 if save_to_db(conn, brand, year, model, engine): batch_saved += 1 print(f" {model} - {engine}") # Guardar cambios del lote conn.commit() print(f"\n>> Lote {batch_num} completado: {batch_saved} nuevos de {batch_total} encontrados") return batch_saved, batch_total def main(): brand = "TOYOTA" print("="*60) print(" SCRAPER TOYOTA - WINDOWS") print(" Procesa 3 años, guarda, espera 60s para VPN") print("="*60) # Verificar base de datos if not os.path.exists(DB_PATH): print(f"\n[ERROR] Base de datos no encontrada: {DB_PATH}") print("Verifica que la ruta sea correcta.") sys.exit(1) print(f"\nBase de datos: {DB_PATH}") conn = sqlite3.connect(DB_PATH) # Verificar qué años ya existen existing = get_existing_years(conn, brand) print(f"Años existentes de {brand}: {sorted(existing)}") # Filtrar solo los que faltan years_to_process = [y for y in MISSING_YEARS if y not in existing] if not years_to_process: print("\n[OK] Todos los años ya están en la base de datos!") conn.close() return print(f"\nAños por procesar: {years_to_process}") print(f"Total: {len(years_to_process)} años") # Dividir en lotes de 3 batches = [years_to_process[i:i+3] for i in range(0, len(years_to_process), 3)] total_batches = len(batches) print(f"Lotes de 3 años: {total_batches} lotes") input("\nPresiona ENTER para comenzar...") total_saved = 0 total_found = 0 for i, batch in enumerate(batches, 1): saved, found = process_batch(conn, brand, batch, i, total_batches) total_saved += saved total_found += found # Si no es el último lote, esperar para cambiar VPN if i < total_batches: print(f"\n{'*'*60}") print(f" PAUSA DE 60 SEGUNDOS - ACTIVA/CAMBIA EL VPN AHORA") print(f" Lotes restantes: {total_batches - i}") print(f"{'*'*60}") for sec in range(60, 0, -1): print(f"\r Continuando en {sec} segundos... ", end="", flush=True) time.sleep(1) print() conn.close() print("\n" + "="*60) print(" RESUMEN FINAL - TOYOTA") print("="*60) print(f" Años procesados: {len(years_to_process)}") print(f" Vehículos encontrados: {total_found}") print(f" Nuevos guardados: {total_saved}") print("="*60) if __name__ == "__main__": main()