diff --git a/vehicle_scraper/scrape_multibrand.py b/vehicle_scraper/scrape_multibrand.py new file mode 100755 index 0000000..1804696 --- /dev/null +++ b/vehicle_scraper/scrape_multibrand.py @@ -0,0 +1,501 @@ +#!/usr/bin/env python3 +""" +Scraper Multimarca v2 +- Marcas: Dodge, Honda, Mitsubishi, Jeep, BMW, Fiat, Hyundai, Infiniti, Kia, Land Rover, Lexus +- Procesa de 5 en 5 años +- Espera 3 minutos entre lotes (saltable con cualquier tecla) +- Menú interactivo para seleccionar marca y lote +- Años: 1975-2026 +""" + +import requests +from bs4 import BeautifulSoup +import sqlite3 +import time +import re +import os +import sys +import select +from urllib.parse import unquote + +# Detectar ruta base del proyecto +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +if os.path.basename(SCRIPT_DIR) == "vehicle_scraper": + BASE_DIR = os.path.dirname(SCRIPT_DIR) +else: + BASE_DIR = SCRIPT_DIR +DB_PATH = os.path.join(BASE_DIR, "vehicle_database", "vehicle_database.db") + +BASE_URL = "https://www.rockauto.com/en/catalog" + +# Marcas a scrapear +BRANDS = [ + "DODGE", + "HONDA", + "MITSUBISHI", + "JEEP", + "BMW", + "FIAT", + "HYUNDAI", + "INFINITI", + "KIA", + "LAND ROVER", + "LEXUS" +] + +# Años de 1975 a 2026 (orden descendente) +ALL_YEARS = list(range(2026, 1974, -1)) + +# Configuración de lotes +BATCH_SIZE = 5 # años por lote +WAIT_TIME = 180 # 3 minutos entre lotes + +session = requests.Session() +session.headers.update({ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Accept': 'text/html,application/xhtml+xml', + 'Accept-Language': 'en-US,en;q=0.9', +}) + + +def check_key_press(): + """Verifica si se presionó alguna tecla (non-blocking)""" + if sys.platform == 'win32': + import msvcrt + if msvcrt.kbhit(): + msvcrt.getch() + return True + return False + else: + # Linux/Mac + rlist, _, _ = select.select([sys.stdin], [], [], 0) + if rlist: + sys.stdin.readline() + return True + return False + + +def wait_with_skip(seconds, message=""): + """Espera que se puede saltar presionando cualquier tecla""" + print(f"\n{'*'*60}") + print(f" {message}") + print(f" ACTIVA/CAMBIA EL VPN AHORA") + print(f" >>> Presiona ENTER para saltar la espera <<<") + print(f"{'*'*60}") + + # Limpiar buffer de entrada + if sys.platform != 'win32': + import termios + try: + termios.tcflush(sys.stdin, termios.TCIFLUSH) + except: + pass + + for sec in range(seconds, 0, -1): + if check_key_press(): + print(f"\n >>> ESPERA SALTADA <<<") + return + mins = sec // 60 + secs = sec % 60 + print(f"\r Continuando en {mins}:{secs:02d}... (ENTER para saltar) ", end="", flush=True) + time.sleep(1) + print() + + +def clean_name(name): + name = unquote(name.replace('+', ' ')) + return re.sub(r'\s+', ' ', name).strip().upper() + + +def get_soup(url, retries=3): + for attempt in range(retries): + try: + time.sleep(0.5) + response = session.get(url, timeout=15) + if response.status_code == 200: + return BeautifulSoup(response.content, 'html.parser') + elif response.status_code == 403: + print(f"\n [!] Bloqueado (403) - Cambia el VPN") + return None + except Exception as e: + if attempt < retries - 1: + time.sleep(3) + else: + print(f"\n Error: {e}") + return None + + +def get_models(brand, year): + brand_url = brand.lower().replace(' ', '+') + soup = get_soup(f"{BASE_URL}/{brand_url},{year}") + if not soup: + return [] + + models = set() + for link in soup.find_all('a', href=True): + pattern = rf'/catalog/{re.escape(brand_url)},{year},([^,/]+)' + match = re.search(pattern, link['href'], re.I) + if match: + model = clean_name(match.group(1)) + if model and not model.isdigit() and len(model) > 1: + models.add(model) + return sorted(models) + + +def get_engines(brand, year, model): + brand_url = brand.lower().replace(' ', '+') + model_url = model.lower().replace(' ', '+') + soup = get_soup(f"{BASE_URL}/{brand_url},{year},{model_url}") + if not soup: + return ['STANDARD'] + + engines = set() + for link in soup.find_all('a', href=True): + pattern = rf'/catalog/{re.escape(brand_url)},{year},{re.escape(model_url)},([^,/]+)' + match = re.search(pattern, link['href'], re.I) + if match: + engine = clean_name(match.group(1)) + if engine and re.search(r'\d+\.?\d*L|V\d|I\d|H\d|HYBRID|ELECTRIC|DIESEL', engine, re.I): + engines.add(engine) + return sorted(engines) if engines else ['STANDARD'] + + +def save_to_db(conn, brand, year, model, engine): + cursor = conn.cursor() + try: + cursor.execute("INSERT OR IGNORE INTO brands (name) VALUES (?)", (brand,)) + cursor.execute("SELECT id FROM brands WHERE name = ?", (brand,)) + brand_id = cursor.fetchone()[0] + + cursor.execute("INSERT OR IGNORE INTO years (year) VALUES (?)", (year,)) + cursor.execute("SELECT id FROM years WHERE year = ?", (year,)) + year_id = cursor.fetchone()[0] + + cursor.execute("INSERT OR IGNORE INTO engines (name) VALUES (?)", (engine,)) + cursor.execute("SELECT id FROM engines WHERE name = ?", (engine,)) + engine_id = cursor.fetchone()[0] + + cursor.execute("INSERT OR IGNORE INTO models (brand_id, name) VALUES (?, ?)", (brand_id, model)) + cursor.execute("SELECT id FROM models WHERE brand_id = ? AND name = ?", (brand_id, model)) + model_id = cursor.fetchone()[0] + + cursor.execute( + "INSERT OR IGNORE INTO model_year_engine (model_id, year_id, engine_id) VALUES (?, ?, ?)", + (model_id, year_id, engine_id) + ) + return cursor.rowcount > 0 + except Exception as e: + print(f" DB Error: {e}") + return False + + +def get_existing_years(conn, brand): + """Obtiene los años que ya existen para esta marca""" + cursor = conn.cursor() + cursor.execute(""" + SELECT DISTINCT y.year + FROM years y + JOIN model_year_engine mye ON y.id = mye.year_id + JOIN models m ON mye.model_id = m.id + JOIN brands b ON m.brand_id = b.id + WHERE b.name = ? + """, (brand,)) + return set(row[0] for row in cursor.fetchall()) + + +def process_batch(conn, brand, years_batch, batch_num, total_batches): + """Procesa un lote de 5 años""" + print(f"\n{'='*60}") + print(f"[{brand}] LOTE {batch_num}/{total_batches}: Años {years_batch}") + print('='*60) + + batch_saved = 0 + batch_total = 0 + + for year in years_batch: + print(f"\n[{brand} - Año {year}] Obteniendo modelos... ", end="", flush=True) + models = get_models(brand, year) + print(f"{len(models)} modelos encontrados") + + if not models: + print(f" No se encontraron modelos para {year}") + continue + + for model in models: + engines = get_engines(brand, year, model) + for engine in engines: + batch_total += 1 + if save_to_db(conn, brand, year, model, engine): + batch_saved += 1 + print(f" {model} - {engine}") + + # Guardar cambios del lote + conn.commit() + print(f"\n>> Lote {batch_num} completado: {batch_saved} nuevos de {batch_total} encontrados") + return batch_saved, batch_total + + +def get_brand_batches(conn, brand): + """Obtiene los lotes disponibles para una marca""" + existing = get_existing_years(conn, brand) + years_to_process = [y for y in ALL_YEARS if y not in existing] + if not years_to_process: + return [], existing + batches = [years_to_process[i:i+BATCH_SIZE] for i in range(0, len(years_to_process), BATCH_SIZE)] + return batches, existing + + +def process_brand(conn, brand, start_batch=1): + """Procesa una marca completa desde un lote específico""" + print(f"\n{'#'*60}") + print(f" PROCESANDO MARCA: {brand}") + print(f"{'#'*60}") + + # Verificar qué años ya existen + existing = get_existing_years(conn, brand) + print(f"Años existentes de {brand}: {len(existing)} años") + if existing: + print(f" Rango existente: {min(existing)}-{max(existing)}") + + # Filtrar solo los que faltan + years_to_process = [y for y in ALL_YEARS if y not in existing] + + if not years_to_process: + print(f"\n[OK] {brand}: Todos los años ya están en la base de datos!") + return 0, 0 + + print(f"\nAños por procesar para {brand}: {len(years_to_process)}") + print(f" De {max(years_to_process)} a {min(years_to_process)}") + + # Dividir en lotes de 5 + batches = [years_to_process[i:i+BATCH_SIZE] for i in range(0, len(years_to_process), BATCH_SIZE)] + total_batches = len(batches) + + print(f"Lotes de {BATCH_SIZE} años: {total_batches} lotes") + + if start_batch > 1: + print(f"\n>>> Comenzando desde el lote {start_batch} <<<") + + total_saved = 0 + total_found = 0 + + for i, batch in enumerate(batches, 1): + # Saltar lotes anteriores al inicial + if i < start_batch: + continue + + saved, found = process_batch(conn, brand, batch, i, total_batches) + total_saved += saved + total_found += found + + # Si no es el último lote, esperar para cambiar VPN + if i < total_batches: + wait_with_skip(WAIT_TIME, f"PAUSA DE {WAIT_TIME//60} MINUTOS - [{brand}] Lotes restantes: {total_batches - i}") + + return total_saved, total_found + + +def show_main_menu(conn): + """Muestra menú principal con opciones""" + while True: + print("\n" + "="*60) + print(" SCRAPER MULTIMARCA - MENU PRINCIPAL") + print("="*60) + print("\n Opciones:") + print(" 1. Ver estado de todas las marcas") + print(" 2. Seleccionar marca y lote específico") + print(" 3. Procesar todas las marcas pendientes") + print(" 0. Salir") + print("="*60) + + choice = input("\nSelecciona opción: ").strip() + + if choice == '0': + return None, None + elif choice == '1': + show_all_brands_status(conn) + elif choice == '2': + result = show_batch_menu(conn) + if result[0] is not None or result[1] is not None: + return result + elif choice == '3': + return 'ALL', 1 + else: + print("Opción inválida") + + +def show_all_brands_status(conn): + """Muestra estado de todas las marcas""" + print("\n" + "-"*60) + print(" ESTADO DE TODAS LAS MARCAS") + print("-"*60) + + for brand in BRANDS: + batches, existing = get_brand_batches(conn, brand) + years_pending = sum(len(b) for b in batches) + status = "COMPLETO" if not batches else f"{years_pending} años pendientes ({len(batches)} lotes)" + print(f" {brand:15} | Existentes: {len(existing):3} | {status}") + + print("-"*60) + input("\nPresiona ENTER para continuar...") + + +def show_batch_menu(conn): + """Muestra menú para seleccionar marca y lote inicial""" + print("\n" + "="*60) + print(" SELECCIÓN DE MARCA Y LOTE") + print("="*60) + + brand_info = {} + available_brands = [] + + for i, brand in enumerate(BRANDS, 1): + batches, existing = get_brand_batches(conn, brand) + brand_info[brand] = {'batches': batches, 'existing': existing} + + if batches: + available_brands.append(brand) + print(f"\n {len(available_brands)}. {brand}") + print(f" Años existentes: {len(existing)}") + print(f" Lotes pendientes: {len(batches)}") + # Mostrar resumen de lotes + if len(batches) <= 5: + for j, batch in enumerate(batches, 1): + print(f" Lote {j}: años {batch[0]}-{batch[-1]}") + else: + print(f" Lote 1: años {batches[0][0]}-{batches[0][-1]}") + print(f" ...") + print(f" Lote {len(batches)}: años {batches[-1][0]}-{batches[-1][-1]}") + else: + print(f"\n -. {brand} - [COMPLETO]") + + if not available_brands: + print("\n [!] Todas las marcas están completas!") + input("\nPresiona ENTER para continuar...") + return None, None + + print(f"\n 0. Volver al menú principal") + print("="*60) + + # Seleccionar marca + while True: + try: + choice = input("\nSelecciona marca (número): ").strip() + if choice == '0' or choice == '': + return None, None + + brand_idx = int(choice) - 1 + if 0 <= brand_idx < len(available_brands): + selected_brand = available_brands[brand_idx] + break + print(f"Opción inválida. Ingresa un número entre 1 y {len(available_brands)}") + except ValueError: + print("Ingresa un número válido") + + batches = brand_info[selected_brand]['batches'] + + # Mostrar lotes disponibles + print(f"\n{'='*60}") + print(f" LOTES DISPONIBLES PARA {selected_brand}") + print(f"{'='*60}") + + for j, batch in enumerate(batches, 1): + print(f" {j}. Lote {j}: años {batch[0]} - {batch[-1]} ({len(batch)} años)") + + print(f"\n 0. Volver") + print("="*60) + + # Seleccionar lote + while True: + try: + batch_choice = input(f"\nComenzar desde lote (1-{len(batches)}): ").strip() + if batch_choice == '0': + return None, None + if batch_choice == '': + return selected_brand, 1 + + batch_num = int(batch_choice) + if 1 <= batch_num <= len(batches): + return selected_brand, batch_num + print(f"Ingresa un número entre 1 y {len(batches)}") + except ValueError: + print("Ingresa un número válido") + + +def main(): + print("="*60) + print(" SCRAPER MULTIMARCA v2") + print("="*60) + print(f" Marcas: {len(BRANDS)}") + for brand in BRANDS: + print(f" - {brand}") + print(f" Años: 1975-2026 | Lotes de {BATCH_SIZE} años") + print(f" Pausa entre lotes: {WAIT_TIME//60} minutos") + print(" >>> Presiona ENTER para saltar esperas <<<") + print("="*60) + + # Verificar base de datos + if not os.path.exists(DB_PATH): + print(f"\n[ERROR] Base de datos no encontrada: {DB_PATH}") + print("Verifica que la ruta sea correcta.") + sys.exit(1) + + print(f"\nBase de datos: {DB_PATH}") + + conn = sqlite3.connect(DB_PATH) + + # Menú principal + selected_brand, start_batch = show_main_menu(conn) + + if selected_brand is None and start_batch is None: + print("\nSaliendo...") + conn.close() + return + + grand_total_saved = 0 + grand_total_found = 0 + brand_stats = {} + + # Determinar qué marcas procesar + if selected_brand == 'ALL': + # Procesar todas las marcas pendientes + brands_to_process = BRANDS + start_batches = {brand: 1 for brand in BRANDS} + elif selected_brand: + # Solo procesar la marca seleccionada desde el lote indicado + brands_to_process = [selected_brand] + start_batches = {selected_brand: start_batch} + else: + conn.close() + return + + for brand in brands_to_process: + saved, found = process_brand(conn, brand, start_batches.get(brand, 1)) + brand_stats[brand] = {'saved': saved, 'found': found} + grand_total_saved += saved + grand_total_found += found + + # Pausa entre marcas (si hay otra marca por procesar) + if brand != brands_to_process[-1]: + next_brand = brands_to_process[brands_to_process.index(brand)+1] + wait_with_skip(WAIT_TIME, f"PAUSA ENTRE MARCAS - Siguiente: {next_brand}") + + conn.close() + + print("\n" + "="*60) + print(" RESUMEN FINAL") + print("="*60) + for brand, stats in brand_stats.items(): + if stats['found'] > 0 or stats['saved'] > 0: + print(f" {brand}:") + print(f" Encontrados: {stats['found']}") + print(f" Nuevos guardados: {stats['saved']}") + print("-"*60) + print(f" TOTAL:") + print(f" Vehículos encontrados: {grand_total_found}") + print(f" Nuevos guardados: {grand_total_saved}") + print("="*60) + + +if __name__ == "__main__": + main()