From 61474f7abe9eb06338ce4c1ea6a4f42efc8bc5d9 Mon Sep 17 00:00:00 2001 From: consultoria-as Date: Mon, 19 Jan 2026 09:17:54 +0000 Subject: [PATCH] Update multibrand scraper v3 with all RockAuto brands Added 108 brands total: - 90 main brands (international) - 18 regional brands (Mexico, China, etc.) Features: - Interactive menu with multiple options - Search brand by name - Select by initial letter - Process range of brands (e.g., 1-10) - View all brands status with pagination - Skip wait time with ENTER - Years: 1975-2026, 5 years per batch --- vehicle_scraper/scrape_multibrand.py | 542 ++++++++++++++++++++------- 1 file changed, 416 insertions(+), 126 deletions(-) diff --git a/vehicle_scraper/scrape_multibrand.py b/vehicle_scraper/scrape_multibrand.py index 1804696..0e77cb2 100755 --- a/vehicle_scraper/scrape_multibrand.py +++ b/vehicle_scraper/scrape_multibrand.py @@ -1,10 +1,10 @@ #!/usr/bin/env python3 """ -Scraper Multimarca v2 -- Marcas: Dodge, Honda, Mitsubishi, Jeep, BMW, Fiat, Hyundai, Infiniti, Kia, Land Rover, Lexus +Scraper Multimarca v3 - TODAS LAS MARCAS DE ROCKAUTO +- Incluye todas las marcas con vehiculos en rango 1975-2026 - Procesa de 5 en 5 años -- Espera 3 minutos entre lotes (saltable con cualquier tecla) -- Menú interactivo para seleccionar marca y lote +- Espera 3 minutos entre lotes (saltable con ENTER) +- Menu interactivo para seleccionar marca y lote - Años: 1975-2026 """ @@ -28,25 +28,180 @@ DB_PATH = os.path.join(BASE_DIR, "vehicle_database", "vehicle_database.db") BASE_URL = "https://www.rockauto.com/en/catalog" -# Marcas a scrapear +# TODAS LAS MARCAS DE ROCKAUTO (con vehiculos 1975-2026) +# Organizadas alfabeticamente BRANDS = [ - "DODGE", - "HONDA", - "MITSUBISHI", - "JEEP", + # A + "ABARTH", + "AC", + "ACURA", + "ALFA ROMEO", + "ALPINE", + "AM GENERAL", + "AMERICAN MOTORS", + "ASTON MARTIN", + "AUDI", + "AUSTIN", + "AUSTIN-HEALEY", + "AVANTI", + # B + "BENTLEY", + "BERTONE", "BMW", + "BRICKLIN", + "BRISTOL", + "BUGATTI", + "BUICK", + "BYD", + # C + "CADILLAC", + "CHECKER", + "CHEVROLET", + "CHRYSLER", + "CITROEN", + "CUPRA", + # D + "DAEWOO", + "DAIHATSU", + "DATSUN", + "DELOREAN", + "DODGE", + # E + "EAGLE", + "EDSEL", + "EXCALIBUR", + # F + "FACEL VEGA", + "FERRARI", "FIAT", + "FISKER", + "FORD", + "FREIGHTLINER", + # G + "GENESIS", + "GEO", + "GMC", + # H + "HILLMAN", + "HONDA", + "HUMMER", "HYUNDAI", + # I + "INEOS", "INFINITI", + "INTERNATIONAL", + "ISUZU", + # J + "JAGUAR", + "JEEP", + "JENSEN", + # K + "KARMA", + "KENWORTH", "KIA", + # L + "LAFORZA", + "LAMBORGHINI", + "LANCIA", "LAND ROVER", - "LEXUS" + "LEXUS", + "LINCOLN", + "LOTUS", + "LUCID", + # M + "MACK", + "MASERATI", + "MAYBACH", + "MAZDA", + "MCLAREN", + "MERCEDES-BENZ", + "MERCURY", + "MERKUR", + "MG", + "MINI", + "MITSUBISHI", + "MITSUBISHI FUSO", + "MOBILITY VENTURES", + "MORGAN", + # N + "NISSAN", + # O + "OLDSMOBILE", + "OPEL", + # P + "PANOZ", + "PEUGEOT", + "PLYMOUTH", + "POLESTAR", + "PONTIAC", + "PORSCHE", + # Q + "QVALE", + # R + "RAM", + "RENAULT", + "RIVIAN", + "ROLLS-ROYCE", + "ROVER", + # S + "SAAB", + "SALEEN", + "SATURN", + "SCION", + "SEAT", + "SHELBY", + "SMART", + "SPYKER", + "SRT", + "SSANGYONG", + "STERLING", + "STUDEBAKER", + "SUBARU", + "SUNBEAM", + "SUZUKI", + # T + "TESLA", + "TOYOTA", + "TRIUMPH", + "TVR", + # U + "UD", + # V + "VOLKSWAGEN", + "VOLVO", + "VPG", + # W + "WORKHORSE", + # Y + "YUGO", +] + +# Marcas adicionales de mercados especificos (Mexico, China, etc.) +BRANDS_REGIONAL = [ + "BAIC", + "BESTUNE", + "CHANGAN", + "CHIREY", + "DFSK", + "FAW", + "FOTON", + "GAC", + "GEELY", + "GIANT MOTORS", + "JAC", + "JAECOO", + "JETOUR", + "JMC", + "OMODA", + "SERES", + "VAM", + "VINFAST", ] # Años de 1975 a 2026 (orden descendente) ALL_YEARS = list(range(2026, 1974, -1)) -# Configuración de lotes +# Configuracion de lotes BATCH_SIZE = 5 # años por lote WAIT_TIME = 180 # 3 minutos entre lotes @@ -59,7 +214,7 @@ session.headers.update({ def check_key_press(): - """Verifica si se presionó alguna tecla (non-blocking)""" + """Verifica si se presiono alguna tecla (non-blocking)""" if sys.platform == 'win32': import msvcrt if msvcrt.kbhit(): @@ -246,12 +401,12 @@ def get_brand_batches(conn, brand): def process_brand(conn, brand, start_batch=1): - """Procesa una marca completa desde un lote específico""" + """Procesa una marca completa desde un lote especifico""" print(f"\n{'#'*60}") print(f" PROCESANDO MARCA: {brand}") print(f"{'#'*60}") - # Verificar qué años ya existen + # Verificar que años ya existen existing = get_existing_years(conn, brand) print(f"Años existentes de {brand}: {len(existing)} años") if existing: @@ -261,7 +416,7 @@ def process_brand(conn, brand, start_batch=1): years_to_process = [y for y in ALL_YEARS if y not in existing] if not years_to_process: - print(f"\n[OK] {brand}: Todos los años ya están en la base de datos!") + print(f"\n[OK] {brand}: Todos los años ya estan en la base de datos!") return 0, 0 print(f"\nAños por procesar para {brand}: {len(years_to_process)}") @@ -288,7 +443,7 @@ def process_brand(conn, brand, start_batch=1): total_saved += saved total_found += found - # Si no es el último lote, esperar para cambiar VPN + # Si no es el ultimo lote, esperar para cambiar VPN if i < total_batches: wait_with_skip(WAIT_TIME, f"PAUSA DE {WAIT_TIME//60} MINUTOS - [{brand}] Lotes restantes: {total_batches - i}") @@ -296,139 +451,259 @@ def process_brand(conn, brand, start_batch=1): def show_main_menu(conn): - """Muestra menú principal con opciones""" + """Muestra menu principal con opciones""" + all_brands = BRANDS + BRANDS_REGIONAL + while True: print("\n" + "="*60) - print(" SCRAPER MULTIMARCA - MENU PRINCIPAL") + print(" SCRAPER MULTIMARCA v3 - MENU PRINCIPAL") print("="*60) + print(f"\n Total de marcas disponibles: {len(all_brands)}") print("\n Opciones:") print(" 1. Ver estado de todas las marcas") - print(" 2. Seleccionar marca y lote específico") - print(" 3. Procesar todas las marcas pendientes") + print(" 2. Buscar marca por nombre") + print(" 3. Seleccionar marca por letra inicial") + print(" 4. Procesar multiples marcas (rango)") + print(" 5. Procesar TODAS las marcas pendientes") print(" 0. Salir") print("="*60) - choice = input("\nSelecciona opción: ").strip() + choice = input("\nSelecciona opcion: ").strip() if choice == '0': - return None, None + return None, None, None elif choice == '1': - show_all_brands_status(conn) + show_all_brands_status(conn, all_brands) elif choice == '2': - result = show_batch_menu(conn) - if result[0] is not None or result[1] is not None: + result = search_brand_menu(conn, all_brands) + if result[0] is not None: return result elif choice == '3': - return 'ALL', 1 + result = select_by_letter_menu(conn, all_brands) + if result[0] is not None: + return result + elif choice == '4': + result = select_range_menu(conn, all_brands) + if result[0] is not None: + return result + elif choice == '5': + return 'ALL', 1, all_brands else: - print("Opción inválida") + print("Opcion invalida") -def show_all_brands_status(conn): - """Muestra estado de todas las marcas""" - print("\n" + "-"*60) +def show_all_brands_status(conn, all_brands): + """Muestra estado de todas las marcas con paginacion""" + print("\n" + "-"*70) print(" ESTADO DE TODAS LAS MARCAS") - print("-"*60) + print("-"*70) - for brand in BRANDS: - batches, existing = get_brand_batches(conn, brand) - years_pending = sum(len(b) for b in batches) - status = "COMPLETO" if not batches else f"{years_pending} años pendientes ({len(batches)} lotes)" - print(f" {brand:15} | Existentes: {len(existing):3} | {status}") + page_size = 20 + total_pages = (len(all_brands) + page_size - 1) // page_size + current_page = 0 - print("-"*60) - input("\nPresiona ENTER para continuar...") - - -def show_batch_menu(conn): - """Muestra menú para seleccionar marca y lote inicial""" - print("\n" + "="*60) - print(" SELECCIÓN DE MARCA Y LOTE") - print("="*60) - - brand_info = {} - available_brands = [] - - for i, brand in enumerate(BRANDS, 1): - batches, existing = get_brand_batches(conn, brand) - brand_info[brand] = {'batches': batches, 'existing': existing} - - if batches: - available_brands.append(brand) - print(f"\n {len(available_brands)}. {brand}") - print(f" Años existentes: {len(existing)}") - print(f" Lotes pendientes: {len(batches)}") - # Mostrar resumen de lotes - if len(batches) <= 5: - for j, batch in enumerate(batches, 1): - print(f" Lote {j}: años {batch[0]}-{batch[-1]}") - else: - print(f" Lote 1: años {batches[0][0]}-{batches[0][-1]}") - print(f" ...") - print(f" Lote {len(batches)}: años {batches[-1][0]}-{batches[-1][-1]}") - else: - print(f"\n -. {brand} - [COMPLETO]") - - if not available_brands: - print("\n [!] Todas las marcas están completas!") - input("\nPresiona ENTER para continuar...") - return None, None - - print(f"\n 0. Volver al menú principal") - print("="*60) - - # Seleccionar marca while True: - try: - choice = input("\nSelecciona marca (número): ").strip() - if choice == '0' or choice == '': - return None, None + start_idx = current_page * page_size + end_idx = min(start_idx + page_size, len(all_brands)) - brand_idx = int(choice) - 1 - if 0 <= brand_idx < len(available_brands): - selected_brand = available_brands[brand_idx] - break - print(f"Opción inválida. Ingresa un número entre 1 y {len(available_brands)}") - except ValueError: - print("Ingresa un número válido") + print(f"\n Pagina {current_page + 1}/{total_pages}") + print("-"*70) + print(f" {'#':<4} {'MARCA':<20} {'EXISTENTES':<12} {'ESTADO':<30}") + print("-"*70) - batches = brand_info[selected_brand]['batches'] + for i, brand in enumerate(all_brands[start_idx:end_idx], start_idx + 1): + batches, existing = get_brand_batches(conn, brand) + years_pending = sum(len(b) for b in batches) + if not batches: + status = "COMPLETO" + else: + status = f"{years_pending} años pend. ({len(batches)} lotes)" + print(f" {i:<4} {brand:<20} {len(existing):<12} {status:<30}") - # Mostrar lotes disponibles - print(f"\n{'='*60}") - print(f" LOTES DISPONIBLES PARA {selected_brand}") - print(f"{'='*60}") + print("-"*70) + print(f" [N] Siguiente | [P] Anterior | [Q] Volver") - for j, batch in enumerate(batches, 1): - print(f" {j}. Lote {j}: años {batch[0]} - {batch[-1]} ({len(batch)} años)") + nav = input("\nNavegacion: ").strip().upper() + if nav == 'N' and current_page < total_pages - 1: + current_page += 1 + elif nav == 'P' and current_page > 0: + current_page -= 1 + elif nav == 'Q' or nav == '': + break + + +def search_brand_menu(conn, all_brands): + """Busca una marca por nombre""" + print("\n" + "="*60) + print(" BUSCAR MARCA") + print("="*60) + + search = input("\nIngresa nombre de marca (o parte): ").strip().upper() + if not search: + return None, None, None + + matches = [b for b in all_brands if search in b] + + if not matches: + print(f"\n No se encontraron marcas con '{search}'") + input("\nPresiona ENTER para continuar...") + return None, None, None + + print(f"\n Marcas encontradas ({len(matches)}):") + for i, brand in enumerate(matches, 1): + batches, existing = get_brand_batches(conn, brand) + status = "COMPLETO" if not batches else f"{len(batches)} lotes pendientes" + print(f" {i}. {brand} - {status}") print(f"\n 0. Volver") + + while True: + try: + choice = input("\nSelecciona marca: ").strip() + if choice == '0' or choice == '': + return None, None, None + + idx = int(choice) - 1 + if 0 <= idx < len(matches): + return select_batch_for_brand(conn, matches[idx]) + print("Opcion invalida") + except ValueError: + print("Ingresa un numero valido") + + +def select_by_letter_menu(conn, all_brands): + """Selecciona marcas por letra inicial""" + print("\n" + "="*60) + print(" SELECCIONAR POR LETRA") print("="*60) - # Seleccionar lote + # Obtener letras disponibles + letters = sorted(set(b[0] for b in all_brands)) + print("\n Letras disponibles:") + print(f" {' '.join(letters)}") + + letter = input("\nIngresa letra: ").strip().upper() + if not letter or letter not in letters: + return None, None, None + + matches = [b for b in all_brands if b.startswith(letter)] + + print(f"\n Marcas con '{letter}' ({len(matches)}):") + for i, brand in enumerate(matches, 1): + batches, existing = get_brand_batches(conn, brand) + status = "COMPLETO" if not batches else f"{len(batches)} lotes pendientes" + print(f" {i}. {brand} - {status}") + + print(f"\n 0. Volver") + print(f" A. Procesar TODAS las marcas con '{letter}'") + + while True: + choice = input("\nSelecciona: ").strip().upper() + if choice == '0' or choice == '': + return None, None, None + + if choice == 'A': + return 'MULTIPLE', 1, matches + + try: + idx = int(choice) - 1 + if 0 <= idx < len(matches): + return select_batch_for_brand(conn, matches[idx]) + print("Opcion invalida") + except ValueError: + print("Ingresa un numero valido o 'A'") + + +def select_range_menu(conn, all_brands): + """Selecciona un rango de marcas para procesar""" + print("\n" + "="*60) + print(" SELECCIONAR RANGO DE MARCAS") + print("="*60) + + # Mostrar todas las marcas numeradas + print("\n Marcas disponibles:") + for i, brand in enumerate(all_brands, 1): + if i % 4 == 0: + print(f" {i:3}. {brand}") + else: + print(f" {i:3}. {brand:<18}", end="") + print() + + print(f"\n Total: {len(all_brands)} marcas") + print("\n Ingresa rango (ej: 1-10, 5-20, etc.)") + + range_input = input("\nRango: ").strip() + if not range_input: + return None, None, None + + try: + if '-' in range_input: + start, end = map(int, range_input.split('-')) + else: + start = end = int(range_input) + + if 1 <= start <= end <= len(all_brands): + selected = all_brands[start-1:end] + print(f"\n Marcas seleccionadas ({len(selected)}):") + for b in selected: + print(f" - {b}") + + confirm = input("\nProcesar estas marcas? (S/N): ").strip().upper() + if confirm == 'S': + return 'MULTIPLE', 1, selected + else: + print("Rango invalido") + except ValueError: + print("Formato invalido. Usa: inicio-fin") + + return None, None, None + + +def select_batch_for_brand(conn, brand): + """Selecciona el lote inicial para una marca""" + batches, existing = get_brand_batches(conn, brand) + + if not batches: + print(f"\n {brand} ya esta completo!") + input("\nPresiona ENTER para continuar...") + return None, None, None + + print(f"\n{'='*60}") + print(f" LOTES PARA {brand}") + print(f"{'='*60}") + print(f"\n Años existentes: {len(existing)}") + print(f" Lotes pendientes: {len(batches)}") + + # Mostrar lotes + print("\n Lotes disponibles:") + for j, batch in enumerate(batches, 1): + print(f" {j}. Años {batch[0]} - {batch[-1]} ({len(batch)} años)") + + print(f"\n 0. Volver") + while True: try: batch_choice = input(f"\nComenzar desde lote (1-{len(batches)}): ").strip() - if batch_choice == '0': - return None, None - if batch_choice == '': - return selected_brand, 1 + if batch_choice == '0' or batch_choice == '': + return None, None, None batch_num = int(batch_choice) if 1 <= batch_num <= len(batches): - return selected_brand, batch_num - print(f"Ingresa un número entre 1 y {len(batches)}") + return brand, batch_num, None + print(f"Ingresa un numero entre 1 y {len(batches)}") except ValueError: - print("Ingresa un número válido") + print("Ingresa un numero valido") def main(): + all_brands = BRANDS + BRANDS_REGIONAL + print("="*60) - print(" SCRAPER MULTIMARCA v2") + print(" SCRAPER MULTIMARCA v3 - ROCKAUTO") print("="*60) - print(f" Marcas: {len(BRANDS)}") - for brand in BRANDS: - print(f" - {brand}") + print(f" Total marcas: {len(all_brands)}") + print(f" Marcas principales: {len(BRANDS)}") + print(f" Marcas regionales: {len(BRANDS_REGIONAL)}") print(f" Años: 1975-2026 | Lotes de {BATCH_SIZE} años") print(f" Pausa entre lotes: {WAIT_TIME//60} minutos") print(" >>> Presiona ENTER para saltar esperas <<<") @@ -444,10 +719,10 @@ def main(): conn = sqlite3.connect(DB_PATH) - # Menú principal - selected_brand, start_batch = show_main_menu(conn) + # Menu principal + selected, start_batch, brands_list = show_main_menu(conn) - if selected_brand is None and start_batch is None: + if selected is None: print("\nSaliendo...") conn.close() return @@ -456,28 +731,35 @@ def main(): grand_total_found = 0 brand_stats = {} - # Determinar qué marcas procesar - if selected_brand == 'ALL': - # Procesar todas las marcas pendientes - brands_to_process = BRANDS - start_batches = {brand: 1 for brand in BRANDS} - elif selected_brand: - # Solo procesar la marca seleccionada desde el lote indicado - brands_to_process = [selected_brand] - start_batches = {selected_brand: start_batch} + # Determinar que marcas procesar + if selected == 'ALL': + brands_to_process = brands_list or all_brands + start_batches = {brand: 1 for brand in brands_to_process} + elif selected == 'MULTIPLE': + brands_to_process = brands_list + start_batches = {brand: 1 for brand in brands_to_process} + elif selected: + brands_to_process = [selected] + start_batches = {selected: start_batch} else: conn.close() return - for brand in brands_to_process: + print(f"\n{'='*60}") + print(f" INICIANDO PROCESAMIENTO") + print(f" Marcas a procesar: {len(brands_to_process)}") + print(f"{'='*60}") + + for idx, brand in enumerate(brands_to_process, 1): + print(f"\n[{idx}/{len(brands_to_process)}] ", end="") saved, found = process_brand(conn, brand, start_batches.get(brand, 1)) brand_stats[brand] = {'saved': saved, 'found': found} grand_total_saved += saved grand_total_found += found # Pausa entre marcas (si hay otra marca por procesar) - if brand != brands_to_process[-1]: - next_brand = brands_to_process[brands_to_process.index(brand)+1] + if idx < len(brands_to_process): + next_brand = brands_to_process[idx] wait_with_skip(WAIT_TIME, f"PAUSA ENTRE MARCAS - Siguiente: {next_brand}") conn.close() @@ -485,14 +767,22 @@ def main(): print("\n" + "="*60) print(" RESUMEN FINAL") print("="*60) - for brand, stats in brand_stats.items(): - if stats['found'] > 0 or stats['saved'] > 0: + + # Solo mostrar marcas con datos + brands_with_data = {k: v for k, v in brand_stats.items() if v['found'] > 0 or v['saved'] > 0} + + if brands_with_data: + for brand, stats in brands_with_data.items(): print(f" {brand}:") print(f" Encontrados: {stats['found']}") print(f" Nuevos guardados: {stats['saved']}") + else: + print(" No se encontraron nuevos datos") + print("-"*60) print(f" TOTAL:") - print(f" Vehículos encontrados: {grand_total_found}") + print(f" Marcas procesadas: {len(brands_to_process)}") + print(f" Vehiculos encontrados: {grand_total_found}") print(f" Nuevos guardados: {grand_total_saved}") print("="*60)