Initial commit: Sistema Autoparts DB
- Base de datos SQLite con información de vehículos - Dashboard web con Flask y Bootstrap - Scripts de web scraping para RockAuto - Interfaz CLI para consultas - Documentación completa del proyecto Incluye: - 12 marcas de vehículos - 10,923 modelos - 10,919 especificaciones de motores - 12,075 combinaciones modelo-año-motor
This commit is contained in:
163
vehicle_scraper/scrape_toyota.py
Normal file
163
vehicle_scraper/scrape_toyota.py
Normal file
@@ -0,0 +1,163 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script optimizado para extraer todos los vehículos Toyota de RockAuto
|
||||
Guarda datos incrementalmente para no perder progreso
|
||||
"""
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import sqlite3
|
||||
import time
|
||||
import re
|
||||
import sys
|
||||
from urllib.parse import unquote
|
||||
|
||||
DB_PATH = "/home/Autopartes/vehicle_database/vehicle_database.db"
|
||||
BASE_URL = "https://www.rockauto.com/en/catalog"
|
||||
|
||||
session = requests.Session()
|
||||
session.headers.update({
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
})
|
||||
|
||||
def clean_name(name):
|
||||
name = unquote(name.replace('+', ' '))
|
||||
return re.sub(r'\s+', ' ', name).strip().upper()
|
||||
|
||||
def get_soup(url, retries=3):
|
||||
for attempt in range(retries):
|
||||
try:
|
||||
time.sleep(0.3) # Delay corto
|
||||
response = session.get(url, timeout=10)
|
||||
if response.status_code == 200:
|
||||
return BeautifulSoup(response.content, 'html.parser')
|
||||
except Exception as e:
|
||||
if attempt < retries - 1:
|
||||
time.sleep(2)
|
||||
else:
|
||||
print(f" Error: {e}")
|
||||
return None
|
||||
|
||||
def get_years(brand):
|
||||
brand_url = brand.lower().replace(' ', '+')
|
||||
soup = get_soup(f"{BASE_URL}/{brand_url}")
|
||||
if not soup:
|
||||
return []
|
||||
|
||||
years = set()
|
||||
for link in soup.find_all('a', href=True):
|
||||
match = re.search(rf'/catalog/{re.escape(brand_url)},(\d{{4}})', link['href'], re.I)
|
||||
if match:
|
||||
year = int(match.group(1))
|
||||
if 1950 < year <= 2030:
|
||||
years.add(year)
|
||||
return sorted(years, reverse=True)
|
||||
|
||||
def get_models(brand, year):
|
||||
brand_url = brand.lower().replace(' ', '+')
|
||||
soup = get_soup(f"{BASE_URL}/{brand_url},{year}")
|
||||
if not soup:
|
||||
return []
|
||||
|
||||
models = set()
|
||||
for link in soup.find_all('a', href=True):
|
||||
pattern = rf'/catalog/{re.escape(brand_url)},{year},([^,/]+)'
|
||||
match = re.search(pattern, link['href'], re.I)
|
||||
if match:
|
||||
model = clean_name(match.group(1))
|
||||
if model and not model.isdigit() and len(model) > 1:
|
||||
models.add(model)
|
||||
return sorted(models)
|
||||
|
||||
def get_engines(brand, year, model):
|
||||
brand_url = brand.lower().replace(' ', '+')
|
||||
model_url = model.lower().replace(' ', '+')
|
||||
soup = get_soup(f"{BASE_URL}/{brand_url},{year},{model_url}")
|
||||
if not soup:
|
||||
return []
|
||||
|
||||
engines = set()
|
||||
for link in soup.find_all('a', href=True):
|
||||
pattern = rf'/catalog/{re.escape(brand_url)},{year},{re.escape(model_url)},([^,/]+)'
|
||||
match = re.search(pattern, link['href'], re.I)
|
||||
if match:
|
||||
engine = clean_name(match.group(1))
|
||||
if engine and re.search(r'\d+\.?\d*L|V\d|I\d|H\d|HYBRID|ELECTRIC|DIESEL', engine, re.I):
|
||||
engines.add(engine)
|
||||
return sorted(engines) if engines else ['Standard']
|
||||
|
||||
def save_to_db(conn, brand, year, model, engine):
|
||||
cursor = conn.cursor()
|
||||
try:
|
||||
cursor.execute("INSERT OR IGNORE INTO brands (name) VALUES (?)", (brand,))
|
||||
cursor.execute("SELECT id FROM brands WHERE name = ?", (brand,))
|
||||
brand_id = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("INSERT OR IGNORE INTO years (year) VALUES (?)", (year,))
|
||||
cursor.execute("SELECT id FROM years WHERE year = ?", (year,))
|
||||
year_id = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("INSERT OR IGNORE INTO engines (name) VALUES (?)", (engine,))
|
||||
cursor.execute("SELECT id FROM engines WHERE name = ?", (engine,))
|
||||
engine_id = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("INSERT OR IGNORE INTO models (brand_id, name) VALUES (?, ?)", (brand_id, model))
|
||||
cursor.execute("SELECT id FROM models WHERE brand_id = ? AND name = ?", (brand_id, model))
|
||||
model_id = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute(
|
||||
"INSERT OR IGNORE INTO model_year_engine (model_id, year_id, engine_id) VALUES (?, ?, ?)",
|
||||
(model_id, year_id, engine_id)
|
||||
)
|
||||
return cursor.rowcount > 0
|
||||
except Exception as e:
|
||||
print(f" DB Error: {e}")
|
||||
return False
|
||||
|
||||
def main():
|
||||
brand = "TOYOTA"
|
||||
|
||||
print(f"Obteniendo años disponibles para {brand}...")
|
||||
years = get_years(brand)
|
||||
print(f"Encontrados {len(years)} años: {years[0]} - {years[-1]}")
|
||||
|
||||
# Filtrar solo 1975-2026
|
||||
years = [y for y in years if 1975 <= y <= 2026]
|
||||
print(f"Procesando años 1975-2026: {len(years)} años")
|
||||
print("=" * 60)
|
||||
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
total_saved = 0
|
||||
total_vehicles = 0
|
||||
|
||||
for i, year in enumerate(years, 1):
|
||||
print(f"\n[{i}/{len(years)}] Año {year}: ", end="", flush=True)
|
||||
|
||||
models = get_models(brand, year)
|
||||
print(f"{len(models)} modelos")
|
||||
|
||||
year_count = 0
|
||||
for model in models:
|
||||
engines = get_engines(brand, year, model)
|
||||
for engine in engines:
|
||||
total_vehicles += 1
|
||||
if save_to_db(conn, brand, year, model, engine):
|
||||
total_saved += 1
|
||||
year_count += 1
|
||||
print(f" {model}: {engine}")
|
||||
|
||||
conn.commit()
|
||||
print(f" -> Guardados: {year_count} nuevos")
|
||||
|
||||
conn.close()
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print(f"RESUMEN TOYOTA")
|
||||
print(f" Años procesados: {len(years)}")
|
||||
print(f" Total vehículos encontrados: {total_vehicles}")
|
||||
print(f" Nuevos guardados: {total_saved}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user