- Base de datos SQLite con información de vehículos - Dashboard web con Flask y Bootstrap - Scripts de web scraping para RockAuto - Interfaz CLI para consultas - Documentación completa del proyecto Incluye: - 12 marcas de vehículos - 10,923 modelos - 10,919 especificaciones de motores - 12,075 combinaciones modelo-año-motor
292 lines
10 KiB
Python
292 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
RockAuto Vehicle Data Scraper
|
|
Extracts vehicle information (brands, models, years, engines) from RockAuto.com
|
|
"""
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import time
|
|
import random
|
|
from urllib.parse import urljoin, urlparse
|
|
import json
|
|
import sqlite3
|
|
from typing import List, Dict, Optional
|
|
|
|
|
|
class RockAutoScraper:
|
|
def __init__(self, db_path: str = "../vehicle_database/vehicle_database.db"):
|
|
self.base_url = "https://www.rockauto.com"
|
|
self.session = requests.Session()
|
|
self.session.headers.update({
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
|
})
|
|
self.db_path = db_path
|
|
|
|
# Create a mapping of RockAuto brand names to standardized names
|
|
self.brand_mapping = {
|
|
'acura': 'Acura',
|
|
'alfa-romeo': 'Alfa Romeo',
|
|
'audi': 'Audi',
|
|
'bmw': 'BMW',
|
|
'buick': 'Buick',
|
|
'cadillac': 'Cadillac',
|
|
'chevrolet': 'Chevrolet',
|
|
'chrysler': 'Chrysler',
|
|
'dodge': 'Dodge',
|
|
'fiat': 'Fiat',
|
|
'ford': 'Ford',
|
|
'gmc': 'GMC',
|
|
'honda': 'Honda',
|
|
'hyundai': 'Hyundai',
|
|
'infiniti': 'Infiniti',
|
|
'isuzu': 'Isuzu',
|
|
'jaguar': 'Jaguar',
|
|
'jeep': 'Jeep',
|
|
'kia': 'Kia',
|
|
'land-rover': 'Land Rover',
|
|
'lexus': 'Lexus',
|
|
'lincoln': 'Lincoln',
|
|
'mazda': 'Mazda',
|
|
'mercedes-benz': 'Mercedes-Benz',
|
|
'mercury': 'Mercury',
|
|
'mitsubishi': 'Mitsubishi',
|
|
'nissan': 'Nissan',
|
|
'oldsmobile': 'Oldsmobile',
|
|
'plymouth': 'Plymouth',
|
|
'pontiac': 'Pontiac',
|
|
'porsche': 'Porsche',
|
|
'ram': 'Ram',
|
|
'saab': 'Saab',
|
|
'saturn': 'Saturn',
|
|
'scion': 'Scion',
|
|
'subaru': 'Subaru',
|
|
'suzuki': 'Suzuki',
|
|
'tesla': 'Tesla',
|
|
'toyota': 'Toyota',
|
|
'volkswagen': 'Volkswagen',
|
|
'volvo': 'Volvo'
|
|
}
|
|
|
|
def get_page(self, url: str) -> Optional[BeautifulSoup]:
|
|
"""Get a page and return BeautifulSoup object"""
|
|
try:
|
|
# Add random delay to be respectful to the server
|
|
time.sleep(random.uniform(1, 3))
|
|
response = self.session.get(url)
|
|
response.raise_for_status()
|
|
return BeautifulSoup(response.content, 'html.parser')
|
|
except requests.RequestException as e:
|
|
print(f"Error fetching {url}: {e}")
|
|
return None
|
|
|
|
def get_makes(self) -> List[str]:
|
|
"""Get list of makes from RockAuto"""
|
|
print("Fetching list of makes...")
|
|
soup = self.get_page(f"{self.base_url}/catalog/catalog.php")
|
|
|
|
if not soup:
|
|
return []
|
|
|
|
makes = []
|
|
# Look for make selection dropdown or similar element
|
|
make_elements = soup.find_all('a', href=lambda x: x and '/catalog/' in x and x.count('/') >= 3)
|
|
|
|
for elem in make_elements:
|
|
href = elem.get('href', '')
|
|
# Extract make from URL
|
|
parts = href.split('/')
|
|
for part in parts:
|
|
if part in self.brand_mapping:
|
|
make = self.brand_mapping[part]
|
|
if make not in makes:
|
|
makes.append(make)
|
|
|
|
# Alternative approach: look for common selector patterns
|
|
if not makes:
|
|
# Look for elements that might contain make information
|
|
links = soup.find_all('a', href=True)
|
|
for link in links:
|
|
href = link['href'].lower()
|
|
for key, value in self.brand_mapping.items():
|
|
if key in href and value not in makes:
|
|
makes.append(value)
|
|
|
|
print(f"Found {len(makes)} makes: {makes[:10]}{'...' if len(makes) > 10 else ''}")
|
|
return makes
|
|
|
|
def get_models_for_make(self, make: str) -> List[Dict]:
|
|
"""Get models for a specific make"""
|
|
print(f"Fetching models for {make}...")
|
|
|
|
# Convert make to RockAuto format
|
|
make_key = None
|
|
for key, value in self.brand_mapping.items():
|
|
if value.lower() == make.lower():
|
|
make_key = key
|
|
break
|
|
|
|
if not make_key:
|
|
print(f"Make {make} not found in mapping")
|
|
return []
|
|
|
|
models = []
|
|
soup = self.get_page(f"{self.base_url}/catalog/catalog.php?c={make_key}")
|
|
|
|
if not soup:
|
|
return models
|
|
|
|
# Look for model/year combinations
|
|
# RockAuto typically has links with year and model info
|
|
links = soup.find_all('a', href=True)
|
|
|
|
for link in links:
|
|
href = link['href']
|
|
text = link.get_text().strip()
|
|
|
|
# Look for patterns that indicate year/model/engine info
|
|
if any(char.isdigit() for char in text) and len(text) > 2:
|
|
# Try to extract year and model info
|
|
parts = text.split()
|
|
|
|
# Look for year (usually 4 digits)
|
|
year = None
|
|
model_parts = []
|
|
|
|
for part in parts:
|
|
if part.isdigit() and len(part) == 4 and 1900 < int(part) < 2030:
|
|
year = int(part)
|
|
else:
|
|
model_parts.append(part)
|
|
|
|
if model_parts and year:
|
|
model = ' '.join(model_parts)
|
|
|
|
# Create a record
|
|
record = {
|
|
'make': make,
|
|
'model': model,
|
|
'year': year,
|
|
'engine': 'Unknown', # Will need to extract from deeper pages
|
|
'href': href
|
|
}
|
|
|
|
if record not in models:
|
|
models.append(record)
|
|
|
|
print(f"Found {len(models)} models for {make}")
|
|
return models
|
|
|
|
def scrape_vehicle_data(self) -> List[Dict]:
|
|
"""Main method to scrape vehicle data from RockAuto"""
|
|
print("Starting RockAuto scraping...")
|
|
|
|
all_vehicles = []
|
|
|
|
# Get all makes
|
|
makes = self.get_makes()
|
|
|
|
# Limit to first 5 makes for testing
|
|
makes = makes[:5] if len(makes) > 5 else makes
|
|
|
|
for make in makes:
|
|
models = self.get_models_for_make(make)
|
|
all_vehicles.extend(models)
|
|
|
|
# Limit total records for testing
|
|
if len(all_vehicles) > 20:
|
|
break
|
|
|
|
print(f"Total vehicles found: {len(all_vehicles)}")
|
|
return all_vehicles
|
|
|
|
def save_to_database(self, vehicles: List[Dict]):
|
|
"""Save scraped data to the vehicle database"""
|
|
print(f"Saving {len(vehicles)} vehicles to database...")
|
|
|
|
conn = sqlite3.connect(self.db_path)
|
|
cursor = conn.cursor()
|
|
|
|
for vehicle in vehicles:
|
|
try:
|
|
# Insert brand
|
|
cursor.execute(
|
|
"INSERT OR IGNORE INTO brands (name) VALUES (?)",
|
|
(vehicle['make'],)
|
|
)
|
|
cursor.execute("SELECT id FROM brands WHERE name = ?", (vehicle['make'],))
|
|
brand_id = cursor.fetchone()[0]
|
|
|
|
# Insert year
|
|
cursor.execute(
|
|
"INSERT OR IGNORE INTO years (year) VALUES (?)",
|
|
(vehicle['year'],)
|
|
)
|
|
cursor.execute("SELECT id FROM years WHERE year = ?", (vehicle['year'],))
|
|
year_id = cursor.fetchone()[0]
|
|
|
|
# Insert engine (with unknown specs for now)
|
|
cursor.execute(
|
|
"INSERT OR IGNORE INTO engines (name) VALUES (?)",
|
|
(vehicle['engine'],)
|
|
)
|
|
cursor.execute("SELECT id FROM engines WHERE name = ?", (vehicle['engine'],))
|
|
engine_id = cursor.fetchone()[0]
|
|
|
|
# Insert model
|
|
cursor.execute(
|
|
"INSERT OR IGNORE INTO models (brand_id, name) VALUES (?, ?)",
|
|
(brand_id, vehicle['model'])
|
|
)
|
|
cursor.execute("SELECT id FROM models WHERE brand_id = ? AND name = ?", (brand_id, vehicle['model']))
|
|
model_id = cursor.fetchone()[0]
|
|
|
|
# Link model, year, and engine
|
|
cursor.execute(
|
|
"""INSERT OR IGNORE INTO model_year_engine
|
|
(model_id, year_id, engine_id) VALUES (?, ?, ?)""",
|
|
(model_id, year_id, engine_id)
|
|
)
|
|
|
|
except Exception as e:
|
|
print(f"Error saving vehicle {vehicle}: {e}")
|
|
|
|
conn.commit()
|
|
conn.close()
|
|
print("Data saved to database successfully!")
|
|
|
|
|
|
def main():
|
|
scraper = RockAutoScraper()
|
|
|
|
print("Starting RockAuto data extraction...")
|
|
print("Note: This may take several minutes due to rate limiting.")
|
|
|
|
try:
|
|
# Scrape vehicle data
|
|
vehicles = scraper.scrape_vehicle_data()
|
|
|
|
if vehicles:
|
|
print(f"\nFound {len(vehicles)} vehicles:")
|
|
for i, v in enumerate(vehicles[:10]): # Show first 10
|
|
print(f" {i+1}. {v['make']} {v['model']} {v['year']}")
|
|
|
|
if len(vehicles) > 10:
|
|
print(f" ... and {len(vehicles)-10} more")
|
|
|
|
# Save to database
|
|
scraper.save_to_database(vehicles)
|
|
|
|
print("\nScraping completed successfully!")
|
|
else:
|
|
print("No vehicles found. This could be due to:")
|
|
print("1. RockAuto blocking automated requests")
|
|
print("2. Changes in website structure")
|
|
print("3. Network connectivity issues")
|
|
|
|
except Exception as e:
|
|
print(f"An error occurred during scraping: {e}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |