#!/usr/bin/env python3 """ RockAuto Vehicle Data Scraper Extracts vehicle information (brands, models, years, engines) from RockAuto.com """ import requests from bs4 import BeautifulSoup import time import random from urllib.parse import urljoin, urlparse import json import sqlite3 from typing import List, Dict, Optional class RockAutoScraper: def __init__(self, db_path: str = "../vehicle_database/vehicle_database.db"): self.base_url = "https://www.rockauto.com" self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' }) self.db_path = db_path # Create a mapping of RockAuto brand names to standardized names self.brand_mapping = { 'acura': 'Acura', 'alfa-romeo': 'Alfa Romeo', 'audi': 'Audi', 'bmw': 'BMW', 'buick': 'Buick', 'cadillac': 'Cadillac', 'chevrolet': 'Chevrolet', 'chrysler': 'Chrysler', 'dodge': 'Dodge', 'fiat': 'Fiat', 'ford': 'Ford', 'gmc': 'GMC', 'honda': 'Honda', 'hyundai': 'Hyundai', 'infiniti': 'Infiniti', 'isuzu': 'Isuzu', 'jaguar': 'Jaguar', 'jeep': 'Jeep', 'kia': 'Kia', 'land-rover': 'Land Rover', 'lexus': 'Lexus', 'lincoln': 'Lincoln', 'mazda': 'Mazda', 'mercedes-benz': 'Mercedes-Benz', 'mercury': 'Mercury', 'mitsubishi': 'Mitsubishi', 'nissan': 'Nissan', 'oldsmobile': 'Oldsmobile', 'plymouth': 'Plymouth', 'pontiac': 'Pontiac', 'porsche': 'Porsche', 'ram': 'Ram', 'saab': 'Saab', 'saturn': 'Saturn', 'scion': 'Scion', 'subaru': 'Subaru', 'suzuki': 'Suzuki', 'tesla': 'Tesla', 'toyota': 'Toyota', 'volkswagen': 'Volkswagen', 'volvo': 'Volvo' } def get_page(self, url: str) -> Optional[BeautifulSoup]: """Get a page and return BeautifulSoup object""" try: # Add random delay to be respectful to the server time.sleep(random.uniform(1, 3)) response = self.session.get(url) response.raise_for_status() return BeautifulSoup(response.content, 'html.parser') except requests.RequestException as e: print(f"Error fetching {url}: {e}") return None def get_makes(self) -> List[str]: """Get list of makes from RockAuto""" print("Fetching list of makes...") soup = self.get_page(f"{self.base_url}/catalog/catalog.php") if not soup: return [] makes = [] # Look for make selection dropdown or similar element make_elements = soup.find_all('a', href=lambda x: x and '/catalog/' in x and x.count('/') >= 3) for elem in make_elements: href = elem.get('href', '') # Extract make from URL parts = href.split('/') for part in parts: if part in self.brand_mapping: make = self.brand_mapping[part] if make not in makes: makes.append(make) # Alternative approach: look for common selector patterns if not makes: # Look for elements that might contain make information links = soup.find_all('a', href=True) for link in links: href = link['href'].lower() for key, value in self.brand_mapping.items(): if key in href and value not in makes: makes.append(value) print(f"Found {len(makes)} makes: {makes[:10]}{'...' if len(makes) > 10 else ''}") return makes def get_models_for_make(self, make: str) -> List[Dict]: """Get models for a specific make""" print(f"Fetching models for {make}...") # Convert make to RockAuto format make_key = None for key, value in self.brand_mapping.items(): if value.lower() == make.lower(): make_key = key break if not make_key: print(f"Make {make} not found in mapping") return [] models = [] soup = self.get_page(f"{self.base_url}/catalog/catalog.php?c={make_key}") if not soup: return models # Look for model/year combinations # RockAuto typically has links with year and model info links = soup.find_all('a', href=True) for link in links: href = link['href'] text = link.get_text().strip() # Look for patterns that indicate year/model/engine info if any(char.isdigit() for char in text) and len(text) > 2: # Try to extract year and model info parts = text.split() # Look for year (usually 4 digits) year = None model_parts = [] for part in parts: if part.isdigit() and len(part) == 4 and 1900 < int(part) < 2030: year = int(part) else: model_parts.append(part) if model_parts and year: model = ' '.join(model_parts) # Create a record record = { 'make': make, 'model': model, 'year': year, 'engine': 'Unknown', # Will need to extract from deeper pages 'href': href } if record not in models: models.append(record) print(f"Found {len(models)} models for {make}") return models def scrape_vehicle_data(self) -> List[Dict]: """Main method to scrape vehicle data from RockAuto""" print("Starting RockAuto scraping...") all_vehicles = [] # Get all makes makes = self.get_makes() # Limit to first 5 makes for testing makes = makes[:5] if len(makes) > 5 else makes for make in makes: models = self.get_models_for_make(make) all_vehicles.extend(models) # Limit total records for testing if len(all_vehicles) > 20: break print(f"Total vehicles found: {len(all_vehicles)}") return all_vehicles def save_to_database(self, vehicles: List[Dict]): """Save scraped data to the vehicle database""" print(f"Saving {len(vehicles)} vehicles to database...") conn = sqlite3.connect(self.db_path) cursor = conn.cursor() for vehicle in vehicles: try: # Insert brand cursor.execute( "INSERT OR IGNORE INTO brands (name) VALUES (?)", (vehicle['make'],) ) cursor.execute("SELECT id FROM brands WHERE name = ?", (vehicle['make'],)) brand_id = cursor.fetchone()[0] # Insert year cursor.execute( "INSERT OR IGNORE INTO years (year) VALUES (?)", (vehicle['year'],) ) cursor.execute("SELECT id FROM years WHERE year = ?", (vehicle['year'],)) year_id = cursor.fetchone()[0] # Insert engine (with unknown specs for now) cursor.execute( "INSERT OR IGNORE INTO engines (name) VALUES (?)", (vehicle['engine'],) ) cursor.execute("SELECT id FROM engines WHERE name = ?", (vehicle['engine'],)) engine_id = cursor.fetchone()[0] # Insert model cursor.execute( "INSERT OR IGNORE INTO models (brand_id, name) VALUES (?, ?)", (brand_id, vehicle['model']) ) cursor.execute("SELECT id FROM models WHERE brand_id = ? AND name = ?", (brand_id, vehicle['model'])) model_id = cursor.fetchone()[0] # Link model, year, and engine cursor.execute( """INSERT OR IGNORE INTO model_year_engine (model_id, year_id, engine_id) VALUES (?, ?, ?)""", (model_id, year_id, engine_id) ) except Exception as e: print(f"Error saving vehicle {vehicle}: {e}") conn.commit() conn.close() print("Data saved to database successfully!") def main(): scraper = RockAutoScraper() print("Starting RockAuto data extraction...") print("Note: This may take several minutes due to rate limiting.") try: # Scrape vehicle data vehicles = scraper.scrape_vehicle_data() if vehicles: print(f"\nFound {len(vehicles)} vehicles:") for i, v in enumerate(vehicles[:10]): # Show first 10 print(f" {i+1}. {v['make']} {v['model']} {v['year']}") if len(vehicles) > 10: print(f" ... and {len(vehicles)-10} more") # Save to database scraper.save_to_database(vehicles) print("\nScraping completed successfully!") else: print("No vehicles found. This could be due to:") print("1. RockAuto blocking automated requests") print("2. Changes in website structure") print("3. Network connectivity issues") except Exception as e: print(f"An error occurred during scraping: {e}") if __name__ == "__main__": main()