#!/usr/bin/env python3 """ RockAuto Vehicle Data Scraper - Enhanced Version Extracts vehicle information (brands, models, years, engines) from RockAuto.com """ import requests from bs4 import BeautifulSoup import time import random from urllib.parse import urljoin, urlparse import json import sqlite3 from typing import List, Dict, Optional class RockAutoScraper: def __init__(self, db_path: str = "../vehicle_database/vehicle_database.db"): self.base_url = "https://www.rockauto.com" self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', }) self.db_path = db_path # Create a mapping of RockAuto brand names to standardized names self.brand_mapping = { 'acura': 'Acura', 'alfa-romeo': 'Alfa Romeo', 'audi': 'Audi', 'bmw': 'BMW', 'buick': 'Buick', 'cadillac': 'Cadillac', 'chevrolet': 'Chevrolet', 'chrysler': 'Chrysler', 'dodge': 'Dodge', 'fiat': 'Fiat', 'ford': 'Ford', 'gmc': 'GMC', 'honda': 'Honda', 'hyundai': 'Hyundai', 'infiniti': 'Infiniti', 'isuzu': 'Isuzu', 'jaguar': 'Jaguar', 'jeep': 'Jeep', 'kia': 'Kia', 'land-rover': 'Land Rover', 'lexus': 'Lexus', 'lincoln': 'Lincoln', 'mazda': 'Mazda', 'mercedes-benz': 'Mercedes-Benz', 'mercury': 'Mercury', 'mitsubishi': 'Mitsubishi', 'nissan': 'Nissan', 'oldsmobile': 'Oldsmobile', 'plymouth': 'Plymouth', 'pontiac': 'Pontiac', 'porsche': 'Porsche', 'ram': 'Ram', 'saab': 'Saab', 'saturn': 'Saturn', 'scion': 'Scion', 'subaru': 'Subaru', 'suzuki': 'Suzuki', 'tesla': 'Tesla', 'toyota': 'Toyota', 'volkswagen': 'Volkswagen', 'volvo': 'Volvo' } def get_page(self, url: str) -> Optional[BeautifulSoup]: """Get a page and return BeautifulSoup object""" try: # Add random delay to be respectful to the server time.sleep(random.uniform(2, 4)) response = self.session.get(url) response.raise_for_status() return BeautifulSoup(response.content, 'html.parser') except requests.RequestException as e: print(f"Error fetching {url}: {e}") return None def get_makes_enhanced(self) -> List[str]: """Enhanced method to get makes from RockAuto""" print("Fetching list of makes (enhanced)...") # Try multiple approaches to get makes makes = [] # Approach 1: Visit the main catalog page soup = self.get_page(f"{self.base_url}/catalog/catalog.php") if not soup: return makes # Look for links that contain make information in the URL links = soup.find_all('a', href=True) for link in links: href = link.get('href', '').lower() # Check if the href contains a known make for key, value in self.brand_mapping.items(): if f"/{key}/" in href and value not in makes: makes.append(value) # Approach 2: Look for JavaScript variables or data attributes that might contain makes scripts = soup.find_all('script') for script in scripts: if script.string: # Look for common patterns in JavaScript import re # Look for patterns like make names in quotes matches = re.findall(r'["\']([a-z-]+)["\']', script.string) for match in matches: if match in self.brand_mapping and self.brand_mapping[match] not in makes: makes.append(self.brand_mapping[match]) print(f"Found {len(makes)} makes: {makes[:10]}{'...' if len(makes) > 10 else ''}") return makes def get_detailed_models_for_make(self, make: str) -> List[Dict]: """Get detailed models for a specific make by exploring deeper pages""" print(f"Fetching detailed models for {make}...") # Convert make to RockAuto format make_key = None for key, value in self.brand_mapping.items(): if value.lower() == make.lower(): make_key = key break if not make_key: print(f"Make {make} not found in mapping") return [] models = [] # Visit the make-specific page url = f"{self.base_url}/catalog/catalog.php?c={make_key}" soup = self.get_page(url) if not soup: return models # Look for year links first year_links = soup.find_all('a', href=lambda x: x and f'/catalog/{make_key}/' in x and any(str(y) in x for y in range(1900, 2030))) for link in year_links: href = link.get('href', '') text = link.get_text().strip() # Extract year from URL or text import re year_match = re.search(r'\b(19|20)\d{2}\b', text) if not year_match: year_match = re.search(r'\b(19|20)\d{2}\b', href) if year_match: year = int(year_match.group()) # Extract model from text or URL # Remove year from text to get model model_text = re.sub(r'\b(19|20)\d{2}\b', '', text).strip() if model_text: # Create a record record = { 'make': make, 'model': model_text, 'year': year, 'engine': 'Unknown', # Will need to extract from deeper pages 'href': href } if record not in models: models.append(record) # If no year-specific links found, try alternative approach if not models: # Look for links that might contain both make and year all_links = soup.find_all('a', href=True) for link in all_links: href = link.get('href', '').lower() text = link.get_text().strip() if f"/{make_key}/" in href: # Look for year in the text or href year_match = re.search(r'\b(19|20)\d{2}\b', text) if not year_match: year_match = re.search(r'\b(19|20)\d{2}\b', href) if year_match: year = int(year_match.group()) # Extract model info model_parts = [part for part in text.split() if not re.match(r'\b(19|20)\d{2}\b', part)] model = ' '.join(model_parts) if model: record = { 'make': make, 'model': model, 'year': year, 'engine': 'Unknown', 'href': link.get('href') } if record not in models: models.append(record) print(f"Found {len(models)} models for {make}") return models def explore_categories(self, make: str) -> List[Dict]: """Explore categories for a specific make to find models and years""" print(f"Exploring categories for {make}...") # Convert make to RockAuto format make_key = None for key, value in self.brand_mapping.items(): if value.lower() == make.lower(): make_key = key break if not make_key: print(f"Make {make} not found in mapping") return [] models = [] # Visit the make-specific page url = f"{self.base_url}/catalog/catalog.php?c={make_key}" soup = self.get_page(url) if not soup: return models # Look for elements that represent vehicle categories # RockAuto typically organizes by year/model category_elements = soup.find_all(['div', 'section', 'ul'], class_=lambda x: x and any(keyword in x.lower() for keyword in ['year', 'model', 'catalog', 'vehicle'])) if not category_elements: # If no categorized elements found, try looking for all links with year info all_links = soup.find_all('a', href=True) for link in all_links: href = link.get('href', '').lower() text = link.get_text().strip() if f"/{make_key}/" in href and any(str(year) in href for year in range(1900, 2030)): # Extract year and model import re year_match = re.search(r'\b(19|20)\d{2}\b', href) if year_match: year = int(year_match.group()) # Clean up text to extract model clean_text = re.sub(r'\b(19|20)\d{2}\b', '', text).strip(' -_') if clean_text and len(clean_text) > 1: record = { 'make': make, 'model': clean_text, 'year': year, 'engine': 'Unknown', 'href': link.get('href') } if record not in models: models.append(record) print(f"Found {len(models)} entries for {make} through category exploration") return models def scrape_vehicle_data(self) -> List[Dict]: """Main method to scrape vehicle data from RockAuto""" print("Starting enhanced RockAuto scraping...") all_vehicles = [] # Get all makes using enhanced method makes = self.get_makes_enhanced() # Limit to first 3 makes for testing makes = makes[:3] if len(makes) > 3 else makes for make in makes: # Try multiple approaches to get models models = self.get_detailed_models_for_make(make) # If still no models, try category exploration if not models: models = self.explore_categories(make) all_vehicles.extend(models) # Limit total records for testing if len(all_vehicles) > 15: break print(f"Total vehicles found: {len(all_vehicles)}") return all_vehicles def save_to_database(self, vehicles: List[Dict]): """Save scraped data to the vehicle database""" print(f"Saving {len(vehicles)} vehicles to database...") conn = sqlite3.connect(self.db_path) cursor = conn.cursor() for vehicle in vehicles: try: # Insert brand cursor.execute( "INSERT OR IGNORE INTO brands (name) VALUES (?)", (vehicle['make'],) ) cursor.execute("SELECT id FROM brands WHERE name = ?", (vehicle['make'],)) brand_id = cursor.fetchone()[0] # Insert year cursor.execute( "INSERT OR IGNORE INTO years (year) VALUES (?)", (vehicle['year'],) ) cursor.execute("SELECT id FROM years WHERE year = ?", (vehicle['year'],)) year_id = cursor.fetchone()[0] # Insert engine (with unknown specs for now) engine_name = vehicle['engine'] if vehicle['engine'] != 'Unknown' else f"Engine_{vehicle['year']}_{vehicle['model'][:10]}" cursor.execute( "INSERT OR IGNORE INTO engines (name) VALUES (?)", (engine_name,) ) cursor.execute("SELECT id FROM engines WHERE name = ?", (engine_name,)) engine_id = cursor.fetchone()[0] # Insert model cursor.execute( "INSERT OR IGNORE INTO models (brand_id, name, body_type) VALUES (?, ?, ?)", (brand_id, vehicle['model'], 'Unknown') ) cursor.execute("SELECT id FROM models WHERE brand_id = ? AND name = ?", (brand_id, vehicle['model'])) model_id = cursor.fetchone()[0] # Link model, year, and engine cursor.execute( """INSERT OR IGNORE INTO model_year_engine (model_id, year_id, engine_id) VALUES (?, ?, ?)""", (model_id, year_id, engine_id) ) except Exception as e: print(f"Error saving vehicle {vehicle}: {e}") conn.commit() conn.close() print("Data saved to database successfully!") def main(): scraper = RockAutoScraper() print("Starting enhanced RockAuto data extraction...") print("Note: This may take several minutes due to rate limiting.") try: # Scrape vehicle data vehicles = scraper.scrape_vehicle_data() if vehicles: print(f"\nFound {len(vehicles)} vehicles:") for i, v in enumerate(vehicles[:10]): # Show first 10 print(f" {i+1}. {v['make']} {v['model']} {v['year']}") if len(vehicles) > 10: print(f" ... and {len(vehicles)-10} more") # Save to database scraper.save_to_database(vehicles) print("\nScraping completed successfully!") else: print("No vehicles found. This could be due to:") print("1. RockAuto blocking automated requests") print("2. Changes in website structure") print("3. Network connectivity issues") print("4. Anti-bot measures implemented by RockAuto") except Exception as e: print(f"An error occurred during scraping: {e}") if __name__ == "__main__": main()