Autoparts-DB/vehicle_scraper/rockauto_scraper_enhanced.py

#!/usr/bin/env python3
"""
RockAuto Vehicle Data Scraper - Enhanced Version
Extracts vehicle information (brands, models, years, engines) from RockAuto.com
"""

import requests
from bs4 import BeautifulSoup
import time
import random
from urllib.parse import urljoin, urlparse
import json
import sqlite3
from typing import List, Dict, Optional


class RockAutoScraper:
    def __init__(self, db_path: str = "../vehicle_database/vehicle_database.db"):
        self.base_url = "https://www.rockauto.com"
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        })
        self.db_path = db_path

        # Create a mapping of RockAuto brand names to standardized names
        self.brand_mapping = {
            'acura': 'Acura',
            'alfa-romeo': 'Alfa Romeo',
            'audi': 'Audi',
            'bmw': 'BMW',
            'buick': 'Buick',
            'cadillac': 'Cadillac',
            'chevrolet': 'Chevrolet',
            'chrysler': 'Chrysler',
            'dodge': 'Dodge',
            'fiat': 'Fiat',
            'ford': 'Ford',
            'gmc': 'GMC',
            'honda': 'Honda',
            'hyundai': 'Hyundai',
            'infiniti': 'Infiniti',
            'isuzu': 'Isuzu',
            'jaguar': 'Jaguar',
            'jeep': 'Jeep',
            'kia': 'Kia',
            'land-rover': 'Land Rover',
            'lexus': 'Lexus',
            'lincoln': 'Lincoln',
            'mazda': 'Mazda',
            'mercedes-benz': 'Mercedes-Benz',
            'mercury': 'Mercury',
            'mitsubishi': 'Mitsubishi',
            'nissan': 'Nissan',
            'oldsmobile': 'Oldsmobile',
            'plymouth': 'Plymouth',
            'pontiac': 'Pontiac',
            'porsche': 'Porsche',
            'ram': 'Ram',
            'saab': 'Saab',
            'saturn': 'Saturn',
            'scion': 'Scion',
            'subaru': 'Subaru',
            'suzuki': 'Suzuki',
            'tesla': 'Tesla',
            'toyota': 'Toyota',
            'volkswagen': 'Volkswagen',
            'volvo': 'Volvo'
        }

    def get_page(self, url: str) -> Optional[BeautifulSoup]:
        """Get a page and return BeautifulSoup object"""
        try:
            # Add random delay to be respectful to the server
            time.sleep(random.uniform(2, 4))
            response = self.session.get(url)
            response.raise_for_status()
            return BeautifulSoup(response.content, 'html.parser')
        except requests.RequestException as e:
            print(f"Error fetching {url}: {e}")
            return None

    def get_makes_enhanced(self) -> List[str]:
        """Enhanced method to get makes from RockAuto"""
        print("Fetching list of makes (enhanced)...")

        # Try multiple approaches to get makes
        makes = []

        # Approach 1: Visit the main catalog page
        soup = self.get_page(f"{self.base_url}/catalog/catalog.php")

        if not soup:
            return makes

        # Look for links that contain make information in the URL
        links = soup.find_all('a', href=True)

        for link in links:
            href = link.get('href', '').lower()

            # Check if the href contains a known make
            for key, value in self.brand_mapping.items():
                if f"/{key}/" in href and value not in makes:
                    makes.append(value)

        # Approach 2: Look for JavaScript variables or data attributes that might contain makes
        scripts = soup.find_all('script')
        for script in scripts:
            if script.string:
                # Look for common patterns in JavaScript
                import re
                # Look for patterns like make names in quotes
                matches = re.findall(r'["\']([a-z-]+)["\']', script.string)
                for match in matches:
                    if match in self.brand_mapping and self.brand_mapping[match] not in makes:
                        makes.append(self.brand_mapping[match])

        print(f"Found {len(makes)} makes: {makes[:10]}{'...' if len(makes) > 10 else ''}")
        return makes

    def get_detailed_models_for_make(self, make: str) -> List[Dict]:
        """Get detailed models for a specific make by exploring deeper pages"""
        print(f"Fetching detailed models for {make}...")

        # Convert make to RockAuto format
        make_key = None
        for key, value in self.brand_mapping.items():
            if value.lower() == make.lower():
                make_key = key
                break

        if not make_key:
            print(f"Make {make} not found in mapping")
            return []

        models = []

        # Visit the make-specific page
        url = f"{self.base_url}/catalog/catalog.php?c={make_key}"
        soup = self.get_page(url)

        if not soup:
            return models

        # Look for year links first
        year_links = soup.find_all('a', href=lambda x: x and f'/catalog/{make_key}/' in x and any(str(y) in x for y in range(1900, 2030)))

        for link in year_links:
            href = link.get('href', '')
            text = link.get_text().strip()

            # Extract year from URL or text
            import re
            year_match = re.search(r'\b(19|20)\d{2}\b', text)
            if not year_match:
                year_match = re.search(r'\b(19|20)\d{2}\b', href)

            if year_match:
                year = int(year_match.group())

                # Extract model from text or URL
                # Remove year from text to get model
                model_text = re.sub(r'\b(19|20)\d{2}\b', '', text).strip()

                if model_text:
                    # Create a record
                    record = {
                        'make': make,
                        'model': model_text,
                        'year': year,
                        'engine': 'Unknown',  # Will need to extract from deeper pages
                        'href': href
                    }

                    if record not in models:
                        models.append(record)

        # If no year-specific links found, try alternative approach
        if not models:
            # Look for links that might contain both make and year
            all_links = soup.find_all('a', href=True)
            for link in all_links:
                href = link.get('href', '').lower()
                text = link.get_text().strip()

                if f"/{make_key}/" in href:
                    # Look for year in the text or href
                    year_match = re.search(r'\b(19|20)\d{2}\b', text)
                    if not year_match:
                        year_match = re.search(r'\b(19|20)\d{2}\b', href)

                    if year_match:
                        year = int(year_match.group())

                        # Extract model info
                        model_parts = [part for part in text.split() if not re.match(r'\b(19|20)\d{2}\b', part)]
                        model = ' '.join(model_parts)

                        if model:
                            record = {
                                'make': make,
                                'model': model,
                                'year': year,
                                'engine': 'Unknown',
                                'href': link.get('href')
                            }

                            if record not in models:
                                models.append(record)

        print(f"Found {len(models)} models for {make}")
        return models

    def explore_categories(self, make: str) -> List[Dict]:
        """Explore categories for a specific make to find models and years"""
        print(f"Exploring categories for {make}...")

        # Convert make to RockAuto format
        make_key = None
        for key, value in self.brand_mapping.items():
            if value.lower() == make.lower():
                make_key = key
                break

        if not make_key:
            print(f"Make {make} not found in mapping")
            return []

        models = []

        # Visit the make-specific page
        url = f"{self.base_url}/catalog/catalog.php?c={make_key}"
        soup = self.get_page(url)

        if not soup:
            return models

        # Look for elements that represent vehicle categories
        # RockAuto typically organizes by year/model
        category_elements = soup.find_all(['div', 'section', 'ul'], class_=lambda x: x and any(keyword in x.lower() for keyword in ['year', 'model', 'catalog', 'vehicle']))

        if not category_elements:
            # If no categorized elements found, try looking for all links with year info
            all_links = soup.find_all('a', href=True)
            for link in all_links:
                href = link.get('href', '').lower()
                text = link.get_text().strip()

                if f"/{make_key}/" in href and any(str(year) in href for year in range(1900, 2030)):
                    # Extract year and model
                    import re
                    year_match = re.search(r'\b(19|20)\d{2}\b', href)
                    if year_match:
                        year = int(year_match.group())

                        # Clean up text to extract model
                        clean_text = re.sub(r'\b(19|20)\d{2}\b', '', text).strip(' -_')

                        if clean_text and len(clean_text) > 1:
                            record = {
                                'make': make,
                                'model': clean_text,
                                'year': year,
                                'engine': 'Unknown',
                                'href': link.get('href')
                            }

                            if record not in models:
                                models.append(record)

        print(f"Found {len(models)} entries for {make} through category exploration")
        return models

    def scrape_vehicle_data(self) -> List[Dict]:
        """Main method to scrape vehicle data from RockAuto"""
        print("Starting enhanced RockAuto scraping...")

        all_vehicles = []

        # Get all makes using enhanced method
        makes = self.get_makes_enhanced()

        # Limit to first 3 makes for testing
        makes = makes[:3] if len(makes) > 3 else makes

        for make in makes:
            # Try multiple approaches to get models
            models = self.get_detailed_models_for_make(make)

            # If still no models, try category exploration
            if not models:
                models = self.explore_categories(make)

            all_vehicles.extend(models)

            # Limit total records for testing
            if len(all_vehicles) > 15:
                break

        print(f"Total vehicles found: {len(all_vehicles)}")
        return all_vehicles

    def save_to_database(self, vehicles: List[Dict]):
        """Save scraped data to the vehicle database"""
        print(f"Saving {len(vehicles)} vehicles to database...")

        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()

        for vehicle in vehicles:
            try:
                # Insert brand
                cursor.execute(
                    "INSERT OR IGNORE INTO brands (name) VALUES (?)",
                    (vehicle['make'],)
                )
                cursor.execute("SELECT id FROM brands WHERE name = ?", (vehicle['make'],))
                brand_id = cursor.fetchone()[0]

                # Insert year
                cursor.execute(
                    "INSERT OR IGNORE INTO years (year) VALUES (?)",
                    (vehicle['year'],)
                )
                cursor.execute("SELECT id FROM years WHERE year = ?", (vehicle['year'],))
                year_id = cursor.fetchone()[0]

                # Insert engine (with unknown specs for now)
                engine_name = vehicle['engine'] if vehicle['engine'] != 'Unknown' else f"Engine_{vehicle['year']}_{vehicle['model'][:10]}"
                cursor.execute(
                    "INSERT OR IGNORE INTO engines (name) VALUES (?)",
                    (engine_name,)
                )
                cursor.execute("SELECT id FROM engines WHERE name = ?", (engine_name,))
                engine_id = cursor.fetchone()[0]

                # Insert model
                cursor.execute(
                    "INSERT OR IGNORE INTO models (brand_id, name, body_type) VALUES (?, ?, ?)",
                    (brand_id, vehicle['model'], 'Unknown')
                )
                cursor.execute("SELECT id FROM models WHERE brand_id = ? AND name = ?", (brand_id, vehicle['model']))
                model_id = cursor.fetchone()[0]

                # Link model, year, and engine
                cursor.execute(
                    """INSERT OR IGNORE INTO model_year_engine
                       (model_id, year_id, engine_id) VALUES (?, ?, ?)""",
                    (model_id, year_id, engine_id)
                )

            except Exception as e:
                print(f"Error saving vehicle {vehicle}: {e}")

        conn.commit()
        conn.close()
        print("Data saved to database successfully!")


def main():
    scraper = RockAutoScraper()

    print("Starting enhanced RockAuto data extraction...")
    print("Note: This may take several minutes due to rate limiting.")

    try:
        # Scrape vehicle data
        vehicles = scraper.scrape_vehicle_data()

        if vehicles:
            print(f"\nFound {len(vehicles)} vehicles:")
            for i, v in enumerate(vehicles[:10]):  # Show first 10
                print(f"  {i+1}. {v['make']} {v['model']} {v['year']}")

            if len(vehicles) > 10:
                print(f"  ... and {len(vehicles)-10} more")

            # Save to database
            scraper.save_to_database(vehicles)

            print("\nScraping completed successfully!")
        else:
            print("No vehicles found. This could be due to:")
            print("1. RockAuto blocking automated requests")
            print("2. Changes in website structure")
            print("3. Network connectivity issues")
            print("4. Anti-bot measures implemented by RockAuto")

    except Exception as e:
        print(f"An error occurred during scraping: {e}")


if __name__ == "__main__":
    main()