Files
Autoparts-DB/vehicle_scraper/rockauto_scraper_enhanced.py
consultoria-as f395d67136 Initial commit: Sistema Autoparts DB
- Base de datos SQLite con información de vehículos
- Dashboard web con Flask y Bootstrap
- Scripts de web scraping para RockAuto
- Interfaz CLI para consultas
- Documentación completa del proyecto

Incluye:
- 12 marcas de vehículos
- 10,923 modelos
- 10,919 especificaciones de motores
- 12,075 combinaciones modelo-año-motor
2026-01-19 08:45:03 +00:00

400 lines
15 KiB
Python

#!/usr/bin/env python3
"""
RockAuto Vehicle Data Scraper - Enhanced Version
Extracts vehicle information (brands, models, years, engines) from RockAuto.com
"""
import requests
from bs4 import BeautifulSoup
import time
import random
from urllib.parse import urljoin, urlparse
import json
import sqlite3
from typing import List, Dict, Optional
class RockAutoScraper:
def __init__(self, db_path: str = "../vehicle_database/vehicle_database.db"):
self.base_url = "https://www.rockauto.com"
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
})
self.db_path = db_path
# Create a mapping of RockAuto brand names to standardized names
self.brand_mapping = {
'acura': 'Acura',
'alfa-romeo': 'Alfa Romeo',
'audi': 'Audi',
'bmw': 'BMW',
'buick': 'Buick',
'cadillac': 'Cadillac',
'chevrolet': 'Chevrolet',
'chrysler': 'Chrysler',
'dodge': 'Dodge',
'fiat': 'Fiat',
'ford': 'Ford',
'gmc': 'GMC',
'honda': 'Honda',
'hyundai': 'Hyundai',
'infiniti': 'Infiniti',
'isuzu': 'Isuzu',
'jaguar': 'Jaguar',
'jeep': 'Jeep',
'kia': 'Kia',
'land-rover': 'Land Rover',
'lexus': 'Lexus',
'lincoln': 'Lincoln',
'mazda': 'Mazda',
'mercedes-benz': 'Mercedes-Benz',
'mercury': 'Mercury',
'mitsubishi': 'Mitsubishi',
'nissan': 'Nissan',
'oldsmobile': 'Oldsmobile',
'plymouth': 'Plymouth',
'pontiac': 'Pontiac',
'porsche': 'Porsche',
'ram': 'Ram',
'saab': 'Saab',
'saturn': 'Saturn',
'scion': 'Scion',
'subaru': 'Subaru',
'suzuki': 'Suzuki',
'tesla': 'Tesla',
'toyota': 'Toyota',
'volkswagen': 'Volkswagen',
'volvo': 'Volvo'
}
def get_page(self, url: str) -> Optional[BeautifulSoup]:
"""Get a page and return BeautifulSoup object"""
try:
# Add random delay to be respectful to the server
time.sleep(random.uniform(2, 4))
response = self.session.get(url)
response.raise_for_status()
return BeautifulSoup(response.content, 'html.parser')
except requests.RequestException as e:
print(f"Error fetching {url}: {e}")
return None
def get_makes_enhanced(self) -> List[str]:
"""Enhanced method to get makes from RockAuto"""
print("Fetching list of makes (enhanced)...")
# Try multiple approaches to get makes
makes = []
# Approach 1: Visit the main catalog page
soup = self.get_page(f"{self.base_url}/catalog/catalog.php")
if not soup:
return makes
# Look for links that contain make information in the URL
links = soup.find_all('a', href=True)
for link in links:
href = link.get('href', '').lower()
# Check if the href contains a known make
for key, value in self.brand_mapping.items():
if f"/{key}/" in href and value not in makes:
makes.append(value)
# Approach 2: Look for JavaScript variables or data attributes that might contain makes
scripts = soup.find_all('script')
for script in scripts:
if script.string:
# Look for common patterns in JavaScript
import re
# Look for patterns like make names in quotes
matches = re.findall(r'["\']([a-z-]+)["\']', script.string)
for match in matches:
if match in self.brand_mapping and self.brand_mapping[match] not in makes:
makes.append(self.brand_mapping[match])
print(f"Found {len(makes)} makes: {makes[:10]}{'...' if len(makes) > 10 else ''}")
return makes
def get_detailed_models_for_make(self, make: str) -> List[Dict]:
"""Get detailed models for a specific make by exploring deeper pages"""
print(f"Fetching detailed models for {make}...")
# Convert make to RockAuto format
make_key = None
for key, value in self.brand_mapping.items():
if value.lower() == make.lower():
make_key = key
break
if not make_key:
print(f"Make {make} not found in mapping")
return []
models = []
# Visit the make-specific page
url = f"{self.base_url}/catalog/catalog.php?c={make_key}"
soup = self.get_page(url)
if not soup:
return models
# Look for year links first
year_links = soup.find_all('a', href=lambda x: x and f'/catalog/{make_key}/' in x and any(str(y) in x for y in range(1900, 2030)))
for link in year_links:
href = link.get('href', '')
text = link.get_text().strip()
# Extract year from URL or text
import re
year_match = re.search(r'\b(19|20)\d{2}\b', text)
if not year_match:
year_match = re.search(r'\b(19|20)\d{2}\b', href)
if year_match:
year = int(year_match.group())
# Extract model from text or URL
# Remove year from text to get model
model_text = re.sub(r'\b(19|20)\d{2}\b', '', text).strip()
if model_text:
# Create a record
record = {
'make': make,
'model': model_text,
'year': year,
'engine': 'Unknown', # Will need to extract from deeper pages
'href': href
}
if record not in models:
models.append(record)
# If no year-specific links found, try alternative approach
if not models:
# Look for links that might contain both make and year
all_links = soup.find_all('a', href=True)
for link in all_links:
href = link.get('href', '').lower()
text = link.get_text().strip()
if f"/{make_key}/" in href:
# Look for year in the text or href
year_match = re.search(r'\b(19|20)\d{2}\b', text)
if not year_match:
year_match = re.search(r'\b(19|20)\d{2}\b', href)
if year_match:
year = int(year_match.group())
# Extract model info
model_parts = [part for part in text.split() if not re.match(r'\b(19|20)\d{2}\b', part)]
model = ' '.join(model_parts)
if model:
record = {
'make': make,
'model': model,
'year': year,
'engine': 'Unknown',
'href': link.get('href')
}
if record not in models:
models.append(record)
print(f"Found {len(models)} models for {make}")
return models
def explore_categories(self, make: str) -> List[Dict]:
"""Explore categories for a specific make to find models and years"""
print(f"Exploring categories for {make}...")
# Convert make to RockAuto format
make_key = None
for key, value in self.brand_mapping.items():
if value.lower() == make.lower():
make_key = key
break
if not make_key:
print(f"Make {make} not found in mapping")
return []
models = []
# Visit the make-specific page
url = f"{self.base_url}/catalog/catalog.php?c={make_key}"
soup = self.get_page(url)
if not soup:
return models
# Look for elements that represent vehicle categories
# RockAuto typically organizes by year/model
category_elements = soup.find_all(['div', 'section', 'ul'], class_=lambda x: x and any(keyword in x.lower() for keyword in ['year', 'model', 'catalog', 'vehicle']))
if not category_elements:
# If no categorized elements found, try looking for all links with year info
all_links = soup.find_all('a', href=True)
for link in all_links:
href = link.get('href', '').lower()
text = link.get_text().strip()
if f"/{make_key}/" in href and any(str(year) in href for year in range(1900, 2030)):
# Extract year and model
import re
year_match = re.search(r'\b(19|20)\d{2}\b', href)
if year_match:
year = int(year_match.group())
# Clean up text to extract model
clean_text = re.sub(r'\b(19|20)\d{2}\b', '', text).strip(' -_')
if clean_text and len(clean_text) > 1:
record = {
'make': make,
'model': clean_text,
'year': year,
'engine': 'Unknown',
'href': link.get('href')
}
if record not in models:
models.append(record)
print(f"Found {len(models)} entries for {make} through category exploration")
return models
def scrape_vehicle_data(self) -> List[Dict]:
"""Main method to scrape vehicle data from RockAuto"""
print("Starting enhanced RockAuto scraping...")
all_vehicles = []
# Get all makes using enhanced method
makes = self.get_makes_enhanced()
# Limit to first 3 makes for testing
makes = makes[:3] if len(makes) > 3 else makes
for make in makes:
# Try multiple approaches to get models
models = self.get_detailed_models_for_make(make)
# If still no models, try category exploration
if not models:
models = self.explore_categories(make)
all_vehicles.extend(models)
# Limit total records for testing
if len(all_vehicles) > 15:
break
print(f"Total vehicles found: {len(all_vehicles)}")
return all_vehicles
def save_to_database(self, vehicles: List[Dict]):
"""Save scraped data to the vehicle database"""
print(f"Saving {len(vehicles)} vehicles to database...")
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
for vehicle in vehicles:
try:
# Insert brand
cursor.execute(
"INSERT OR IGNORE INTO brands (name) VALUES (?)",
(vehicle['make'],)
)
cursor.execute("SELECT id FROM brands WHERE name = ?", (vehicle['make'],))
brand_id = cursor.fetchone()[0]
# Insert year
cursor.execute(
"INSERT OR IGNORE INTO years (year) VALUES (?)",
(vehicle['year'],)
)
cursor.execute("SELECT id FROM years WHERE year = ?", (vehicle['year'],))
year_id = cursor.fetchone()[0]
# Insert engine (with unknown specs for now)
engine_name = vehicle['engine'] if vehicle['engine'] != 'Unknown' else f"Engine_{vehicle['year']}_{vehicle['model'][:10]}"
cursor.execute(
"INSERT OR IGNORE INTO engines (name) VALUES (?)",
(engine_name,)
)
cursor.execute("SELECT id FROM engines WHERE name = ?", (engine_name,))
engine_id = cursor.fetchone()[0]
# Insert model
cursor.execute(
"INSERT OR IGNORE INTO models (brand_id, name, body_type) VALUES (?, ?, ?)",
(brand_id, vehicle['model'], 'Unknown')
)
cursor.execute("SELECT id FROM models WHERE brand_id = ? AND name = ?", (brand_id, vehicle['model']))
model_id = cursor.fetchone()[0]
# Link model, year, and engine
cursor.execute(
"""INSERT OR IGNORE INTO model_year_engine
(model_id, year_id, engine_id) VALUES (?, ?, ?)""",
(model_id, year_id, engine_id)
)
except Exception as e:
print(f"Error saving vehicle {vehicle}: {e}")
conn.commit()
conn.close()
print("Data saved to database successfully!")
def main():
scraper = RockAutoScraper()
print("Starting enhanced RockAuto data extraction...")
print("Note: This may take several minutes due to rate limiting.")
try:
# Scrape vehicle data
vehicles = scraper.scrape_vehicle_data()
if vehicles:
print(f"\nFound {len(vehicles)} vehicles:")
for i, v in enumerate(vehicles[:10]): # Show first 10
print(f" {i+1}. {v['make']} {v['model']} {v['year']}")
if len(vehicles) > 10:
print(f" ... and {len(vehicles)-10} more")
# Save to database
scraper.save_to_database(vehicles)
print("\nScraping completed successfully!")
else:
print("No vehicles found. This could be due to:")
print("1. RockAuto blocking automated requests")
print("2. Changes in website structure")
print("3. Network connectivity issues")
print("4. Anti-bot measures implemented by RockAuto")
except Exception as e:
print(f"An error occurred during scraping: {e}")
if __name__ == "__main__":
main()