Initial commit: Sistema Autoparts DB
- Base de datos SQLite con información de vehículos - Dashboard web con Flask y Bootstrap - Scripts de web scraping para RockAuto - Interfaz CLI para consultas - Documentación completa del proyecto Incluye: - 12 marcas de vehículos - 10,923 modelos - 10,919 especificaciones de motores - 12,075 combinaciones modelo-año-motor
This commit is contained in:
400
vehicle_scraper/rockauto_scraper_enhanced.py
Normal file
400
vehicle_scraper/rockauto_scraper_enhanced.py
Normal file
@@ -0,0 +1,400 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
RockAuto Vehicle Data Scraper - Enhanced Version
|
||||
Extracts vehicle information (brands, models, years, engines) from RockAuto.com
|
||||
"""
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import time
|
||||
import random
|
||||
from urllib.parse import urljoin, urlparse
|
||||
import json
|
||||
import sqlite3
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
|
||||
class RockAutoScraper:
|
||||
def __init__(self, db_path: str = "../vehicle_database/vehicle_database.db"):
|
||||
self.base_url = "https://www.rockauto.com"
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
'Accept-Encoding': 'gzip, deflate',
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
})
|
||||
self.db_path = db_path
|
||||
|
||||
# Create a mapping of RockAuto brand names to standardized names
|
||||
self.brand_mapping = {
|
||||
'acura': 'Acura',
|
||||
'alfa-romeo': 'Alfa Romeo',
|
||||
'audi': 'Audi',
|
||||
'bmw': 'BMW',
|
||||
'buick': 'Buick',
|
||||
'cadillac': 'Cadillac',
|
||||
'chevrolet': 'Chevrolet',
|
||||
'chrysler': 'Chrysler',
|
||||
'dodge': 'Dodge',
|
||||
'fiat': 'Fiat',
|
||||
'ford': 'Ford',
|
||||
'gmc': 'GMC',
|
||||
'honda': 'Honda',
|
||||
'hyundai': 'Hyundai',
|
||||
'infiniti': 'Infiniti',
|
||||
'isuzu': 'Isuzu',
|
||||
'jaguar': 'Jaguar',
|
||||
'jeep': 'Jeep',
|
||||
'kia': 'Kia',
|
||||
'land-rover': 'Land Rover',
|
||||
'lexus': 'Lexus',
|
||||
'lincoln': 'Lincoln',
|
||||
'mazda': 'Mazda',
|
||||
'mercedes-benz': 'Mercedes-Benz',
|
||||
'mercury': 'Mercury',
|
||||
'mitsubishi': 'Mitsubishi',
|
||||
'nissan': 'Nissan',
|
||||
'oldsmobile': 'Oldsmobile',
|
||||
'plymouth': 'Plymouth',
|
||||
'pontiac': 'Pontiac',
|
||||
'porsche': 'Porsche',
|
||||
'ram': 'Ram',
|
||||
'saab': 'Saab',
|
||||
'saturn': 'Saturn',
|
||||
'scion': 'Scion',
|
||||
'subaru': 'Subaru',
|
||||
'suzuki': 'Suzuki',
|
||||
'tesla': 'Tesla',
|
||||
'toyota': 'Toyota',
|
||||
'volkswagen': 'Volkswagen',
|
||||
'volvo': 'Volvo'
|
||||
}
|
||||
|
||||
def get_page(self, url: str) -> Optional[BeautifulSoup]:
|
||||
"""Get a page and return BeautifulSoup object"""
|
||||
try:
|
||||
# Add random delay to be respectful to the server
|
||||
time.sleep(random.uniform(2, 4))
|
||||
response = self.session.get(url)
|
||||
response.raise_for_status()
|
||||
return BeautifulSoup(response.content, 'html.parser')
|
||||
except requests.RequestException as e:
|
||||
print(f"Error fetching {url}: {e}")
|
||||
return None
|
||||
|
||||
def get_makes_enhanced(self) -> List[str]:
|
||||
"""Enhanced method to get makes from RockAuto"""
|
||||
print("Fetching list of makes (enhanced)...")
|
||||
|
||||
# Try multiple approaches to get makes
|
||||
makes = []
|
||||
|
||||
# Approach 1: Visit the main catalog page
|
||||
soup = self.get_page(f"{self.base_url}/catalog/catalog.php")
|
||||
|
||||
if not soup:
|
||||
return makes
|
||||
|
||||
# Look for links that contain make information in the URL
|
||||
links = soup.find_all('a', href=True)
|
||||
|
||||
for link in links:
|
||||
href = link.get('href', '').lower()
|
||||
|
||||
# Check if the href contains a known make
|
||||
for key, value in self.brand_mapping.items():
|
||||
if f"/{key}/" in href and value not in makes:
|
||||
makes.append(value)
|
||||
|
||||
# Approach 2: Look for JavaScript variables or data attributes that might contain makes
|
||||
scripts = soup.find_all('script')
|
||||
for script in scripts:
|
||||
if script.string:
|
||||
# Look for common patterns in JavaScript
|
||||
import re
|
||||
# Look for patterns like make names in quotes
|
||||
matches = re.findall(r'["\']([a-z-]+)["\']', script.string)
|
||||
for match in matches:
|
||||
if match in self.brand_mapping and self.brand_mapping[match] not in makes:
|
||||
makes.append(self.brand_mapping[match])
|
||||
|
||||
print(f"Found {len(makes)} makes: {makes[:10]}{'...' if len(makes) > 10 else ''}")
|
||||
return makes
|
||||
|
||||
def get_detailed_models_for_make(self, make: str) -> List[Dict]:
|
||||
"""Get detailed models for a specific make by exploring deeper pages"""
|
||||
print(f"Fetching detailed models for {make}...")
|
||||
|
||||
# Convert make to RockAuto format
|
||||
make_key = None
|
||||
for key, value in self.brand_mapping.items():
|
||||
if value.lower() == make.lower():
|
||||
make_key = key
|
||||
break
|
||||
|
||||
if not make_key:
|
||||
print(f"Make {make} not found in mapping")
|
||||
return []
|
||||
|
||||
models = []
|
||||
|
||||
# Visit the make-specific page
|
||||
url = f"{self.base_url}/catalog/catalog.php?c={make_key}"
|
||||
soup = self.get_page(url)
|
||||
|
||||
if not soup:
|
||||
return models
|
||||
|
||||
# Look for year links first
|
||||
year_links = soup.find_all('a', href=lambda x: x and f'/catalog/{make_key}/' in x and any(str(y) in x for y in range(1900, 2030)))
|
||||
|
||||
for link in year_links:
|
||||
href = link.get('href', '')
|
||||
text = link.get_text().strip()
|
||||
|
||||
# Extract year from URL or text
|
||||
import re
|
||||
year_match = re.search(r'\b(19|20)\d{2}\b', text)
|
||||
if not year_match:
|
||||
year_match = re.search(r'\b(19|20)\d{2}\b', href)
|
||||
|
||||
if year_match:
|
||||
year = int(year_match.group())
|
||||
|
||||
# Extract model from text or URL
|
||||
# Remove year from text to get model
|
||||
model_text = re.sub(r'\b(19|20)\d{2}\b', '', text).strip()
|
||||
|
||||
if model_text:
|
||||
# Create a record
|
||||
record = {
|
||||
'make': make,
|
||||
'model': model_text,
|
||||
'year': year,
|
||||
'engine': 'Unknown', # Will need to extract from deeper pages
|
||||
'href': href
|
||||
}
|
||||
|
||||
if record not in models:
|
||||
models.append(record)
|
||||
|
||||
# If no year-specific links found, try alternative approach
|
||||
if not models:
|
||||
# Look for links that might contain both make and year
|
||||
all_links = soup.find_all('a', href=True)
|
||||
for link in all_links:
|
||||
href = link.get('href', '').lower()
|
||||
text = link.get_text().strip()
|
||||
|
||||
if f"/{make_key}/" in href:
|
||||
# Look for year in the text or href
|
||||
year_match = re.search(r'\b(19|20)\d{2}\b', text)
|
||||
if not year_match:
|
||||
year_match = re.search(r'\b(19|20)\d{2}\b', href)
|
||||
|
||||
if year_match:
|
||||
year = int(year_match.group())
|
||||
|
||||
# Extract model info
|
||||
model_parts = [part for part in text.split() if not re.match(r'\b(19|20)\d{2}\b', part)]
|
||||
model = ' '.join(model_parts)
|
||||
|
||||
if model:
|
||||
record = {
|
||||
'make': make,
|
||||
'model': model,
|
||||
'year': year,
|
||||
'engine': 'Unknown',
|
||||
'href': link.get('href')
|
||||
}
|
||||
|
||||
if record not in models:
|
||||
models.append(record)
|
||||
|
||||
print(f"Found {len(models)} models for {make}")
|
||||
return models
|
||||
|
||||
def explore_categories(self, make: str) -> List[Dict]:
|
||||
"""Explore categories for a specific make to find models and years"""
|
||||
print(f"Exploring categories for {make}...")
|
||||
|
||||
# Convert make to RockAuto format
|
||||
make_key = None
|
||||
for key, value in self.brand_mapping.items():
|
||||
if value.lower() == make.lower():
|
||||
make_key = key
|
||||
break
|
||||
|
||||
if not make_key:
|
||||
print(f"Make {make} not found in mapping")
|
||||
return []
|
||||
|
||||
models = []
|
||||
|
||||
# Visit the make-specific page
|
||||
url = f"{self.base_url}/catalog/catalog.php?c={make_key}"
|
||||
soup = self.get_page(url)
|
||||
|
||||
if not soup:
|
||||
return models
|
||||
|
||||
# Look for elements that represent vehicle categories
|
||||
# RockAuto typically organizes by year/model
|
||||
category_elements = soup.find_all(['div', 'section', 'ul'], class_=lambda x: x and any(keyword in x.lower() for keyword in ['year', 'model', 'catalog', 'vehicle']))
|
||||
|
||||
if not category_elements:
|
||||
# If no categorized elements found, try looking for all links with year info
|
||||
all_links = soup.find_all('a', href=True)
|
||||
for link in all_links:
|
||||
href = link.get('href', '').lower()
|
||||
text = link.get_text().strip()
|
||||
|
||||
if f"/{make_key}/" in href and any(str(year) in href for year in range(1900, 2030)):
|
||||
# Extract year and model
|
||||
import re
|
||||
year_match = re.search(r'\b(19|20)\d{2}\b', href)
|
||||
if year_match:
|
||||
year = int(year_match.group())
|
||||
|
||||
# Clean up text to extract model
|
||||
clean_text = re.sub(r'\b(19|20)\d{2}\b', '', text).strip(' -_')
|
||||
|
||||
if clean_text and len(clean_text) > 1:
|
||||
record = {
|
||||
'make': make,
|
||||
'model': clean_text,
|
||||
'year': year,
|
||||
'engine': 'Unknown',
|
||||
'href': link.get('href')
|
||||
}
|
||||
|
||||
if record not in models:
|
||||
models.append(record)
|
||||
|
||||
print(f"Found {len(models)} entries for {make} through category exploration")
|
||||
return models
|
||||
|
||||
def scrape_vehicle_data(self) -> List[Dict]:
|
||||
"""Main method to scrape vehicle data from RockAuto"""
|
||||
print("Starting enhanced RockAuto scraping...")
|
||||
|
||||
all_vehicles = []
|
||||
|
||||
# Get all makes using enhanced method
|
||||
makes = self.get_makes_enhanced()
|
||||
|
||||
# Limit to first 3 makes for testing
|
||||
makes = makes[:3] if len(makes) > 3 else makes
|
||||
|
||||
for make in makes:
|
||||
# Try multiple approaches to get models
|
||||
models = self.get_detailed_models_for_make(make)
|
||||
|
||||
# If still no models, try category exploration
|
||||
if not models:
|
||||
models = self.explore_categories(make)
|
||||
|
||||
all_vehicles.extend(models)
|
||||
|
||||
# Limit total records for testing
|
||||
if len(all_vehicles) > 15:
|
||||
break
|
||||
|
||||
print(f"Total vehicles found: {len(all_vehicles)}")
|
||||
return all_vehicles
|
||||
|
||||
def save_to_database(self, vehicles: List[Dict]):
|
||||
"""Save scraped data to the vehicle database"""
|
||||
print(f"Saving {len(vehicles)} vehicles to database...")
|
||||
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
for vehicle in vehicles:
|
||||
try:
|
||||
# Insert brand
|
||||
cursor.execute(
|
||||
"INSERT OR IGNORE INTO brands (name) VALUES (?)",
|
||||
(vehicle['make'],)
|
||||
)
|
||||
cursor.execute("SELECT id FROM brands WHERE name = ?", (vehicle['make'],))
|
||||
brand_id = cursor.fetchone()[0]
|
||||
|
||||
# Insert year
|
||||
cursor.execute(
|
||||
"INSERT OR IGNORE INTO years (year) VALUES (?)",
|
||||
(vehicle['year'],)
|
||||
)
|
||||
cursor.execute("SELECT id FROM years WHERE year = ?", (vehicle['year'],))
|
||||
year_id = cursor.fetchone()[0]
|
||||
|
||||
# Insert engine (with unknown specs for now)
|
||||
engine_name = vehicle['engine'] if vehicle['engine'] != 'Unknown' else f"Engine_{vehicle['year']}_{vehicle['model'][:10]}"
|
||||
cursor.execute(
|
||||
"INSERT OR IGNORE INTO engines (name) VALUES (?)",
|
||||
(engine_name,)
|
||||
)
|
||||
cursor.execute("SELECT id FROM engines WHERE name = ?", (engine_name,))
|
||||
engine_id = cursor.fetchone()[0]
|
||||
|
||||
# Insert model
|
||||
cursor.execute(
|
||||
"INSERT OR IGNORE INTO models (brand_id, name, body_type) VALUES (?, ?, ?)",
|
||||
(brand_id, vehicle['model'], 'Unknown')
|
||||
)
|
||||
cursor.execute("SELECT id FROM models WHERE brand_id = ? AND name = ?", (brand_id, vehicle['model']))
|
||||
model_id = cursor.fetchone()[0]
|
||||
|
||||
# Link model, year, and engine
|
||||
cursor.execute(
|
||||
"""INSERT OR IGNORE INTO model_year_engine
|
||||
(model_id, year_id, engine_id) VALUES (?, ?, ?)""",
|
||||
(model_id, year_id, engine_id)
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error saving vehicle {vehicle}: {e}")
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
print("Data saved to database successfully!")
|
||||
|
||||
|
||||
def main():
|
||||
scraper = RockAutoScraper()
|
||||
|
||||
print("Starting enhanced RockAuto data extraction...")
|
||||
print("Note: This may take several minutes due to rate limiting.")
|
||||
|
||||
try:
|
||||
# Scrape vehicle data
|
||||
vehicles = scraper.scrape_vehicle_data()
|
||||
|
||||
if vehicles:
|
||||
print(f"\nFound {len(vehicles)} vehicles:")
|
||||
for i, v in enumerate(vehicles[:10]): # Show first 10
|
||||
print(f" {i+1}. {v['make']} {v['model']} {v['year']}")
|
||||
|
||||
if len(vehicles) > 10:
|
||||
print(f" ... and {len(vehicles)-10} more")
|
||||
|
||||
# Save to database
|
||||
scraper.save_to_database(vehicles)
|
||||
|
||||
print("\nScraping completed successfully!")
|
||||
else:
|
||||
print("No vehicles found. This could be due to:")
|
||||
print("1. RockAuto blocking automated requests")
|
||||
print("2. Changes in website structure")
|
||||
print("3. Network connectivity issues")
|
||||
print("4. Anti-bot measures implemented by RockAuto")
|
||||
|
||||
except Exception as e:
|
||||
print(f"An error occurred during scraping: {e}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user