Initial commit: Sistema Autoparts DB

- Base de datos SQLite con información de vehículos
- Dashboard web con Flask y Bootstrap
- Scripts de web scraping para RockAuto
- Interfaz CLI para consultas
- Documentación completa del proyecto

Incluye:
- 12 marcas de vehículos
- 10,923 modelos
- 10,919 especificaciones de motores
- 12,075 combinaciones modelo-año-motor
This commit is contained in:
2026-01-19 08:45:03 +00:00
commit f395d67136
59 changed files with 10881 additions and 0 deletions

View File

@@ -0,0 +1,292 @@
#!/usr/bin/env python3
"""
RockAuto Vehicle Data Scraper
Extracts vehicle information (brands, models, years, engines) from RockAuto.com
"""
import requests
from bs4 import BeautifulSoup
import time
import random
from urllib.parse import urljoin, urlparse
import json
import sqlite3
from typing import List, Dict, Optional
class RockAutoScraper:
def __init__(self, db_path: str = "../vehicle_database/vehicle_database.db"):
self.base_url = "https://www.rockauto.com"
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
})
self.db_path = db_path
# Create a mapping of RockAuto brand names to standardized names
self.brand_mapping = {
'acura': 'Acura',
'alfa-romeo': 'Alfa Romeo',
'audi': 'Audi',
'bmw': 'BMW',
'buick': 'Buick',
'cadillac': 'Cadillac',
'chevrolet': 'Chevrolet',
'chrysler': 'Chrysler',
'dodge': 'Dodge',
'fiat': 'Fiat',
'ford': 'Ford',
'gmc': 'GMC',
'honda': 'Honda',
'hyundai': 'Hyundai',
'infiniti': 'Infiniti',
'isuzu': 'Isuzu',
'jaguar': 'Jaguar',
'jeep': 'Jeep',
'kia': 'Kia',
'land-rover': 'Land Rover',
'lexus': 'Lexus',
'lincoln': 'Lincoln',
'mazda': 'Mazda',
'mercedes-benz': 'Mercedes-Benz',
'mercury': 'Mercury',
'mitsubishi': 'Mitsubishi',
'nissan': 'Nissan',
'oldsmobile': 'Oldsmobile',
'plymouth': 'Plymouth',
'pontiac': 'Pontiac',
'porsche': 'Porsche',
'ram': 'Ram',
'saab': 'Saab',
'saturn': 'Saturn',
'scion': 'Scion',
'subaru': 'Subaru',
'suzuki': 'Suzuki',
'tesla': 'Tesla',
'toyota': 'Toyota',
'volkswagen': 'Volkswagen',
'volvo': 'Volvo'
}
def get_page(self, url: str) -> Optional[BeautifulSoup]:
"""Get a page and return BeautifulSoup object"""
try:
# Add random delay to be respectful to the server
time.sleep(random.uniform(1, 3))
response = self.session.get(url)
response.raise_for_status()
return BeautifulSoup(response.content, 'html.parser')
except requests.RequestException as e:
print(f"Error fetching {url}: {e}")
return None
def get_makes(self) -> List[str]:
"""Get list of makes from RockAuto"""
print("Fetching list of makes...")
soup = self.get_page(f"{self.base_url}/catalog/catalog.php")
if not soup:
return []
makes = []
# Look for make selection dropdown or similar element
make_elements = soup.find_all('a', href=lambda x: x and '/catalog/' in x and x.count('/') >= 3)
for elem in make_elements:
href = elem.get('href', '')
# Extract make from URL
parts = href.split('/')
for part in parts:
if part in self.brand_mapping:
make = self.brand_mapping[part]
if make not in makes:
makes.append(make)
# Alternative approach: look for common selector patterns
if not makes:
# Look for elements that might contain make information
links = soup.find_all('a', href=True)
for link in links:
href = link['href'].lower()
for key, value in self.brand_mapping.items():
if key in href and value not in makes:
makes.append(value)
print(f"Found {len(makes)} makes: {makes[:10]}{'...' if len(makes) > 10 else ''}")
return makes
def get_models_for_make(self, make: str) -> List[Dict]:
"""Get models for a specific make"""
print(f"Fetching models for {make}...")
# Convert make to RockAuto format
make_key = None
for key, value in self.brand_mapping.items():
if value.lower() == make.lower():
make_key = key
break
if not make_key:
print(f"Make {make} not found in mapping")
return []
models = []
soup = self.get_page(f"{self.base_url}/catalog/catalog.php?c={make_key}")
if not soup:
return models
# Look for model/year combinations
# RockAuto typically has links with year and model info
links = soup.find_all('a', href=True)
for link in links:
href = link['href']
text = link.get_text().strip()
# Look for patterns that indicate year/model/engine info
if any(char.isdigit() for char in text) and len(text) > 2:
# Try to extract year and model info
parts = text.split()
# Look for year (usually 4 digits)
year = None
model_parts = []
for part in parts:
if part.isdigit() and len(part) == 4 and 1900 < int(part) < 2030:
year = int(part)
else:
model_parts.append(part)
if model_parts and year:
model = ' '.join(model_parts)
# Create a record
record = {
'make': make,
'model': model,
'year': year,
'engine': 'Unknown', # Will need to extract from deeper pages
'href': href
}
if record not in models:
models.append(record)
print(f"Found {len(models)} models for {make}")
return models
def scrape_vehicle_data(self) -> List[Dict]:
"""Main method to scrape vehicle data from RockAuto"""
print("Starting RockAuto scraping...")
all_vehicles = []
# Get all makes
makes = self.get_makes()
# Limit to first 5 makes for testing
makes = makes[:5] if len(makes) > 5 else makes
for make in makes:
models = self.get_models_for_make(make)
all_vehicles.extend(models)
# Limit total records for testing
if len(all_vehicles) > 20:
break
print(f"Total vehicles found: {len(all_vehicles)}")
return all_vehicles
def save_to_database(self, vehicles: List[Dict]):
"""Save scraped data to the vehicle database"""
print(f"Saving {len(vehicles)} vehicles to database...")
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
for vehicle in vehicles:
try:
# Insert brand
cursor.execute(
"INSERT OR IGNORE INTO brands (name) VALUES (?)",
(vehicle['make'],)
)
cursor.execute("SELECT id FROM brands WHERE name = ?", (vehicle['make'],))
brand_id = cursor.fetchone()[0]
# Insert year
cursor.execute(
"INSERT OR IGNORE INTO years (year) VALUES (?)",
(vehicle['year'],)
)
cursor.execute("SELECT id FROM years WHERE year = ?", (vehicle['year'],))
year_id = cursor.fetchone()[0]
# Insert engine (with unknown specs for now)
cursor.execute(
"INSERT OR IGNORE INTO engines (name) VALUES (?)",
(vehicle['engine'],)
)
cursor.execute("SELECT id FROM engines WHERE name = ?", (vehicle['engine'],))
engine_id = cursor.fetchone()[0]
# Insert model
cursor.execute(
"INSERT OR IGNORE INTO models (brand_id, name) VALUES (?, ?)",
(brand_id, vehicle['model'])
)
cursor.execute("SELECT id FROM models WHERE brand_id = ? AND name = ?", (brand_id, vehicle['model']))
model_id = cursor.fetchone()[0]
# Link model, year, and engine
cursor.execute(
"""INSERT OR IGNORE INTO model_year_engine
(model_id, year_id, engine_id) VALUES (?, ?, ?)""",
(model_id, year_id, engine_id)
)
except Exception as e:
print(f"Error saving vehicle {vehicle}: {e}")
conn.commit()
conn.close()
print("Data saved to database successfully!")
def main():
scraper = RockAutoScraper()
print("Starting RockAuto data extraction...")
print("Note: This may take several minutes due to rate limiting.")
try:
# Scrape vehicle data
vehicles = scraper.scrape_vehicle_data()
if vehicles:
print(f"\nFound {len(vehicles)} vehicles:")
for i, v in enumerate(vehicles[:10]): # Show first 10
print(f" {i+1}. {v['make']} {v['model']} {v['year']}")
if len(vehicles) > 10:
print(f" ... and {len(vehicles)-10} more")
# Save to database
scraper.save_to_database(vehicles)
print("\nScraping completed successfully!")
else:
print("No vehicles found. This could be due to:")
print("1. RockAuto blocking automated requests")
print("2. Changes in website structure")
print("3. Network connectivity issues")
except Exception as e:
print(f"An error occurred during scraping: {e}")
if __name__ == "__main__":
main()