Initial commit: Sistema Autoparts DB

- Base de datos SQLite con información de vehículos - Dashboard web con Flask y Bootstrap - Scripts de web scraping para RockAuto - Interfaz CLI para consultas - Documentación completa del proyecto Incluye: - 12 marcas de vehículos - 10,923 modelos - 10,919 especificaciones de motores - 12,075 combinaciones modelo-año-motor
2026-01-19 08:45:03 +00:00
commit f395d67136
59 changed files with 10881 additions and 0 deletions
--- a/vehicle_scraper/rockauto_scraper_enhanced.py
+++ b/vehicle_scraper/rockauto_scraper_enhanced.py
@@ -0,0 +1,400 @@
+#!/usr/bin/env python3
+"""
+RockAuto Vehicle Data Scraper - Enhanced Version
+Extracts vehicle information (brands, models, years, engines) from RockAuto.com
+"""
+
+import requests
+from bs4 import BeautifulSoup
+import time
+import random
+from urllib.parse import urljoin, urlparse
+import json
+import sqlite3
+from typing import List, Dict, Optional
+
+
+class RockAutoScraper:
+    def __init__(self, db_path: str = "../vehicle_database/vehicle_database.db"):
+        self.base_url = "https://www.rockauto.com"
+        self.session = requests.Session()
+        self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.5',
+            'Accept-Encoding': 'gzip, deflate',
+            'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1',
+        })
+        self.db_path = db_path
+        
+        # Create a mapping of RockAuto brand names to standardized names
+        self.brand_mapping = {
+            'acura': 'Acura',
+            'alfa-romeo': 'Alfa Romeo',
+            'audi': 'Audi',
+            'bmw': 'BMW',
+            'buick': 'Buick',
+            'cadillac': 'Cadillac',
+            'chevrolet': 'Chevrolet',
+            'chrysler': 'Chrysler',
+            'dodge': 'Dodge',
+            'fiat': 'Fiat',
+            'ford': 'Ford',
+            'gmc': 'GMC',
+            'honda': 'Honda',
+            'hyundai': 'Hyundai',
+            'infiniti': 'Infiniti',
+            'isuzu': 'Isuzu',
+            'jaguar': 'Jaguar',
+            'jeep': 'Jeep',
+            'kia': 'Kia',
+            'land-rover': 'Land Rover',
+            'lexus': 'Lexus',
+            'lincoln': 'Lincoln',
+            'mazda': 'Mazda',
+            'mercedes-benz': 'Mercedes-Benz',
+            'mercury': 'Mercury',
+            'mitsubishi': 'Mitsubishi',
+            'nissan': 'Nissan',
+            'oldsmobile': 'Oldsmobile',
+            'plymouth': 'Plymouth',
+            'pontiac': 'Pontiac',
+            'porsche': 'Porsche',
+            'ram': 'Ram',
+            'saab': 'Saab',
+            'saturn': 'Saturn',
+            'scion': 'Scion',
+            'subaru': 'Subaru',
+            'suzuki': 'Suzuki',
+            'tesla': 'Tesla',
+            'toyota': 'Toyota',
+            'volkswagen': 'Volkswagen',
+            'volvo': 'Volvo'
+        }
+    
+    def get_page(self, url: str) -> Optional[BeautifulSoup]:
+        """Get a page and return BeautifulSoup object"""
+        try:
+            # Add random delay to be respectful to the server
+            time.sleep(random.uniform(2, 4))
+            response = self.session.get(url)
+            response.raise_for_status()
+            return BeautifulSoup(response.content, 'html.parser')
+        except requests.RequestException as e:
+            print(f"Error fetching {url}: {e}")
+            return None
+    
+    def get_makes_enhanced(self) -> List[str]:
+        """Enhanced method to get makes from RockAuto"""
+        print("Fetching list of makes (enhanced)...")
+        
+        # Try multiple approaches to get makes
+        makes = []
+        
+        # Approach 1: Visit the main catalog page
+        soup = self.get_page(f"{self.base_url}/catalog/catalog.php")
+        
+        if not soup:
+            return makes
+        
+        # Look for links that contain make information in the URL
+        links = soup.find_all('a', href=True)
+        
+        for link in links:
+            href = link.get('href', '').lower()
+            
+            # Check if the href contains a known make
+            for key, value in self.brand_mapping.items():
+                if f"/{key}/" in href and value not in makes:
+                    makes.append(value)
+        
+        # Approach 2: Look for JavaScript variables or data attributes that might contain makes
+        scripts = soup.find_all('script')
+        for script in scripts:
+            if script.string:
+                # Look for common patterns in JavaScript
+                import re
+                # Look for patterns like make names in quotes
+                matches = re.findall(r'["\']([a-z-]+)["\']', script.string)
+                for match in matches:
+                    if match in self.brand_mapping and self.brand_mapping[match] not in makes:
+                        makes.append(self.brand_mapping[match])
+        
+        print(f"Found {len(makes)} makes: {makes[:10]}{'...' if len(makes) > 10 else ''}")
+        return makes
+    
+    def get_detailed_models_for_make(self, make: str) -> List[Dict]:
+        """Get detailed models for a specific make by exploring deeper pages"""
+        print(f"Fetching detailed models for {make}...")
+        
+        # Convert make to RockAuto format
+        make_key = None
+        for key, value in self.brand_mapping.items():
+            if value.lower() == make.lower():
+                make_key = key
+                break
+        
+        if not make_key:
+            print(f"Make {make} not found in mapping")
+            return []
+        
+        models = []
+        
+        # Visit the make-specific page
+        url = f"{self.base_url}/catalog/catalog.php?c={make_key}"
+        soup = self.get_page(url)
+        
+        if not soup:
+            return models
+        
+        # Look for year links first
+        year_links = soup.find_all('a', href=lambda x: x and f'/catalog/{make_key}/' in x and any(str(y) in x for y in range(1900, 2030)))
+        
+        for link in year_links:
+            href = link.get('href', '')
+            text = link.get_text().strip()
+            
+            # Extract year from URL or text
+            import re
+            year_match = re.search(r'\b(19|20)\d{2}\b', text)
+            if not year_match:
+                year_match = re.search(r'\b(19|20)\d{2}\b', href)
+            
+            if year_match:
+                year = int(year_match.group())
+                
+                # Extract model from text or URL
+                # Remove year from text to get model
+                model_text = re.sub(r'\b(19|20)\d{2}\b', '', text).strip()
+                
+                if model_text:
+                    # Create a record
+                    record = {
+                        'make': make,
+                        'model': model_text,
+                        'year': year,
+                        'engine': 'Unknown',  # Will need to extract from deeper pages
+                        'href': href
+                    }
+                    
+                    if record not in models:
+                        models.append(record)
+        
+        # If no year-specific links found, try alternative approach
+        if not models:
+            # Look for links that might contain both make and year
+            all_links = soup.find_all('a', href=True)
+            for link in all_links:
+                href = link.get('href', '').lower()
+                text = link.get_text().strip()
+                
+                if f"/{make_key}/" in href:
+                    # Look for year in the text or href
+                    year_match = re.search(r'\b(19|20)\d{2}\b', text)
+                    if not year_match:
+                        year_match = re.search(r'\b(19|20)\d{2}\b', href)
+                    
+                    if year_match:
+                        year = int(year_match.group())
+                        
+                        # Extract model info
+                        model_parts = [part for part in text.split() if not re.match(r'\b(19|20)\d{2}\b', part)]
+                        model = ' '.join(model_parts)
+                        
+                        if model:
+                            record = {
+                                'make': make,
+                                'model': model,
+                                'year': year,
+                                'engine': 'Unknown',
+                                'href': link.get('href')
+                            }
+                            
+                            if record not in models:
+                                models.append(record)
+        
+        print(f"Found {len(models)} models for {make}")
+        return models
+    
+    def explore_categories(self, make: str) -> List[Dict]:
+        """Explore categories for a specific make to find models and years"""
+        print(f"Exploring categories for {make}...")
+        
+        # Convert make to RockAuto format
+        make_key = None
+        for key, value in self.brand_mapping.items():
+            if value.lower() == make.lower():
+                make_key = key
+                break
+        
+        if not make_key:
+            print(f"Make {make} not found in mapping")
+            return []
+        
+        models = []
+        
+        # Visit the make-specific page
+        url = f"{self.base_url}/catalog/catalog.php?c={make_key}"
+        soup = self.get_page(url)
+        
+        if not soup:
+            return models
+        
+        # Look for elements that represent vehicle categories
+        # RockAuto typically organizes by year/model
+        category_elements = soup.find_all(['div', 'section', 'ul'], class_=lambda x: x and any(keyword in x.lower() for keyword in ['year', 'model', 'catalog', 'vehicle']))
+        
+        if not category_elements:
+            # If no categorized elements found, try looking for all links with year info
+            all_links = soup.find_all('a', href=True)
+            for link in all_links:
+                href = link.get('href', '').lower()
+                text = link.get_text().strip()
+                
+                if f"/{make_key}/" in href and any(str(year) in href for year in range(1900, 2030)):
+                    # Extract year and model
+                    import re
+                    year_match = re.search(r'\b(19|20)\d{2}\b', href)
+                    if year_match:
+                        year = int(year_match.group())
+                        
+                        # Clean up text to extract model
+                        clean_text = re.sub(r'\b(19|20)\d{2}\b', '', text).strip(' -_')
+                        
+                        if clean_text and len(clean_text) > 1:
+                            record = {
+                                'make': make,
+                                'model': clean_text,
+                                'year': year,
+                                'engine': 'Unknown',
+                                'href': link.get('href')
+                            }
+                            
+                            if record not in models:
+                                models.append(record)
+        
+        print(f"Found {len(models)} entries for {make} through category exploration")
+        return models
+    
+    def scrape_vehicle_data(self) -> List[Dict]:
+        """Main method to scrape vehicle data from RockAuto"""
+        print("Starting enhanced RockAuto scraping...")
+        
+        all_vehicles = []
+        
+        # Get all makes using enhanced method
+        makes = self.get_makes_enhanced()
+        
+        # Limit to first 3 makes for testing
+        makes = makes[:3] if len(makes) > 3 else makes
+        
+        for make in makes:
+            # Try multiple approaches to get models
+            models = self.get_detailed_models_for_make(make)
+            
+            # If still no models, try category exploration
+            if not models:
+                models = self.explore_categories(make)
+            
+            all_vehicles.extend(models)
+            
+            # Limit total records for testing
+            if len(all_vehicles) > 15:
+                break
+        
+        print(f"Total vehicles found: {len(all_vehicles)}")
+        return all_vehicles
+    
+    def save_to_database(self, vehicles: List[Dict]):
+        """Save scraped data to the vehicle database"""
+        print(f"Saving {len(vehicles)} vehicles to database...")
+        
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        
+        for vehicle in vehicles:
+            try:
+                # Insert brand
+                cursor.execute(
+                    "INSERT OR IGNORE INTO brands (name) VALUES (?)",
+                    (vehicle['make'],)
+                )
+                cursor.execute("SELECT id FROM brands WHERE name = ?", (vehicle['make'],))
+                brand_id = cursor.fetchone()[0]
+                
+                # Insert year
+                cursor.execute(
+                    "INSERT OR IGNORE INTO years (year) VALUES (?)",
+                    (vehicle['year'],)
+                )
+                cursor.execute("SELECT id FROM years WHERE year = ?", (vehicle['year'],))
+                year_id = cursor.fetchone()[0]
+                
+                # Insert engine (with unknown specs for now)
+                engine_name = vehicle['engine'] if vehicle['engine'] != 'Unknown' else f"Engine_{vehicle['year']}_{vehicle['model'][:10]}"
+                cursor.execute(
+                    "INSERT OR IGNORE INTO engines (name) VALUES (?)",
+                    (engine_name,)
+                )
+                cursor.execute("SELECT id FROM engines WHERE name = ?", (engine_name,))
+                engine_id = cursor.fetchone()[0]
+                
+                # Insert model
+                cursor.execute(
+                    "INSERT OR IGNORE INTO models (brand_id, name, body_type) VALUES (?, ?, ?)",
+                    (brand_id, vehicle['model'], 'Unknown')
+                )
+                cursor.execute("SELECT id FROM models WHERE brand_id = ? AND name = ?", (brand_id, vehicle['model']))
+                model_id = cursor.fetchone()[0]
+                
+                # Link model, year, and engine
+                cursor.execute(
+                    """INSERT OR IGNORE INTO model_year_engine 
+                       (model_id, year_id, engine_id) VALUES (?, ?, ?)""",
+                    (model_id, year_id, engine_id)
+                )
+                
+            except Exception as e:
+                print(f"Error saving vehicle {vehicle}: {e}")
+        
+        conn.commit()
+        conn.close()
+        print("Data saved to database successfully!")
+
+
+def main():
+    scraper = RockAutoScraper()
+    
+    print("Starting enhanced RockAuto data extraction...")
+    print("Note: This may take several minutes due to rate limiting.")
+    
+    try:
+        # Scrape vehicle data
+        vehicles = scraper.scrape_vehicle_data()
+        
+        if vehicles:
+            print(f"\nFound {len(vehicles)} vehicles:")
+            for i, v in enumerate(vehicles[:10]):  # Show first 10
+                print(f"  {i+1}. {v['make']} {v['model']} {v['year']}")
+            
+            if len(vehicles) > 10:
+                print(f"  ... and {len(vehicles)-10} more")
+            
+            # Save to database
+            scraper.save_to_database(vehicles)
+            
+            print("\nScraping completed successfully!")
+        else:
+            print("No vehicles found. This could be due to:")
+            print("1. RockAuto blocking automated requests")
+            print("2. Changes in website structure")
+            print("3. Network connectivity issues")
+            print("4. Anti-bot measures implemented by RockAuto")
+    
+    except Exception as e:
+        print(f"An error occurred during scraping: {e}")
+
+
+if __name__ == "__main__":
+    main()