Autoparts-DB/vehicle_database/scripts/create_cross_references.py

#!/usr/bin/env python3
"""
GENERADOR DE REFERENCIAS CRUZADAS ENTRE MARCAS
Encuentra partes de diferentes fabricantes que cubren los mismos vehículos
y crea referencias cruzadas bidireccionales entre ellas.
"""

import sqlite3
from pathlib import Path
from collections import defaultdict

DB_PATH = Path(__file__).parent.parent / 'vehicle_database.db'


def get_db():
    conn = sqlite3.connect(DB_PATH)
    conn.row_factory = sqlite3.Row
    return conn


def main():
    print("=" * 70)
    print("GENERADOR DE REFERENCIAS CRUZADAS ENTRE MARCAS")
    print("=" * 70)

    conn = get_db()
    cursor = conn.cursor()

    # Get existing cross-ref count
    cursor.execute("SELECT COUNT(*) FROM part_cross_references")
    existing_xrefs = cursor.fetchone()[0]
    print(f"\nCross-refs existentes: {existing_xrefs:,}")

    # Step 1: For each part_group, find parts from different brands
    # that fit the same vehicle (model_year_engine)
    print("\n[1/3] Buscando partes que cubren los mismos vehículos...")

    # Build a map: (group_id, mye_id) -> list of (part_id, part_number)
    cursor.execute("""
        SELECT vp.model_year_engine_id, vp.part_id, p.oem_part_number, p.group_id
        FROM vehicle_parts vp
        JOIN parts p ON vp.part_id = p.id
        WHERE p.group_id IS NOT NULL
        ORDER BY p.group_id, vp.model_year_engine_id
    """)

    group_mye_parts = defaultdict(set)
    for row in cursor.fetchall():
        key = (row['group_id'], row['model_year_engine_id'])
        group_mye_parts[key].add((row['part_id'], row['oem_part_number']))

    print(f"      Combinaciones grupo+vehículo: {len(group_mye_parts):,}")

    # Step 2: For each (group, vehicle) with multiple parts from different brands,
    # create cross-references
    print("\n[2/3] Generando pares de cross-reference...")

    # Build existing cross-ref set for fast lookup
    cursor.execute("SELECT part_id, cross_reference_number FROM part_cross_references")
    existing = set()
    for row in cursor.fetchall():
        existing.add((row['part_id'], row['cross_reference_number']))

    print(f"      Cross-refs existentes en set: {len(existing):,}")

    # Collect new cross-reference pairs
    new_xrefs = []
    for key, parts_set in group_mye_parts.items():
        if len(parts_set) < 2:
            continue

        parts_list = list(parts_set)
        for i in range(len(parts_list)):
            pid_a, pn_a = parts_list[i]
            for j in range(i + 1, len(parts_list)):
                pid_b, pn_b = parts_list[j]

                # Skip if same part number prefix (same brand)
                if pn_a[:3] == pn_b[:3]:
                    continue

                # Add A->B
                if (pid_a, pn_b) not in existing:
                    new_xrefs.append((pid_a, pn_b))
                    existing.add((pid_a, pn_b))

                # Add B->A
                if (pid_b, pn_a) not in existing:
                    new_xrefs.append((pid_b, pn_a))
                    existing.add((pid_b, pn_a))

    print(f"      Nuevas cross-refs a crear: {len(new_xrefs):,}")

    # Step 3: Insert
    print("\n[3/3] Insertando cross-references...")
    inserted = 0
    for i, (part_id, xref_number) in enumerate(new_xrefs):
        if i % 5000 == 0 and i > 0:
            print(f"      Insertando {i}/{len(new_xrefs)}...")
        cursor.execute(
            "INSERT INTO part_cross_references (part_id, cross_reference_number, reference_type, source) VALUES (?, ?, 'interchange', 'Vehicle Fitment Match')",
            (part_id, xref_number))
        inserted += 1

    conn.commit()

    # Final stats
    cursor.execute("SELECT COUNT(*) FROM part_cross_references")
    total_xrefs = cursor.fetchone()[0]

    conn.close()

    print("\n" + "=" * 70)
    print("CROSS-REFERENCES COMPLETADAS")
    print("=" * 70)
    print(f"""
RESUMEN:
  - Cross-refs antes:      {existing_xrefs:,}
  - Nuevas cross-refs:     {inserted:,}
  - Total cross-refs:      {total_xrefs:,}
""")


if __name__ == '__main__':
    main()