Autoparts-DB/scripts/link_vehicle_parts.py

#!/usr/bin/env python3
"""
Link parts to vehicles using TecDoc article files.
Maps: article file (vehicleId_categoryId.json) → parts → vehicle_parts (MYE ids)
Optimized v3: year+engine filtering + batch inserts.
"""

import json
import re
import psycopg2
from psycopg2.extras import execute_values
from pathlib import Path

DB_URL = "postgresql://nexus:nexus_autoparts_2026@localhost/nexus_autoparts"
DATA_DIR = Path("/home/Autopartes/data/tecdoc")
ARTICLES_DIR = DATA_DIR / "parts" / "articles"
DETAILS_DIR = DATA_DIR / "parts" / "details"

BATCH_SIZE = 50000


def parse_capacity_liters(cap):
    """Convert TecDoc capacityLt (e.g. '1998.0000' cc) to liters float (1.998)."""
    try:
        cc = float(cap)
        return round(cc / 1000, 1)
    except:
        return None


def extract_engine_liters(engine_name):
    """Extract liters from engine name like '2.0L 4cyl 127hp'."""
    m = re.match(r'(\d+\.\d+)L', engine_name)
    if m:
        return round(float(m.group(1)), 1)
    return None


def run():
    conn = psycopg2.connect(DB_URL)
    cur = conn.cursor()

    # Step 1: Build vehicleId → vehicle info from TecDoc files
    print("Building vehicleId → vehicle info mapping...", flush=True)
    mfrs = json.loads((DATA_DIR / "manufacturers.json").read_text())
    vid_info = {}  # vehicleId → {brand, model, year_start, year_end, liters}
    for mfr in mfrs:
        brand = mfr['manufacturerName']
        if '(' in brand:
            continue
        mfr_id = mfr['manufacturerId']
        model_file = DATA_DIR / "models" / f"{mfr_id}.json"
        if not model_file.exists():
            continue
        models = json.loads(model_file.read_text())
        for model in models:
            model_name = model.get('modelName', '')
            if not model_name:
                continue
            vehicle_file = DATA_DIR / "vehicles" / f"{model['modelId']}.json"
            if not vehicle_file.exists():
                continue
            vehicles = json.loads(vehicle_file.read_text())
            if not vehicles:
                continue
            for v in vehicles:
                vid = v.get('vehicleId')
                if not vid:
                    continue
                # Parse year range
                year_start = None
                year_end = None
                try:
                    cs = v.get('constructionIntervalStart', '')
                    if cs:
                        year_start = int(cs[:4])
                    ce = v.get('constructionIntervalEnd', '')
                    if ce:
                        year_end = int(ce[:4])
                except:
                    pass
                # Parse engine capacity
                liters = parse_capacity_liters(v.get('capacityLt') or v.get('capacityTax'))
                vid_info[vid] = {
                    'brand': brand,
                    'model': model_name,
                    'year_start': year_start,
                    'year_end': year_end,
                    'liters': liters,
                }

    print(f"  {len(vid_info):,} vehicleIds mapped", flush=True)

    # Step 2: Build (brand, modelName) → list of (mye_id, year, liters) from our DB
    print("Building brand/model → MYE details mapping...", flush=True)
    cur.execute("""
        SELECT b.name_brand, m.name_model, mye.id_mye, y.year_car, e.name_engine
        FROM model_year_engine mye
        JOIN models m ON mye.model_id = m.id_model
        JOIN brands b ON m.brand_id = b.id_brand
        JOIN years y ON mye.year_id = y.id_year
        JOIN engines e ON mye.engine_id = e.id_engine
    """)
    brand_model_to_myes = {}
    for brand, model, mye_id, year, engine_name in cur.fetchall():
        key = (brand, model)
        liters = extract_engine_liters(engine_name)
        if key not in brand_model_to_myes:
            brand_model_to_myes[key] = []
        brand_model_to_myes[key].append((mye_id, year, liters))

    print(f"  {len(brand_model_to_myes):,} brand/model combos with {sum(len(v) for v in brand_model_to_myes.values()):,} MYEs", flush=True)

    # Step 3: Build OEM number → part_id from DB
    print("Loading parts cache...", flush=True)
    cur.execute("SELECT oem_part_number, id_part FROM parts WHERE oem_part_number IS NOT NULL")
    part_cache = {r[0]: r[1] for r in cur.fetchall()}
    print(f"  {len(part_cache):,} parts cached", flush=True)

    # Step 4: Load detail files to get articleId → OEM numbers
    print("Loading article detail OEM mappings...", flush=True)
    article_to_oems = {}
    for f in DETAILS_DIR.glob("*.json"):
        try:
            data = json.loads(f.read_text())
            oem_list = data.get('articleOemNo', [])
            if oem_list:
                oem_nos = [o.get('oemDisplayNo') for o in oem_list if o.get('oemDisplayNo')]
                if oem_nos:
                    article_to_oems[int(f.stem)] = oem_nos
        except:
            continue
    print(f"  {len(article_to_oems):,} articles with OEM data", flush=True)

    # Step 5: Process article files and create vehicle_parts
    print("\nCreating vehicle_parts links (filtered + batch mode)...", flush=True)

    stats = {'links': 0, 'skipped_no_mye': 0, 'skipped_no_part': 0, 'files': 0, 'filtered_out': 0}
    pending = []

    def flush_batch():
        if not pending:
            return
        execute_values(cur, """
            INSERT INTO vehicle_parts (model_year_engine_id, part_id, quantity_required)
            VALUES %s ON CONFLICT DO NOTHING
        """, pending, page_size=10000)
        conn.commit()
        pending.clear()

    article_files = sorted(ARTICLES_DIR.glob("*.json"))
    for f in article_files:
        parts_split = f.stem.split("_")
        if len(parts_split) != 2:
            continue
        vid = int(parts_split[0])

        info = vid_info.get(vid)
        if not info:
            stats['skipped_no_mye'] += 1
            continue

        bm = (info['brand'], info['model'])
        all_myes = brand_model_to_myes.get(bm, [])
        if not all_myes:
            stats['skipped_no_mye'] += 1
            continue

        # Filter MYEs by year range and engine capacity
        td_ys = info['year_start']
        td_ye = info['year_end']
        td_lit = info['liters']

        filtered_myes = []
        for mye_id, mye_year, mye_liters in all_myes:
            # Year filter: MYE year must fall within TecDoc construction interval
            if td_ys and td_ye:
                if mye_year < td_ys or mye_year > td_ye:
                    stats['filtered_out'] += 1
                    continue
            elif td_ys:
                if mye_year < td_ys:
                    stats['filtered_out'] += 1
                    continue

            # Engine capacity filter: must match within 0.2L tolerance
            if td_lit and mye_liters:
                if abs(td_lit - mye_liters) > 0.2:
                    stats['filtered_out'] += 1
                    continue

            filtered_myes.append(mye_id)

        if not filtered_myes:
            # Fallback: if filtering removed everything, skip
            stats['skipped_no_mye'] += 1
            continue

        try:
            articles = json.loads(f.read_text())
        except:
            continue

        for a in articles:
            aid = a.get('articleId')
            article_no = a.get('articleNo', '')
            supplier = a.get('supplierName', '')
            if not aid:
                continue

            part_ids = set()
            oem_nos = article_to_oems.get(aid, [])
            for oem_no in oem_nos:
                pid = part_cache.get(oem_no)
                if pid:
                    part_ids.add(pid)

            if not part_ids:
                stats['skipped_no_part'] += 1
                continue

            for mye_id in filtered_myes:
                for part_id in part_ids:
                    pending.append((mye_id, part_id, 1))
                    stats['links'] += 1

            if len(pending) >= BATCH_SIZE:
                flush_batch()

        stats['files'] += 1
        if stats['files'] % 500 == 0:
            flush_batch()
            print(f"  {stats['files']:,}/{len(article_files):,} files | "
                  f"{stats['links']:,} links | {stats['filtered_out']:,} filtered out", flush=True)

    flush_batch()
    cur.close()
    conn.close()

    print(f"\n{'='*50}", flush=True)
    print(f"LINKING COMPLETE", flush=True)
    print(f"  Files processed:  {stats['files']:,}", flush=True)
    print(f"  Links created:    {stats['links']:,}", flush=True)
    print(f"  Filtered out:     {stats['filtered_out']:,}", flush=True)
    print(f"  Skipped (no MYE): {stats['skipped_no_mye']:,}", flush=True)
    print(f"  Skipped (no part):{stats['skipped_no_part']:,}", flush=True)
    print(f"{'='*50}", flush=True)


if __name__ == "__main__":
    run()