fix: stop creating AFT- placeholder parts in import pipeline

- import_phase1.py: skip AFT- part creation when no OEM data - link_vehicle_parts.py: remove AFT- fallback lookup in part cache - import_tecdoc_parts.py: add VW to TOP_BRANDS list Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-18 22:25:21 +00:00
parent 4b01c57c88
commit eff04a5e60
3 changed files with 930 additions and 0 deletions
--- a/scripts/import_phase1.py
+++ b/scripts/import_phase1.py
@@ -0,0 +1,148 @@
 #!/usr/bin/env python3
 """
 Quick import of Phase 1 TecDoc article data into PostgreSQL.
 Imports aftermarket parts and their vehicle mappings from article list files,
 without waiting for OEM detail downloads.
 """
 import json
 import psycopg2
 from pathlib import Path
 DB_URL = "postgresql://nexus:nexus_autoparts_2026@localhost/nexus_autoparts"
 ARTICLES_DIR = Path("/home/Autopartes/data/tecdoc/parts/articles")
 DETAILS_DIR = Path("/home/Autopartes/data/tecdoc/parts/details")
 def run():
    conn = psycopg2.connect(DB_URL)
    cur = conn.cursor()
    # Load category mapping: tecdoc_id → id_part_category
    cur.execute("SELECT id_part_category, tecdoc_id FROM part_categories WHERE tecdoc_id IS NOT NULL")
    cat_map = {r[1]: r[0] for r in cur.fetchall()}
    # Load existing manufacturers
    cur.execute("SELECT id_manufacture, name_manufacture FROM manufacturers")
    mfr_cache = {r[1]: r[0] for r in cur.fetchall()}
    # Load existing parts by OEM
    cur.execute("SELECT oem_part_number, id_part FROM parts WHERE oem_part_number IS NOT NULL")
    part_cache = {r[0]: r[1] for r in cur.fetchall()}
    # Load existing cross-refs to avoid duplicates
    cur.execute("SELECT part_id, cross_reference_number, source_ref FROM part_cross_references")
    xref_set = {(r[0], r[1], r[2]) for r in cur.fetchall()}
    # Also check detail files for OEM numbers
    detail_oem = {}  # articleId → list of {oemBrand, oemDisplayNo}
    detail_files = list(DETAILS_DIR.glob("*.json"))
    print(f"Loading {len(detail_files)} detail files for OEM data...", flush=True)
    for f in detail_files:
        try:
            data = json.loads(f.read_text())
            article = data.get('article', {})
            if article and article.get('oemNo'):
                detail_oem[int(f.stem)] = article['oemNo']
        except:
            continue
    stats = {'parts': 0, 'xrefs': 0, 'mfrs': 0, 'skipped': 0}
    article_files = sorted(ARTICLES_DIR.glob("*.json"))
    print(f"Processing {len(article_files)} article files...", flush=True)
    # Collect all unique articles across all files
    all_articles = {}  # articleId → article data + category
    for f in article_files:
        parts = f.stem.split("_")
        if len(parts) != 2:
            continue
        cat_id = int(parts[1])
        cat_db_id = cat_map.get(cat_id)
        try:
            articles = json.loads(f.read_text())
        except:
            continue
        for a in articles:
            aid = a.get('articleId')
            if aid and aid not in all_articles:
                a['_cat_db_id'] = cat_db_id
                a['_cat_td_id'] = cat_id
                all_articles[aid] = a
    print(f"Unique articles to process: {len(all_articles):,}", flush=True)
    batch = 0
    for aid, a in all_articles.items():
        article_no = a.get('articleNo', '')
        supplier = a.get('supplierName', '')
        product_name = a.get('articleProductName', '')
        cat_db_id = a.get('_cat_db_id')
        if not article_no or not supplier:
            stats['skipped'] += 1
            continue
        # Ensure manufacturer exists
        if supplier not in mfr_cache:
            cur.execute(
                "INSERT INTO manufacturers (name_manufacture) VALUES (%s) RETURNING id_manufacture",
                (supplier,))
            mfr_cache[supplier] = cur.fetchone()[0]
            stats['mfrs'] += 1
        # If we have OEM details for this article, create OEM parts
        oem_numbers = detail_oem.get(aid, [])
        if oem_numbers:
            for oem in oem_numbers:
                oem_no = oem.get('oemDisplayNo', '')
                oem_brand = oem.get('oemBrand', '')
                if not oem_no:
                    continue
                if oem_no not in part_cache:
                    cur.execute("""
                        INSERT INTO parts (oem_part_number, name_part, description)
                        VALUES (%s, %s, %s)
                        ON CONFLICT (oem_part_number) DO UPDATE SET name_part = EXCLUDED.name_part
                        RETURNING id_part
                    """, (oem_no, product_name, f"OEM {oem_brand}"))
                    part_cache[oem_no] = cur.fetchone()[0]
                    stats['parts'] += 1
                part_id = part_cache[oem_no]
                # Add cross-reference (aftermarket → OEM)
                xref_key = (part_id, article_no, supplier)
                if xref_key not in xref_set:
                    cur.execute("""
                        INSERT INTO part_cross_references (part_id, cross_reference_number, source_ref)
                        VALUES (%s, %s, %s) ON CONFLICT DO NOTHING
                    """, (part_id, article_no, supplier))
                    xref_set.add(xref_key)
                    stats['xrefs'] += 1
        else:
            # No OEM data yet - skip, will be imported when detail arrives
            pass
        batch += 1
        if batch % 5000 == 0:
            conn.commit()
            print(f"  {batch:,}/{len(all_articles):,} — {stats['parts']:,} parts, {stats['xrefs']:,} xrefs", flush=True)
    conn.commit()
    cur.close()
    conn.close()
    print(f"\n{'='*50}", flush=True)
    print(f"IMPORT COMPLETE", flush=True)
    print(f"  Parts:          {stats['parts']:,}", flush=True)
    print(f"  Cross-refs:     {stats['xrefs']:,}", flush=True)
    print(f"  Manufacturers:  {stats['mfrs']:,}", flush=True)
    print(f"  Skipped:        {stats['skipped']:,}", flush=True)
    print(f"{'='*50}", flush=True)
 if __name__ == "__main__":
    run()
--- a/scripts/import_tecdoc_parts.py
+++ b/scripts/import_tecdoc_parts.py
@@ -0,0 +1,531 @@
 #!/usr/bin/env python3
 """
 Import OEM parts data from TecDoc (Apify) into Nexus Autoparts PostgreSQL.
 Three-phase approach:
  Phase 1: Download categories per vehicle → JSON files
  Phase 2: Download article lists per vehicle+category → JSON files
  Phase 3: Download article details (OEM numbers) → JSON files
  Phase 4: Import all JSON data into PostgreSQL
 Uses one representative vehicleId per TecDoc model to minimize API calls.
 Supports concurrent API calls for speed.
 Usage:
  python3 scripts/import_tecdoc_parts.py download    # Phases 1-3
  python3 scripts/import_tecdoc_parts.py import       # Phase 4
  python3 scripts/import_tecdoc_parts.py status       # Check progress
 """
 import os
 import sys
 import json
 import time
 import argparse
 import requests
 import psycopg2
 from datetime import datetime
 from pathlib import Path
 from concurrent.futures import ThreadPoolExecutor, as_completed
 # --- Config ---
 APIFY_TOKEN = os.environ.get("APIFY_TOKEN", "apify_api_l5SrcwYyanAO45AFxrEpviUcuVRIFK2yPdc5")
 APIFY_ACTOR = "making-data-meaningful~tecdoc"
 APIFY_URL = f"https://api.apify.com/v2/acts/{APIFY_ACTOR}/run-sync-get-dataset-items"
 DB_URL = os.environ.get("DATABASE_URL", "postgresql://nexus:nexus_autoparts_2026@localhost/nexus_autoparts")
 TYPE_ID = 1        # Passenger cars
 LANG_ID = 4        # English
 COUNTRY_ID = 153   # Mexico
 DATA_DIR = Path("/home/Autopartes/data/tecdoc")
 PARTS_DIR = DATA_DIR / "parts"
 ARTICLES_DIR = PARTS_DIR / "articles"     # vehicle articles by category
 DETAILS_DIR = PARTS_DIR / "details"       # article OEM details
 MAX_WORKERS = 30       # Concurrent API calls
 APIFY_DELAY = 0.1      # Seconds between API calls per thread
 # Top brands for Mexico & USA
 TOP_BRANDS = [
    'TOYOTA', 'NISSAN', 'CHEVROLET', 'VOLKSWAGEN', 'VW', 'HONDA', 'FORD',
    'HYUNDAI', 'KIA', 'MAZDA', 'BMW', 'MERCEDES-BENZ', 'AUDI',
    'JEEP', 'DODGE', 'CHRYSLER', 'RAM', 'GMC', 'BUICK', 'CADILLAC',
    'SUBARU', 'MITSUBISHI', 'SUZUKI', 'ACURA', 'LEXUS', 'INFINITI',
    'LINCOLN', 'FIAT', 'PEUGEOT', 'RENAULT', 'SEAT'
 ]
 # Top-level TecDoc category IDs (from our DB)
 TOP_CATEGORIES = None  # Loaded dynamically
 def apify_call(input_data, retries=3):
    """Call Apify actor and return result."""
    for attempt in range(retries):
        try:
            resp = requests.post(
                APIFY_URL, params={"token": APIFY_TOKEN},
                headers={"Content-Type": "application/json"},
                json=input_data, timeout=180
            )
            if resp.status_code in (200, 201):
                data = resp.json()
                return data[0] if isinstance(data, list) and data else data
            elif resp.status_code == 429:
                wait = 30 * (attempt + 1)
                print(f"    Rate limited, waiting {wait}s...", flush=True)
                time.sleep(wait)
            else:
                print(f"    HTTP {resp.status_code}: {resp.text[:200]}", flush=True)
                time.sleep(5)
        except Exception as e:
            print(f"    Error: {e}", flush=True)
            time.sleep(5)
    return None
 def load_top_categories():
    """Load top-level TecDoc category IDs from database."""
    conn = psycopg2.connect(DB_URL)
    cur = conn.cursor()
    cur.execute("SELECT tecdoc_id, name_part_category FROM part_categories WHERE tecdoc_id IS NOT NULL ORDER BY display_order")
    cats = [(r[0], r[1]) for r in cur.fetchall()]
    cur.close()
    conn.close()
    return cats
 def get_representative_vehicles():
    """Get one representative vehicleId per TecDoc model for top brands."""
    mfrs = json.loads((DATA_DIR / "manufacturers.json").read_text())
    models_dir = DATA_DIR / "models"
    vehicles_dir = DATA_DIR / "vehicles"
    representatives = []  # (vehicleId, brand_name, model_name, td_model_id)
    for mfr in mfrs:
        name = mfr['manufacturerName']
        if '(' in name:
            continue
        if name.upper() not in [b.upper() for b in TOP_BRANDS]:
            continue
        mfr_id = mfr['manufacturerId']
        model_file = models_dir / f"{mfr_id}.json"
        if not model_file.exists():
            continue
        models = json.loads(model_file.read_text())
        for model in models:
            td_model_id = model['modelId']
            model_name = model.get('modelName', '')
            vehicle_file = vehicles_dir / f"{td_model_id}.json"
            if not vehicle_file.exists():
                continue
            vehicles = json.loads(vehicle_file.read_text())
            if not vehicles:
                continue
            # Pick the first vehicle with a valid vehicleId as representative
            vid = vehicles[0].get('vehicleId')
            if vid:
                # Also collect ALL vehicleIds for this model
                all_vids = [v['vehicleId'] for v in vehicles if v.get('vehicleId')]
                representatives.append({
                    'vehicleId': vid,
                    'allVehicleIds': all_vids,
                    'brand': name,
                    'model': model_name,
                    'tdModelId': td_model_id
                })
    return representatives
 def download_articles_for_vehicle(vid, category_id, category_name):
    """Download article list for a vehicle+category. Returns article count."""
    outfile = ARTICLES_DIR / f"{vid}_{category_id}.json"
    if outfile.exists():
        return 0  # Already downloaded
    time.sleep(APIFY_DELAY)
    result = apify_call({
        'endpoint_partsArticleListByVehicleIdCategoryId': True,
        'parts_vehicleId_18': vid,
        'parts_categoryId_18': category_id,
        'parts_typeId_18': TYPE_ID,
        'parts_langId_18': LANG_ID,
    })
    if result and isinstance(result, dict) and 'articles' in result:
        articles = result.get('articles') or []
        outfile.write_text(json.dumps(articles, indent=1))
        return len(articles)
    else:
        # Save empty to avoid re-querying
        outfile.write_text("[]")
        return 0
 def download_article_detail(article_id):
    """Download OEM details for a single article."""
    outfile = DETAILS_DIR / f"{article_id}.json"
    if outfile.exists():
        return True
    time.sleep(APIFY_DELAY)
    result = apify_call({
        'endpoint_partsArticleDetailsByArticleId': True,
        'parts_articleId_13': article_id,
        'parts_langId_13': LANG_ID,
    })
    if result and result.get('articleOemNo'):
        outfile.write_text(json.dumps(result, indent=1))
        return True
    elif result and isinstance(result.get('article'), dict):
        outfile.write_text(json.dumps(result, indent=1))
        return True
    return False
 # ──────────────── Download ────────────────
 def download(brand_filter=None):
    """Download all parts data from TecDoc."""
    PARTS_DIR.mkdir(parents=True, exist_ok=True)
    ARTICLES_DIR.mkdir(parents=True, exist_ok=True)
    DETAILS_DIR.mkdir(parents=True, exist_ok=True)
    categories = load_top_categories()
    print(f"Loaded {len(categories)} top-level categories", flush=True)
    representatives = get_representative_vehicles()
    if brand_filter:
        representatives = [r for r in representatives if brand_filter.upper() in r['brand'].upper()]
    print(f"Found {len(representatives)} representative vehicles for top brands", flush=True)
    # Phase 1: Download articles per vehicle+category
    total_tasks = len(representatives) * len(categories)
    completed = 0
    total_articles = 0
    print(f"\n{'='*60}", flush=True)
    print(f"PHASE 1: Download articles ({total_tasks:,} tasks)", flush=True)
    print(f"{'='*60}", flush=True)
    for i, rep in enumerate(representatives):
        vid = rep['vehicleId']
        brand = rep['brand']
        model = rep['model']
        # Check if all categories already downloaded for this vehicle
        existing = sum(1 for cat_id, _ in categories
                      if (ARTICLES_DIR / f"{vid}_{cat_id}.json").exists())
        if existing == len(categories):
            completed += len(categories)
            continue
        print(f"[{i+1}/{len(representatives)}] {brand} {model} (vid={vid})", flush=True)
        def download_task(args):
            vid, cat_id, cat_name = args
            return download_articles_for_vehicle(vid, cat_id, cat_name)
        tasks = [(vid, cat_id, cat_name) for cat_id, cat_name in categories
                 if not (ARTICLES_DIR / f"{vid}_{cat_id}.json").exists()]
        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            futures = {executor.submit(download_task, t): t for t in tasks}
            for future in as_completed(futures):
                try:
                    count = future.result()
                    total_articles += count
                    completed += 1
                except Exception as e:
                    print(f"    Task error: {e}", flush=True)
                    completed += 1
        completed += existing  # Count pre-existing
    print(f"\nPhase 1 complete: {total_articles:,} articles found", flush=True)
    # Phase 2: Collect unique articleIds and download OEM details
    print(f"\n{'='*60}", flush=True)
    print(f"PHASE 2: Collect unique articles & download OEM details", flush=True)
    print(f"{'='*60}", flush=True)
    unique_articles = set()
    for f in ARTICLES_DIR.glob("*.json"):
        try:
            articles = json.loads(f.read_text())
            for a in articles:
                if 'articleId' in a:
                    unique_articles.add(a['articleId'])
        except:
            continue
    # Filter out already downloaded
    to_download = [aid for aid in unique_articles
                   if not (DETAILS_DIR / f"{aid}.json").exists()]
    print(f"Unique articles: {len(unique_articles):,}", flush=True)
    print(f"Already have details: {len(unique_articles) - len(to_download):,}", flush=True)
    print(f"Need to download: {len(to_download):,}", flush=True)
    if to_download:
        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            futures = {executor.submit(download_article_detail, aid): aid
                      for aid in to_download}
            done = 0
            for future in as_completed(futures):
                done += 1
                if done % 100 == 0:
                    print(f"  Details: {done}/{len(to_download)}", flush=True)
    print(f"\nDownload complete!", flush=True)
 # ──────────────── Import ────────────────
 def do_import():
    """Import downloaded parts data into PostgreSQL."""
    if not ARTICLES_DIR.exists():
        print("No articles directory. Run 'download' first.")
        return
    conn = psycopg2.connect(DB_URL)
    cur = conn.cursor()
    # Load category mapping: tecdoc_id → (id_part_category, name)
    cur.execute("SELECT id_part_category, tecdoc_id, name_part_category FROM part_categories WHERE tecdoc_id IS NOT NULL")
    cat_map = {r[1]: (r[0], r[2]) for r in cur.fetchall()}
    # Load group mapping: tecdoc_id → id_part_group
    cur.execute("SELECT id_part_group, tecdoc_id, category_id FROM part_groups WHERE tecdoc_id IS NOT NULL")
    group_map = {r[1]: (r[0], r[2]) for r in cur.fetchall()}
    # Load brand mapping from DB
    cur.execute("SELECT id_brand, name_brand FROM brands")
    brand_db = {r[1].upper(): r[0] for r in cur.fetchall()}
    # Build vehicle mapping: vehicleId → list of MYE ids
    representatives = get_representative_vehicles()
    # Build vehicleId → model mapping from our DB
    # We need to map TecDoc modelId → our model_id
    cur.execute("""
        SELECT m.id_model, b.name_brand, m.name_model, m.id_brand
        FROM models m JOIN brands b ON m.id_brand = b.id_brand
    """)
    db_models = cur.fetchall()
    stats = {
        'parts_inserted': 0, 'parts_existing': 0,
        'vehicle_parts': 0, 'aftermarket': 0,
        'cross_refs': 0, 'manufacturers': 0
    }
    # Process article detail files
    detail_files = list(DETAILS_DIR.glob("*.json"))
    print(f"Processing {len(detail_files)} article details...", flush=True)
    # Cache for parts by OEM number
    oem_cache = {}  # oem_no → id_part
    # Cache for manufacturers
    mfr_cache = {}  # supplier_name → id_manufacture
    cur.execute("SELECT id_manufacture, name_manufacture FROM manufacturers")
    for r in cur.fetchall():
        mfr_cache[r[1]] = r[0]
    # Cache existing parts
    cur.execute("SELECT oem_part_number, id_part FROM parts WHERE oem_part_number IS NOT NULL")
    for r in cur.fetchall():
        oem_cache[r[0]] = r[1]
    # Build article→vehicles mapping from article files
    article_vehicles = {}  # articleId → set of vehicleIds
    article_category = {}  # articleId → categoryId (TecDoc)
    for f in ARTICLES_DIR.glob("*.json"):
        parts = f.stem.split("_")
        if len(parts) != 2:
            continue
        vid, cat_id = int(parts[0]), int(parts[1])
        try:
            articles = json.loads(f.read_text())
        except:
            continue
        for a in articles:
            aid = a.get('articleId')
            if aid:
                if aid not in article_vehicles:
                    article_vehicles[aid] = set()
                article_vehicles[aid].add(vid)
                article_category[aid] = cat_id
    print(f"Article→vehicle mappings: {len(article_vehicles)}", flush=True)
    batch_count = 0
    for detail_file in detail_files:
        article_id = int(detail_file.stem)
        try:
            data = json.loads(detail_file.read_text())
        except:
            continue
        article = data.get('article', {})
        if not article:
            continue
        article_no = article.get('articleNo', '')
        supplier_name = article.get('supplierName', '')
        product_name = article.get('articleProductName', '')
        supplier_id = article.get('supplierId')
        # Get OEM numbers
        oem_numbers = article.get('oemNo', [])
        if not oem_numbers:
            continue
        # Get category for this article
        td_cat_id = article_category.get(article_id)
        cat_info = cat_map.get(td_cat_id)
        cat_db_id = cat_info[0] if cat_info else None
        # Ensure manufacturer exists
        if supplier_name and supplier_name not in mfr_cache:
            cur.execute(
                "INSERT INTO manufacturers (name_manufacture) VALUES (%s) RETURNING id_manufacture",
                (supplier_name,))
            mfr_cache[supplier_name] = cur.fetchone()[0]
            stats['manufacturers'] += 1
        mfr_id = mfr_cache.get(supplier_name)
        # Insert each OEM part
        for oem_entry in oem_numbers:
            oem_no = oem_entry.get('oemDisplayNo', '')
            oem_brand = oem_entry.get('oemBrand', '')
            if not oem_no:
                continue
            # Insert OEM part if not exists
            if oem_no not in oem_cache:
                cur.execute("""
                    INSERT INTO parts (oem_part_number, name_part, name_es, category_id, description)
                    VALUES (%s, %s, %s, %s, %s)
                    ON CONFLICT (oem_part_number) DO UPDATE SET name_part = EXCLUDED.name_part
                    RETURNING id_part
                """, (oem_no, product_name, None, cat_db_id, f"OEM {oem_brand}"))
                oem_cache[oem_no] = cur.fetchone()[0]
                stats['parts_inserted'] += 1
            else:
                stats['parts_existing'] += 1
            part_id = oem_cache[oem_no]
            # Insert aftermarket cross-reference
            if article_no and supplier_name:
                cur.execute("""
                    INSERT INTO part_cross_references (part_id, cross_ref_number, id_ref_type, source_ref)
                    VALUES (%s, %s, NULL, %s)
                    ON CONFLICT DO NOTHING
                """, (part_id, article_no, supplier_name))
                stats['cross_refs'] += 1
        batch_count += 1
        if batch_count % 500 == 0:
            conn.commit()
            print(f"  Processed {batch_count}/{len(detail_files)} articles, "
                  f"{stats['parts_inserted']} parts inserted", flush=True)
    conn.commit()
    cur.close()
    conn.close()
    print(f"\n{'='*60}", flush=True)
    print(f"IMPORT COMPLETE", flush=True)
    print(f"  Parts inserted:    {stats['parts_inserted']:,}", flush=True)
    print(f"  Parts existing:    {stats['parts_existing']:,}", flush=True)
    print(f"  Cross-references:  {stats['cross_refs']:,}", flush=True)
    print(f"  Manufacturers:     {stats['manufacturers']:,}", flush=True)
    print(f"{'='*60}", flush=True)
 # ──────────────── Status ────────────────
 def status():
    """Show download progress."""
    categories = load_top_categories()
    representatives = get_representative_vehicles()
    print(f"Representative vehicles: {len(representatives)}")
    print(f"Categories: {len(categories)}")
    print(f"Expected article files: {len(representatives) * len(categories):,}")
    article_files = list(ARTICLES_DIR.glob("*.json")) if ARTICLES_DIR.exists() else []
    detail_files = list(DETAILS_DIR.glob("*.json")) if DETAILS_DIR.exists() else []
    # Count unique articleIds
    unique_articles = set()
    total_article_count = 0
    for f in article_files:
        try:
            articles = json.loads(f.read_text())
            for a in articles:
                if 'articleId' in a:
                    unique_articles.add(a['articleId'])
            total_article_count += len(articles)
        except:
            continue
    expected = len(representatives) * len(categories)
    pct_articles = len(article_files) / expected * 100 if expected > 0 else 0
    print(f"\nArticle files:     {len(article_files):,} / {expected:,} ({pct_articles:.1f}%)")
    print(f"Total articles:    {total_article_count:,}")
    print(f"Unique articleIds: {len(unique_articles):,}")
    print(f"Detail files:      {len(detail_files):,} / {len(unique_articles):,}")
    if expected > 0:
        remaining = expected - len(article_files)
        est_minutes = remaining * (APIFY_DELAY + 3) / MAX_WORKERS / 60
        print(f"\nEst. remaining (articles): ~{est_minutes:.0f} min ({remaining:,} calls)")
    remaining_details = len(unique_articles) - len(detail_files)
    if remaining_details > 0:
        est_detail_min = remaining_details * (APIFY_DELAY + 3) / MAX_WORKERS / 60
        print(f"Est. remaining (details):  ~{est_detail_min:.0f} min ({remaining_details:,} calls)")
    # Per-brand breakdown
    print(f"\n{'Brand':20s} {'Models':>7} {'Done':>7} {'%':>6}")
    print("-" * 44)
    for brand in sorted(TOP_BRANDS):
        brand_reps = [r for r in representatives if r['brand'].upper() == brand]
        brand_done = sum(1 for r in brand_reps
                        for cat_id, _ in categories
                        if (ARTICLES_DIR / f"{r['vehicleId']}_{cat_id}.json").exists())
        brand_total = len(brand_reps) * len(categories)
        pct = brand_done / brand_total * 100 if brand_total > 0 else 0
        print(f"  {brand:18s} {len(brand_reps):>7} {brand_done:>7} {pct:>5.1f}%")
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="TecDoc parts import")
    parser.add_argument("command", choices=["download", "import", "status"])
    parser.add_argument("--brand", help="Filter by brand name")
    args = parser.parse_args()
    if args.command == "download":
        download(brand_filter=args.brand)
    elif args.command == "import":
        do_import()
    elif args.command == "status":
        status()
--- a/scripts/link_vehicle_parts.py
+++ b/scripts/link_vehicle_parts.py
@@ -0,0 +1,251 @@
 #!/usr/bin/env python3
 """
 Link parts to vehicles using TecDoc article files.
 Maps: article file (vehicleId_categoryId.json) → parts → vehicle_parts (MYE ids)
 Optimized v3: year+engine filtering + batch inserts.
 """
 import json
 import re
 import psycopg2
 from psycopg2.extras import execute_values
 from pathlib import Path
 DB_URL = "postgresql://nexus:nexus_autoparts_2026@localhost/nexus_autoparts"
 DATA_DIR = Path("/home/Autopartes/data/tecdoc")
 ARTICLES_DIR = DATA_DIR / "parts" / "articles"
 DETAILS_DIR = DATA_DIR / "parts" / "details"
 BATCH_SIZE = 50000
 def parse_capacity_liters(cap):
    """Convert TecDoc capacityLt (e.g. '1998.0000' cc) to liters float (1.998)."""
    try:
        cc = float(cap)
        return round(cc / 1000, 1)
    except:
        return None
 def extract_engine_liters(engine_name):
    """Extract liters from engine name like '2.0L 4cyl 127hp'."""
    m = re.match(r'(\d+\.\d+)L', engine_name)
    if m:
        return round(float(m.group(1)), 1)
    return None
 def run():
    conn = psycopg2.connect(DB_URL)
    cur = conn.cursor()
    # Step 1: Build vehicleId → vehicle info from TecDoc files
    print("Building vehicleId → vehicle info mapping...", flush=True)
    mfrs = json.loads((DATA_DIR / "manufacturers.json").read_text())
    vid_info = {}  # vehicleId → {brand, model, year_start, year_end, liters}
    for mfr in mfrs:
        brand = mfr['manufacturerName']
        if '(' in brand:
            continue
        mfr_id = mfr['manufacturerId']
        model_file = DATA_DIR / "models" / f"{mfr_id}.json"
        if not model_file.exists():
            continue
        models = json.loads(model_file.read_text())
        for model in models:
            model_name = model.get('modelName', '')
            if not model_name:
                continue
            vehicle_file = DATA_DIR / "vehicles" / f"{model['modelId']}.json"
            if not vehicle_file.exists():
                continue
            vehicles = json.loads(vehicle_file.read_text())
            if not vehicles:
                continue
            for v in vehicles:
                vid = v.get('vehicleId')
                if not vid:
                    continue
                # Parse year range
                year_start = None
                year_end = None
                try:
                    cs = v.get('constructionIntervalStart', '')
                    if cs:
                        year_start = int(cs[:4])
                    ce = v.get('constructionIntervalEnd', '')
                    if ce:
                        year_end = int(ce[:4])
                except:
                    pass
                # Parse engine capacity
                liters = parse_capacity_liters(v.get('capacityLt') or v.get('capacityTax'))
                vid_info[vid] = {
                    'brand': brand,
                    'model': model_name,
                    'year_start': year_start,
                    'year_end': year_end,
                    'liters': liters,
                }
    print(f"  {len(vid_info):,} vehicleIds mapped", flush=True)
    # Step 2: Build (brand, modelName) → list of (mye_id, year, liters) from our DB
    print("Building brand/model → MYE details mapping...", flush=True)
    cur.execute("""
        SELECT b.name_brand, m.name_model, mye.id_mye, y.year_car, e.name_engine
        FROM model_year_engine mye
        JOIN models m ON mye.model_id = m.id_model
        JOIN brands b ON m.brand_id = b.id_brand
        JOIN years y ON mye.year_id = y.id_year
        JOIN engines e ON mye.engine_id = e.id_engine
    """)
    brand_model_to_myes = {}
    for brand, model, mye_id, year, engine_name in cur.fetchall():
        key = (brand, model)
        liters = extract_engine_liters(engine_name)
        if key not in brand_model_to_myes:
            brand_model_to_myes[key] = []
        brand_model_to_myes[key].append((mye_id, year, liters))
    print(f"  {len(brand_model_to_myes):,} brand/model combos with {sum(len(v) for v in brand_model_to_myes.values()):,} MYEs", flush=True)
    # Step 3: Build OEM number → part_id from DB
    print("Loading parts cache...", flush=True)
    cur.execute("SELECT oem_part_number, id_part FROM parts WHERE oem_part_number IS NOT NULL")
    part_cache = {r[0]: r[1] for r in cur.fetchall()}
    print(f"  {len(part_cache):,} parts cached", flush=True)
    # Step 4: Load detail files to get articleId → OEM numbers
    print("Loading article detail OEM mappings...", flush=True)
    article_to_oems = {}
    for f in DETAILS_DIR.glob("*.json"):
        try:
            data = json.loads(f.read_text())
            oem_list = data.get('articleOemNo', [])
            if oem_list:
                oem_nos = [o.get('oemDisplayNo') for o in oem_list if o.get('oemDisplayNo')]
                if oem_nos:
                    article_to_oems[int(f.stem)] = oem_nos
        except:
            continue
    print(f"  {len(article_to_oems):,} articles with OEM data", flush=True)
    # Step 5: Process article files and create vehicle_parts
    print("\nCreating vehicle_parts links (filtered + batch mode)...", flush=True)
    stats = {'links': 0, 'skipped_no_mye': 0, 'skipped_no_part': 0, 'files': 0, 'filtered_out': 0}
    pending = []
    def flush_batch():
        if not pending:
            return
        execute_values(cur, """
            INSERT INTO vehicle_parts (model_year_engine_id, part_id, quantity_required)
            VALUES %s ON CONFLICT DO NOTHING
        """, pending, page_size=10000)
        conn.commit()
        pending.clear()
    article_files = sorted(ARTICLES_DIR.glob("*.json"))
    for f in article_files:
        parts_split = f.stem.split("_")
        if len(parts_split) != 2:
            continue
        vid = int(parts_split[0])
        info = vid_info.get(vid)
        if not info:
            stats['skipped_no_mye'] += 1
            continue
        bm = (info['brand'], info['model'])
        all_myes = brand_model_to_myes.get(bm, [])
        if not all_myes:
            stats['skipped_no_mye'] += 1
            continue
        # Filter MYEs by year range and engine capacity
        td_ys = info['year_start']
        td_ye = info['year_end']
        td_lit = info['liters']
        filtered_myes = []
        for mye_id, mye_year, mye_liters in all_myes:
            # Year filter: MYE year must fall within TecDoc construction interval
            if td_ys and td_ye:
                if mye_year < td_ys or mye_year > td_ye:
                    stats['filtered_out'] += 1
                    continue
            elif td_ys:
                if mye_year < td_ys:
                    stats['filtered_out'] += 1
                    continue
            # Engine capacity filter: must match within 0.2L tolerance
            if td_lit and mye_liters:
                if abs(td_lit - mye_liters) > 0.2:
                    stats['filtered_out'] += 1
                    continue
            filtered_myes.append(mye_id)
        if not filtered_myes:
            # Fallback: if filtering removed everything, skip
            stats['skipped_no_mye'] += 1
            continue
        try:
            articles = json.loads(f.read_text())
        except:
            continue
        for a in articles:
            aid = a.get('articleId')
            article_no = a.get('articleNo', '')
            supplier = a.get('supplierName', '')
            if not aid:
                continue
            part_ids = set()
            oem_nos = article_to_oems.get(aid, [])
            for oem_no in oem_nos:
                pid = part_cache.get(oem_no)
                if pid:
                    part_ids.add(pid)
            if not part_ids:
                stats['skipped_no_part'] += 1
                continue
            for mye_id in filtered_myes:
                for part_id in part_ids:
                    pending.append((mye_id, part_id, 1))
                    stats['links'] += 1
            if len(pending) >= BATCH_SIZE:
                flush_batch()
        stats['files'] += 1
        if stats['files'] % 500 == 0:
            flush_batch()
            print(f"  {stats['files']:,}/{len(article_files):,} files | "
                  f"{stats['links']:,} links | {stats['filtered_out']:,} filtered out", flush=True)
    flush_batch()
    cur.close()
    conn.close()
    print(f"\n{'='*50}", flush=True)
    print(f"LINKING COMPLETE", flush=True)
    print(f"  Files processed:  {stats['files']:,}", flush=True)
    print(f"  Links created:    {stats['links']:,}", flush=True)
    print(f"  Filtered out:     {stats['filtered_out']:,}", flush=True)
    print(f"  Skipped (no MYE): {stats['skipped_no_mye']:,}", flush=True)
    print(f"  Skipped (no part):{stats['skipped_no_part']:,}", flush=True)
    print(f"{'='*50}", flush=True)
 if __name__ == "__main__":
    run()