fix: stop creating AFT- placeholder parts in import pipeline

- import_phase1.py: skip AFT- part creation when no OEM data - link_vehicle_parts.py: remove AFT- fallback lookup in part cache - import_tecdoc_parts.py: add VW to TOP_BRANDS list Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-18 22:25:21 +00:00
parent 4b01c57c88
commit eff04a5e60
3 changed files with 930 additions and 0 deletions
--- a/scripts/link_vehicle_parts.py
+++ b/scripts/link_vehicle_parts.py
@@ -0,0 +1,251 @@
+#!/usr/bin/env python3
+"""
+Link parts to vehicles using TecDoc article files.
+Maps: article file (vehicleId_categoryId.json) → parts → vehicle_parts (MYE ids)
+Optimized v3: year+engine filtering + batch inserts.
+"""
+
+import json
+import re
+import psycopg2
+from psycopg2.extras import execute_values
+from pathlib import Path
+
+DB_URL = "postgresql://nexus:nexus_autoparts_2026@localhost/nexus_autoparts"
+DATA_DIR = Path("/home/Autopartes/data/tecdoc")
+ARTICLES_DIR = DATA_DIR / "parts" / "articles"
+DETAILS_DIR = DATA_DIR / "parts" / "details"
+
+BATCH_SIZE = 50000
+
+
+def parse_capacity_liters(cap):
+    """Convert TecDoc capacityLt (e.g. '1998.0000' cc) to liters float (1.998)."""
+    try:
+        cc = float(cap)
+        return round(cc / 1000, 1)
+    except:
+        return None
+
+
+def extract_engine_liters(engine_name):
+    """Extract liters from engine name like '2.0L 4cyl 127hp'."""
+    m = re.match(r'(\d+\.\d+)L', engine_name)
+    if m:
+        return round(float(m.group(1)), 1)
+    return None
+
+
+def run():
+    conn = psycopg2.connect(DB_URL)
+    cur = conn.cursor()
+
+    # Step 1: Build vehicleId → vehicle info from TecDoc files
+    print("Building vehicleId → vehicle info mapping...", flush=True)
+    mfrs = json.loads((DATA_DIR / "manufacturers.json").read_text())
+    vid_info = {}  # vehicleId → {brand, model, year_start, year_end, liters}
+    for mfr in mfrs:
+        brand = mfr['manufacturerName']
+        if '(' in brand:
+            continue
+        mfr_id = mfr['manufacturerId']
+        model_file = DATA_DIR / "models" / f"{mfr_id}.json"
+        if not model_file.exists():
+            continue
+        models = json.loads(model_file.read_text())
+        for model in models:
+            model_name = model.get('modelName', '')
+            if not model_name:
+                continue
+            vehicle_file = DATA_DIR / "vehicles" / f"{model['modelId']}.json"
+            if not vehicle_file.exists():
+                continue
+            vehicles = json.loads(vehicle_file.read_text())
+            if not vehicles:
+                continue
+            for v in vehicles:
+                vid = v.get('vehicleId')
+                if not vid:
+                    continue
+                # Parse year range
+                year_start = None
+                year_end = None
+                try:
+                    cs = v.get('constructionIntervalStart', '')
+                    if cs:
+                        year_start = int(cs[:4])
+                    ce = v.get('constructionIntervalEnd', '')
+                    if ce:
+                        year_end = int(ce[:4])
+                except:
+                    pass
+                # Parse engine capacity
+                liters = parse_capacity_liters(v.get('capacityLt') or v.get('capacityTax'))
+                vid_info[vid] = {
+                    'brand': brand,
+                    'model': model_name,
+                    'year_start': year_start,
+                    'year_end': year_end,
+                    'liters': liters,
+                }
+
+    print(f"  {len(vid_info):,} vehicleIds mapped", flush=True)
+
+    # Step 2: Build (brand, modelName) → list of (mye_id, year, liters) from our DB
+    print("Building brand/model → MYE details mapping...", flush=True)
+    cur.execute("""
+        SELECT b.name_brand, m.name_model, mye.id_mye, y.year_car, e.name_engine
+        FROM model_year_engine mye
+        JOIN models m ON mye.model_id = m.id_model
+        JOIN brands b ON m.brand_id = b.id_brand
+        JOIN years y ON mye.year_id = y.id_year
+        JOIN engines e ON mye.engine_id = e.id_engine
+    """)
+    brand_model_to_myes = {}
+    for brand, model, mye_id, year, engine_name in cur.fetchall():
+        key = (brand, model)
+        liters = extract_engine_liters(engine_name)
+        if key not in brand_model_to_myes:
+            brand_model_to_myes[key] = []
+        brand_model_to_myes[key].append((mye_id, year, liters))
+
+    print(f"  {len(brand_model_to_myes):,} brand/model combos with {sum(len(v) for v in brand_model_to_myes.values()):,} MYEs", flush=True)
+
+    # Step 3: Build OEM number → part_id from DB
+    print("Loading parts cache...", flush=True)
+    cur.execute("SELECT oem_part_number, id_part FROM parts WHERE oem_part_number IS NOT NULL")
+    part_cache = {r[0]: r[1] for r in cur.fetchall()}
+    print(f"  {len(part_cache):,} parts cached", flush=True)
+
+    # Step 4: Load detail files to get articleId → OEM numbers
+    print("Loading article detail OEM mappings...", flush=True)
+    article_to_oems = {}
+    for f in DETAILS_DIR.glob("*.json"):
+        try:
+            data = json.loads(f.read_text())
+            oem_list = data.get('articleOemNo', [])
+            if oem_list:
+                oem_nos = [o.get('oemDisplayNo') for o in oem_list if o.get('oemDisplayNo')]
+                if oem_nos:
+                    article_to_oems[int(f.stem)] = oem_nos
+        except:
+            continue
+    print(f"  {len(article_to_oems):,} articles with OEM data", flush=True)
+
+    # Step 5: Process article files and create vehicle_parts
+    print("\nCreating vehicle_parts links (filtered + batch mode)...", flush=True)
+
+    stats = {'links': 0, 'skipped_no_mye': 0, 'skipped_no_part': 0, 'files': 0, 'filtered_out': 0}
+    pending = []
+
+    def flush_batch():
+        if not pending:
+            return
+        execute_values(cur, """
+            INSERT INTO vehicle_parts (model_year_engine_id, part_id, quantity_required)
+            VALUES %s ON CONFLICT DO NOTHING
+        """, pending, page_size=10000)
+        conn.commit()
+        pending.clear()
+
+    article_files = sorted(ARTICLES_DIR.glob("*.json"))
+    for f in article_files:
+        parts_split = f.stem.split("_")
+        if len(parts_split) != 2:
+            continue
+        vid = int(parts_split[0])
+
+        info = vid_info.get(vid)
+        if not info:
+            stats['skipped_no_mye'] += 1
+            continue
+
+        bm = (info['brand'], info['model'])
+        all_myes = brand_model_to_myes.get(bm, [])
+        if not all_myes:
+            stats['skipped_no_mye'] += 1
+            continue
+
+        # Filter MYEs by year range and engine capacity
+        td_ys = info['year_start']
+        td_ye = info['year_end']
+        td_lit = info['liters']
+
+        filtered_myes = []
+        for mye_id, mye_year, mye_liters in all_myes:
+            # Year filter: MYE year must fall within TecDoc construction interval
+            if td_ys and td_ye:
+                if mye_year < td_ys or mye_year > td_ye:
+                    stats['filtered_out'] += 1
+                    continue
+            elif td_ys:
+                if mye_year < td_ys:
+                    stats['filtered_out'] += 1
+                    continue
+
+            # Engine capacity filter: must match within 0.2L tolerance
+            if td_lit and mye_liters:
+                if abs(td_lit - mye_liters) > 0.2:
+                    stats['filtered_out'] += 1
+                    continue
+
+            filtered_myes.append(mye_id)
+
+        if not filtered_myes:
+            # Fallback: if filtering removed everything, skip
+            stats['skipped_no_mye'] += 1
+            continue
+
+        try:
+            articles = json.loads(f.read_text())
+        except:
+            continue
+
+        for a in articles:
+            aid = a.get('articleId')
+            article_no = a.get('articleNo', '')
+            supplier = a.get('supplierName', '')
+            if not aid:
+                continue
+
+            part_ids = set()
+            oem_nos = article_to_oems.get(aid, [])
+            for oem_no in oem_nos:
+                pid = part_cache.get(oem_no)
+                if pid:
+                    part_ids.add(pid)
+
+            if not part_ids:
+                stats['skipped_no_part'] += 1
+                continue
+
+            for mye_id in filtered_myes:
+                for part_id in part_ids:
+                    pending.append((mye_id, part_id, 1))
+                    stats['links'] += 1
+
+            if len(pending) >= BATCH_SIZE:
+                flush_batch()
+
+        stats['files'] += 1
+        if stats['files'] % 500 == 0:
+            flush_batch()
+            print(f"  {stats['files']:,}/{len(article_files):,} files | "
+                  f"{stats['links']:,} links | {stats['filtered_out']:,} filtered out", flush=True)
+
+    flush_batch()
+    cur.close()
+    conn.close()
+
+    print(f"\n{'='*50}", flush=True)
+    print(f"LINKING COMPLETE", flush=True)
+    print(f"  Files processed:  {stats['files']:,}", flush=True)
+    print(f"  Links created:    {stats['links']:,}", flush=True)
+    print(f"  Filtered out:     {stats['filtered_out']:,}", flush=True)
+    print(f"  Skipped (no MYE): {stats['skipped_no_mye']:,}", flush=True)
+    print(f"  Skipped (no part):{stats['skipped_no_part']:,}", flush=True)
+    print(f"{'='*50}", flush=True)
+
+
+if __name__ == "__main__":
+    run()