#!/usr/bin/env python3 """ Link parts to vehicles using TecDoc article files. Maps: article file (vehicleId_categoryId.json) → parts → vehicle_parts (MYE ids) Optimized v3: year+engine filtering + batch inserts. """ import json import re import psycopg2 from psycopg2.extras import execute_values from pathlib import Path DB_URL = "postgresql://nexus:nexus_autoparts_2026@localhost/nexus_autoparts" DATA_DIR = Path("/home/Autopartes/data/tecdoc") ARTICLES_DIR = DATA_DIR / "parts" / "articles" DETAILS_DIR = DATA_DIR / "parts" / "details" BATCH_SIZE = 50000 def parse_capacity_liters(cap): """Convert TecDoc capacityLt (e.g. '1998.0000' cc) to liters float (1.998).""" try: cc = float(cap) return round(cc / 1000, 1) except: return None def extract_engine_liters(engine_name): """Extract liters from engine name like '2.0L 4cyl 127hp'.""" m = re.match(r'(\d+\.\d+)L', engine_name) if m: return round(float(m.group(1)), 1) return None def run(): conn = psycopg2.connect(DB_URL) cur = conn.cursor() # Step 1: Build vehicleId → vehicle info from TecDoc files print("Building vehicleId → vehicle info mapping...", flush=True) mfrs = json.loads((DATA_DIR / "manufacturers.json").read_text()) vid_info = {} # vehicleId → {brand, model, year_start, year_end, liters} for mfr in mfrs: brand = mfr['manufacturerName'] if '(' in brand: continue mfr_id = mfr['manufacturerId'] model_file = DATA_DIR / "models" / f"{mfr_id}.json" if not model_file.exists(): continue models = json.loads(model_file.read_text()) for model in models: model_name = model.get('modelName', '') if not model_name: continue vehicle_file = DATA_DIR / "vehicles" / f"{model['modelId']}.json" if not vehicle_file.exists(): continue vehicles = json.loads(vehicle_file.read_text()) if not vehicles: continue for v in vehicles: vid = v.get('vehicleId') if not vid: continue # Parse year range year_start = None year_end = None try: cs = v.get('constructionIntervalStart', '') if cs: year_start = int(cs[:4]) ce = v.get('constructionIntervalEnd', '') if ce: year_end = int(ce[:4]) except: pass # Parse engine capacity liters = parse_capacity_liters(v.get('capacityLt') or v.get('capacityTax')) vid_info[vid] = { 'brand': brand, 'model': model_name, 'year_start': year_start, 'year_end': year_end, 'liters': liters, } print(f" {len(vid_info):,} vehicleIds mapped", flush=True) # Step 2: Build (brand, modelName) → list of (mye_id, year, liters) from our DB print("Building brand/model → MYE details mapping...", flush=True) cur.execute(""" SELECT b.name_brand, m.name_model, mye.id_mye, y.year_car, e.name_engine FROM model_year_engine mye JOIN models m ON mye.model_id = m.id_model JOIN brands b ON m.brand_id = b.id_brand JOIN years y ON mye.year_id = y.id_year JOIN engines e ON mye.engine_id = e.id_engine """) brand_model_to_myes = {} for brand, model, mye_id, year, engine_name in cur.fetchall(): key = (brand, model) liters = extract_engine_liters(engine_name) if key not in brand_model_to_myes: brand_model_to_myes[key] = [] brand_model_to_myes[key].append((mye_id, year, liters)) print(f" {len(brand_model_to_myes):,} brand/model combos with {sum(len(v) for v in brand_model_to_myes.values()):,} MYEs", flush=True) # Step 3: Build OEM number → part_id from DB print("Loading parts cache...", flush=True) cur.execute("SELECT oem_part_number, id_part FROM parts WHERE oem_part_number IS NOT NULL") part_cache = {r[0]: r[1] for r in cur.fetchall()} print(f" {len(part_cache):,} parts cached", flush=True) # Step 4: Load detail files to get articleId → OEM numbers print("Loading article detail OEM mappings...", flush=True) article_to_oems = {} for f in DETAILS_DIR.glob("*.json"): try: data = json.loads(f.read_text()) oem_list = data.get('articleOemNo', []) if oem_list: oem_nos = [o.get('oemDisplayNo') for o in oem_list if o.get('oemDisplayNo')] if oem_nos: article_to_oems[int(f.stem)] = oem_nos except: continue print(f" {len(article_to_oems):,} articles with OEM data", flush=True) # Step 5: Process article files and create vehicle_parts print("\nCreating vehicle_parts links (filtered + batch mode)...", flush=True) stats = {'links': 0, 'skipped_no_mye': 0, 'skipped_no_part': 0, 'files': 0, 'filtered_out': 0} pending = [] def flush_batch(): if not pending: return execute_values(cur, """ INSERT INTO vehicle_parts (model_year_engine_id, part_id, quantity_required) VALUES %s ON CONFLICT DO NOTHING """, pending, page_size=10000) conn.commit() pending.clear() article_files = sorted(ARTICLES_DIR.glob("*.json")) for f in article_files: parts_split = f.stem.split("_") if len(parts_split) != 2: continue vid = int(parts_split[0]) info = vid_info.get(vid) if not info: stats['skipped_no_mye'] += 1 continue bm = (info['brand'], info['model']) all_myes = brand_model_to_myes.get(bm, []) if not all_myes: stats['skipped_no_mye'] += 1 continue # Filter MYEs by year range and engine capacity td_ys = info['year_start'] td_ye = info['year_end'] td_lit = info['liters'] filtered_myes = [] for mye_id, mye_year, mye_liters in all_myes: # Year filter: MYE year must fall within TecDoc construction interval if td_ys and td_ye: if mye_year < td_ys or mye_year > td_ye: stats['filtered_out'] += 1 continue elif td_ys: if mye_year < td_ys: stats['filtered_out'] += 1 continue # Engine capacity filter: must match within 0.2L tolerance if td_lit and mye_liters: if abs(td_lit - mye_liters) > 0.2: stats['filtered_out'] += 1 continue filtered_myes.append(mye_id) if not filtered_myes: # Fallback: if filtering removed everything, skip stats['skipped_no_mye'] += 1 continue try: articles = json.loads(f.read_text()) except: continue for a in articles: aid = a.get('articleId') article_no = a.get('articleNo', '') supplier = a.get('supplierName', '') if not aid: continue part_ids = set() oem_nos = article_to_oems.get(aid, []) for oem_no in oem_nos: pid = part_cache.get(oem_no) if pid: part_ids.add(pid) if not part_ids: stats['skipped_no_part'] += 1 continue for mye_id in filtered_myes: for part_id in part_ids: pending.append((mye_id, part_id, 1)) stats['links'] += 1 if len(pending) >= BATCH_SIZE: flush_batch() stats['files'] += 1 if stats['files'] % 500 == 0: flush_batch() print(f" {stats['files']:,}/{len(article_files):,} files | " f"{stats['links']:,} links | {stats['filtered_out']:,} filtered out", flush=True) flush_batch() cur.close() conn.close() print(f"\n{'='*50}", flush=True) print(f"LINKING COMPLETE", flush=True) print(f" Files processed: {stats['files']:,}", flush=True) print(f" Links created: {stats['links']:,}", flush=True) print(f" Filtered out: {stats['filtered_out']:,}", flush=True) print(f" Skipped (no MYE): {stats['skipped_no_mye']:,}", flush=True) print(f" Skipped (no part):{stats['skipped_no_part']:,}", flush=True) print(f"{'='*50}", flush=True) if __name__ == "__main__": run()