Files
Autoparts-DB/scripts/link_vehicle_parts.py
consultoria-as eff04a5e60 fix: stop creating AFT- placeholder parts in import pipeline
- import_phase1.py: skip AFT- part creation when no OEM data
- link_vehicle_parts.py: remove AFT- fallback lookup in part cache
- import_tecdoc_parts.py: add VW to TOP_BRANDS list

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-18 22:25:21 +00:00

252 lines
8.7 KiB
Python

#!/usr/bin/env python3
"""
Link parts to vehicles using TecDoc article files.
Maps: article file (vehicleId_categoryId.json) → parts → vehicle_parts (MYE ids)
Optimized v3: year+engine filtering + batch inserts.
"""
import json
import re
import psycopg2
from psycopg2.extras import execute_values
from pathlib import Path
DB_URL = "postgresql://nexus:nexus_autoparts_2026@localhost/nexus_autoparts"
DATA_DIR = Path("/home/Autopartes/data/tecdoc")
ARTICLES_DIR = DATA_DIR / "parts" / "articles"
DETAILS_DIR = DATA_DIR / "parts" / "details"
BATCH_SIZE = 50000
def parse_capacity_liters(cap):
"""Convert TecDoc capacityLt (e.g. '1998.0000' cc) to liters float (1.998)."""
try:
cc = float(cap)
return round(cc / 1000, 1)
except:
return None
def extract_engine_liters(engine_name):
"""Extract liters from engine name like '2.0L 4cyl 127hp'."""
m = re.match(r'(\d+\.\d+)L', engine_name)
if m:
return round(float(m.group(1)), 1)
return None
def run():
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
# Step 1: Build vehicleId → vehicle info from TecDoc files
print("Building vehicleId → vehicle info mapping...", flush=True)
mfrs = json.loads((DATA_DIR / "manufacturers.json").read_text())
vid_info = {} # vehicleId → {brand, model, year_start, year_end, liters}
for mfr in mfrs:
brand = mfr['manufacturerName']
if '(' in brand:
continue
mfr_id = mfr['manufacturerId']
model_file = DATA_DIR / "models" / f"{mfr_id}.json"
if not model_file.exists():
continue
models = json.loads(model_file.read_text())
for model in models:
model_name = model.get('modelName', '')
if not model_name:
continue
vehicle_file = DATA_DIR / "vehicles" / f"{model['modelId']}.json"
if not vehicle_file.exists():
continue
vehicles = json.loads(vehicle_file.read_text())
if not vehicles:
continue
for v in vehicles:
vid = v.get('vehicleId')
if not vid:
continue
# Parse year range
year_start = None
year_end = None
try:
cs = v.get('constructionIntervalStart', '')
if cs:
year_start = int(cs[:4])
ce = v.get('constructionIntervalEnd', '')
if ce:
year_end = int(ce[:4])
except:
pass
# Parse engine capacity
liters = parse_capacity_liters(v.get('capacityLt') or v.get('capacityTax'))
vid_info[vid] = {
'brand': brand,
'model': model_name,
'year_start': year_start,
'year_end': year_end,
'liters': liters,
}
print(f" {len(vid_info):,} vehicleIds mapped", flush=True)
# Step 2: Build (brand, modelName) → list of (mye_id, year, liters) from our DB
print("Building brand/model → MYE details mapping...", flush=True)
cur.execute("""
SELECT b.name_brand, m.name_model, mye.id_mye, y.year_car, e.name_engine
FROM model_year_engine mye
JOIN models m ON mye.model_id = m.id_model
JOIN brands b ON m.brand_id = b.id_brand
JOIN years y ON mye.year_id = y.id_year
JOIN engines e ON mye.engine_id = e.id_engine
""")
brand_model_to_myes = {}
for brand, model, mye_id, year, engine_name in cur.fetchall():
key = (brand, model)
liters = extract_engine_liters(engine_name)
if key not in brand_model_to_myes:
brand_model_to_myes[key] = []
brand_model_to_myes[key].append((mye_id, year, liters))
print(f" {len(brand_model_to_myes):,} brand/model combos with {sum(len(v) for v in brand_model_to_myes.values()):,} MYEs", flush=True)
# Step 3: Build OEM number → part_id from DB
print("Loading parts cache...", flush=True)
cur.execute("SELECT oem_part_number, id_part FROM parts WHERE oem_part_number IS NOT NULL")
part_cache = {r[0]: r[1] for r in cur.fetchall()}
print(f" {len(part_cache):,} parts cached", flush=True)
# Step 4: Load detail files to get articleId → OEM numbers
print("Loading article detail OEM mappings...", flush=True)
article_to_oems = {}
for f in DETAILS_DIR.glob("*.json"):
try:
data = json.loads(f.read_text())
oem_list = data.get('articleOemNo', [])
if oem_list:
oem_nos = [o.get('oemDisplayNo') for o in oem_list if o.get('oemDisplayNo')]
if oem_nos:
article_to_oems[int(f.stem)] = oem_nos
except:
continue
print(f" {len(article_to_oems):,} articles with OEM data", flush=True)
# Step 5: Process article files and create vehicle_parts
print("\nCreating vehicle_parts links (filtered + batch mode)...", flush=True)
stats = {'links': 0, 'skipped_no_mye': 0, 'skipped_no_part': 0, 'files': 0, 'filtered_out': 0}
pending = []
def flush_batch():
if not pending:
return
execute_values(cur, """
INSERT INTO vehicle_parts (model_year_engine_id, part_id, quantity_required)
VALUES %s ON CONFLICT DO NOTHING
""", pending, page_size=10000)
conn.commit()
pending.clear()
article_files = sorted(ARTICLES_DIR.glob("*.json"))
for f in article_files:
parts_split = f.stem.split("_")
if len(parts_split) != 2:
continue
vid = int(parts_split[0])
info = vid_info.get(vid)
if not info:
stats['skipped_no_mye'] += 1
continue
bm = (info['brand'], info['model'])
all_myes = brand_model_to_myes.get(bm, [])
if not all_myes:
stats['skipped_no_mye'] += 1
continue
# Filter MYEs by year range and engine capacity
td_ys = info['year_start']
td_ye = info['year_end']
td_lit = info['liters']
filtered_myes = []
for mye_id, mye_year, mye_liters in all_myes:
# Year filter: MYE year must fall within TecDoc construction interval
if td_ys and td_ye:
if mye_year < td_ys or mye_year > td_ye:
stats['filtered_out'] += 1
continue
elif td_ys:
if mye_year < td_ys:
stats['filtered_out'] += 1
continue
# Engine capacity filter: must match within 0.2L tolerance
if td_lit and mye_liters:
if abs(td_lit - mye_liters) > 0.2:
stats['filtered_out'] += 1
continue
filtered_myes.append(mye_id)
if not filtered_myes:
# Fallback: if filtering removed everything, skip
stats['skipped_no_mye'] += 1
continue
try:
articles = json.loads(f.read_text())
except:
continue
for a in articles:
aid = a.get('articleId')
article_no = a.get('articleNo', '')
supplier = a.get('supplierName', '')
if not aid:
continue
part_ids = set()
oem_nos = article_to_oems.get(aid, [])
for oem_no in oem_nos:
pid = part_cache.get(oem_no)
if pid:
part_ids.add(pid)
if not part_ids:
stats['skipped_no_part'] += 1
continue
for mye_id in filtered_myes:
for part_id in part_ids:
pending.append((mye_id, part_id, 1))
stats['links'] += 1
if len(pending) >= BATCH_SIZE:
flush_batch()
stats['files'] += 1
if stats['files'] % 500 == 0:
flush_batch()
print(f" {stats['files']:,}/{len(article_files):,} files | "
f"{stats['links']:,} links | {stats['filtered_out']:,} filtered out", flush=True)
flush_batch()
cur.close()
conn.close()
print(f"\n{'='*50}", flush=True)
print(f"LINKING COMPLETE", flush=True)
print(f" Files processed: {stats['files']:,}", flush=True)
print(f" Links created: {stats['links']:,}", flush=True)
print(f" Filtered out: {stats['filtered_out']:,}", flush=True)
print(f" Skipped (no MYE): {stats['skipped_no_mye']:,}", flush=True)
print(f" Skipped (no part):{stats['skipped_no_part']:,}", flush=True)
print(f"{'='*50}", flush=True)
if __name__ == "__main__":
run()