- import_phase1.py: skip AFT- part creation when no OEM data - link_vehicle_parts.py: remove AFT- fallback lookup in part cache - import_tecdoc_parts.py: add VW to TOP_BRANDS list Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
252 lines
8.7 KiB
Python
252 lines
8.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Link parts to vehicles using TecDoc article files.
|
|
Maps: article file (vehicleId_categoryId.json) → parts → vehicle_parts (MYE ids)
|
|
Optimized v3: year+engine filtering + batch inserts.
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import psycopg2
|
|
from psycopg2.extras import execute_values
|
|
from pathlib import Path
|
|
|
|
DB_URL = "postgresql://nexus:nexus_autoparts_2026@localhost/nexus_autoparts"
|
|
DATA_DIR = Path("/home/Autopartes/data/tecdoc")
|
|
ARTICLES_DIR = DATA_DIR / "parts" / "articles"
|
|
DETAILS_DIR = DATA_DIR / "parts" / "details"
|
|
|
|
BATCH_SIZE = 50000
|
|
|
|
|
|
def parse_capacity_liters(cap):
|
|
"""Convert TecDoc capacityLt (e.g. '1998.0000' cc) to liters float (1.998)."""
|
|
try:
|
|
cc = float(cap)
|
|
return round(cc / 1000, 1)
|
|
except:
|
|
return None
|
|
|
|
|
|
def extract_engine_liters(engine_name):
|
|
"""Extract liters from engine name like '2.0L 4cyl 127hp'."""
|
|
m = re.match(r'(\d+\.\d+)L', engine_name)
|
|
if m:
|
|
return round(float(m.group(1)), 1)
|
|
return None
|
|
|
|
|
|
def run():
|
|
conn = psycopg2.connect(DB_URL)
|
|
cur = conn.cursor()
|
|
|
|
# Step 1: Build vehicleId → vehicle info from TecDoc files
|
|
print("Building vehicleId → vehicle info mapping...", flush=True)
|
|
mfrs = json.loads((DATA_DIR / "manufacturers.json").read_text())
|
|
vid_info = {} # vehicleId → {brand, model, year_start, year_end, liters}
|
|
for mfr in mfrs:
|
|
brand = mfr['manufacturerName']
|
|
if '(' in brand:
|
|
continue
|
|
mfr_id = mfr['manufacturerId']
|
|
model_file = DATA_DIR / "models" / f"{mfr_id}.json"
|
|
if not model_file.exists():
|
|
continue
|
|
models = json.loads(model_file.read_text())
|
|
for model in models:
|
|
model_name = model.get('modelName', '')
|
|
if not model_name:
|
|
continue
|
|
vehicle_file = DATA_DIR / "vehicles" / f"{model['modelId']}.json"
|
|
if not vehicle_file.exists():
|
|
continue
|
|
vehicles = json.loads(vehicle_file.read_text())
|
|
if not vehicles:
|
|
continue
|
|
for v in vehicles:
|
|
vid = v.get('vehicleId')
|
|
if not vid:
|
|
continue
|
|
# Parse year range
|
|
year_start = None
|
|
year_end = None
|
|
try:
|
|
cs = v.get('constructionIntervalStart', '')
|
|
if cs:
|
|
year_start = int(cs[:4])
|
|
ce = v.get('constructionIntervalEnd', '')
|
|
if ce:
|
|
year_end = int(ce[:4])
|
|
except:
|
|
pass
|
|
# Parse engine capacity
|
|
liters = parse_capacity_liters(v.get('capacityLt') or v.get('capacityTax'))
|
|
vid_info[vid] = {
|
|
'brand': brand,
|
|
'model': model_name,
|
|
'year_start': year_start,
|
|
'year_end': year_end,
|
|
'liters': liters,
|
|
}
|
|
|
|
print(f" {len(vid_info):,} vehicleIds mapped", flush=True)
|
|
|
|
# Step 2: Build (brand, modelName) → list of (mye_id, year, liters) from our DB
|
|
print("Building brand/model → MYE details mapping...", flush=True)
|
|
cur.execute("""
|
|
SELECT b.name_brand, m.name_model, mye.id_mye, y.year_car, e.name_engine
|
|
FROM model_year_engine mye
|
|
JOIN models m ON mye.model_id = m.id_model
|
|
JOIN brands b ON m.brand_id = b.id_brand
|
|
JOIN years y ON mye.year_id = y.id_year
|
|
JOIN engines e ON mye.engine_id = e.id_engine
|
|
""")
|
|
brand_model_to_myes = {}
|
|
for brand, model, mye_id, year, engine_name in cur.fetchall():
|
|
key = (brand, model)
|
|
liters = extract_engine_liters(engine_name)
|
|
if key not in brand_model_to_myes:
|
|
brand_model_to_myes[key] = []
|
|
brand_model_to_myes[key].append((mye_id, year, liters))
|
|
|
|
print(f" {len(brand_model_to_myes):,} brand/model combos with {sum(len(v) for v in brand_model_to_myes.values()):,} MYEs", flush=True)
|
|
|
|
# Step 3: Build OEM number → part_id from DB
|
|
print("Loading parts cache...", flush=True)
|
|
cur.execute("SELECT oem_part_number, id_part FROM parts WHERE oem_part_number IS NOT NULL")
|
|
part_cache = {r[0]: r[1] for r in cur.fetchall()}
|
|
print(f" {len(part_cache):,} parts cached", flush=True)
|
|
|
|
# Step 4: Load detail files to get articleId → OEM numbers
|
|
print("Loading article detail OEM mappings...", flush=True)
|
|
article_to_oems = {}
|
|
for f in DETAILS_DIR.glob("*.json"):
|
|
try:
|
|
data = json.loads(f.read_text())
|
|
oem_list = data.get('articleOemNo', [])
|
|
if oem_list:
|
|
oem_nos = [o.get('oemDisplayNo') for o in oem_list if o.get('oemDisplayNo')]
|
|
if oem_nos:
|
|
article_to_oems[int(f.stem)] = oem_nos
|
|
except:
|
|
continue
|
|
print(f" {len(article_to_oems):,} articles with OEM data", flush=True)
|
|
|
|
# Step 5: Process article files and create vehicle_parts
|
|
print("\nCreating vehicle_parts links (filtered + batch mode)...", flush=True)
|
|
|
|
stats = {'links': 0, 'skipped_no_mye': 0, 'skipped_no_part': 0, 'files': 0, 'filtered_out': 0}
|
|
pending = []
|
|
|
|
def flush_batch():
|
|
if not pending:
|
|
return
|
|
execute_values(cur, """
|
|
INSERT INTO vehicle_parts (model_year_engine_id, part_id, quantity_required)
|
|
VALUES %s ON CONFLICT DO NOTHING
|
|
""", pending, page_size=10000)
|
|
conn.commit()
|
|
pending.clear()
|
|
|
|
article_files = sorted(ARTICLES_DIR.glob("*.json"))
|
|
for f in article_files:
|
|
parts_split = f.stem.split("_")
|
|
if len(parts_split) != 2:
|
|
continue
|
|
vid = int(parts_split[0])
|
|
|
|
info = vid_info.get(vid)
|
|
if not info:
|
|
stats['skipped_no_mye'] += 1
|
|
continue
|
|
|
|
bm = (info['brand'], info['model'])
|
|
all_myes = brand_model_to_myes.get(bm, [])
|
|
if not all_myes:
|
|
stats['skipped_no_mye'] += 1
|
|
continue
|
|
|
|
# Filter MYEs by year range and engine capacity
|
|
td_ys = info['year_start']
|
|
td_ye = info['year_end']
|
|
td_lit = info['liters']
|
|
|
|
filtered_myes = []
|
|
for mye_id, mye_year, mye_liters in all_myes:
|
|
# Year filter: MYE year must fall within TecDoc construction interval
|
|
if td_ys and td_ye:
|
|
if mye_year < td_ys or mye_year > td_ye:
|
|
stats['filtered_out'] += 1
|
|
continue
|
|
elif td_ys:
|
|
if mye_year < td_ys:
|
|
stats['filtered_out'] += 1
|
|
continue
|
|
|
|
# Engine capacity filter: must match within 0.2L tolerance
|
|
if td_lit and mye_liters:
|
|
if abs(td_lit - mye_liters) > 0.2:
|
|
stats['filtered_out'] += 1
|
|
continue
|
|
|
|
filtered_myes.append(mye_id)
|
|
|
|
if not filtered_myes:
|
|
# Fallback: if filtering removed everything, skip
|
|
stats['skipped_no_mye'] += 1
|
|
continue
|
|
|
|
try:
|
|
articles = json.loads(f.read_text())
|
|
except:
|
|
continue
|
|
|
|
for a in articles:
|
|
aid = a.get('articleId')
|
|
article_no = a.get('articleNo', '')
|
|
supplier = a.get('supplierName', '')
|
|
if not aid:
|
|
continue
|
|
|
|
part_ids = set()
|
|
oem_nos = article_to_oems.get(aid, [])
|
|
for oem_no in oem_nos:
|
|
pid = part_cache.get(oem_no)
|
|
if pid:
|
|
part_ids.add(pid)
|
|
|
|
if not part_ids:
|
|
stats['skipped_no_part'] += 1
|
|
continue
|
|
|
|
for mye_id in filtered_myes:
|
|
for part_id in part_ids:
|
|
pending.append((mye_id, part_id, 1))
|
|
stats['links'] += 1
|
|
|
|
if len(pending) >= BATCH_SIZE:
|
|
flush_batch()
|
|
|
|
stats['files'] += 1
|
|
if stats['files'] % 500 == 0:
|
|
flush_batch()
|
|
print(f" {stats['files']:,}/{len(article_files):,} files | "
|
|
f"{stats['links']:,} links | {stats['filtered_out']:,} filtered out", flush=True)
|
|
|
|
flush_batch()
|
|
cur.close()
|
|
conn.close()
|
|
|
|
print(f"\n{'='*50}", flush=True)
|
|
print(f"LINKING COMPLETE", flush=True)
|
|
print(f" Files processed: {stats['files']:,}", flush=True)
|
|
print(f" Links created: {stats['links']:,}", flush=True)
|
|
print(f" Filtered out: {stats['filtered_out']:,}", flush=True)
|
|
print(f" Skipped (no MYE): {stats['skipped_no_mye']:,}", flush=True)
|
|
print(f" Skipped (no part):{stats['skipped_no_part']:,}", flush=True)
|
|
print(f"{'='*50}", flush=True)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
run()
|