fix: stop creating AFT- placeholder parts in import pipeline
- import_phase1.py: skip AFT- part creation when no OEM data - link_vehicle_parts.py: remove AFT- fallback lookup in part cache - import_tecdoc_parts.py: add VW to TOP_BRANDS list Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
251
scripts/link_vehicle_parts.py
Normal file
251
scripts/link_vehicle_parts.py
Normal file
@@ -0,0 +1,251 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Link parts to vehicles using TecDoc article files.
|
||||
Maps: article file (vehicleId_categoryId.json) → parts → vehicle_parts (MYE ids)
|
||||
Optimized v3: year+engine filtering + batch inserts.
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_values
|
||||
from pathlib import Path
|
||||
|
||||
DB_URL = "postgresql://nexus:nexus_autoparts_2026@localhost/nexus_autoparts"
|
||||
DATA_DIR = Path("/home/Autopartes/data/tecdoc")
|
||||
ARTICLES_DIR = DATA_DIR / "parts" / "articles"
|
||||
DETAILS_DIR = DATA_DIR / "parts" / "details"
|
||||
|
||||
BATCH_SIZE = 50000
|
||||
|
||||
|
||||
def parse_capacity_liters(cap):
|
||||
"""Convert TecDoc capacityLt (e.g. '1998.0000' cc) to liters float (1.998)."""
|
||||
try:
|
||||
cc = float(cap)
|
||||
return round(cc / 1000, 1)
|
||||
except:
|
||||
return None
|
||||
|
||||
|
||||
def extract_engine_liters(engine_name):
|
||||
"""Extract liters from engine name like '2.0L 4cyl 127hp'."""
|
||||
m = re.match(r'(\d+\.\d+)L', engine_name)
|
||||
if m:
|
||||
return round(float(m.group(1)), 1)
|
||||
return None
|
||||
|
||||
|
||||
def run():
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Step 1: Build vehicleId → vehicle info from TecDoc files
|
||||
print("Building vehicleId → vehicle info mapping...", flush=True)
|
||||
mfrs = json.loads((DATA_DIR / "manufacturers.json").read_text())
|
||||
vid_info = {} # vehicleId → {brand, model, year_start, year_end, liters}
|
||||
for mfr in mfrs:
|
||||
brand = mfr['manufacturerName']
|
||||
if '(' in brand:
|
||||
continue
|
||||
mfr_id = mfr['manufacturerId']
|
||||
model_file = DATA_DIR / "models" / f"{mfr_id}.json"
|
||||
if not model_file.exists():
|
||||
continue
|
||||
models = json.loads(model_file.read_text())
|
||||
for model in models:
|
||||
model_name = model.get('modelName', '')
|
||||
if not model_name:
|
||||
continue
|
||||
vehicle_file = DATA_DIR / "vehicles" / f"{model['modelId']}.json"
|
||||
if not vehicle_file.exists():
|
||||
continue
|
||||
vehicles = json.loads(vehicle_file.read_text())
|
||||
if not vehicles:
|
||||
continue
|
||||
for v in vehicles:
|
||||
vid = v.get('vehicleId')
|
||||
if not vid:
|
||||
continue
|
||||
# Parse year range
|
||||
year_start = None
|
||||
year_end = None
|
||||
try:
|
||||
cs = v.get('constructionIntervalStart', '')
|
||||
if cs:
|
||||
year_start = int(cs[:4])
|
||||
ce = v.get('constructionIntervalEnd', '')
|
||||
if ce:
|
||||
year_end = int(ce[:4])
|
||||
except:
|
||||
pass
|
||||
# Parse engine capacity
|
||||
liters = parse_capacity_liters(v.get('capacityLt') or v.get('capacityTax'))
|
||||
vid_info[vid] = {
|
||||
'brand': brand,
|
||||
'model': model_name,
|
||||
'year_start': year_start,
|
||||
'year_end': year_end,
|
||||
'liters': liters,
|
||||
}
|
||||
|
||||
print(f" {len(vid_info):,} vehicleIds mapped", flush=True)
|
||||
|
||||
# Step 2: Build (brand, modelName) → list of (mye_id, year, liters) from our DB
|
||||
print("Building brand/model → MYE details mapping...", flush=True)
|
||||
cur.execute("""
|
||||
SELECT b.name_brand, m.name_model, mye.id_mye, y.year_car, e.name_engine
|
||||
FROM model_year_engine mye
|
||||
JOIN models m ON mye.model_id = m.id_model
|
||||
JOIN brands b ON m.brand_id = b.id_brand
|
||||
JOIN years y ON mye.year_id = y.id_year
|
||||
JOIN engines e ON mye.engine_id = e.id_engine
|
||||
""")
|
||||
brand_model_to_myes = {}
|
||||
for brand, model, mye_id, year, engine_name in cur.fetchall():
|
||||
key = (brand, model)
|
||||
liters = extract_engine_liters(engine_name)
|
||||
if key not in brand_model_to_myes:
|
||||
brand_model_to_myes[key] = []
|
||||
brand_model_to_myes[key].append((mye_id, year, liters))
|
||||
|
||||
print(f" {len(brand_model_to_myes):,} brand/model combos with {sum(len(v) for v in brand_model_to_myes.values()):,} MYEs", flush=True)
|
||||
|
||||
# Step 3: Build OEM number → part_id from DB
|
||||
print("Loading parts cache...", flush=True)
|
||||
cur.execute("SELECT oem_part_number, id_part FROM parts WHERE oem_part_number IS NOT NULL")
|
||||
part_cache = {r[0]: r[1] for r in cur.fetchall()}
|
||||
print(f" {len(part_cache):,} parts cached", flush=True)
|
||||
|
||||
# Step 4: Load detail files to get articleId → OEM numbers
|
||||
print("Loading article detail OEM mappings...", flush=True)
|
||||
article_to_oems = {}
|
||||
for f in DETAILS_DIR.glob("*.json"):
|
||||
try:
|
||||
data = json.loads(f.read_text())
|
||||
oem_list = data.get('articleOemNo', [])
|
||||
if oem_list:
|
||||
oem_nos = [o.get('oemDisplayNo') for o in oem_list if o.get('oemDisplayNo')]
|
||||
if oem_nos:
|
||||
article_to_oems[int(f.stem)] = oem_nos
|
||||
except:
|
||||
continue
|
||||
print(f" {len(article_to_oems):,} articles with OEM data", flush=True)
|
||||
|
||||
# Step 5: Process article files and create vehicle_parts
|
||||
print("\nCreating vehicle_parts links (filtered + batch mode)...", flush=True)
|
||||
|
||||
stats = {'links': 0, 'skipped_no_mye': 0, 'skipped_no_part': 0, 'files': 0, 'filtered_out': 0}
|
||||
pending = []
|
||||
|
||||
def flush_batch():
|
||||
if not pending:
|
||||
return
|
||||
execute_values(cur, """
|
||||
INSERT INTO vehicle_parts (model_year_engine_id, part_id, quantity_required)
|
||||
VALUES %s ON CONFLICT DO NOTHING
|
||||
""", pending, page_size=10000)
|
||||
conn.commit()
|
||||
pending.clear()
|
||||
|
||||
article_files = sorted(ARTICLES_DIR.glob("*.json"))
|
||||
for f in article_files:
|
||||
parts_split = f.stem.split("_")
|
||||
if len(parts_split) != 2:
|
||||
continue
|
||||
vid = int(parts_split[0])
|
||||
|
||||
info = vid_info.get(vid)
|
||||
if not info:
|
||||
stats['skipped_no_mye'] += 1
|
||||
continue
|
||||
|
||||
bm = (info['brand'], info['model'])
|
||||
all_myes = brand_model_to_myes.get(bm, [])
|
||||
if not all_myes:
|
||||
stats['skipped_no_mye'] += 1
|
||||
continue
|
||||
|
||||
# Filter MYEs by year range and engine capacity
|
||||
td_ys = info['year_start']
|
||||
td_ye = info['year_end']
|
||||
td_lit = info['liters']
|
||||
|
||||
filtered_myes = []
|
||||
for mye_id, mye_year, mye_liters in all_myes:
|
||||
# Year filter: MYE year must fall within TecDoc construction interval
|
||||
if td_ys and td_ye:
|
||||
if mye_year < td_ys or mye_year > td_ye:
|
||||
stats['filtered_out'] += 1
|
||||
continue
|
||||
elif td_ys:
|
||||
if mye_year < td_ys:
|
||||
stats['filtered_out'] += 1
|
||||
continue
|
||||
|
||||
# Engine capacity filter: must match within 0.2L tolerance
|
||||
if td_lit and mye_liters:
|
||||
if abs(td_lit - mye_liters) > 0.2:
|
||||
stats['filtered_out'] += 1
|
||||
continue
|
||||
|
||||
filtered_myes.append(mye_id)
|
||||
|
||||
if not filtered_myes:
|
||||
# Fallback: if filtering removed everything, skip
|
||||
stats['skipped_no_mye'] += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
articles = json.loads(f.read_text())
|
||||
except:
|
||||
continue
|
||||
|
||||
for a in articles:
|
||||
aid = a.get('articleId')
|
||||
article_no = a.get('articleNo', '')
|
||||
supplier = a.get('supplierName', '')
|
||||
if not aid:
|
||||
continue
|
||||
|
||||
part_ids = set()
|
||||
oem_nos = article_to_oems.get(aid, [])
|
||||
for oem_no in oem_nos:
|
||||
pid = part_cache.get(oem_no)
|
||||
if pid:
|
||||
part_ids.add(pid)
|
||||
|
||||
if not part_ids:
|
||||
stats['skipped_no_part'] += 1
|
||||
continue
|
||||
|
||||
for mye_id in filtered_myes:
|
||||
for part_id in part_ids:
|
||||
pending.append((mye_id, part_id, 1))
|
||||
stats['links'] += 1
|
||||
|
||||
if len(pending) >= BATCH_SIZE:
|
||||
flush_batch()
|
||||
|
||||
stats['files'] += 1
|
||||
if stats['files'] % 500 == 0:
|
||||
flush_batch()
|
||||
print(f" {stats['files']:,}/{len(article_files):,} files | "
|
||||
f"{stats['links']:,} links | {stats['filtered_out']:,} filtered out", flush=True)
|
||||
|
||||
flush_batch()
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
print(f"\n{'='*50}", flush=True)
|
||||
print(f"LINKING COMPLETE", flush=True)
|
||||
print(f" Files processed: {stats['files']:,}", flush=True)
|
||||
print(f" Links created: {stats['links']:,}", flush=True)
|
||||
print(f" Filtered out: {stats['filtered_out']:,}", flush=True)
|
||||
print(f" Skipped (no MYE): {stats['skipped_no_mye']:,}", flush=True)
|
||||
print(f" Skipped (no part):{stats['skipped_no_part']:,}", flush=True)
|
||||
print(f"{'='*50}", flush=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run()
|
||||
Reference in New Issue
Block a user