fix: stop creating AFT- placeholder parts in import pipeline

- import_phase1.py: skip AFT- part creation when no OEM data
- link_vehicle_parts.py: remove AFT- fallback lookup in part cache
- import_tecdoc_parts.py: add VW to TOP_BRANDS list

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-18 22:25:21 +00:00
parent 4b01c57c88
commit eff04a5e60
3 changed files with 930 additions and 0 deletions

148
scripts/import_phase1.py Normal file
View File

@@ -0,0 +1,148 @@
#!/usr/bin/env python3
"""
Quick import of Phase 1 TecDoc article data into PostgreSQL.
Imports aftermarket parts and their vehicle mappings from article list files,
without waiting for OEM detail downloads.
"""
import json
import psycopg2
from pathlib import Path
DB_URL = "postgresql://nexus:nexus_autoparts_2026@localhost/nexus_autoparts"
ARTICLES_DIR = Path("/home/Autopartes/data/tecdoc/parts/articles")
DETAILS_DIR = Path("/home/Autopartes/data/tecdoc/parts/details")
def run():
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
# Load category mapping: tecdoc_id → id_part_category
cur.execute("SELECT id_part_category, tecdoc_id FROM part_categories WHERE tecdoc_id IS NOT NULL")
cat_map = {r[1]: r[0] for r in cur.fetchall()}
# Load existing manufacturers
cur.execute("SELECT id_manufacture, name_manufacture FROM manufacturers")
mfr_cache = {r[1]: r[0] for r in cur.fetchall()}
# Load existing parts by OEM
cur.execute("SELECT oem_part_number, id_part FROM parts WHERE oem_part_number IS NOT NULL")
part_cache = {r[0]: r[1] for r in cur.fetchall()}
# Load existing cross-refs to avoid duplicates
cur.execute("SELECT part_id, cross_reference_number, source_ref FROM part_cross_references")
xref_set = {(r[0], r[1], r[2]) for r in cur.fetchall()}
# Also check detail files for OEM numbers
detail_oem = {} # articleId → list of {oemBrand, oemDisplayNo}
detail_files = list(DETAILS_DIR.glob("*.json"))
print(f"Loading {len(detail_files)} detail files for OEM data...", flush=True)
for f in detail_files:
try:
data = json.loads(f.read_text())
article = data.get('article', {})
if article and article.get('oemNo'):
detail_oem[int(f.stem)] = article['oemNo']
except:
continue
stats = {'parts': 0, 'xrefs': 0, 'mfrs': 0, 'skipped': 0}
article_files = sorted(ARTICLES_DIR.glob("*.json"))
print(f"Processing {len(article_files)} article files...", flush=True)
# Collect all unique articles across all files
all_articles = {} # articleId → article data + category
for f in article_files:
parts = f.stem.split("_")
if len(parts) != 2:
continue
cat_id = int(parts[1])
cat_db_id = cat_map.get(cat_id)
try:
articles = json.loads(f.read_text())
except:
continue
for a in articles:
aid = a.get('articleId')
if aid and aid not in all_articles:
a['_cat_db_id'] = cat_db_id
a['_cat_td_id'] = cat_id
all_articles[aid] = a
print(f"Unique articles to process: {len(all_articles):,}", flush=True)
batch = 0
for aid, a in all_articles.items():
article_no = a.get('articleNo', '')
supplier = a.get('supplierName', '')
product_name = a.get('articleProductName', '')
cat_db_id = a.get('_cat_db_id')
if not article_no or not supplier:
stats['skipped'] += 1
continue
# Ensure manufacturer exists
if supplier not in mfr_cache:
cur.execute(
"INSERT INTO manufacturers (name_manufacture) VALUES (%s) RETURNING id_manufacture",
(supplier,))
mfr_cache[supplier] = cur.fetchone()[0]
stats['mfrs'] += 1
# If we have OEM details for this article, create OEM parts
oem_numbers = detail_oem.get(aid, [])
if oem_numbers:
for oem in oem_numbers:
oem_no = oem.get('oemDisplayNo', '')
oem_brand = oem.get('oemBrand', '')
if not oem_no:
continue
if oem_no not in part_cache:
cur.execute("""
INSERT INTO parts (oem_part_number, name_part, description)
VALUES (%s, %s, %s)
ON CONFLICT (oem_part_number) DO UPDATE SET name_part = EXCLUDED.name_part
RETURNING id_part
""", (oem_no, product_name, f"OEM {oem_brand}"))
part_cache[oem_no] = cur.fetchone()[0]
stats['parts'] += 1
part_id = part_cache[oem_no]
# Add cross-reference (aftermarket → OEM)
xref_key = (part_id, article_no, supplier)
if xref_key not in xref_set:
cur.execute("""
INSERT INTO part_cross_references (part_id, cross_reference_number, source_ref)
VALUES (%s, %s, %s) ON CONFLICT DO NOTHING
""", (part_id, article_no, supplier))
xref_set.add(xref_key)
stats['xrefs'] += 1
else:
# No OEM data yet - skip, will be imported when detail arrives
pass
batch += 1
if batch % 5000 == 0:
conn.commit()
print(f" {batch:,}/{len(all_articles):,}{stats['parts']:,} parts, {stats['xrefs']:,} xrefs", flush=True)
conn.commit()
cur.close()
conn.close()
print(f"\n{'='*50}", flush=True)
print(f"IMPORT COMPLETE", flush=True)
print(f" Parts: {stats['parts']:,}", flush=True)
print(f" Cross-refs: {stats['xrefs']:,}", flush=True)
print(f" Manufacturers: {stats['mfrs']:,}", flush=True)
print(f" Skipped: {stats['skipped']:,}", flush=True)
print(f"{'='*50}", flush=True)
if __name__ == "__main__":
run()

View File

@@ -0,0 +1,531 @@
#!/usr/bin/env python3
"""
Import OEM parts data from TecDoc (Apify) into Nexus Autoparts PostgreSQL.
Three-phase approach:
Phase 1: Download categories per vehicle → JSON files
Phase 2: Download article lists per vehicle+category → JSON files
Phase 3: Download article details (OEM numbers) → JSON files
Phase 4: Import all JSON data into PostgreSQL
Uses one representative vehicleId per TecDoc model to minimize API calls.
Supports concurrent API calls for speed.
Usage:
python3 scripts/import_tecdoc_parts.py download # Phases 1-3
python3 scripts/import_tecdoc_parts.py import # Phase 4
python3 scripts/import_tecdoc_parts.py status # Check progress
"""
import os
import sys
import json
import time
import argparse
import requests
import psycopg2
from datetime import datetime
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
# --- Config ---
APIFY_TOKEN = os.environ.get("APIFY_TOKEN", "apify_api_l5SrcwYyanAO45AFxrEpviUcuVRIFK2yPdc5")
APIFY_ACTOR = "making-data-meaningful~tecdoc"
APIFY_URL = f"https://api.apify.com/v2/acts/{APIFY_ACTOR}/run-sync-get-dataset-items"
DB_URL = os.environ.get("DATABASE_URL", "postgresql://nexus:nexus_autoparts_2026@localhost/nexus_autoparts")
TYPE_ID = 1 # Passenger cars
LANG_ID = 4 # English
COUNTRY_ID = 153 # Mexico
DATA_DIR = Path("/home/Autopartes/data/tecdoc")
PARTS_DIR = DATA_DIR / "parts"
ARTICLES_DIR = PARTS_DIR / "articles" # vehicle articles by category
DETAILS_DIR = PARTS_DIR / "details" # article OEM details
MAX_WORKERS = 30 # Concurrent API calls
APIFY_DELAY = 0.1 # Seconds between API calls per thread
# Top brands for Mexico & USA
TOP_BRANDS = [
'TOYOTA', 'NISSAN', 'CHEVROLET', 'VOLKSWAGEN', 'VW', 'HONDA', 'FORD',
'HYUNDAI', 'KIA', 'MAZDA', 'BMW', 'MERCEDES-BENZ', 'AUDI',
'JEEP', 'DODGE', 'CHRYSLER', 'RAM', 'GMC', 'BUICK', 'CADILLAC',
'SUBARU', 'MITSUBISHI', 'SUZUKI', 'ACURA', 'LEXUS', 'INFINITI',
'LINCOLN', 'FIAT', 'PEUGEOT', 'RENAULT', 'SEAT'
]
# Top-level TecDoc category IDs (from our DB)
TOP_CATEGORIES = None # Loaded dynamically
def apify_call(input_data, retries=3):
"""Call Apify actor and return result."""
for attempt in range(retries):
try:
resp = requests.post(
APIFY_URL, params={"token": APIFY_TOKEN},
headers={"Content-Type": "application/json"},
json=input_data, timeout=180
)
if resp.status_code in (200, 201):
data = resp.json()
return data[0] if isinstance(data, list) and data else data
elif resp.status_code == 429:
wait = 30 * (attempt + 1)
print(f" Rate limited, waiting {wait}s...", flush=True)
time.sleep(wait)
else:
print(f" HTTP {resp.status_code}: {resp.text[:200]}", flush=True)
time.sleep(5)
except Exception as e:
print(f" Error: {e}", flush=True)
time.sleep(5)
return None
def load_top_categories():
"""Load top-level TecDoc category IDs from database."""
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
cur.execute("SELECT tecdoc_id, name_part_category FROM part_categories WHERE tecdoc_id IS NOT NULL ORDER BY display_order")
cats = [(r[0], r[1]) for r in cur.fetchall()]
cur.close()
conn.close()
return cats
def get_representative_vehicles():
"""Get one representative vehicleId per TecDoc model for top brands."""
mfrs = json.loads((DATA_DIR / "manufacturers.json").read_text())
models_dir = DATA_DIR / "models"
vehicles_dir = DATA_DIR / "vehicles"
representatives = [] # (vehicleId, brand_name, model_name, td_model_id)
for mfr in mfrs:
name = mfr['manufacturerName']
if '(' in name:
continue
if name.upper() not in [b.upper() for b in TOP_BRANDS]:
continue
mfr_id = mfr['manufacturerId']
model_file = models_dir / f"{mfr_id}.json"
if not model_file.exists():
continue
models = json.loads(model_file.read_text())
for model in models:
td_model_id = model['modelId']
model_name = model.get('modelName', '')
vehicle_file = vehicles_dir / f"{td_model_id}.json"
if not vehicle_file.exists():
continue
vehicles = json.loads(vehicle_file.read_text())
if not vehicles:
continue
# Pick the first vehicle with a valid vehicleId as representative
vid = vehicles[0].get('vehicleId')
if vid:
# Also collect ALL vehicleIds for this model
all_vids = [v['vehicleId'] for v in vehicles if v.get('vehicleId')]
representatives.append({
'vehicleId': vid,
'allVehicleIds': all_vids,
'brand': name,
'model': model_name,
'tdModelId': td_model_id
})
return representatives
def download_articles_for_vehicle(vid, category_id, category_name):
"""Download article list for a vehicle+category. Returns article count."""
outfile = ARTICLES_DIR / f"{vid}_{category_id}.json"
if outfile.exists():
return 0 # Already downloaded
time.sleep(APIFY_DELAY)
result = apify_call({
'endpoint_partsArticleListByVehicleIdCategoryId': True,
'parts_vehicleId_18': vid,
'parts_categoryId_18': category_id,
'parts_typeId_18': TYPE_ID,
'parts_langId_18': LANG_ID,
})
if result and isinstance(result, dict) and 'articles' in result:
articles = result.get('articles') or []
outfile.write_text(json.dumps(articles, indent=1))
return len(articles)
else:
# Save empty to avoid re-querying
outfile.write_text("[]")
return 0
def download_article_detail(article_id):
"""Download OEM details for a single article."""
outfile = DETAILS_DIR / f"{article_id}.json"
if outfile.exists():
return True
time.sleep(APIFY_DELAY)
result = apify_call({
'endpoint_partsArticleDetailsByArticleId': True,
'parts_articleId_13': article_id,
'parts_langId_13': LANG_ID,
})
if result and result.get('articleOemNo'):
outfile.write_text(json.dumps(result, indent=1))
return True
elif result and isinstance(result.get('article'), dict):
outfile.write_text(json.dumps(result, indent=1))
return True
return False
# ──────────────── Download ────────────────
def download(brand_filter=None):
"""Download all parts data from TecDoc."""
PARTS_DIR.mkdir(parents=True, exist_ok=True)
ARTICLES_DIR.mkdir(parents=True, exist_ok=True)
DETAILS_DIR.mkdir(parents=True, exist_ok=True)
categories = load_top_categories()
print(f"Loaded {len(categories)} top-level categories", flush=True)
representatives = get_representative_vehicles()
if brand_filter:
representatives = [r for r in representatives if brand_filter.upper() in r['brand'].upper()]
print(f"Found {len(representatives)} representative vehicles for top brands", flush=True)
# Phase 1: Download articles per vehicle+category
total_tasks = len(representatives) * len(categories)
completed = 0
total_articles = 0
print(f"\n{'='*60}", flush=True)
print(f"PHASE 1: Download articles ({total_tasks:,} tasks)", flush=True)
print(f"{'='*60}", flush=True)
for i, rep in enumerate(representatives):
vid = rep['vehicleId']
brand = rep['brand']
model = rep['model']
# Check if all categories already downloaded for this vehicle
existing = sum(1 for cat_id, _ in categories
if (ARTICLES_DIR / f"{vid}_{cat_id}.json").exists())
if existing == len(categories):
completed += len(categories)
continue
print(f"[{i+1}/{len(representatives)}] {brand} {model} (vid={vid})", flush=True)
def download_task(args):
vid, cat_id, cat_name = args
return download_articles_for_vehicle(vid, cat_id, cat_name)
tasks = [(vid, cat_id, cat_name) for cat_id, cat_name in categories
if not (ARTICLES_DIR / f"{vid}_{cat_id}.json").exists()]
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
futures = {executor.submit(download_task, t): t for t in tasks}
for future in as_completed(futures):
try:
count = future.result()
total_articles += count
completed += 1
except Exception as e:
print(f" Task error: {e}", flush=True)
completed += 1
completed += existing # Count pre-existing
print(f"\nPhase 1 complete: {total_articles:,} articles found", flush=True)
# Phase 2: Collect unique articleIds and download OEM details
print(f"\n{'='*60}", flush=True)
print(f"PHASE 2: Collect unique articles & download OEM details", flush=True)
print(f"{'='*60}", flush=True)
unique_articles = set()
for f in ARTICLES_DIR.glob("*.json"):
try:
articles = json.loads(f.read_text())
for a in articles:
if 'articleId' in a:
unique_articles.add(a['articleId'])
except:
continue
# Filter out already downloaded
to_download = [aid for aid in unique_articles
if not (DETAILS_DIR / f"{aid}.json").exists()]
print(f"Unique articles: {len(unique_articles):,}", flush=True)
print(f"Already have details: {len(unique_articles) - len(to_download):,}", flush=True)
print(f"Need to download: {len(to_download):,}", flush=True)
if to_download:
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
futures = {executor.submit(download_article_detail, aid): aid
for aid in to_download}
done = 0
for future in as_completed(futures):
done += 1
if done % 100 == 0:
print(f" Details: {done}/{len(to_download)}", flush=True)
print(f"\nDownload complete!", flush=True)
# ──────────────── Import ────────────────
def do_import():
"""Import downloaded parts data into PostgreSQL."""
if not ARTICLES_DIR.exists():
print("No articles directory. Run 'download' first.")
return
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
# Load category mapping: tecdoc_id → (id_part_category, name)
cur.execute("SELECT id_part_category, tecdoc_id, name_part_category FROM part_categories WHERE tecdoc_id IS NOT NULL")
cat_map = {r[1]: (r[0], r[2]) for r in cur.fetchall()}
# Load group mapping: tecdoc_id → id_part_group
cur.execute("SELECT id_part_group, tecdoc_id, category_id FROM part_groups WHERE tecdoc_id IS NOT NULL")
group_map = {r[1]: (r[0], r[2]) for r in cur.fetchall()}
# Load brand mapping from DB
cur.execute("SELECT id_brand, name_brand FROM brands")
brand_db = {r[1].upper(): r[0] for r in cur.fetchall()}
# Build vehicle mapping: vehicleId → list of MYE ids
representatives = get_representative_vehicles()
# Build vehicleId → model mapping from our DB
# We need to map TecDoc modelId → our model_id
cur.execute("""
SELECT m.id_model, b.name_brand, m.name_model, m.id_brand
FROM models m JOIN brands b ON m.id_brand = b.id_brand
""")
db_models = cur.fetchall()
stats = {
'parts_inserted': 0, 'parts_existing': 0,
'vehicle_parts': 0, 'aftermarket': 0,
'cross_refs': 0, 'manufacturers': 0
}
# Process article detail files
detail_files = list(DETAILS_DIR.glob("*.json"))
print(f"Processing {len(detail_files)} article details...", flush=True)
# Cache for parts by OEM number
oem_cache = {} # oem_no → id_part
# Cache for manufacturers
mfr_cache = {} # supplier_name → id_manufacture
cur.execute("SELECT id_manufacture, name_manufacture FROM manufacturers")
for r in cur.fetchall():
mfr_cache[r[1]] = r[0]
# Cache existing parts
cur.execute("SELECT oem_part_number, id_part FROM parts WHERE oem_part_number IS NOT NULL")
for r in cur.fetchall():
oem_cache[r[0]] = r[1]
# Build article→vehicles mapping from article files
article_vehicles = {} # articleId → set of vehicleIds
article_category = {} # articleId → categoryId (TecDoc)
for f in ARTICLES_DIR.glob("*.json"):
parts = f.stem.split("_")
if len(parts) != 2:
continue
vid, cat_id = int(parts[0]), int(parts[1])
try:
articles = json.loads(f.read_text())
except:
continue
for a in articles:
aid = a.get('articleId')
if aid:
if aid not in article_vehicles:
article_vehicles[aid] = set()
article_vehicles[aid].add(vid)
article_category[aid] = cat_id
print(f"Article→vehicle mappings: {len(article_vehicles)}", flush=True)
batch_count = 0
for detail_file in detail_files:
article_id = int(detail_file.stem)
try:
data = json.loads(detail_file.read_text())
except:
continue
article = data.get('article', {})
if not article:
continue
article_no = article.get('articleNo', '')
supplier_name = article.get('supplierName', '')
product_name = article.get('articleProductName', '')
supplier_id = article.get('supplierId')
# Get OEM numbers
oem_numbers = article.get('oemNo', [])
if not oem_numbers:
continue
# Get category for this article
td_cat_id = article_category.get(article_id)
cat_info = cat_map.get(td_cat_id)
cat_db_id = cat_info[0] if cat_info else None
# Ensure manufacturer exists
if supplier_name and supplier_name not in mfr_cache:
cur.execute(
"INSERT INTO manufacturers (name_manufacture) VALUES (%s) RETURNING id_manufacture",
(supplier_name,))
mfr_cache[supplier_name] = cur.fetchone()[0]
stats['manufacturers'] += 1
mfr_id = mfr_cache.get(supplier_name)
# Insert each OEM part
for oem_entry in oem_numbers:
oem_no = oem_entry.get('oemDisplayNo', '')
oem_brand = oem_entry.get('oemBrand', '')
if not oem_no:
continue
# Insert OEM part if not exists
if oem_no not in oem_cache:
cur.execute("""
INSERT INTO parts (oem_part_number, name_part, name_es, category_id, description)
VALUES (%s, %s, %s, %s, %s)
ON CONFLICT (oem_part_number) DO UPDATE SET name_part = EXCLUDED.name_part
RETURNING id_part
""", (oem_no, product_name, None, cat_db_id, f"OEM {oem_brand}"))
oem_cache[oem_no] = cur.fetchone()[0]
stats['parts_inserted'] += 1
else:
stats['parts_existing'] += 1
part_id = oem_cache[oem_no]
# Insert aftermarket cross-reference
if article_no and supplier_name:
cur.execute("""
INSERT INTO part_cross_references (part_id, cross_ref_number, id_ref_type, source_ref)
VALUES (%s, %s, NULL, %s)
ON CONFLICT DO NOTHING
""", (part_id, article_no, supplier_name))
stats['cross_refs'] += 1
batch_count += 1
if batch_count % 500 == 0:
conn.commit()
print(f" Processed {batch_count}/{len(detail_files)} articles, "
f"{stats['parts_inserted']} parts inserted", flush=True)
conn.commit()
cur.close()
conn.close()
print(f"\n{'='*60}", flush=True)
print(f"IMPORT COMPLETE", flush=True)
print(f" Parts inserted: {stats['parts_inserted']:,}", flush=True)
print(f" Parts existing: {stats['parts_existing']:,}", flush=True)
print(f" Cross-references: {stats['cross_refs']:,}", flush=True)
print(f" Manufacturers: {stats['manufacturers']:,}", flush=True)
print(f"{'='*60}", flush=True)
# ──────────────── Status ────────────────
def status():
"""Show download progress."""
categories = load_top_categories()
representatives = get_representative_vehicles()
print(f"Representative vehicles: {len(representatives)}")
print(f"Categories: {len(categories)}")
print(f"Expected article files: {len(representatives) * len(categories):,}")
article_files = list(ARTICLES_DIR.glob("*.json")) if ARTICLES_DIR.exists() else []
detail_files = list(DETAILS_DIR.glob("*.json")) if DETAILS_DIR.exists() else []
# Count unique articleIds
unique_articles = set()
total_article_count = 0
for f in article_files:
try:
articles = json.loads(f.read_text())
for a in articles:
if 'articleId' in a:
unique_articles.add(a['articleId'])
total_article_count += len(articles)
except:
continue
expected = len(representatives) * len(categories)
pct_articles = len(article_files) / expected * 100 if expected > 0 else 0
print(f"\nArticle files: {len(article_files):,} / {expected:,} ({pct_articles:.1f}%)")
print(f"Total articles: {total_article_count:,}")
print(f"Unique articleIds: {len(unique_articles):,}")
print(f"Detail files: {len(detail_files):,} / {len(unique_articles):,}")
if expected > 0:
remaining = expected - len(article_files)
est_minutes = remaining * (APIFY_DELAY + 3) / MAX_WORKERS / 60
print(f"\nEst. remaining (articles): ~{est_minutes:.0f} min ({remaining:,} calls)")
remaining_details = len(unique_articles) - len(detail_files)
if remaining_details > 0:
est_detail_min = remaining_details * (APIFY_DELAY + 3) / MAX_WORKERS / 60
print(f"Est. remaining (details): ~{est_detail_min:.0f} min ({remaining_details:,} calls)")
# Per-brand breakdown
print(f"\n{'Brand':20s} {'Models':>7} {'Done':>7} {'%':>6}")
print("-" * 44)
for brand in sorted(TOP_BRANDS):
brand_reps = [r for r in representatives if r['brand'].upper() == brand]
brand_done = sum(1 for r in brand_reps
for cat_id, _ in categories
if (ARTICLES_DIR / f"{r['vehicleId']}_{cat_id}.json").exists())
brand_total = len(brand_reps) * len(categories)
pct = brand_done / brand_total * 100 if brand_total > 0 else 0
print(f" {brand:18s} {len(brand_reps):>7} {brand_done:>7} {pct:>5.1f}%")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="TecDoc parts import")
parser.add_argument("command", choices=["download", "import", "status"])
parser.add_argument("--brand", help="Filter by brand name")
args = parser.parse_args()
if args.command == "download":
download(brand_filter=args.brand)
elif args.command == "import":
do_import()
elif args.command == "status":
status()

View File

@@ -0,0 +1,251 @@
#!/usr/bin/env python3
"""
Link parts to vehicles using TecDoc article files.
Maps: article file (vehicleId_categoryId.json) → parts → vehicle_parts (MYE ids)
Optimized v3: year+engine filtering + batch inserts.
"""
import json
import re
import psycopg2
from psycopg2.extras import execute_values
from pathlib import Path
DB_URL = "postgresql://nexus:nexus_autoparts_2026@localhost/nexus_autoparts"
DATA_DIR = Path("/home/Autopartes/data/tecdoc")
ARTICLES_DIR = DATA_DIR / "parts" / "articles"
DETAILS_DIR = DATA_DIR / "parts" / "details"
BATCH_SIZE = 50000
def parse_capacity_liters(cap):
"""Convert TecDoc capacityLt (e.g. '1998.0000' cc) to liters float (1.998)."""
try:
cc = float(cap)
return round(cc / 1000, 1)
except:
return None
def extract_engine_liters(engine_name):
"""Extract liters from engine name like '2.0L 4cyl 127hp'."""
m = re.match(r'(\d+\.\d+)L', engine_name)
if m:
return round(float(m.group(1)), 1)
return None
def run():
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
# Step 1: Build vehicleId → vehicle info from TecDoc files
print("Building vehicleId → vehicle info mapping...", flush=True)
mfrs = json.loads((DATA_DIR / "manufacturers.json").read_text())
vid_info = {} # vehicleId → {brand, model, year_start, year_end, liters}
for mfr in mfrs:
brand = mfr['manufacturerName']
if '(' in brand:
continue
mfr_id = mfr['manufacturerId']
model_file = DATA_DIR / "models" / f"{mfr_id}.json"
if not model_file.exists():
continue
models = json.loads(model_file.read_text())
for model in models:
model_name = model.get('modelName', '')
if not model_name:
continue
vehicle_file = DATA_DIR / "vehicles" / f"{model['modelId']}.json"
if not vehicle_file.exists():
continue
vehicles = json.loads(vehicle_file.read_text())
if not vehicles:
continue
for v in vehicles:
vid = v.get('vehicleId')
if not vid:
continue
# Parse year range
year_start = None
year_end = None
try:
cs = v.get('constructionIntervalStart', '')
if cs:
year_start = int(cs[:4])
ce = v.get('constructionIntervalEnd', '')
if ce:
year_end = int(ce[:4])
except:
pass
# Parse engine capacity
liters = parse_capacity_liters(v.get('capacityLt') or v.get('capacityTax'))
vid_info[vid] = {
'brand': brand,
'model': model_name,
'year_start': year_start,
'year_end': year_end,
'liters': liters,
}
print(f" {len(vid_info):,} vehicleIds mapped", flush=True)
# Step 2: Build (brand, modelName) → list of (mye_id, year, liters) from our DB
print("Building brand/model → MYE details mapping...", flush=True)
cur.execute("""
SELECT b.name_brand, m.name_model, mye.id_mye, y.year_car, e.name_engine
FROM model_year_engine mye
JOIN models m ON mye.model_id = m.id_model
JOIN brands b ON m.brand_id = b.id_brand
JOIN years y ON mye.year_id = y.id_year
JOIN engines e ON mye.engine_id = e.id_engine
""")
brand_model_to_myes = {}
for brand, model, mye_id, year, engine_name in cur.fetchall():
key = (brand, model)
liters = extract_engine_liters(engine_name)
if key not in brand_model_to_myes:
brand_model_to_myes[key] = []
brand_model_to_myes[key].append((mye_id, year, liters))
print(f" {len(brand_model_to_myes):,} brand/model combos with {sum(len(v) for v in brand_model_to_myes.values()):,} MYEs", flush=True)
# Step 3: Build OEM number → part_id from DB
print("Loading parts cache...", flush=True)
cur.execute("SELECT oem_part_number, id_part FROM parts WHERE oem_part_number IS NOT NULL")
part_cache = {r[0]: r[1] for r in cur.fetchall()}
print(f" {len(part_cache):,} parts cached", flush=True)
# Step 4: Load detail files to get articleId → OEM numbers
print("Loading article detail OEM mappings...", flush=True)
article_to_oems = {}
for f in DETAILS_DIR.glob("*.json"):
try:
data = json.loads(f.read_text())
oem_list = data.get('articleOemNo', [])
if oem_list:
oem_nos = [o.get('oemDisplayNo') for o in oem_list if o.get('oemDisplayNo')]
if oem_nos:
article_to_oems[int(f.stem)] = oem_nos
except:
continue
print(f" {len(article_to_oems):,} articles with OEM data", flush=True)
# Step 5: Process article files and create vehicle_parts
print("\nCreating vehicle_parts links (filtered + batch mode)...", flush=True)
stats = {'links': 0, 'skipped_no_mye': 0, 'skipped_no_part': 0, 'files': 0, 'filtered_out': 0}
pending = []
def flush_batch():
if not pending:
return
execute_values(cur, """
INSERT INTO vehicle_parts (model_year_engine_id, part_id, quantity_required)
VALUES %s ON CONFLICT DO NOTHING
""", pending, page_size=10000)
conn.commit()
pending.clear()
article_files = sorted(ARTICLES_DIR.glob("*.json"))
for f in article_files:
parts_split = f.stem.split("_")
if len(parts_split) != 2:
continue
vid = int(parts_split[0])
info = vid_info.get(vid)
if not info:
stats['skipped_no_mye'] += 1
continue
bm = (info['brand'], info['model'])
all_myes = brand_model_to_myes.get(bm, [])
if not all_myes:
stats['skipped_no_mye'] += 1
continue
# Filter MYEs by year range and engine capacity
td_ys = info['year_start']
td_ye = info['year_end']
td_lit = info['liters']
filtered_myes = []
for mye_id, mye_year, mye_liters in all_myes:
# Year filter: MYE year must fall within TecDoc construction interval
if td_ys and td_ye:
if mye_year < td_ys or mye_year > td_ye:
stats['filtered_out'] += 1
continue
elif td_ys:
if mye_year < td_ys:
stats['filtered_out'] += 1
continue
# Engine capacity filter: must match within 0.2L tolerance
if td_lit and mye_liters:
if abs(td_lit - mye_liters) > 0.2:
stats['filtered_out'] += 1
continue
filtered_myes.append(mye_id)
if not filtered_myes:
# Fallback: if filtering removed everything, skip
stats['skipped_no_mye'] += 1
continue
try:
articles = json.loads(f.read_text())
except:
continue
for a in articles:
aid = a.get('articleId')
article_no = a.get('articleNo', '')
supplier = a.get('supplierName', '')
if not aid:
continue
part_ids = set()
oem_nos = article_to_oems.get(aid, [])
for oem_no in oem_nos:
pid = part_cache.get(oem_no)
if pid:
part_ids.add(pid)
if not part_ids:
stats['skipped_no_part'] += 1
continue
for mye_id in filtered_myes:
for part_id in part_ids:
pending.append((mye_id, part_id, 1))
stats['links'] += 1
if len(pending) >= BATCH_SIZE:
flush_batch()
stats['files'] += 1
if stats['files'] % 500 == 0:
flush_batch()
print(f" {stats['files']:,}/{len(article_files):,} files | "
f"{stats['links']:,} links | {stats['filtered_out']:,} filtered out", flush=True)
flush_batch()
cur.close()
conn.close()
print(f"\n{'='*50}", flush=True)
print(f"LINKING COMPLETE", flush=True)
print(f" Files processed: {stats['files']:,}", flush=True)
print(f" Links created: {stats['links']:,}", flush=True)
print(f" Filtered out: {stats['filtered_out']:,}", flush=True)
print(f" Skipped (no MYE): {stats['skipped_no_mye']:,}", flush=True)
print(f" Skipped (no part):{stats['skipped_no_part']:,}", flush=True)
print(f"{'='*50}", flush=True)
if __name__ == "__main__":
run()