fix: stop creating AFT- placeholder parts in import pipeline

- import_phase1.py: skip AFT- part creation when no OEM data
- link_vehicle_parts.py: remove AFT- fallback lookup in part cache
- import_tecdoc_parts.py: add VW to TOP_BRANDS list

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-18 22:25:21 +00:00
parent 4b01c57c88
commit eff04a5e60
3 changed files with 930 additions and 0 deletions

148
scripts/import_phase1.py Normal file
View File

@@ -0,0 +1,148 @@
#!/usr/bin/env python3
"""
Quick import of Phase 1 TecDoc article data into PostgreSQL.
Imports aftermarket parts and their vehicle mappings from article list files,
without waiting for OEM detail downloads.
"""
import json
import psycopg2
from pathlib import Path
DB_URL = "postgresql://nexus:nexus_autoparts_2026@localhost/nexus_autoparts"
ARTICLES_DIR = Path("/home/Autopartes/data/tecdoc/parts/articles")
DETAILS_DIR = Path("/home/Autopartes/data/tecdoc/parts/details")
def run():
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
# Load category mapping: tecdoc_id → id_part_category
cur.execute("SELECT id_part_category, tecdoc_id FROM part_categories WHERE tecdoc_id IS NOT NULL")
cat_map = {r[1]: r[0] for r in cur.fetchall()}
# Load existing manufacturers
cur.execute("SELECT id_manufacture, name_manufacture FROM manufacturers")
mfr_cache = {r[1]: r[0] for r in cur.fetchall()}
# Load existing parts by OEM
cur.execute("SELECT oem_part_number, id_part FROM parts WHERE oem_part_number IS NOT NULL")
part_cache = {r[0]: r[1] for r in cur.fetchall()}
# Load existing cross-refs to avoid duplicates
cur.execute("SELECT part_id, cross_reference_number, source_ref FROM part_cross_references")
xref_set = {(r[0], r[1], r[2]) for r in cur.fetchall()}
# Also check detail files for OEM numbers
detail_oem = {} # articleId → list of {oemBrand, oemDisplayNo}
detail_files = list(DETAILS_DIR.glob("*.json"))
print(f"Loading {len(detail_files)} detail files for OEM data...", flush=True)
for f in detail_files:
try:
data = json.loads(f.read_text())
article = data.get('article', {})
if article and article.get('oemNo'):
detail_oem[int(f.stem)] = article['oemNo']
except:
continue
stats = {'parts': 0, 'xrefs': 0, 'mfrs': 0, 'skipped': 0}
article_files = sorted(ARTICLES_DIR.glob("*.json"))
print(f"Processing {len(article_files)} article files...", flush=True)
# Collect all unique articles across all files
all_articles = {} # articleId → article data + category
for f in article_files:
parts = f.stem.split("_")
if len(parts) != 2:
continue
cat_id = int(parts[1])
cat_db_id = cat_map.get(cat_id)
try:
articles = json.loads(f.read_text())
except:
continue
for a in articles:
aid = a.get('articleId')
if aid and aid not in all_articles:
a['_cat_db_id'] = cat_db_id
a['_cat_td_id'] = cat_id
all_articles[aid] = a
print(f"Unique articles to process: {len(all_articles):,}", flush=True)
batch = 0
for aid, a in all_articles.items():
article_no = a.get('articleNo', '')
supplier = a.get('supplierName', '')
product_name = a.get('articleProductName', '')
cat_db_id = a.get('_cat_db_id')
if not article_no or not supplier:
stats['skipped'] += 1
continue
# Ensure manufacturer exists
if supplier not in mfr_cache:
cur.execute(
"INSERT INTO manufacturers (name_manufacture) VALUES (%s) RETURNING id_manufacture",
(supplier,))
mfr_cache[supplier] = cur.fetchone()[0]
stats['mfrs'] += 1
# If we have OEM details for this article, create OEM parts
oem_numbers = detail_oem.get(aid, [])
if oem_numbers:
for oem in oem_numbers:
oem_no = oem.get('oemDisplayNo', '')
oem_brand = oem.get('oemBrand', '')
if not oem_no:
continue
if oem_no not in part_cache:
cur.execute("""
INSERT INTO parts (oem_part_number, name_part, description)
VALUES (%s, %s, %s)
ON CONFLICT (oem_part_number) DO UPDATE SET name_part = EXCLUDED.name_part
RETURNING id_part
""", (oem_no, product_name, f"OEM {oem_brand}"))
part_cache[oem_no] = cur.fetchone()[0]
stats['parts'] += 1
part_id = part_cache[oem_no]
# Add cross-reference (aftermarket → OEM)
xref_key = (part_id, article_no, supplier)
if xref_key not in xref_set:
cur.execute("""
INSERT INTO part_cross_references (part_id, cross_reference_number, source_ref)
VALUES (%s, %s, %s) ON CONFLICT DO NOTHING
""", (part_id, article_no, supplier))
xref_set.add(xref_key)
stats['xrefs'] += 1
else:
# No OEM data yet - skip, will be imported when detail arrives
pass
batch += 1
if batch % 5000 == 0:
conn.commit()
print(f" {batch:,}/{len(all_articles):,}{stats['parts']:,} parts, {stats['xrefs']:,} xrefs", flush=True)
conn.commit()
cur.close()
conn.close()
print(f"\n{'='*50}", flush=True)
print(f"IMPORT COMPLETE", flush=True)
print(f" Parts: {stats['parts']:,}", flush=True)
print(f" Cross-refs: {stats['xrefs']:,}", flush=True)
print(f" Manufacturers: {stats['mfrs']:,}", flush=True)
print(f" Skipped: {stats['skipped']:,}", flush=True)
print(f"{'='*50}", flush=True)
if __name__ == "__main__":
run()