fix: stop creating AFT- placeholder parts in import pipeline
- import_phase1.py: skip AFT- part creation when no OEM data - link_vehicle_parts.py: remove AFT- fallback lookup in part cache - import_tecdoc_parts.py: add VW to TOP_BRANDS list Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
148
scripts/import_phase1.py
Normal file
148
scripts/import_phase1.py
Normal file
@@ -0,0 +1,148 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Quick import of Phase 1 TecDoc article data into PostgreSQL.
|
||||
Imports aftermarket parts and their vehicle mappings from article list files,
|
||||
without waiting for OEM detail downloads.
|
||||
"""
|
||||
|
||||
import json
|
||||
import psycopg2
|
||||
from pathlib import Path
|
||||
|
||||
DB_URL = "postgresql://nexus:nexus_autoparts_2026@localhost/nexus_autoparts"
|
||||
ARTICLES_DIR = Path("/home/Autopartes/data/tecdoc/parts/articles")
|
||||
DETAILS_DIR = Path("/home/Autopartes/data/tecdoc/parts/details")
|
||||
|
||||
def run():
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Load category mapping: tecdoc_id → id_part_category
|
||||
cur.execute("SELECT id_part_category, tecdoc_id FROM part_categories WHERE tecdoc_id IS NOT NULL")
|
||||
cat_map = {r[1]: r[0] for r in cur.fetchall()}
|
||||
|
||||
# Load existing manufacturers
|
||||
cur.execute("SELECT id_manufacture, name_manufacture FROM manufacturers")
|
||||
mfr_cache = {r[1]: r[0] for r in cur.fetchall()}
|
||||
|
||||
# Load existing parts by OEM
|
||||
cur.execute("SELECT oem_part_number, id_part FROM parts WHERE oem_part_number IS NOT NULL")
|
||||
part_cache = {r[0]: r[1] for r in cur.fetchall()}
|
||||
|
||||
# Load existing cross-refs to avoid duplicates
|
||||
cur.execute("SELECT part_id, cross_reference_number, source_ref FROM part_cross_references")
|
||||
xref_set = {(r[0], r[1], r[2]) for r in cur.fetchall()}
|
||||
|
||||
# Also check detail files for OEM numbers
|
||||
detail_oem = {} # articleId → list of {oemBrand, oemDisplayNo}
|
||||
detail_files = list(DETAILS_DIR.glob("*.json"))
|
||||
print(f"Loading {len(detail_files)} detail files for OEM data...", flush=True)
|
||||
for f in detail_files:
|
||||
try:
|
||||
data = json.loads(f.read_text())
|
||||
article = data.get('article', {})
|
||||
if article and article.get('oemNo'):
|
||||
detail_oem[int(f.stem)] = article['oemNo']
|
||||
except:
|
||||
continue
|
||||
|
||||
stats = {'parts': 0, 'xrefs': 0, 'mfrs': 0, 'skipped': 0}
|
||||
|
||||
article_files = sorted(ARTICLES_DIR.glob("*.json"))
|
||||
print(f"Processing {len(article_files)} article files...", flush=True)
|
||||
|
||||
# Collect all unique articles across all files
|
||||
all_articles = {} # articleId → article data + category
|
||||
for f in article_files:
|
||||
parts = f.stem.split("_")
|
||||
if len(parts) != 2:
|
||||
continue
|
||||
cat_id = int(parts[1])
|
||||
cat_db_id = cat_map.get(cat_id)
|
||||
|
||||
try:
|
||||
articles = json.loads(f.read_text())
|
||||
except:
|
||||
continue
|
||||
|
||||
for a in articles:
|
||||
aid = a.get('articleId')
|
||||
if aid and aid not in all_articles:
|
||||
a['_cat_db_id'] = cat_db_id
|
||||
a['_cat_td_id'] = cat_id
|
||||
all_articles[aid] = a
|
||||
|
||||
print(f"Unique articles to process: {len(all_articles):,}", flush=True)
|
||||
|
||||
batch = 0
|
||||
for aid, a in all_articles.items():
|
||||
article_no = a.get('articleNo', '')
|
||||
supplier = a.get('supplierName', '')
|
||||
product_name = a.get('articleProductName', '')
|
||||
cat_db_id = a.get('_cat_db_id')
|
||||
|
||||
if not article_no or not supplier:
|
||||
stats['skipped'] += 1
|
||||
continue
|
||||
|
||||
# Ensure manufacturer exists
|
||||
if supplier not in mfr_cache:
|
||||
cur.execute(
|
||||
"INSERT INTO manufacturers (name_manufacture) VALUES (%s) RETURNING id_manufacture",
|
||||
(supplier,))
|
||||
mfr_cache[supplier] = cur.fetchone()[0]
|
||||
stats['mfrs'] += 1
|
||||
|
||||
# If we have OEM details for this article, create OEM parts
|
||||
oem_numbers = detail_oem.get(aid, [])
|
||||
if oem_numbers:
|
||||
for oem in oem_numbers:
|
||||
oem_no = oem.get('oemDisplayNo', '')
|
||||
oem_brand = oem.get('oemBrand', '')
|
||||
if not oem_no:
|
||||
continue
|
||||
|
||||
if oem_no not in part_cache:
|
||||
cur.execute("""
|
||||
INSERT INTO parts (oem_part_number, name_part, description)
|
||||
VALUES (%s, %s, %s)
|
||||
ON CONFLICT (oem_part_number) DO UPDATE SET name_part = EXCLUDED.name_part
|
||||
RETURNING id_part
|
||||
""", (oem_no, product_name, f"OEM {oem_brand}"))
|
||||
part_cache[oem_no] = cur.fetchone()[0]
|
||||
stats['parts'] += 1
|
||||
|
||||
part_id = part_cache[oem_no]
|
||||
|
||||
# Add cross-reference (aftermarket → OEM)
|
||||
xref_key = (part_id, article_no, supplier)
|
||||
if xref_key not in xref_set:
|
||||
cur.execute("""
|
||||
INSERT INTO part_cross_references (part_id, cross_reference_number, source_ref)
|
||||
VALUES (%s, %s, %s) ON CONFLICT DO NOTHING
|
||||
""", (part_id, article_no, supplier))
|
||||
xref_set.add(xref_key)
|
||||
stats['xrefs'] += 1
|
||||
else:
|
||||
# No OEM data yet - skip, will be imported when detail arrives
|
||||
pass
|
||||
|
||||
batch += 1
|
||||
if batch % 5000 == 0:
|
||||
conn.commit()
|
||||
print(f" {batch:,}/{len(all_articles):,} — {stats['parts']:,} parts, {stats['xrefs']:,} xrefs", flush=True)
|
||||
|
||||
conn.commit()
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
print(f"\n{'='*50}", flush=True)
|
||||
print(f"IMPORT COMPLETE", flush=True)
|
||||
print(f" Parts: {stats['parts']:,}", flush=True)
|
||||
print(f" Cross-refs: {stats['xrefs']:,}", flush=True)
|
||||
print(f" Manufacturers: {stats['mfrs']:,}", flush=True)
|
||||
print(f" Skipped: {stats['skipped']:,}", flush=True)
|
||||
print(f"{'='*50}", flush=True)
|
||||
|
||||
if __name__ == "__main__":
|
||||
run()
|
||||
Reference in New Issue
Block a user