fix: stop creating AFT- placeholder parts in import pipeline

- import_phase1.py: skip AFT- part creation when no OEM data - link_vehicle_parts.py: remove AFT- fallback lookup in part cache - import_tecdoc_parts.py: add VW to TOP_BRANDS list Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-18 22:25:21 +00:00
parent 4b01c57c88
commit eff04a5e60
3 changed files with 930 additions and 0 deletions
--- a/scripts/import_phase1.py
+++ b/scripts/import_phase1.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python3
+"""
+Quick import of Phase 1 TecDoc article data into PostgreSQL.
+Imports aftermarket parts and their vehicle mappings from article list files,
+without waiting for OEM detail downloads.
+"""
+
+import json
+import psycopg2
+from pathlib import Path
+
+DB_URL = "postgresql://nexus:nexus_autoparts_2026@localhost/nexus_autoparts"
+ARTICLES_DIR = Path("/home/Autopartes/data/tecdoc/parts/articles")
+DETAILS_DIR = Path("/home/Autopartes/data/tecdoc/parts/details")
+
+def run():
+    conn = psycopg2.connect(DB_URL)
+    cur = conn.cursor()
+
+    # Load category mapping: tecdoc_id → id_part_category
+    cur.execute("SELECT id_part_category, tecdoc_id FROM part_categories WHERE tecdoc_id IS NOT NULL")
+    cat_map = {r[1]: r[0] for r in cur.fetchall()}
+
+    # Load existing manufacturers
+    cur.execute("SELECT id_manufacture, name_manufacture FROM manufacturers")
+    mfr_cache = {r[1]: r[0] for r in cur.fetchall()}
+
+    # Load existing parts by OEM
+    cur.execute("SELECT oem_part_number, id_part FROM parts WHERE oem_part_number IS NOT NULL")
+    part_cache = {r[0]: r[1] for r in cur.fetchall()}
+
+    # Load existing cross-refs to avoid duplicates
+    cur.execute("SELECT part_id, cross_reference_number, source_ref FROM part_cross_references")
+    xref_set = {(r[0], r[1], r[2]) for r in cur.fetchall()}
+
+    # Also check detail files for OEM numbers
+    detail_oem = {}  # articleId → list of {oemBrand, oemDisplayNo}
+    detail_files = list(DETAILS_DIR.glob("*.json"))
+    print(f"Loading {len(detail_files)} detail files for OEM data...", flush=True)
+    for f in detail_files:
+        try:
+            data = json.loads(f.read_text())
+            article = data.get('article', {})
+            if article and article.get('oemNo'):
+                detail_oem[int(f.stem)] = article['oemNo']
+        except:
+            continue
+
+    stats = {'parts': 0, 'xrefs': 0, 'mfrs': 0, 'skipped': 0}
+
+    article_files = sorted(ARTICLES_DIR.glob("*.json"))
+    print(f"Processing {len(article_files)} article files...", flush=True)
+
+    # Collect all unique articles across all files
+    all_articles = {}  # articleId → article data + category
+    for f in article_files:
+        parts = f.stem.split("_")
+        if len(parts) != 2:
+            continue
+        cat_id = int(parts[1])
+        cat_db_id = cat_map.get(cat_id)
+
+        try:
+            articles = json.loads(f.read_text())
+        except:
+            continue
+
+        for a in articles:
+            aid = a.get('articleId')
+            if aid and aid not in all_articles:
+                a['_cat_db_id'] = cat_db_id
+                a['_cat_td_id'] = cat_id
+                all_articles[aid] = a
+
+    print(f"Unique articles to process: {len(all_articles):,}", flush=True)
+
+    batch = 0
+    for aid, a in all_articles.items():
+        article_no = a.get('articleNo', '')
+        supplier = a.get('supplierName', '')
+        product_name = a.get('articleProductName', '')
+        cat_db_id = a.get('_cat_db_id')
+
+        if not article_no or not supplier:
+            stats['skipped'] += 1
+            continue
+
+        # Ensure manufacturer exists
+        if supplier not in mfr_cache:
+            cur.execute(
+                "INSERT INTO manufacturers (name_manufacture) VALUES (%s) RETURNING id_manufacture",
+                (supplier,))
+            mfr_cache[supplier] = cur.fetchone()[0]
+            stats['mfrs'] += 1
+
+        # If we have OEM details for this article, create OEM parts
+        oem_numbers = detail_oem.get(aid, [])
+        if oem_numbers:
+            for oem in oem_numbers:
+                oem_no = oem.get('oemDisplayNo', '')
+                oem_brand = oem.get('oemBrand', '')
+                if not oem_no:
+                    continue
+
+                if oem_no not in part_cache:
+                    cur.execute("""
+                        INSERT INTO parts (oem_part_number, name_part, description)
+                        VALUES (%s, %s, %s)
+                        ON CONFLICT (oem_part_number) DO UPDATE SET name_part = EXCLUDED.name_part
+                        RETURNING id_part
+                    """, (oem_no, product_name, f"OEM {oem_brand}"))
+                    part_cache[oem_no] = cur.fetchone()[0]
+                    stats['parts'] += 1
+
+                part_id = part_cache[oem_no]
+
+                # Add cross-reference (aftermarket → OEM)
+                xref_key = (part_id, article_no, supplier)
+                if xref_key not in xref_set:
+                    cur.execute("""
+                        INSERT INTO part_cross_references (part_id, cross_reference_number, source_ref)
+                        VALUES (%s, %s, %s) ON CONFLICT DO NOTHING
+                    """, (part_id, article_no, supplier))
+                    xref_set.add(xref_key)
+                    stats['xrefs'] += 1
+        else:
+            # No OEM data yet - skip, will be imported when detail arrives
+            pass
+
+        batch += 1
+        if batch % 5000 == 0:
+            conn.commit()
+            print(f"  {batch:,}/{len(all_articles):,} — {stats['parts']:,} parts, {stats['xrefs']:,} xrefs", flush=True)
+
+    conn.commit()
+    cur.close()
+    conn.close()
+
+    print(f"\n{'='*50}", flush=True)
+    print(f"IMPORT COMPLETE", flush=True)
+    print(f"  Parts:          {stats['parts']:,}", flush=True)
+    print(f"  Cross-refs:     {stats['xrefs']:,}", flush=True)
+    print(f"  Manufacturers:  {stats['mfrs']:,}", flush=True)
+    print(f"  Skipped:        {stats['skipped']:,}", flush=True)
+    print(f"{'='*50}", flush=True)
+
+if __name__ == "__main__":
+    run()