Autoparts-DB/scripts/import_phase1.py

#!/usr/bin/env python3
"""
Quick import of Phase 1 TecDoc article data into PostgreSQL.
Imports aftermarket parts and their vehicle mappings from article list files,
without waiting for OEM detail downloads.
"""

import json
import psycopg2
from pathlib import Path

DB_URL = "postgresql://nexus:nexus_autoparts_2026@localhost/nexus_autoparts"
ARTICLES_DIR = Path("/home/Autopartes/data/tecdoc/parts/articles")
DETAILS_DIR = Path("/home/Autopartes/data/tecdoc/parts/details")

def run():
    conn = psycopg2.connect(DB_URL)
    cur = conn.cursor()

    # Load category mapping: tecdoc_id → id_part_category
    cur.execute("SELECT id_part_category, tecdoc_id FROM part_categories WHERE tecdoc_id IS NOT NULL")
    cat_map = {r[1]: r[0] for r in cur.fetchall()}

    # Load existing manufacturers
    cur.execute("SELECT id_manufacture, name_manufacture FROM manufacturers")
    mfr_cache = {r[1]: r[0] for r in cur.fetchall()}

    # Load existing parts by OEM
    cur.execute("SELECT oem_part_number, id_part FROM parts WHERE oem_part_number IS NOT NULL")
    part_cache = {r[0]: r[1] for r in cur.fetchall()}

    # Load existing cross-refs to avoid duplicates
    cur.execute("SELECT part_id, cross_reference_number, source_ref FROM part_cross_references")
    xref_set = {(r[0], r[1], r[2]) for r in cur.fetchall()}

    # Also check detail files for OEM numbers
    detail_oem = {}  # articleId → list of {oemBrand, oemDisplayNo}
    detail_files = list(DETAILS_DIR.glob("*.json"))
    print(f"Loading {len(detail_files)} detail files for OEM data...", flush=True)
    for f in detail_files:
        try:
            data = json.loads(f.read_text())
            article = data.get('article', {})
            if article and article.get('oemNo'):
                detail_oem[int(f.stem)] = article['oemNo']
        except:
            continue

    stats = {'parts': 0, 'xrefs': 0, 'mfrs': 0, 'skipped': 0}

    article_files = sorted(ARTICLES_DIR.glob("*.json"))
    print(f"Processing {len(article_files)} article files...", flush=True)

    # Collect all unique articles across all files
    all_articles = {}  # articleId → article data + category
    for f in article_files:
        parts = f.stem.split("_")
        if len(parts) != 2:
            continue
        cat_id = int(parts[1])
        cat_db_id = cat_map.get(cat_id)

        try:
            articles = json.loads(f.read_text())
        except:
            continue

        for a in articles:
            aid = a.get('articleId')
            if aid and aid not in all_articles:
                a['_cat_db_id'] = cat_db_id
                a['_cat_td_id'] = cat_id
                all_articles[aid] = a

    print(f"Unique articles to process: {len(all_articles):,}", flush=True)

    batch = 0
    for aid, a in all_articles.items():
        article_no = a.get('articleNo', '')
        supplier = a.get('supplierName', '')
        product_name = a.get('articleProductName', '')
        cat_db_id = a.get('_cat_db_id')

        if not article_no or not supplier:
            stats['skipped'] += 1
            continue

        # Ensure manufacturer exists
        if supplier not in mfr_cache:
            cur.execute(
                "INSERT INTO manufacturers (name_manufacture) VALUES (%s) RETURNING id_manufacture",
                (supplier,))
            mfr_cache[supplier] = cur.fetchone()[0]
            stats['mfrs'] += 1

        # If we have OEM details for this article, create OEM parts
        oem_numbers = detail_oem.get(aid, [])
        if oem_numbers:
            for oem in oem_numbers:
                oem_no = oem.get('oemDisplayNo', '')
                oem_brand = oem.get('oemBrand', '')
                if not oem_no:
                    continue

                if oem_no not in part_cache:
                    cur.execute("""
                        INSERT INTO parts (oem_part_number, name_part, description)
                        VALUES (%s, %s, %s)
                        ON CONFLICT (oem_part_number) DO UPDATE SET name_part = EXCLUDED.name_part
                        RETURNING id_part
                    """, (oem_no, product_name, f"OEM {oem_brand}"))
                    part_cache[oem_no] = cur.fetchone()[0]
                    stats['parts'] += 1

                part_id = part_cache[oem_no]

                # Add cross-reference (aftermarket → OEM)
                xref_key = (part_id, article_no, supplier)
                if xref_key not in xref_set:
                    cur.execute("""
                        INSERT INTO part_cross_references (part_id, cross_reference_number, source_ref)
                        VALUES (%s, %s, %s) ON CONFLICT DO NOTHING
                    """, (part_id, article_no, supplier))
                    xref_set.add(xref_key)
                    stats['xrefs'] += 1
        else:
            # No OEM data yet - skip, will be imported when detail arrives
            pass

        batch += 1
        if batch % 5000 == 0:
            conn.commit()
            print(f"  {batch:,}/{len(all_articles):,} — {stats['parts']:,} parts, {stats['xrefs']:,} xrefs", flush=True)

    conn.commit()
    cur.close()
    conn.close()

    print(f"\n{'='*50}", flush=True)
    print(f"IMPORT COMPLETE", flush=True)
    print(f"  Parts:          {stats['parts']:,}", flush=True)
    print(f"  Cross-refs:     {stats['xrefs']:,}", flush=True)
    print(f"  Manufacturers:  {stats['mfrs']:,}", flush=True)
    print(f"  Skipped:        {stats['skipped']:,}", flush=True)
    print(f"{'='*50}", flush=True)

if __name__ == "__main__":
    run()