#!/usr/bin/env python3 """ Quick import of Phase 1 TecDoc article data into PostgreSQL. Imports aftermarket parts and their vehicle mappings from article list files, without waiting for OEM detail downloads. """ import json import psycopg2 from pathlib import Path DB_URL = "postgresql://nexus:nexus_autoparts_2026@localhost/nexus_autoparts" ARTICLES_DIR = Path("/home/Autopartes/data/tecdoc/parts/articles") DETAILS_DIR = Path("/home/Autopartes/data/tecdoc/parts/details") def run(): conn = psycopg2.connect(DB_URL) cur = conn.cursor() # Load category mapping: tecdoc_id → id_part_category cur.execute("SELECT id_part_category, tecdoc_id FROM part_categories WHERE tecdoc_id IS NOT NULL") cat_map = {r[1]: r[0] for r in cur.fetchall()} # Load existing manufacturers cur.execute("SELECT id_manufacture, name_manufacture FROM manufacturers") mfr_cache = {r[1]: r[0] for r in cur.fetchall()} # Load existing parts by OEM cur.execute("SELECT oem_part_number, id_part FROM parts WHERE oem_part_number IS NOT NULL") part_cache = {r[0]: r[1] for r in cur.fetchall()} # Load existing cross-refs to avoid duplicates cur.execute("SELECT part_id, cross_reference_number, source_ref FROM part_cross_references") xref_set = {(r[0], r[1], r[2]) for r in cur.fetchall()} # Also check detail files for OEM numbers detail_oem = {} # articleId → list of {oemBrand, oemDisplayNo} detail_files = list(DETAILS_DIR.glob("*.json")) print(f"Loading {len(detail_files)} detail files for OEM data...", flush=True) for f in detail_files: try: data = json.loads(f.read_text()) article = data.get('article', {}) if article and article.get('oemNo'): detail_oem[int(f.stem)] = article['oemNo'] except: continue stats = {'parts': 0, 'xrefs': 0, 'mfrs': 0, 'skipped': 0} article_files = sorted(ARTICLES_DIR.glob("*.json")) print(f"Processing {len(article_files)} article files...", flush=True) # Collect all unique articles across all files all_articles = {} # articleId → article data + category for f in article_files: parts = f.stem.split("_") if len(parts) != 2: continue cat_id = int(parts[1]) cat_db_id = cat_map.get(cat_id) try: articles = json.loads(f.read_text()) except: continue for a in articles: aid = a.get('articleId') if aid and aid not in all_articles: a['_cat_db_id'] = cat_db_id a['_cat_td_id'] = cat_id all_articles[aid] = a print(f"Unique articles to process: {len(all_articles):,}", flush=True) batch = 0 for aid, a in all_articles.items(): article_no = a.get('articleNo', '') supplier = a.get('supplierName', '') product_name = a.get('articleProductName', '') cat_db_id = a.get('_cat_db_id') if not article_no or not supplier: stats['skipped'] += 1 continue # Ensure manufacturer exists if supplier not in mfr_cache: cur.execute( "INSERT INTO manufacturers (name_manufacture) VALUES (%s) RETURNING id_manufacture", (supplier,)) mfr_cache[supplier] = cur.fetchone()[0] stats['mfrs'] += 1 # If we have OEM details for this article, create OEM parts oem_numbers = detail_oem.get(aid, []) if oem_numbers: for oem in oem_numbers: oem_no = oem.get('oemDisplayNo', '') oem_brand = oem.get('oemBrand', '') if not oem_no: continue if oem_no not in part_cache: cur.execute(""" INSERT INTO parts (oem_part_number, name_part, description) VALUES (%s, %s, %s) ON CONFLICT (oem_part_number) DO UPDATE SET name_part = EXCLUDED.name_part RETURNING id_part """, (oem_no, product_name, f"OEM {oem_brand}")) part_cache[oem_no] = cur.fetchone()[0] stats['parts'] += 1 part_id = part_cache[oem_no] # Add cross-reference (aftermarket → OEM) xref_key = (part_id, article_no, supplier) if xref_key not in xref_set: cur.execute(""" INSERT INTO part_cross_references (part_id, cross_reference_number, source_ref) VALUES (%s, %s, %s) ON CONFLICT DO NOTHING """, (part_id, article_no, supplier)) xref_set.add(xref_key) stats['xrefs'] += 1 else: # No OEM data yet - skip, will be imported when detail arrives pass batch += 1 if batch % 5000 == 0: conn.commit() print(f" {batch:,}/{len(all_articles):,} — {stats['parts']:,} parts, {stats['xrefs']:,} xrefs", flush=True) conn.commit() cur.close() conn.close() print(f"\n{'='*50}", flush=True) print(f"IMPORT COMPLETE", flush=True) print(f" Parts: {stats['parts']:,}", flush=True) print(f" Cross-refs: {stats['xrefs']:,}", flush=True) print(f" Manufacturers: {stats['mfrs']:,}", flush=True) print(f" Skipped: {stats['skipped']:,}", flush=True) print(f"{'='*50}", flush=True) if __name__ == "__main__": run()