Autoparts-DB/scripts/import_vazlo_catalog.py

#!/usr/bin/env python3
"""
Import VAZLO catalog from Excel into supplier_catalog tables.

Usage:
    python scripts/import_vazlo_catalog.py
"""

import os
import re
import sys
from collections import defaultdict
from datetime import datetime

import psycopg2
from openpyxl import load_workbook

# DB connections
MASTER_DB_URL = os.environ.get('MASTER_DB_URL', 'postgresql://postgres@localhost/nexus_autoparts')
TENANT_DB_URL = os.environ.get('TENANT_DB_URL', 'postgresql://postgres@localhost/tenant_refaccionaria_rached')

EXCEL_PATH = os.path.join(os.path.dirname(__file__), '..', 'data', 'VAZLO (1).xlsx')
SUPPLIER_NAME = 'VAZLO'
TENANT_ID = 31

POS_KEYWORDS = {
    'DEL.', 'TRAS.', 'FRONT.', 'EXT.', 'IZQ.', 'DER.', 'RUEDA', 'CAJA',
    'INF.', 'SUP.', 'TRANS.', 'STD', 'AWD', '2/4WD', '4WD', 'FWD', 'RWD',
    '4X4', 'TURBO', 'GASOLINA', 'DIESEL',
    'DEL', 'TRAS', 'FRONT', 'EXT', 'IZQ', 'DER', 'INF', 'SUP', 'TRANS',
}

MULTI_WORD_MAKES = {
    ('MERCEDES', 'BENZ'): 'MERCEDES BENZ',
    ('LAND', 'ROVER'): 'LAND ROVER',
    ('ALFA', 'ROMEO'): 'ALFA ROMEO',
    ('AMERICAN', 'MOTORS'): 'AMERICAN MOTORS',
    ('ROLLS', 'ROYCE'): 'ROLLS ROYCE',
    ('ASTON', 'MARTIN'): 'ASTON MARTIN',
    ('GREAT', 'WALL'): 'GREAT WALL',
}


def connect_master():
    return psycopg2.connect(MASTER_DB_URL)


def connect_tenant():
    return psycopg2.connect(TENANT_DB_URL)


def collect_all_skus(wb):
    """Pre-scan all SKUs to detect SKU-in-model cases."""
    skus = set()
    for sheet_name in wb.sheetnames:
        ws = wb[sheet_name]
        for row in ws.iter_rows(min_row=2, values_only=True):
            sku = str(row[1]).strip() if row[1] else ''
            if sku:
                skus.add(sku)
    return skus


def parse_carro(carro, all_skus):
    """
    Parse CARRO_PERTENECIENTE like:
      'ACURA TL DEL. 2015'
      'BMW X1 SDRIVE 20IA TRAS. 2018'
      'ACURA TL FRONT. DER. 2004'
      'AUDI 4000S CAJA 1980'
      'MERCEDES BENZ C350 E --'
      'ACURA TLX 3429'  (3429 is a SKU inserted into model)

    Returns dict with make, model, year, position, raw.
    """
    if not carro:
        return {'make': None, 'model': None, 'year': None, 'position': None, 'raw': carro}

    s = str(carro).strip()
    parts = s.split()
    if not parts:
        return {'make': None, 'model': None, 'year': None, 'position': None, 'raw': s}

    # Extract year from end
    year = None
    if re.match(r'^(19|20)\d{2}$', parts[-1]):
        year = int(parts[-1])
        parts = parts[:-1]

    # Remove trailing '--' (no-year marker)
    if parts and parts[-1] == '--':
        parts = parts[:-1]

    # Extract make
    make = parts[0] if parts else ''
    if len(parts) >= 2:
        key = (parts[0].upper(), parts[1].upper())
        if key in MULTI_WORD_MAKES:
            make = MULTI_WORD_MAKES[key]
            parts = parts[2:]
        else:
            parts = parts[1:]
    else:
        parts = parts[1:]

    # Extract position keywords from the end
    position_parts = []
    while parts and parts[-1].upper() in POS_KEYWORDS:
        position_parts.insert(0, parts[-1])
        parts = parts[:-1]

    model = ' '.join(parts)

    # Remove trailing SKU numbers that match known VAZLO SKUs
    # e.g. "ACURA TLX 3429" -> model="TLX", sku_suffix="3429"
    model_parts = model.split()
    if model_parts and re.match(r'^\d{3,4}$', model_parts[-1]) and model_parts[-1] in all_skus:
        model = ' '.join(model_parts[:-1])

    return {
        'make': make,
        'model': model,
        'year': year,
        'position': ' '.join(position_parts),
        'raw': s,
    }


def extract_interchanges(row):
    """Extract (brand, part_number) pairs from all 11 interchange columns."""
    interchanges = []
    for i in range(11):
        marca_col = 2 + i * 2
        inter_col = 3 + i * 2
        if marca_col < len(row) and row[marca_col]:
            brand = str(row[marca_col]).strip()
            pn = str(row[inter_col]).strip() if inter_col < len(row) and row[inter_col] else ''
            if brand and pn:
                interchanges.append((brand, pn))
    return interchanges


def normalize_name(name):
    """Clean up piece name: collapse whitespace, replace newlines."""
    if not name:
        return ''
    return ' '.join(str(name).replace('\n', ' ').split())


def main():
    print(f"[{datetime.now().isoformat()}] Starting VAZLO import...")

    if not os.path.exists(EXCEL_PATH):
        print(f"ERROR: Excel not found at {EXCEL_PATH}")
        sys.exit(1)

    print(f"Loading {EXCEL_PATH}...")
    wb = load_workbook(EXCEL_PATH, read_only=True, data_only=True)

    # Pre-scan SKUs for SKU-in-model detection
    print("Pre-scanning SKUs...")
    all_skus = collect_all_skus(wb)
    print(f"  Found {len(all_skus)} unique SKUs")

    master_conn = connect_master()
    master_conn = connect_master()
    master_cur = master_conn.cursor()

    upsert_catalog_sql = """
        INSERT INTO supplier_catalog (supplier_name, sku, name, category, is_active)
        VALUES (%s, %s, %s, %s, true)
        ON CONFLICT (supplier_name, sku, category) DO UPDATE SET
            name = EXCLUDED.name,
            category = EXCLUDED.category,
            is_active = true
        RETURNING id
    """

    insert_compat_sql = """
        INSERT INTO supplier_catalog_compat
            (catalog_id, make, model, year, engine, model_year_engine_id, source)
        VALUES (%s, %s, %s, %s, %s, %s, %s)
        ON CONFLICT (catalog_id, make, model, year, engine) DO NOTHING
    """

    insert_interchange_sql = """
        INSERT INTO supplier_catalog_interchange (catalog_id, brand, part_number)
        VALUES (%s, %s, %s)
        ON CONFLICT DO NOTHING
    """

    stats = {
        'sheets': 0,
        'rows': 0,
        'catalog_items': 0,
        'compat_rows': 0,
        'interchange_rows': 0,
        'vehicles_parsed': 0,
        'skipped_no_sku': 0,
        'skipped_no_carro': 0,
    }

    for sheet_name in wb.sheetnames:
        ws = wb[sheet_name]
        rows = list(ws.iter_rows(values_only=True))
        if not rows:
            continue
        data_rows = rows[1:]
        stats['sheets'] += 1
        print(f"\nProcessing sheet '{sheet_name}' with {len(data_rows)} rows...")

        # Cache catalog_id per (sku, sheet_name) to avoid repeated upserts
        catalog_id_cache = {}

        for idx, row in enumerate(data_rows):
            if idx % 2000 == 0 and idx > 0:
                print(f"  ...{idx} rows processed")

            if not row or not row[1]:
                stats['skipped_no_sku'] += 1
                continue

            sku = str(row[1]).strip()
            name = normalize_name(row[24])
            carro_raw = str(row[25]).strip() if row[25] else ''

            if not sku:
                stats['skipped_no_sku'] += 1
                continue

            stats['rows'] += 1

            # Upsert catalog item (keyed by sku + category)
            cache_key = (sku, sheet_name)
            catalog_id = catalog_id_cache.get(cache_key)
            if catalog_id is None:
                master_cur.execute(upsert_catalog_sql, (SUPPLIER_NAME, sku, name, sheet_name))
                catalog_id = master_cur.fetchone()[0]
                catalog_id_cache[cache_key] = catalog_id
                stats['catalog_items'] += 1

            # Parse vehicle
            parsed = parse_carro(carro_raw, all_skus)
            stats['vehicles_parsed'] += 1

            # Insert compatibility (text-only, no MYE matching during import)
            master_cur.execute(insert_compat_sql, (
                catalog_id,
                parsed['make'],
                parsed['model'],
                parsed['year'],
                parsed['position'] or None,
                None,
                'import_text',
            ))
            stats['compat_rows'] += 1

            # Insert interchanges
            interchanges = extract_interchanges(row)
            for brand, pn in interchanges:
                master_cur.execute(insert_interchange_sql, (catalog_id, brand, pn))
                stats['interchange_rows'] += 1

        # Commit per sheet
        master_conn.commit()
        print(f"  Sheet '{sheet_name}' committed.")

    print(f"\n{'='*60}")
    print("IMPORT COMPLETE")
    print(f"{'='*60}")
    print(f"Sheets processed:      {stats['sheets']}")
    print(f"Total rows read:       {stats['rows']}")
    print(f"Catalog items:         {stats['catalog_items']}")
    print(f"Compat rows:           {stats['compat_rows']}")
    print(f"Interchange rows:      {stats['interchange_rows']}")
    print(f"Vehicles parsed:       {stats['vehicles_parsed']}")
    print(f"Skipped (no SKU):      {stats['skipped_no_sku']}")

    master_cur.close()
    master_conn.close()
    master_conn.close()


if __name__ == '__main__':
    main()