Autoparts-DB/scripts/import_yokomitsu_catalog.py

#!/usr/bin/env python3
"""
Import Yokomitsu catalog from Excel into supplier_catalog tables.

Usage:
    python scripts/import_yokomitsu_catalog.py
"""

import os
import re
import sys
from datetime import datetime

import psycopg2
from openpyxl import load_workbook

# DB connections
MASTER_DB_URL = os.environ.get('MASTER_DB_URL', 'postgresql://postgres@localhost/nexus_autoparts')
TENANT_DB_URL = os.environ.get('TENANT_DB_URL', 'postgresql://postgres@localhost/tenant_refaccionaria_rached')

EXCEL_PATH = os.path.join(os.path.dirname(__file__), '..', 'data', 'YOKOMITSU_CATALOGOS_COMPLETOS_TODOS.xlsx')
SUPPLIER_NAME = 'YOKOMITSU'
TENANT_ID = 31


def connect_master():
    return psycopg2.connect(MASTER_DB_URL)


def connect_tenant():
    return psycopg2.connect(TENANT_DB_URL)


def parse_year(token):
    """Parse a 2-digit or 4-digit year string."""
    token = token.strip()
    if not token:
        return None
    # Handle ranges like 08-13 or 08-15 -> use first year
    if '-' in token:
        token = token.split('-')[0]
    token = token.strip()
    if not token.isdigit():
        return None
    n = int(token)
    if n < 50:
        return 2000 + n
    if n < 100:
        return 1900 + n
    if n >= 1900 and n <= 2050:
        return n
    return None


def parse_vehicle(vehicle_raw):
    """
    Parse a vehicle string like:
      'Chevrolet AVEO 1.5L 18'
      'Audi A4 1.8L/2.0L 09'
      'Dodge GRAND CHEROKEE 2/4WD 3.0L/3.7L/4.7L 08'
      'Volkswagen JETTA A4/CLASICO 1.8L/2.0L 06 V'
      'NISSAN 720 1988'
      'Dodge CARAVAN/VOYAGER 00'
      'ER 08-15 10'  (garbage/unknown)

    Returns dict with make, model, year, engine, vehicle_raw.
    """
    if not vehicle_raw:
        return {'make': None, 'model': None, 'year': None, 'engine': None, 'vehicle_raw': vehicle_raw}

    s = str(vehicle_raw).strip()
    # Remove trailing 'V' (variant marker)
    s = re.sub(r'\s+V$', '', s)

    tokens = s.split()
    if len(tokens) < 2:
        return {'make': None, 'model': None, 'year': None, 'engine': None, 'vehicle_raw': s}

    # Last token is usually year (or year with suffix)
    year = parse_year(tokens[-1])
    if year is None and len(tokens) >= 3:
        # Try second-to-last if last doesn't look like year
        year = parse_year(tokens[-2])
        if year:
            tokens = tokens[:-2] + [tokens[-1]]  # keep last as extra, but year found at -2
            year = parse_year(tokens[-2])
    if year is None:
        # No year found; keep raw and try best-effort
        make = tokens[0] if tokens else None
        return {'make': make, 'model': ' '.join(tokens[1:]) if len(tokens) > 1 else None,
                'year': None, 'engine': None, 'vehicle_raw': s}

    # Remove year token
    tokens_without_year = tokens[:-1]
    make = tokens_without_year[0] if tokens_without_year else None

    # Try to extract engine from remaining tokens
    # Engine patterns: contains 'L', 'WD', 'DIESEL', 'TURBO', numeric with decimal
    remaining = ' '.join(tokens_without_year[1:]) if len(tokens_without_year) > 1 else ''

    # Heuristic: look for engine tokens at the END of remaining string
    # Common patterns: "1.5L", "1.8L/2.0L", "2/4WD", "3.0L/3.7L/4.7L", "1.9L DIESEL"
    engine = None
    model = remaining

    # Try to find engine pattern from the end
    engine_match = re.search(r'(\d+(?:\.\d+)?\s*L(?:/\d+(?:\.\d+)?\s*L)*|\d+/\d+WD|\d+\.\d+L\s+DIESEL|\d+\.\d+L\s+TURBO)$', remaining, re.IGNORECASE)
    if engine_match:
        engine = engine_match.group(1)
        model = remaining[:engine_match.start()].strip()
    else:
        # Try simpler: anything with digits and 'L' or 'WD' at the very end
        parts = remaining.split()
        if parts and re.search(r'\d', parts[-1]) and ('L' in parts[-1].upper() or 'WD' in parts[-1].upper()):
            engine = parts[-1]
            model = ' '.join(parts[:-1])

    return {
        'make': make,
        'model': model,
        'year': year,
        'engine': engine,
        'vehicle_raw': s,
    }


def build_brand_cache(cur):
    """Fetch all brands from master DB."""
    cur.execute("SELECT id_brand, name_brand FROM brands")
    return {row[1].upper(): row[0] for row in cur.fetchall()}


def build_model_cache(cur):
    """Fetch all models from master DB."""
    cur.execute("SELECT id_model, brand_id, name_model FROM models")
    rows = cur.fetchall()
    # Index by brand_id for fast lookup
    cache = {}
    for mid, bid, name in rows:
        cache.setdefault(bid, []).append((mid, name))
    return cache


def build_year_cache(cur):
    """Fetch all years from master DB."""
    cur.execute("SELECT id_year, year_car FROM years")
    return {row[1]: row[0] for row in cur.fetchall()}


def build_mye_cache(cur):
    """Fetch all MYE entries."""
    cur.execute("SELECT id_mye, model_id, year_id FROM model_year_engine")
    cache = {}
    for mye_id, model_id, year_id in cur.fetchall():
        cache.setdefault((model_id, year_id), []).append(mye_id)
    return cache


def fuzzy_match_vehicle(parsed, brand_cache, model_cache, year_cache, mye_cache):
    """
    Try to match parsed vehicle to MYE IDs.
    Returns list of mye_ids (may be empty).
    """
    make = parsed.get('make')
    model_keyword = parsed.get('model')
    year = parsed.get('year')

    if not make or not model_keyword or not year:
        return []

    # Find brand
    brand_id = brand_cache.get(make.upper())
    if not brand_id:
        # Try partial match
        for name, bid in brand_cache.items():
            if make.upper() in name or name in make.upper():
                brand_id = bid
                break
    if not brand_id:
        return []

    # Find models for this brand that contain the keyword
    models = model_cache.get(brand_id, [])
    # Extract keyword: longest uppercase word from model string
    keyword = model_keyword.upper()
    # Try exact word match first
    matched_model_ids = []
    for mid, mname in models:
        if keyword in mname.upper():
            matched_model_ids.append(mid)

    if not matched_model_ids:
        # Try with each word in keyword
        words = [w for w in keyword.split() if len(w) >= 3]
        for mid, mname in models:
            mname_up = mname.upper()
            if any(w in mname_up for w in words):
                matched_model_ids.append(mid)

    if not matched_model_ids:
        return []

    # Find year_id
    year_id = year_cache.get(year)
    if not year_id:
        return []

    # Collect MYEs for all matched model+year combos
    mye_ids = []
    for mid in matched_model_ids:
        mye_ids.extend(mye_cache.get((mid, year_id), []))

    return mye_ids


def extract_interchanges(row):
    """Extract (brand, part_number) pairs from the interchange columns."""
    interchanges = []
    # Columns: MARCA.1=2, INTERCAMBIO=3, MARCA.2=4, INTERCAMBIO.1=5, ... up to MARCA.6=12, INTERCAMBIO.5=13
    pairs = [
        (row[2], row[3]),
        (row[4], row[5]),
        (row[6], row[7]),
        (row[8], row[9]),
        (row[10], row[11]),
        (row[12], row[13]),
    ]
    for brand, pn in pairs:
        if brand and pn:
            brand = str(brand).strip()
            pn = str(pn).strip()
            if brand and pn:
                interchanges.append((brand, pn))
    return interchanges


def main():
    print(f"[{datetime.now().isoformat()}] Starting import...")

    if not os.path.exists(EXCEL_PATH):
        print(f"ERROR: Excel not found at {EXCEL_PATH}")
        sys.exit(1)

    print(f"Loading {EXCEL_PATH}...")
    wb = load_workbook(EXCEL_PATH, read_only=True, data_only=True)

    master_conn = connect_master()
    master_conn = connect_master()
    master_cur = master_conn.cursor()
    master_cur = master_conn.cursor()

    print("Building caches...")
    brand_cache = build_brand_cache(master_cur)
    model_cache = build_model_cache(master_cur)
    year_cache = build_year_cache(master_cur)
    mye_cache = build_mye_cache(master_cur)
    print(f"  Brands: {len(brand_cache)}, Models: {sum(len(v) for v in model_cache.values())}, Years: {len(year_cache)}, MYE combos: {len(mye_cache)}")

    # Prepare UPSERT statements
    upsert_catalog_sql = """
        INSERT INTO supplier_catalog (supplier_name, sku, name, category)
        VALUES (%s, %s, %s, %s, %s)
        ON CONFLICT (supplier_name, sku, category) DO UPDATE SET
            name = EXCLUDED.name,
            category = EXCLUDED.category
        RETURNING id
    """

    insert_compat_sql = """
        INSERT INTO supplier_catalog_compat
            (catalog_id, make, model, year, engine, model_year_engine_id, source)
        VALUES (%s, %s, %s, %s, %s, %s, %s)
        ON CONFLICT (catalog_id, make, model, year, engine) DO NOTHING
    """

    insert_interchange_sql = """
        INSERT INTO supplier_catalog_interchange (catalog_id, brand, part_number)
        VALUES (%s, %s, %s)
        ON CONFLICT DO NOTHING
    """

    # Track stats
    stats = {
        'sheets': 0,
        'rows': 0,
        'catalog_items': 0,
        'compat_rows': 0,
        'interchange_rows': 0,
        'vehicles_parsed': 0,
        'vehicles_matched': 0,
        'mye_matches': 0,
    }

    # Process each sheet
    for sheet_name in wb.sheetnames:
        ws = wb[sheet_name]
        rows = list(ws.iter_rows(values_only=True))
        if not rows:
            continue
        headers = rows[0]
        data_rows = rows[1:]
        stats['sheets'] += 1
        print(f"\nProcessing sheet '{sheet_name}' with {len(data_rows)} rows...")

        for idx, row in enumerate(data_rows):
            if idx % 1000 == 0 and idx > 0:
                print(f"  ...{idx} rows processed")

            # Skip empty rows
            if not row or not row[1]:
                continue

            sku = str(row[1]).strip()
            name = str(row[14]).strip() if row[14] else ''
            vehicle_raw = str(row[15]).strip() if row[15] else ''

            if not sku or not name:
                continue

            stats['rows'] += 1

            # Upsert catalog item
            master_cur.execute(upsert_catalog_sql, (SUPPLIER_NAME, sku, name, sheet_name))
            catalog_id = master_cur.fetchone()[0]
            stats['catalog_items'] += 1

            # Parse vehicle
            parsed = parse_vehicle(vehicle_raw)
            stats['vehicles_parsed'] += 1

            mye_ids = fuzzy_match_vehicle(parsed, brand_cache, model_cache, year_cache, mye_cache)
            if mye_ids:
                stats['vehicles_matched'] += 1
                stats['mye_matches'] += len(mye_ids)

            # Insert compatibility rows
            # If we have MYE matches, insert one row per MYE
            if mye_ids:
                for mye_id in mye_ids:
                    master_cur.execute(insert_compat_sql, (
                        catalog_id,
                        parsed['make'],
                        parsed['model'],
                        parsed['year'],
                        parsed['engine'],
                        mye_id,
                        'fuzzy_match',
                    ))
                    stats['compat_rows'] += 1
            else:
                # No MYE match: insert with text only
                master_cur.execute(insert_compat_sql, (
                    catalog_id,
                    parsed['make'],
                    parsed['model'],
                    parsed['year'],
                    parsed['engine'],
                    None,
                    'import_text',
                ))
                stats['compat_rows'] += 1

            # Insert interchanges
            interchanges = extract_interchanges(row)
            for brand, pn in interchanges:
                master_cur.execute(insert_interchange_sql, (catalog_id, brand, pn))
                stats['interchange_rows'] += 1

        # Commit per sheet
        master_conn.commit()
        print(f"  Sheet '{sheet_name}' committed.")

    # Final stats
    print(f"\n{'='*60}")
    print("IMPORT COMPLETE")
    print(f"{'='*60}")
    print(f"Sheets processed:      {stats['sheets']}")
    print(f"Total rows read:       {stats['rows']}")
    print(f"Catalog items:         {stats['catalog_items']}")
    print(f"Compat rows:           {stats['compat_rows']}")
    print(f"Interchange rows:      {stats['interchange_rows']}")
    print(f"Vehicles parsed:       {stats['vehicles_parsed']}")
    print(f"Vehicles with MYE:     {stats['vehicles_matched']}")
    print(f"Total MYE matches:     {stats['mye_matches']}")

    master_cur.close()
    master_cur.close()
    master_conn.close()
    master_conn.close()


if __name__ == '__main__':
    main()