feat(catalog): supplier catalog cleanup, fuzzy matching, and navigation fixes

- Cleaned 137+ fake engine-displacement models from supplier imports (v3/v4 scripts: Chevrolet, Ford, Chrysler, Dodge, Jeep, Nissan, etc.) - Removed 1,251+ corrupted models (INT. prefixes, year-suffix, torque specs, empty names, trailing-year variants) - Migrated supplier tables to master DB (supplier_catalog, supplier_catalog_compat, supplier_catalog_interchange) - Fixed _get_mye_ids_with_parts() to query supplier_catalog_compat from master DB so supplier-only vehicles appear for all tenants - Added fuzzy model matcher with parenthesis stripping, noise suffix removal, compact matching, prefix/substring fallback, model aliases, and ±3 year proximity - Matched compat rows: KEEP GREEN +14,152, KNADIAN +3,021, VAZLO +127,500, LUK +477, RAYBESTOS +1,743 - Added KNADIAN catalog importer with year-range expansion and future-year filtering - Added VAZLO catalog importer with position parsing and SKU-in-model cleanup - Added Keep Green, LUK, Yokomitsu, Raybestos catalog importers - Cache clearing after cleanups (_classify_cache_*, nexus:mye_ids:*, nexus:brand_mye_counts:*) Final match rates: - KEEP GREEN: 90.3% - VAZLO: 93.6% - YOKOMITSU: 100.0% - KNADIAN: 57.4% - LUK: 51.0% - RAYBESTOS: 55.9%
2026-06-09 07:47:42 +00:00
parent 5ea667b80e
commit ea29cc31c0
53 changed files with 7727 additions and 548 deletions
--- a/scripts/import_yokomitsu_catalog.py
+++ b/scripts/import_yokomitsu_catalog.py
@@ -0,0 +1,393 @@
+#!/usr/bin/env python3
+"""
+Import Yokomitsu catalog from Excel into supplier_catalog tables.
+
+Usage:
+    python scripts/import_yokomitsu_catalog.py
+"""
+
+import os
+import re
+import sys
+from datetime import datetime
+
+import psycopg2
+from openpyxl import load_workbook
+
+# DB connections
+MASTER_DB_URL = os.environ.get('MASTER_DB_URL', 'postgresql://postgres@localhost/nexus_autoparts')
+TENANT_DB_URL = os.environ.get('TENANT_DB_URL', 'postgresql://postgres@localhost/tenant_refaccionaria_rached')
+
+EXCEL_PATH = os.path.join(os.path.dirname(__file__), '..', 'data', 'YOKOMITSU_CATALOGOS_COMPLETOS_TODOS.xlsx')
+SUPPLIER_NAME = 'YOKOMITSU'
+TENANT_ID = 31
+
+
+def connect_master():
+    return psycopg2.connect(MASTER_DB_URL)
+
+
+def connect_tenant():
+    return psycopg2.connect(TENANT_DB_URL)
+
+
+def parse_year(token):
+    """Parse a 2-digit or 4-digit year string."""
+    token = token.strip()
+    if not token:
+        return None
+    # Handle ranges like 08-13 or 08-15 -> use first year
+    if '-' in token:
+        token = token.split('-')[0]
+    token = token.strip()
+    if not token.isdigit():
+        return None
+    n = int(token)
+    if n < 50:
+        return 2000 + n
+    if n < 100:
+        return 1900 + n
+    if n >= 1900 and n <= 2050:
+        return n
+    return None
+
+
+def parse_vehicle(vehicle_raw):
+    """
+    Parse a vehicle string like:
+      'Chevrolet AVEO 1.5L 18'
+      'Audi A4 1.8L/2.0L 09'
+      'Dodge GRAND CHEROKEE 2/4WD 3.0L/3.7L/4.7L 08'
+      'Volkswagen JETTA A4/CLASICO 1.8L/2.0L 06 V'
+      'NISSAN 720 1988'
+      'Dodge CARAVAN/VOYAGER 00'
+      'ER 08-15 10'  (garbage/unknown)
+
+    Returns dict with make, model, year, engine, vehicle_raw.
+    """
+    if not vehicle_raw:
+        return {'make': None, 'model': None, 'year': None, 'engine': None, 'vehicle_raw': vehicle_raw}
+
+    s = str(vehicle_raw).strip()
+    # Remove trailing 'V' (variant marker)
+    s = re.sub(r'\s+V$', '', s)
+
+    tokens = s.split()
+    if len(tokens) < 2:
+        return {'make': None, 'model': None, 'year': None, 'engine': None, 'vehicle_raw': s}
+
+    # Last token is usually year (or year with suffix)
+    year = parse_year(tokens[-1])
+    if year is None and len(tokens) >= 3:
+        # Try second-to-last if last doesn't look like year
+        year = parse_year(tokens[-2])
+        if year:
+            tokens = tokens[:-2] + [tokens[-1]]  # keep last as extra, but year found at -2
+            year = parse_year(tokens[-2])
+    if year is None:
+        # No year found; keep raw and try best-effort
+        make = tokens[0] if tokens else None
+        return {'make': make, 'model': ' '.join(tokens[1:]) if len(tokens) > 1 else None,
+                'year': None, 'engine': None, 'vehicle_raw': s}
+
+    # Remove year token
+    tokens_without_year = tokens[:-1]
+    make = tokens_without_year[0] if tokens_without_year else None
+
+    # Try to extract engine from remaining tokens
+    # Engine patterns: contains 'L', 'WD', 'DIESEL', 'TURBO', numeric with decimal
+    remaining = ' '.join(tokens_without_year[1:]) if len(tokens_without_year) > 1 else ''
+
+    # Heuristic: look for engine tokens at the END of remaining string
+    # Common patterns: "1.5L", "1.8L/2.0L", "2/4WD", "3.0L/3.7L/4.7L", "1.9L DIESEL"
+    engine = None
+    model = remaining
+
+    # Try to find engine pattern from the end
+    engine_match = re.search(r'(\d+(?:\.\d+)?\s*L(?:/\d+(?:\.\d+)?\s*L)*|\d+/\d+WD|\d+\.\d+L\s+DIESEL|\d+\.\d+L\s+TURBO)$', remaining, re.IGNORECASE)
+    if engine_match:
+        engine = engine_match.group(1)
+        model = remaining[:engine_match.start()].strip()
+    else:
+        # Try simpler: anything with digits and 'L' or 'WD' at the very end
+        parts = remaining.split()
+        if parts and re.search(r'\d', parts[-1]) and ('L' in parts[-1].upper() or 'WD' in parts[-1].upper()):
+            engine = parts[-1]
+            model = ' '.join(parts[:-1])
+
+    return {
+        'make': make,
+        'model': model,
+        'year': year,
+        'engine': engine,
+        'vehicle_raw': s,
+    }
+
+
+def build_brand_cache(cur):
+    """Fetch all brands from master DB."""
+    cur.execute("SELECT id_brand, name_brand FROM brands")
+    return {row[1].upper(): row[0] for row in cur.fetchall()}
+
+
+def build_model_cache(cur):
+    """Fetch all models from master DB."""
+    cur.execute("SELECT id_model, brand_id, name_model FROM models")
+    rows = cur.fetchall()
+    # Index by brand_id for fast lookup
+    cache = {}
+    for mid, bid, name in rows:
+        cache.setdefault(bid, []).append((mid, name))
+    return cache
+
+
+def build_year_cache(cur):
+    """Fetch all years from master DB."""
+    cur.execute("SELECT id_year, year_car FROM years")
+    return {row[1]: row[0] for row in cur.fetchall()}
+
+
+def build_mye_cache(cur):
+    """Fetch all MYE entries."""
+    cur.execute("SELECT id_mye, model_id, year_id FROM model_year_engine")
+    cache = {}
+    for mye_id, model_id, year_id in cur.fetchall():
+        cache.setdefault((model_id, year_id), []).append(mye_id)
+    return cache
+
+
+def fuzzy_match_vehicle(parsed, brand_cache, model_cache, year_cache, mye_cache):
+    """
+    Try to match parsed vehicle to MYE IDs.
+    Returns list of mye_ids (may be empty).
+    """
+    make = parsed.get('make')
+    model_keyword = parsed.get('model')
+    year = parsed.get('year')
+
+    if not make or not model_keyword or not year:
+        return []
+
+    # Find brand
+    brand_id = brand_cache.get(make.upper())
+    if not brand_id:
+        # Try partial match
+        for name, bid in brand_cache.items():
+            if make.upper() in name or name in make.upper():
+                brand_id = bid
+                break
+    if not brand_id:
+        return []
+
+    # Find models for this brand that contain the keyword
+    models = model_cache.get(brand_id, [])
+    # Extract keyword: longest uppercase word from model string
+    keyword = model_keyword.upper()
+    # Try exact word match first
+    matched_model_ids = []
+    for mid, mname in models:
+        if keyword in mname.upper():
+            matched_model_ids.append(mid)
+
+    if not matched_model_ids:
+        # Try with each word in keyword
+        words = [w for w in keyword.split() if len(w) >= 3]
+        for mid, mname in models:
+            mname_up = mname.upper()
+            if any(w in mname_up for w in words):
+                matched_model_ids.append(mid)
+
+    if not matched_model_ids:
+        return []
+
+    # Find year_id
+    year_id = year_cache.get(year)
+    if not year_id:
+        return []
+
+    # Collect MYEs for all matched model+year combos
+    mye_ids = []
+    for mid in matched_model_ids:
+        mye_ids.extend(mye_cache.get((mid, year_id), []))
+
+    return mye_ids
+
+
+def extract_interchanges(row):
+    """Extract (brand, part_number) pairs from the interchange columns."""
+    interchanges = []
+    # Columns: MARCA.1=2, INTERCAMBIO=3, MARCA.2=4, INTERCAMBIO.1=5, ... up to MARCA.6=12, INTERCAMBIO.5=13
+    pairs = [
+        (row[2], row[3]),
+        (row[4], row[5]),
+        (row[6], row[7]),
+        (row[8], row[9]),
+        (row[10], row[11]),
+        (row[12], row[13]),
+    ]
+    for brand, pn in pairs:
+        if brand and pn:
+            brand = str(brand).strip()
+            pn = str(pn).strip()
+            if brand and pn:
+                interchanges.append((brand, pn))
+    return interchanges
+
+
+def main():
+    print(f"[{datetime.now().isoformat()}] Starting import...")
+
+    if not os.path.exists(EXCEL_PATH):
+        print(f"ERROR: Excel not found at {EXCEL_PATH}")
+        sys.exit(1)
+
+    print(f"Loading {EXCEL_PATH}...")
+    wb = load_workbook(EXCEL_PATH, read_only=True, data_only=True)
+
+    master_conn = connect_master()
+    master_conn = connect_master()
+    master_cur = master_conn.cursor()
+    master_cur = master_conn.cursor()
+
+    print("Building caches...")
+    brand_cache = build_brand_cache(master_cur)
+    model_cache = build_model_cache(master_cur)
+    year_cache = build_year_cache(master_cur)
+    mye_cache = build_mye_cache(master_cur)
+    print(f"  Brands: {len(brand_cache)}, Models: {sum(len(v) for v in model_cache.values())}, Years: {len(year_cache)}, MYE combos: {len(mye_cache)}")
+
+    # Prepare UPSERT statements
+    upsert_catalog_sql = """
+        INSERT INTO supplier_catalog (supplier_name, sku, name, category)
+        VALUES (%s, %s, %s, %s, %s)
+        ON CONFLICT (supplier_name, sku, category) DO UPDATE SET
+            name = EXCLUDED.name,
+            category = EXCLUDED.category
+        RETURNING id
+    """
+
+    insert_compat_sql = """
+        INSERT INTO supplier_catalog_compat
+            (catalog_id, make, model, year, engine, model_year_engine_id, source)
+        VALUES (%s, %s, %s, %s, %s, %s, %s)
+        ON CONFLICT (catalog_id, make, model, year, engine) DO NOTHING
+    """
+
+    insert_interchange_sql = """
+        INSERT INTO supplier_catalog_interchange (catalog_id, brand, part_number)
+        VALUES (%s, %s, %s)
+        ON CONFLICT DO NOTHING
+    """
+
+    # Track stats
+    stats = {
+        'sheets': 0,
+        'rows': 0,
+        'catalog_items': 0,
+        'compat_rows': 0,
+        'interchange_rows': 0,
+        'vehicles_parsed': 0,
+        'vehicles_matched': 0,
+        'mye_matches': 0,
+    }
+
+    # Process each sheet
+    for sheet_name in wb.sheetnames:
+        ws = wb[sheet_name]
+        rows = list(ws.iter_rows(values_only=True))
+        if not rows:
+            continue
+        headers = rows[0]
+        data_rows = rows[1:]
+        stats['sheets'] += 1
+        print(f"\nProcessing sheet '{sheet_name}' with {len(data_rows)} rows...")
+
+        for idx, row in enumerate(data_rows):
+            if idx % 1000 == 0 and idx > 0:
+                print(f"  ...{idx} rows processed")
+
+            # Skip empty rows
+            if not row or not row[1]:
+                continue
+
+            sku = str(row[1]).strip()
+            name = str(row[14]).strip() if row[14] else ''
+            vehicle_raw = str(row[15]).strip() if row[15] else ''
+
+            if not sku or not name:
+                continue
+
+            stats['rows'] += 1
+
+            # Upsert catalog item
+            master_cur.execute(upsert_catalog_sql, (SUPPLIER_NAME, sku, name, sheet_name))
+            catalog_id = master_cur.fetchone()[0]
+            stats['catalog_items'] += 1
+
+            # Parse vehicle
+            parsed = parse_vehicle(vehicle_raw)
+            stats['vehicles_parsed'] += 1
+
+            mye_ids = fuzzy_match_vehicle(parsed, brand_cache, model_cache, year_cache, mye_cache)
+            if mye_ids:
+                stats['vehicles_matched'] += 1
+                stats['mye_matches'] += len(mye_ids)
+
+            # Insert compatibility rows
+            # If we have MYE matches, insert one row per MYE
+            if mye_ids:
+                for mye_id in mye_ids:
+                    master_cur.execute(insert_compat_sql, (
+                        catalog_id,
+                        parsed['make'],
+                        parsed['model'],
+                        parsed['year'],
+                        parsed['engine'],
+                        mye_id,
+                        'fuzzy_match',
+                    ))
+                    stats['compat_rows'] += 1
+            else:
+                # No MYE match: insert with text only
+                master_cur.execute(insert_compat_sql, (
+                    catalog_id,
+                    parsed['make'],
+                    parsed['model'],
+                    parsed['year'],
+                    parsed['engine'],
+                    None,
+                    'import_text',
+                ))
+                stats['compat_rows'] += 1
+
+            # Insert interchanges
+            interchanges = extract_interchanges(row)
+            for brand, pn in interchanges:
+                master_cur.execute(insert_interchange_sql, (catalog_id, brand, pn))
+                stats['interchange_rows'] += 1
+
+        # Commit per sheet
+        master_conn.commit()
+        print(f"  Sheet '{sheet_name}' committed.")
+
+    # Final stats
+    print(f"\n{'='*60}")
+    print("IMPORT COMPLETE")
+    print(f"{'='*60}")
+    print(f"Sheets processed:      {stats['sheets']}")
+    print(f"Total rows read:       {stats['rows']}")
+    print(f"Catalog items:         {stats['catalog_items']}")
+    print(f"Compat rows:           {stats['compat_rows']}")
+    print(f"Interchange rows:      {stats['interchange_rows']}")
+    print(f"Vehicles parsed:       {stats['vehicles_parsed']}")
+    print(f"Vehicles with MYE:     {stats['vehicles_matched']}")
+    print(f"Total MYE matches:     {stats['mye_matches']}")
+
+    master_cur.close()
+    master_cur.close()
+    master_conn.close()
+    master_conn.close()
+
+
+if __name__ == '__main__':
+    main()