#!/usr/bin/env python3 """ Import Yokomitsu catalog from Excel into supplier_catalog tables. Usage: python scripts/import_yokomitsu_catalog.py """ import os import re import sys from datetime import datetime import psycopg2 from openpyxl import load_workbook # DB connections MASTER_DB_URL = os.environ.get('MASTER_DB_URL', 'postgresql://postgres@localhost/nexus_autoparts') TENANT_DB_URL = os.environ.get('TENANT_DB_URL', 'postgresql://postgres@localhost/tenant_refaccionaria_rached') EXCEL_PATH = os.path.join(os.path.dirname(__file__), '..', 'data', 'YOKOMITSU_CATALOGOS_COMPLETOS_TODOS.xlsx') SUPPLIER_NAME = 'YOKOMITSU' TENANT_ID = 31 def connect_master(): return psycopg2.connect(MASTER_DB_URL) def connect_tenant(): return psycopg2.connect(TENANT_DB_URL) def parse_year(token): """Parse a 2-digit or 4-digit year string.""" token = token.strip() if not token: return None # Handle ranges like 08-13 or 08-15 -> use first year if '-' in token: token = token.split('-')[0] token = token.strip() if not token.isdigit(): return None n = int(token) if n < 50: return 2000 + n if n < 100: return 1900 + n if n >= 1900 and n <= 2050: return n return None def parse_vehicle(vehicle_raw): """ Parse a vehicle string like: 'Chevrolet AVEO 1.5L 18' 'Audi A4 1.8L/2.0L 09' 'Dodge GRAND CHEROKEE 2/4WD 3.0L/3.7L/4.7L 08' 'Volkswagen JETTA A4/CLASICO 1.8L/2.0L 06 V' 'NISSAN 720 1988' 'Dodge CARAVAN/VOYAGER 00' 'ER 08-15 10' (garbage/unknown) Returns dict with make, model, year, engine, vehicle_raw. """ if not vehicle_raw: return {'make': None, 'model': None, 'year': None, 'engine': None, 'vehicle_raw': vehicle_raw} s = str(vehicle_raw).strip() # Remove trailing 'V' (variant marker) s = re.sub(r'\s+V$', '', s) tokens = s.split() if len(tokens) < 2: return {'make': None, 'model': None, 'year': None, 'engine': None, 'vehicle_raw': s} # Last token is usually year (or year with suffix) year = parse_year(tokens[-1]) if year is None and len(tokens) >= 3: # Try second-to-last if last doesn't look like year year = parse_year(tokens[-2]) if year: tokens = tokens[:-2] + [tokens[-1]] # keep last as extra, but year found at -2 year = parse_year(tokens[-2]) if year is None: # No year found; keep raw and try best-effort make = tokens[0] if tokens else None return {'make': make, 'model': ' '.join(tokens[1:]) if len(tokens) > 1 else None, 'year': None, 'engine': None, 'vehicle_raw': s} # Remove year token tokens_without_year = tokens[:-1] make = tokens_without_year[0] if tokens_without_year else None # Try to extract engine from remaining tokens # Engine patterns: contains 'L', 'WD', 'DIESEL', 'TURBO', numeric with decimal remaining = ' '.join(tokens_without_year[1:]) if len(tokens_without_year) > 1 else '' # Heuristic: look for engine tokens at the END of remaining string # Common patterns: "1.5L", "1.8L/2.0L", "2/4WD", "3.0L/3.7L/4.7L", "1.9L DIESEL" engine = None model = remaining # Try to find engine pattern from the end engine_match = re.search(r'(\d+(?:\.\d+)?\s*L(?:/\d+(?:\.\d+)?\s*L)*|\d+/\d+WD|\d+\.\d+L\s+DIESEL|\d+\.\d+L\s+TURBO)$', remaining, re.IGNORECASE) if engine_match: engine = engine_match.group(1) model = remaining[:engine_match.start()].strip() else: # Try simpler: anything with digits and 'L' or 'WD' at the very end parts = remaining.split() if parts and re.search(r'\d', parts[-1]) and ('L' in parts[-1].upper() or 'WD' in parts[-1].upper()): engine = parts[-1] model = ' '.join(parts[:-1]) return { 'make': make, 'model': model, 'year': year, 'engine': engine, 'vehicle_raw': s, } def build_brand_cache(cur): """Fetch all brands from master DB.""" cur.execute("SELECT id_brand, name_brand FROM brands") return {row[1].upper(): row[0] for row in cur.fetchall()} def build_model_cache(cur): """Fetch all models from master DB.""" cur.execute("SELECT id_model, brand_id, name_model FROM models") rows = cur.fetchall() # Index by brand_id for fast lookup cache = {} for mid, bid, name in rows: cache.setdefault(bid, []).append((mid, name)) return cache def build_year_cache(cur): """Fetch all years from master DB.""" cur.execute("SELECT id_year, year_car FROM years") return {row[1]: row[0] for row in cur.fetchall()} def build_mye_cache(cur): """Fetch all MYE entries.""" cur.execute("SELECT id_mye, model_id, year_id FROM model_year_engine") cache = {} for mye_id, model_id, year_id in cur.fetchall(): cache.setdefault((model_id, year_id), []).append(mye_id) return cache def fuzzy_match_vehicle(parsed, brand_cache, model_cache, year_cache, mye_cache): """ Try to match parsed vehicle to MYE IDs. Returns list of mye_ids (may be empty). """ make = parsed.get('make') model_keyword = parsed.get('model') year = parsed.get('year') if not make or not model_keyword or not year: return [] # Find brand brand_id = brand_cache.get(make.upper()) if not brand_id: # Try partial match for name, bid in brand_cache.items(): if make.upper() in name or name in make.upper(): brand_id = bid break if not brand_id: return [] # Find models for this brand that contain the keyword models = model_cache.get(brand_id, []) # Extract keyword: longest uppercase word from model string keyword = model_keyword.upper() # Try exact word match first matched_model_ids = [] for mid, mname in models: if keyword in mname.upper(): matched_model_ids.append(mid) if not matched_model_ids: # Try with each word in keyword words = [w for w in keyword.split() if len(w) >= 3] for mid, mname in models: mname_up = mname.upper() if any(w in mname_up for w in words): matched_model_ids.append(mid) if not matched_model_ids: return [] # Find year_id year_id = year_cache.get(year) if not year_id: return [] # Collect MYEs for all matched model+year combos mye_ids = [] for mid in matched_model_ids: mye_ids.extend(mye_cache.get((mid, year_id), [])) return mye_ids def extract_interchanges(row): """Extract (brand, part_number) pairs from the interchange columns.""" interchanges = [] # Columns: MARCA.1=2, INTERCAMBIO=3, MARCA.2=4, INTERCAMBIO.1=5, ... up to MARCA.6=12, INTERCAMBIO.5=13 pairs = [ (row[2], row[3]), (row[4], row[5]), (row[6], row[7]), (row[8], row[9]), (row[10], row[11]), (row[12], row[13]), ] for brand, pn in pairs: if brand and pn: brand = str(brand).strip() pn = str(pn).strip() if brand and pn: interchanges.append((brand, pn)) return interchanges def main(): print(f"[{datetime.now().isoformat()}] Starting import...") if not os.path.exists(EXCEL_PATH): print(f"ERROR: Excel not found at {EXCEL_PATH}") sys.exit(1) print(f"Loading {EXCEL_PATH}...") wb = load_workbook(EXCEL_PATH, read_only=True, data_only=True) master_conn = connect_master() master_conn = connect_master() master_cur = master_conn.cursor() master_cur = master_conn.cursor() print("Building caches...") brand_cache = build_brand_cache(master_cur) model_cache = build_model_cache(master_cur) year_cache = build_year_cache(master_cur) mye_cache = build_mye_cache(master_cur) print(f" Brands: {len(brand_cache)}, Models: {sum(len(v) for v in model_cache.values())}, Years: {len(year_cache)}, MYE combos: {len(mye_cache)}") # Prepare UPSERT statements upsert_catalog_sql = """ INSERT INTO supplier_catalog (supplier_name, sku, name, category) VALUES (%s, %s, %s, %s, %s) ON CONFLICT (supplier_name, sku, category) DO UPDATE SET name = EXCLUDED.name, category = EXCLUDED.category RETURNING id """ insert_compat_sql = """ INSERT INTO supplier_catalog_compat (catalog_id, make, model, year, engine, model_year_engine_id, source) VALUES (%s, %s, %s, %s, %s, %s, %s) ON CONFLICT (catalog_id, make, model, year, engine) DO NOTHING """ insert_interchange_sql = """ INSERT INTO supplier_catalog_interchange (catalog_id, brand, part_number) VALUES (%s, %s, %s) ON CONFLICT DO NOTHING """ # Track stats stats = { 'sheets': 0, 'rows': 0, 'catalog_items': 0, 'compat_rows': 0, 'interchange_rows': 0, 'vehicles_parsed': 0, 'vehicles_matched': 0, 'mye_matches': 0, } # Process each sheet for sheet_name in wb.sheetnames: ws = wb[sheet_name] rows = list(ws.iter_rows(values_only=True)) if not rows: continue headers = rows[0] data_rows = rows[1:] stats['sheets'] += 1 print(f"\nProcessing sheet '{sheet_name}' with {len(data_rows)} rows...") for idx, row in enumerate(data_rows): if idx % 1000 == 0 and idx > 0: print(f" ...{idx} rows processed") # Skip empty rows if not row or not row[1]: continue sku = str(row[1]).strip() name = str(row[14]).strip() if row[14] else '' vehicle_raw = str(row[15]).strip() if row[15] else '' if not sku or not name: continue stats['rows'] += 1 # Upsert catalog item master_cur.execute(upsert_catalog_sql, (SUPPLIER_NAME, sku, name, sheet_name)) catalog_id = master_cur.fetchone()[0] stats['catalog_items'] += 1 # Parse vehicle parsed = parse_vehicle(vehicle_raw) stats['vehicles_parsed'] += 1 mye_ids = fuzzy_match_vehicle(parsed, brand_cache, model_cache, year_cache, mye_cache) if mye_ids: stats['vehicles_matched'] += 1 stats['mye_matches'] += len(mye_ids) # Insert compatibility rows # If we have MYE matches, insert one row per MYE if mye_ids: for mye_id in mye_ids: master_cur.execute(insert_compat_sql, ( catalog_id, parsed['make'], parsed['model'], parsed['year'], parsed['engine'], mye_id, 'fuzzy_match', )) stats['compat_rows'] += 1 else: # No MYE match: insert with text only master_cur.execute(insert_compat_sql, ( catalog_id, parsed['make'], parsed['model'], parsed['year'], parsed['engine'], None, 'import_text', )) stats['compat_rows'] += 1 # Insert interchanges interchanges = extract_interchanges(row) for brand, pn in interchanges: master_cur.execute(insert_interchange_sql, (catalog_id, brand, pn)) stats['interchange_rows'] += 1 # Commit per sheet master_conn.commit() print(f" Sheet '{sheet_name}' committed.") # Final stats print(f"\n{'='*60}") print("IMPORT COMPLETE") print(f"{'='*60}") print(f"Sheets processed: {stats['sheets']}") print(f"Total rows read: {stats['rows']}") print(f"Catalog items: {stats['catalog_items']}") print(f"Compat rows: {stats['compat_rows']}") print(f"Interchange rows: {stats['interchange_rows']}") print(f"Vehicles parsed: {stats['vehicles_parsed']}") print(f"Vehicles with MYE: {stats['vehicles_matched']}") print(f"Total MYE matches: {stats['mye_matches']}") master_cur.close() master_cur.close() master_conn.close() master_conn.close() if __name__ == '__main__': main()