#!/usr/bin/env python3 """ Import KNADIAN catalog from Excel into supplier_catalog tables. Usage: python scripts/import_knadian_catalog.py """ import os import re import sys from collections import defaultdict from datetime import datetime import psycopg2 from openpyxl import load_workbook MASTER_DB_URL = os.environ.get('MASTER_DB_URL', 'postgresql://postgres@localhost/nexus_autoparts') EXCEL_PATH = os.path.join(os.path.dirname(__file__), '..', 'data', 'KNADIAN.xlsx') SUPPLIER_NAME = 'KNADIAN' MAX_IMPORT_YEAR = datetime.now().year + 1 # reject future years from bad supplier data MULTI_WORD_MAKES = { ('MERCEDES', 'BENZ'): 'MERCEDES BENZ', ('LAND', 'ROVER'): 'LAND ROVER', ('ALFA', 'ROMEO'): 'ALFA ROMEO', ('AMERICAN', 'MOTORS'): 'AMERICAN MOTORS', ('ROLLS', 'ROYCE'): 'ROLLS ROYCE', ('ASTON', 'MARTIN'): 'ASTON MARTIN', ('GREAT', 'WALL'): 'GREAT WALL', } def connect_master(): return psycopg2.connect(MASTER_DB_URL) def normalize_name(name): if not name: return '' return ' '.join(str(name).replace('\n', ' ').split()) def parse_year_token(token): """Parse a year token like '05', '1998', '2015'.""" if not token or not re.match(r'^\d+$', str(token)): return None val = int(token) if 1000 <= val <= 2100: return val if 70 <= val <= 99: return 1900 + val if 0 <= val <= 69: return 2000 + val return None def extract_years(text): """Extract year(s) from end of a string like '05/10', '2011', '1315', '97/99'.""" if not text: return [None], '' s = str(text).strip() # Try trailing range with / or -: YY/YY, YYYY-YYYY, YY-YY m = re.search(r'\s+(\d{2,4})\s*[-/]\s*(\d{2,4})$', s) if m: start = parse_year_token(m.group(1)) end = parse_year_token(m.group(2)) if start and end: if end < start: start, end = end, start if end - start <= 100: rest = s[:m.start()].strip() return list(range(start, end + 1)), rest # Try trailing 4-digit year m = re.search(r'\s+(19|20)\d{2}$', s) if m: year = int(m.group(0).strip()) rest = s[:m.start()].strip() return [year], rest # Try trailing 4 consecutive digits that look like a merged range: 1315 -> 2013,2014,2015 m = re.search(r'\s+(\d{4})$', s) if m: digits = m.group(1) # If first two and last two are valid years, treat as range y1 = parse_year_token(digits[:2]) y2 = parse_year_token(digits[2:]) if y1 and y2 and y1 <= y2 and y2 - y1 <= 30: rest = s[:m.start()].strip() return list(range(y1, y2 + 1)), rest return [None], s def parse_carro(carro): """Parse CARRO_PERTENECIENTE like 'ACURA TL 05/10' -> make, model, years.""" if not carro: return {'make': None, 'model': None, 'years': [None], 'raw': carro} s = str(carro).strip() years, rest = extract_years(s) parts = rest.split() if not parts: return {'make': None, 'model': None, 'years': years, 'raw': s} # Extract make make = parts[0].upper() if len(parts) >= 2: key = (parts[0].upper(), parts[1].upper()) if key in MULTI_WORD_MAKES: make = MULTI_WORD_MAKES[key] parts = parts[2:] else: parts = parts[1:] else: parts = parts[1:] model = ' '.join(parts) if parts else None return { 'make': make, 'model': model, 'years': years, 'raw': s, } def extract_engine(name): """Extract engine description from NOMBRE_PIEZA like 'BOMBA_REFRIGERANTE L4 2.0'.""" if not name: return None s = normalize_name(name) parts = s.split() if len(parts) <= 1: return None # Everything after first word engine = ' '.join(parts[1:]) # Filter out meaningless tokens that should not be engines if engine.upper() in {'DEL.', 'TRAS.', 'FRONT.', 'EXT.', 'IZQ.', 'DER.', 'INF.', 'SUP.', 'TRANS.'}: return None return engine or None def extract_interchanges(row): """Extract (brand, part_number) pairs from interchange columns. KNADIAN: interchanges start at col 3 (MARCA.1) through col 15 (INTERCAMBIO.5). """ interchanges = [] for i in range(6): marca_col = 3 + i * 2 inter_col = 4 + i * 2 if marca_col < len(row) and row[marca_col]: brand = str(row[marca_col]).strip() pn = str(row[inter_col]).strip() if inter_col < len(row) and row[inter_col] else '' if brand and pn: interchanges.append((brand, pn)) return interchanges def main(): print(f"[{datetime.now().isoformat()}] Starting KNADIAN import...") if not os.path.exists(EXCEL_PATH): print(f"ERROR: Excel not found at {EXCEL_PATH}") sys.exit(1) print(f"Loading {EXCEL_PATH}...") wb = load_workbook(EXCEL_PATH, read_only=True, data_only=True) master_conn = connect_master() master_cur = master_conn.cursor() upsert_catalog_sql = """ INSERT INTO supplier_catalog (supplier_name, sku, name, category, is_active) VALUES (%s, %s, %s, %s, true) ON CONFLICT (supplier_name, sku, category) DO UPDATE SET name = EXCLUDED.name, category = EXCLUDED.category, is_active = true RETURNING id """ insert_compat_sql = """ INSERT INTO supplier_catalog_compat (catalog_id, make, model, year, engine, model_year_engine_id, source) VALUES (%s, %s, %s, %s, %s, NULL, %s) ON CONFLICT (catalog_id, make, model, year, engine) DO NOTHING """ insert_interchange_sql = """ INSERT INTO supplier_catalog_interchange (catalog_id, brand, part_number) VALUES (%s, %s, %s) ON CONFLICT DO NOTHING """ stats = defaultdict(int) for sheet_name in wb.sheetnames: ws = wb[sheet_name] rows = list(ws.iter_rows(values_only=True)) if not rows: continue data_rows = rows[1:] stats['sheets'] += 1 print(f"\nProcessing sheet '{sheet_name}' with {len(data_rows)} rows...") catalog_id_cache = {} for idx, row in enumerate(data_rows): if idx % 2000 == 0 and idx > 0: print(f" ...{idx} rows processed") if not row or len(row) < 3 or not row[2]: stats['skipped_no_sku'] += 1 continue make_col = str(row[0]).strip().upper() if row[0] else '' model_col = str(row[1]).strip() if row[1] else '' sku = str(row[2]).strip() name = normalize_name(row[15]) if len(row) > 15 and row[15] else sheet_name carro = str(row[16]).strip() if len(row) > 16 and row[16] else '' if not sku: stats['skipped_no_sku'] += 1 continue # Always try to parse year from CARRO_PERTENECIENTE parsed = parse_carro(carro) years = parsed['years'] # Prefer explicit make/model columns; fallback to parsed carro if make_col: make = make_col else: make = parsed['make'] if model_col: model = model_col else: model = parsed['model'] # If year still missing, maybe the model column itself contains a year if years == [None] and model_col: years, _ = extract_years(model_col) if not make or not model: stats['skipped_no_vehicle'] += 1 continue # Filter out future years and de-duplicate filtered_years = [] for y in years: if y is None: if None not in filtered_years: filtered_years.append(None) elif y <= MAX_IMPORT_YEAR: if y not in filtered_years: filtered_years.append(y) years = filtered_years if filtered_years else [None] stats['rows'] += 1 # Upsert catalog item (keyed by sku) cache_key = sku catalog_id = catalog_id_cache.get(cache_key) if catalog_id is None: master_cur.execute(upsert_catalog_sql, (SUPPLIER_NAME, sku, name, sheet_name)) row_result = master_cur.fetchone() catalog_id = row_result[0] if row_result else None catalog_id_cache[cache_key] = catalog_id stats['catalog_items'] += 1 if catalog_id is None: stats['skipped_no_catalog'] += 1 continue engine = extract_engine(name) for year in years: master_cur.execute(insert_compat_sql, ( catalog_id, make, model, year, engine, 'import_text', )) stats['compat_rows'] += 1 interchanges = extract_interchanges(row) for brand, pn in interchanges: master_cur.execute(insert_interchange_sql, (catalog_id, brand, pn)) stats['interchange_rows'] += 1 master_conn.commit() print(f" Sheet '{sheet_name}' committed.") print(f"\n{'='*60}") print("IMPORT COMPLETE") print(f"{'='*60}") for k, v in sorted(stats.items()): print(f"{k:25s}: {v}") master_cur.close() master_conn.close() if __name__ == '__main__': main()