#!/usr/bin/env python3 """ Match supplier_catalog_compat rows to model_year_engine ids by fuzzy (make, model, year). Supports exact match, parenthesis-stripped match, whitespace/dash normalization, prefix/substring fallback, model aliases, and year proximity (±2 years). Usage: python scripts/match_supplier_compat_to_mye.py [--dry-run] """ import os import re import sys from collections import defaultdict import psycopg2 MASTER_DB_URL = os.environ.get('MASTER_DB_URL', 'postgresql://postgres@localhost/nexus_autoparts') MAKE_ALIASES = { 'VOLKSWAGEN': 'VW', 'VOLKWAGEN': 'VW', 'MERCEDES BENZ': 'MERCEDES BENZ', 'MERCEDES-BENZ': 'MERCEDES BENZ', 'BMW MOTORRAD': 'BMW', } NOISE_SUFFIXES = { 'SEDAN', 'SALOON', 'COUPE', 'HATCHBACK', 'HATCH', 'WAGON', 'ESTATE', 'SUV', 'VAN', 'PICK', 'UP', 'PICKUP', 'CABRIOLET', 'CONVERTIBLE', 'LATINO', 'BRASIL', 'MEXICO', 'USA', 'EUROPA', 'EUROPE', 'NACIO', 'LIMITED', 'LTD', 'XLT', 'LE', 'SE', 'XLE', 'SPORT', 'LX', 'EX', '4X2', '4X4', '4WD', 'AWD', 'FWD', 'RWD', '2WD', } # Specific model aliases: (make, supplier_model) -> list of possible master model substrings MODEL_ALIASES = { ('INFINITI', 'JX35'): ['JX SUV'], ('INFINITI', 'G35'): ['G Coupe', 'G Saloon', 'G37'], ('INFINITI', 'G37'): ['G Coupe', 'G Saloon', 'G37'], ('HONDA', 'CRX'): ['CRX'], ('MAZDA', 'PROTEGE'): ['PROTEGE'], ('MAZDA', 'PROTEGE5'): ['PROTEGE'], ('KIA', 'SPECTRA'): ['SPECTRA', 'SEPHIA'], ('KIA', 'FORTE5'): ['FORTE'], ('CHEVROLET', 'OPTRA'): ['OPTRA', 'LACETTI'], ('CHEVROLET', 'AGILE'): ['AGILE'], ('FIAT', 'SIENA'): ['SIENA'], ('PONTIAC', 'G4'): ['G4', 'PURSUIT'], ('FORD', 'FIVE HUNDRED'): ['FIVE HUNDRED', '500', 'TAURUS'], ('FORD', 'POLICE INTERCEPTOR UTILITY'): ['POLICE INTERCEPTOR UTILITY', 'EXPLORER'], ('FORD', 'POLICE INTERCEPTOR SEDAN'): ['POLICE INTERCEPTOR SEDAN', 'TAURUS'], ('SCION', 'XA'): ['XA'], ('SAAB', '9-2X'): ['9-2X'], ('BUICK', 'LACROSSE'): ['LACROSSE'], ('DODGE', 'CALIBER'): ['CALIBER'], ('SUZUKI', 'EQUATOR'): ['EQUATOR'], ('CHRYSLER', 'LEBARON K'): ['LEBARON'], ('MERCEDES BENZ', 'A170'): ['A-CLASS'], ('MERCEDES BENZ', 'A210'): ['A-CLASS'], } # Regex-based class extraction for Mercedes: e.g. C350E -> C-Class, SL600 -> SL MERCEDES_CLASS_PATTERNS = [ # These Mercedes classes use "X-CLASS" in master (C-CLASS, E-CLASS, S-CLASS, etc.) (r'^(A|B|C|E|G|GL|GLA|GLB|GLC|GLE|GLK|GLS|M|R|S|V|X)\d', 'CLASS'), # These use just the letters (SL, SLK, CLS, CL, CLK) without -CLASS (r'^(SL|SLK|CLS|CL|CLK)\d', 'LETTERS'), (r'^(260E|300E|320E|400E|500E)$', 'E-CLASS'), (r'^(300SL|500SL)$', 'SL'), (r'^(400SEL|500SEL|600SEL)$', 'S-CLASS'), ] def normalize_make(make): if not make: return '' m = str(make).strip().upper() return MAKE_ALIASES.get(m, m) def normalize_model(model): if not model: return '' return ' '.join(str(model).upper().split()) def strip_parentheses(text): return re.sub(r'\s*\([^)]*\)', '', text).strip() def strip_noise_suffixes(text): parts = text.split() cleaned = [] for p in parts: if p in NOISE_SUFFIXES: break cleaned.append(p) return ' '.join(cleaned) def compact_alnum(text): return re.sub(r'[^A-Z0-9]', '', text) def build_model_variants(model_name): variants = set() base = normalize_model(model_name) if not base: return variants no_paren = strip_parentheses(base) no_noise = strip_noise_suffixes(no_paren) compact = compact_alnum(no_noise) compact_paren = compact_alnum(no_paren) compact_base = compact_alnum(base) variants.add(base) if no_paren: variants.add(no_paren) if no_noise: variants.add(no_noise) if compact: variants.add(compact) if compact_paren: variants.add(compact_paren) if compact_base: variants.add(compact_base) return variants def mercedes_class_alias(model): """Return a master model substring for Mercedes class-based models.""" m = normalize_model(model) for pat, repl in MERCEDES_CLASS_PATTERNS: match = re.match(pat, m) if match: if repl == 'CLASS': return match.group(1) + '-CLASS' if repl == 'LETTERS': return match.group(1) return repl return None def connect(): return psycopg2.connect(MASTER_DB_URL) def build_mye_index(cur): print('Building MYE index...') cur.execute(''' SELECT b.name_brand, m.name_model, y.year_car, mye.id_mye FROM model_year_engine mye JOIN models m ON m.id_model = mye.model_id JOIN brands b ON b.id_brand = m.brand_id JOIN years y ON y.id_year = mye.year_id ''') exact_index = defaultdict(list) compact_index = defaultdict(list) models_by_make = defaultdict(list) # For year proximity: make -> compact_model -> {year: [mye_ids]} year_range_index = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) for make, model, year, mye_id in cur.fetchall(): nmake = normalize_make(make) if not nmake or not model or year is None: continue variants = build_model_variants(model) for v in variants: exact_index[(nmake, v, year)].append(mye_id) compact = compact_alnum(strip_parentheses(normalize_model(model))) if compact: compact_index[(nmake, compact, year)].append(mye_id) year_range_index[nmake][compact][year].append(mye_id) models_by_make[nmake].append((normalize_model(model), mye_id, year, compact)) total_myes = sum(len(v) for v in exact_index.values()) print(f' {len(exact_index):,} exact keys, {total_myes:,} MYE entries') return exact_index, compact_index, models_by_make, year_range_index def find_by_alias(nmake, nmodel, year, models_by_make): """Try specific model aliases and Mercedes class patterns.""" aliases = MODEL_ALIASES.get((nmake, nmodel), []) # Mercedes fallback if nmake == 'MERCEDES BENZ': cls = mercedes_class_alias(nmodel) if cls and cls not in aliases: aliases = list(aliases) + [cls] if not aliases: return None # Try to find a master model that contains any alias substring and matches year for alias in aliases: alias_compact = compact_alnum(alias) for master_model, mye_id, mye_year, master_compact in models_by_make.get(nmake, []): if mye_year != year: continue if alias in master_model or alias_compact in master_compact: return mye_id return None def find_by_year_proximity(nmake, supplier_compact, year, year_range_index, max_diff=2): """If exact year missing, find closest year within ±max_diff for same model.""" years = year_range_index.get(nmake, {}).get(supplier_compact) if not years: return None best_y = None best_diff = None for y in years.keys(): diff = abs(y - year) if diff <= max_diff: if best_diff is None or diff < best_diff: best_diff = diff best_y = y if best_y is not None: return year_range_index[nmake][supplier_compact][best_y][0] return None def find_mye_id(make, model, year, exact_index, compact_index, models_by_make, year_range_index): nmake = normalize_make(make) nmodel = normalize_model(model) if not nmake or not nmodel: return None variants = build_model_variants(nmodel) # 1) Exact/near-exact on any variant for v in variants: myes = exact_index.get((nmake, v, year)) if myes: return myes[0] supplier_compact = compact_alnum(strip_parentheses(nmodel)) # 2) Compact match myes = compact_index.get((nmake, supplier_compact, year)) if myes: return myes[0] # 3) Prefix/substring containment for master_model, mye_id, mye_year, master_compact in models_by_make.get(nmake, []): if mye_year != year: continue if not master_compact: continue if supplier_compact in master_compact or master_compact in supplier_compact: return mye_id if year is None: return None # 4) Model aliases mye_id = find_by_alias(nmake, nmodel, year, models_by_make) if mye_id: return mye_id # 5) Year proximity ±3 years (same compact model) if supplier_compact: mye_id = find_by_year_proximity(nmake, supplier_compact, year, year_range_index, max_diff=3) if mye_id: return mye_id return None def main(): args = sys.argv[1:] dry_run = False if '--dry-run' in args: dry_run = True args.remove('--dry-run') if len(args) < 1: print('Usage: match_supplier_compat_to_mye.py [--dry-run] ') sys.exit(1) supplier_arg = args[0] suppliers = None if supplier_arg == '--all' else [supplier_arg] if dry_run: print('=' * 60) print('DRY RUN MODE — no changes will be made') print('=' * 60) conn = connect() cur = conn.cursor() exact_index, compact_index, models_by_make, year_range_index = build_mye_index(cur) if suppliers: cur.execute(''' SELECT scc.id, scc.make, scc.model, scc.year FROM supplier_catalog_compat scc JOIN supplier_catalog sc ON sc.id = scc.catalog_id WHERE sc.supplier_name = ANY(%s) AND scc.model_year_engine_id IS NULL ''', (suppliers,)) else: cur.execute(''' SELECT scc.id, scc.make, scc.model, scc.year FROM supplier_catalog_compat scc WHERE scc.model_year_engine_id IS NULL ''') rows = cur.fetchall() print(f'\nMatching {len(rows):,} compat rows...') matched = 0 unmatched = 0 sample_matches = [] sample_unmatched = [] updates = [] for scc_id, make, model, year in rows: mye_id = find_mye_id(make, model, year, exact_index, compact_index, models_by_make, year_range_index) if mye_id: updates.append((mye_id, scc_id)) matched += 1 if len(sample_matches) < 10: sample_matches.append((make, model, year, mye_id)) else: unmatched += 1 if len(sample_unmatched) < 10: sample_unmatched.append((make, model, year)) print(f'Matched: {matched:,}') print(f'Unmatched: {unmatched:,}') if sample_matches: print('\nSample matches:') for make, model, year, mye_id in sample_matches: print(f' {make} {model} {year} -> mye_id={mye_id}') if sample_unmatched: print('\nSample unmatched:') for make, model, year in sample_unmatched: print(f' {make} {model} {year}') if dry_run or not updates: cur.close() conn.close() if dry_run: print('\n' + '=' * 60) print('DRY RUN complete. Run without --dry-run to apply.') print('=' * 60) return print(f'\nApplying {len(updates):,} updates...') cur.executemany(''' UPDATE supplier_catalog_compat SET model_year_engine_id = %s, source = 'matched_fuzzy' WHERE id = %s ''', updates) conn.commit() print('Updates committed.') cur.close() conn.close() if __name__ == '__main__': main()