Files
Autoparts-DB/scripts/match_supplier_compat_to_mye.py
consultoria-as ea29cc31c0 feat(catalog): supplier catalog cleanup, fuzzy matching, and navigation fixes
- Cleaned 137+ fake engine-displacement models from supplier imports
  (v3/v4 scripts: Chevrolet, Ford, Chrysler, Dodge, Jeep, Nissan, etc.)
- Removed 1,251+ corrupted models (INT. prefixes, year-suffix, torque specs,
  empty names, trailing-year variants)
- Migrated supplier tables to master DB (supplier_catalog,
  supplier_catalog_compat, supplier_catalog_interchange)
- Fixed _get_mye_ids_with_parts() to query supplier_catalog_compat from
  master DB so supplier-only vehicles appear for all tenants
- Added fuzzy model matcher with parenthesis stripping, noise suffix removal,
  compact matching, prefix/substring fallback, model aliases, and ±3 year
  proximity
- Matched compat rows: KEEP GREEN +14,152, KNADIAN +3,021, VAZLO +127,500,
  LUK +477, RAYBESTOS +1,743
- Added KNADIAN catalog importer with year-range expansion and future-year
  filtering
- Added VAZLO catalog importer with position parsing and SKU-in-model cleanup
- Added Keep Green, LUK, Yokomitsu, Raybestos catalog importers
- Cache clearing after cleanups (_classify_cache_*, nexus:mye_ids:*,
  nexus:brand_mye_counts:*)

Final match rates:
- KEEP GREEN: 90.3%
- VAZLO: 93.6%
- YOKOMITSU: 100.0%
- KNADIAN: 57.4%
- LUK: 51.0%
- RAYBESTOS: 55.9%
2026-06-09 07:47:42 +00:00

370 lines
11 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Match supplier_catalog_compat rows to model_year_engine ids by fuzzy (make, model, year).
Supports exact match, parenthesis-stripped match, whitespace/dash normalization,
prefix/substring fallback, model aliases, and year proximity (±2 years).
Usage:
python scripts/match_supplier_compat_to_mye.py [--dry-run] <supplier_name|--all>
"""
import os
import re
import sys
from collections import defaultdict
import psycopg2
MASTER_DB_URL = os.environ.get('MASTER_DB_URL', 'postgresql://postgres@localhost/nexus_autoparts')
MAKE_ALIASES = {
'VOLKSWAGEN': 'VW',
'VOLKWAGEN': 'VW',
'MERCEDES BENZ': 'MERCEDES BENZ',
'MERCEDES-BENZ': 'MERCEDES BENZ',
'BMW MOTORRAD': 'BMW',
}
NOISE_SUFFIXES = {
'SEDAN', 'SALOON', 'COUPE', 'HATCHBACK', 'HATCH', 'WAGON', 'ESTATE',
'SUV', 'VAN', 'PICK', 'UP', 'PICKUP', 'CABRIOLET', 'CONVERTIBLE',
'LATINO', 'BRASIL', 'MEXICO', 'USA', 'EUROPA', 'EUROPE', 'NACIO',
'LIMITED', 'LTD', 'XLT', 'LE', 'SE', 'XLE', 'SPORT', 'LX', 'EX',
'4X2', '4X4', '4WD', 'AWD', 'FWD', 'RWD', '2WD',
}
# Specific model aliases: (make, supplier_model) -> list of possible master model substrings
MODEL_ALIASES = {
('INFINITI', 'JX35'): ['JX SUV'],
('INFINITI', 'G35'): ['G Coupe', 'G Saloon', 'G37'],
('INFINITI', 'G37'): ['G Coupe', 'G Saloon', 'G37'],
('HONDA', 'CRX'): ['CRX'],
('MAZDA', 'PROTEGE'): ['PROTEGE'],
('MAZDA', 'PROTEGE5'): ['PROTEGE'],
('KIA', 'SPECTRA'): ['SPECTRA', 'SEPHIA'],
('KIA', 'FORTE5'): ['FORTE'],
('CHEVROLET', 'OPTRA'): ['OPTRA', 'LACETTI'],
('CHEVROLET', 'AGILE'): ['AGILE'],
('FIAT', 'SIENA'): ['SIENA'],
('PONTIAC', 'G4'): ['G4', 'PURSUIT'],
('FORD', 'FIVE HUNDRED'): ['FIVE HUNDRED', '500', 'TAURUS'],
('FORD', 'POLICE INTERCEPTOR UTILITY'): ['POLICE INTERCEPTOR UTILITY', 'EXPLORER'],
('FORD', 'POLICE INTERCEPTOR SEDAN'): ['POLICE INTERCEPTOR SEDAN', 'TAURUS'],
('SCION', 'XA'): ['XA'],
('SAAB', '9-2X'): ['9-2X'],
('BUICK', 'LACROSSE'): ['LACROSSE'],
('DODGE', 'CALIBER'): ['CALIBER'],
('SUZUKI', 'EQUATOR'): ['EQUATOR'],
('CHRYSLER', 'LEBARON K'): ['LEBARON'],
('MERCEDES BENZ', 'A170'): ['A-CLASS'],
('MERCEDES BENZ', 'A210'): ['A-CLASS'],
}
# Regex-based class extraction for Mercedes: e.g. C350E -> C-Class, SL600 -> SL
MERCEDES_CLASS_PATTERNS = [
# These Mercedes classes use "X-CLASS" in master (C-CLASS, E-CLASS, S-CLASS, etc.)
(r'^(A|B|C|E|G|GL|GLA|GLB|GLC|GLE|GLK|GLS|M|R|S|V|X)\d', 'CLASS'),
# These use just the letters (SL, SLK, CLS, CL, CLK) without -CLASS
(r'^(SL|SLK|CLS|CL|CLK)\d', 'LETTERS'),
(r'^(260E|300E|320E|400E|500E)$', 'E-CLASS'),
(r'^(300SL|500SL)$', 'SL'),
(r'^(400SEL|500SEL|600SEL)$', 'S-CLASS'),
]
def normalize_make(make):
if not make:
return ''
m = str(make).strip().upper()
return MAKE_ALIASES.get(m, m)
def normalize_model(model):
if not model:
return ''
return ' '.join(str(model).upper().split())
def strip_parentheses(text):
return re.sub(r'\s*\([^)]*\)', '', text).strip()
def strip_noise_suffixes(text):
parts = text.split()
cleaned = []
for p in parts:
if p in NOISE_SUFFIXES:
break
cleaned.append(p)
return ' '.join(cleaned)
def compact_alnum(text):
return re.sub(r'[^A-Z0-9]', '', text)
def build_model_variants(model_name):
variants = set()
base = normalize_model(model_name)
if not base:
return variants
no_paren = strip_parentheses(base)
no_noise = strip_noise_suffixes(no_paren)
compact = compact_alnum(no_noise)
compact_paren = compact_alnum(no_paren)
compact_base = compact_alnum(base)
variants.add(base)
if no_paren:
variants.add(no_paren)
if no_noise:
variants.add(no_noise)
if compact:
variants.add(compact)
if compact_paren:
variants.add(compact_paren)
if compact_base:
variants.add(compact_base)
return variants
def mercedes_class_alias(model):
"""Return a master model substring for Mercedes class-based models."""
m = normalize_model(model)
for pat, repl in MERCEDES_CLASS_PATTERNS:
match = re.match(pat, m)
if match:
if repl == 'CLASS':
return match.group(1) + '-CLASS'
if repl == 'LETTERS':
return match.group(1)
return repl
return None
def connect():
return psycopg2.connect(MASTER_DB_URL)
def build_mye_index(cur):
print('Building MYE index...')
cur.execute('''
SELECT b.name_brand, m.name_model, y.year_car, mye.id_mye
FROM model_year_engine mye
JOIN models m ON m.id_model = mye.model_id
JOIN brands b ON b.id_brand = m.brand_id
JOIN years y ON y.id_year = mye.year_id
''')
exact_index = defaultdict(list)
compact_index = defaultdict(list)
models_by_make = defaultdict(list)
# For year proximity: make -> compact_model -> {year: [mye_ids]}
year_range_index = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
for make, model, year, mye_id in cur.fetchall():
nmake = normalize_make(make)
if not nmake or not model or year is None:
continue
variants = build_model_variants(model)
for v in variants:
exact_index[(nmake, v, year)].append(mye_id)
compact = compact_alnum(strip_parentheses(normalize_model(model)))
if compact:
compact_index[(nmake, compact, year)].append(mye_id)
year_range_index[nmake][compact][year].append(mye_id)
models_by_make[nmake].append((normalize_model(model), mye_id, year, compact))
total_myes = sum(len(v) for v in exact_index.values())
print(f' {len(exact_index):,} exact keys, {total_myes:,} MYE entries')
return exact_index, compact_index, models_by_make, year_range_index
def find_by_alias(nmake, nmodel, year, models_by_make):
"""Try specific model aliases and Mercedes class patterns."""
aliases = MODEL_ALIASES.get((nmake, nmodel), [])
# Mercedes fallback
if nmake == 'MERCEDES BENZ':
cls = mercedes_class_alias(nmodel)
if cls and cls not in aliases:
aliases = list(aliases) + [cls]
if not aliases:
return None
# Try to find a master model that contains any alias substring and matches year
for alias in aliases:
alias_compact = compact_alnum(alias)
for master_model, mye_id, mye_year, master_compact in models_by_make.get(nmake, []):
if mye_year != year:
continue
if alias in master_model or alias_compact in master_compact:
return mye_id
return None
def find_by_year_proximity(nmake, supplier_compact, year, year_range_index, max_diff=2):
"""If exact year missing, find closest year within ±max_diff for same model."""
years = year_range_index.get(nmake, {}).get(supplier_compact)
if not years:
return None
best_y = None
best_diff = None
for y in years.keys():
diff = abs(y - year)
if diff <= max_diff:
if best_diff is None or diff < best_diff:
best_diff = diff
best_y = y
if best_y is not None:
return year_range_index[nmake][supplier_compact][best_y][0]
return None
def find_mye_id(make, model, year, exact_index, compact_index, models_by_make, year_range_index):
nmake = normalize_make(make)
nmodel = normalize_model(model)
if not nmake or not nmodel:
return None
variants = build_model_variants(nmodel)
# 1) Exact/near-exact on any variant
for v in variants:
myes = exact_index.get((nmake, v, year))
if myes:
return myes[0]
supplier_compact = compact_alnum(strip_parentheses(nmodel))
# 2) Compact match
myes = compact_index.get((nmake, supplier_compact, year))
if myes:
return myes[0]
# 3) Prefix/substring containment
for master_model, mye_id, mye_year, master_compact in models_by_make.get(nmake, []):
if mye_year != year:
continue
if not master_compact:
continue
if supplier_compact in master_compact or master_compact in supplier_compact:
return mye_id
if year is None:
return None
# 4) Model aliases
mye_id = find_by_alias(nmake, nmodel, year, models_by_make)
if mye_id:
return mye_id
# 5) Year proximity ±3 years (same compact model)
if supplier_compact:
mye_id = find_by_year_proximity(nmake, supplier_compact, year, year_range_index, max_diff=3)
if mye_id:
return mye_id
return None
def main():
args = sys.argv[1:]
dry_run = False
if '--dry-run' in args:
dry_run = True
args.remove('--dry-run')
if len(args) < 1:
print('Usage: match_supplier_compat_to_mye.py [--dry-run] <supplier_name|--all>')
sys.exit(1)
supplier_arg = args[0]
suppliers = None if supplier_arg == '--all' else [supplier_arg]
if dry_run:
print('=' * 60)
print('DRY RUN MODE — no changes will be made')
print('=' * 60)
conn = connect()
cur = conn.cursor()
exact_index, compact_index, models_by_make, year_range_index = build_mye_index(cur)
if suppliers:
cur.execute('''
SELECT scc.id, scc.make, scc.model, scc.year
FROM supplier_catalog_compat scc
JOIN supplier_catalog sc ON sc.id = scc.catalog_id
WHERE sc.supplier_name = ANY(%s) AND scc.model_year_engine_id IS NULL
''', (suppliers,))
else:
cur.execute('''
SELECT scc.id, scc.make, scc.model, scc.year
FROM supplier_catalog_compat scc
WHERE scc.model_year_engine_id IS NULL
''')
rows = cur.fetchall()
print(f'\nMatching {len(rows):,} compat rows...')
matched = 0
unmatched = 0
sample_matches = []
sample_unmatched = []
updates = []
for scc_id, make, model, year in rows:
mye_id = find_mye_id(make, model, year, exact_index, compact_index, models_by_make, year_range_index)
if mye_id:
updates.append((mye_id, scc_id))
matched += 1
if len(sample_matches) < 10:
sample_matches.append((make, model, year, mye_id))
else:
unmatched += 1
if len(sample_unmatched) < 10:
sample_unmatched.append((make, model, year))
print(f'Matched: {matched:,}')
print(f'Unmatched: {unmatched:,}')
if sample_matches:
print('\nSample matches:')
for make, model, year, mye_id in sample_matches:
print(f' {make} {model} {year} -> mye_id={mye_id}')
if sample_unmatched:
print('\nSample unmatched:')
for make, model, year in sample_unmatched:
print(f' {make} {model} {year}')
if dry_run or not updates:
cur.close()
conn.close()
if dry_run:
print('\n' + '=' * 60)
print('DRY RUN complete. Run without --dry-run to apply.')
print('=' * 60)
return
print(f'\nApplying {len(updates):,} updates...')
cur.executemany('''
UPDATE supplier_catalog_compat
SET model_year_engine_id = %s, source = 'matched_fuzzy'
WHERE id = %s
''', updates)
conn.commit()
print('Updates committed.')
cur.close()
conn.close()
if __name__ == '__main__':
main()