feat(catalog): supplier catalog cleanup, fuzzy matching, and navigation fixes
- Cleaned 137+ fake engine-displacement models from supplier imports (v3/v4 scripts: Chevrolet, Ford, Chrysler, Dodge, Jeep, Nissan, etc.) - Removed 1,251+ corrupted models (INT. prefixes, year-suffix, torque specs, empty names, trailing-year variants) - Migrated supplier tables to master DB (supplier_catalog, supplier_catalog_compat, supplier_catalog_interchange) - Fixed _get_mye_ids_with_parts() to query supplier_catalog_compat from master DB so supplier-only vehicles appear for all tenants - Added fuzzy model matcher with parenthesis stripping, noise suffix removal, compact matching, prefix/substring fallback, model aliases, and ±3 year proximity - Matched compat rows: KEEP GREEN +14,152, KNADIAN +3,021, VAZLO +127,500, LUK +477, RAYBESTOS +1,743 - Added KNADIAN catalog importer with year-range expansion and future-year filtering - Added VAZLO catalog importer with position parsing and SKU-in-model cleanup - Added Keep Green, LUK, Yokomitsu, Raybestos catalog importers - Cache clearing after cleanups (_classify_cache_*, nexus:mye_ids:*, nexus:brand_mye_counts:*) Final match rates: - KEEP GREEN: 90.3% - VAZLO: 93.6% - YOKOMITSU: 100.0% - KNADIAN: 57.4% - LUK: 51.0% - RAYBESTOS: 55.9%
This commit is contained in:
369
scripts/match_supplier_compat_to_mye.py
Executable file
369
scripts/match_supplier_compat_to_mye.py
Executable file
@@ -0,0 +1,369 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Match supplier_catalog_compat rows to model_year_engine ids by fuzzy (make, model, year).
|
||||
|
||||
Supports exact match, parenthesis-stripped match, whitespace/dash normalization,
|
||||
prefix/substring fallback, model aliases, and year proximity (±2 years).
|
||||
|
||||
Usage:
|
||||
python scripts/match_supplier_compat_to_mye.py [--dry-run] <supplier_name|--all>
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
|
||||
import psycopg2
|
||||
|
||||
MASTER_DB_URL = os.environ.get('MASTER_DB_URL', 'postgresql://postgres@localhost/nexus_autoparts')
|
||||
|
||||
MAKE_ALIASES = {
|
||||
'VOLKSWAGEN': 'VW',
|
||||
'VOLKWAGEN': 'VW',
|
||||
'MERCEDES BENZ': 'MERCEDES BENZ',
|
||||
'MERCEDES-BENZ': 'MERCEDES BENZ',
|
||||
'BMW MOTORRAD': 'BMW',
|
||||
}
|
||||
|
||||
NOISE_SUFFIXES = {
|
||||
'SEDAN', 'SALOON', 'COUPE', 'HATCHBACK', 'HATCH', 'WAGON', 'ESTATE',
|
||||
'SUV', 'VAN', 'PICK', 'UP', 'PICKUP', 'CABRIOLET', 'CONVERTIBLE',
|
||||
'LATINO', 'BRASIL', 'MEXICO', 'USA', 'EUROPA', 'EUROPE', 'NACIO',
|
||||
'LIMITED', 'LTD', 'XLT', 'LE', 'SE', 'XLE', 'SPORT', 'LX', 'EX',
|
||||
'4X2', '4X4', '4WD', 'AWD', 'FWD', 'RWD', '2WD',
|
||||
}
|
||||
|
||||
# Specific model aliases: (make, supplier_model) -> list of possible master model substrings
|
||||
MODEL_ALIASES = {
|
||||
('INFINITI', 'JX35'): ['JX SUV'],
|
||||
('INFINITI', 'G35'): ['G Coupe', 'G Saloon', 'G37'],
|
||||
('INFINITI', 'G37'): ['G Coupe', 'G Saloon', 'G37'],
|
||||
('HONDA', 'CRX'): ['CRX'],
|
||||
('MAZDA', 'PROTEGE'): ['PROTEGE'],
|
||||
('MAZDA', 'PROTEGE5'): ['PROTEGE'],
|
||||
('KIA', 'SPECTRA'): ['SPECTRA', 'SEPHIA'],
|
||||
('KIA', 'FORTE5'): ['FORTE'],
|
||||
('CHEVROLET', 'OPTRA'): ['OPTRA', 'LACETTI'],
|
||||
('CHEVROLET', 'AGILE'): ['AGILE'],
|
||||
('FIAT', 'SIENA'): ['SIENA'],
|
||||
('PONTIAC', 'G4'): ['G4', 'PURSUIT'],
|
||||
('FORD', 'FIVE HUNDRED'): ['FIVE HUNDRED', '500', 'TAURUS'],
|
||||
('FORD', 'POLICE INTERCEPTOR UTILITY'): ['POLICE INTERCEPTOR UTILITY', 'EXPLORER'],
|
||||
('FORD', 'POLICE INTERCEPTOR SEDAN'): ['POLICE INTERCEPTOR SEDAN', 'TAURUS'],
|
||||
('SCION', 'XA'): ['XA'],
|
||||
('SAAB', '9-2X'): ['9-2X'],
|
||||
('BUICK', 'LACROSSE'): ['LACROSSE'],
|
||||
('DODGE', 'CALIBER'): ['CALIBER'],
|
||||
('SUZUKI', 'EQUATOR'): ['EQUATOR'],
|
||||
('CHRYSLER', 'LEBARON K'): ['LEBARON'],
|
||||
('MERCEDES BENZ', 'A170'): ['A-CLASS'],
|
||||
('MERCEDES BENZ', 'A210'): ['A-CLASS'],
|
||||
}
|
||||
|
||||
# Regex-based class extraction for Mercedes: e.g. C350E -> C-Class, SL600 -> SL
|
||||
MERCEDES_CLASS_PATTERNS = [
|
||||
# These Mercedes classes use "X-CLASS" in master (C-CLASS, E-CLASS, S-CLASS, etc.)
|
||||
(r'^(A|B|C|E|G|GL|GLA|GLB|GLC|GLE|GLK|GLS|M|R|S|V|X)\d', 'CLASS'),
|
||||
# These use just the letters (SL, SLK, CLS, CL, CLK) without -CLASS
|
||||
(r'^(SL|SLK|CLS|CL|CLK)\d', 'LETTERS'),
|
||||
(r'^(260E|300E|320E|400E|500E)$', 'E-CLASS'),
|
||||
(r'^(300SL|500SL)$', 'SL'),
|
||||
(r'^(400SEL|500SEL|600SEL)$', 'S-CLASS'),
|
||||
]
|
||||
|
||||
|
||||
def normalize_make(make):
|
||||
if not make:
|
||||
return ''
|
||||
m = str(make).strip().upper()
|
||||
return MAKE_ALIASES.get(m, m)
|
||||
|
||||
|
||||
def normalize_model(model):
|
||||
if not model:
|
||||
return ''
|
||||
return ' '.join(str(model).upper().split())
|
||||
|
||||
|
||||
def strip_parentheses(text):
|
||||
return re.sub(r'\s*\([^)]*\)', '', text).strip()
|
||||
|
||||
|
||||
def strip_noise_suffixes(text):
|
||||
parts = text.split()
|
||||
cleaned = []
|
||||
for p in parts:
|
||||
if p in NOISE_SUFFIXES:
|
||||
break
|
||||
cleaned.append(p)
|
||||
return ' '.join(cleaned)
|
||||
|
||||
|
||||
def compact_alnum(text):
|
||||
return re.sub(r'[^A-Z0-9]', '', text)
|
||||
|
||||
|
||||
def build_model_variants(model_name):
|
||||
variants = set()
|
||||
base = normalize_model(model_name)
|
||||
if not base:
|
||||
return variants
|
||||
no_paren = strip_parentheses(base)
|
||||
no_noise = strip_noise_suffixes(no_paren)
|
||||
compact = compact_alnum(no_noise)
|
||||
compact_paren = compact_alnum(no_paren)
|
||||
compact_base = compact_alnum(base)
|
||||
|
||||
variants.add(base)
|
||||
if no_paren:
|
||||
variants.add(no_paren)
|
||||
if no_noise:
|
||||
variants.add(no_noise)
|
||||
if compact:
|
||||
variants.add(compact)
|
||||
if compact_paren:
|
||||
variants.add(compact_paren)
|
||||
if compact_base:
|
||||
variants.add(compact_base)
|
||||
return variants
|
||||
|
||||
|
||||
def mercedes_class_alias(model):
|
||||
"""Return a master model substring for Mercedes class-based models."""
|
||||
m = normalize_model(model)
|
||||
for pat, repl in MERCEDES_CLASS_PATTERNS:
|
||||
match = re.match(pat, m)
|
||||
if match:
|
||||
if repl == 'CLASS':
|
||||
return match.group(1) + '-CLASS'
|
||||
if repl == 'LETTERS':
|
||||
return match.group(1)
|
||||
return repl
|
||||
return None
|
||||
|
||||
|
||||
def connect():
|
||||
return psycopg2.connect(MASTER_DB_URL)
|
||||
|
||||
|
||||
def build_mye_index(cur):
|
||||
print('Building MYE index...')
|
||||
cur.execute('''
|
||||
SELECT b.name_brand, m.name_model, y.year_car, mye.id_mye
|
||||
FROM model_year_engine mye
|
||||
JOIN models m ON m.id_model = mye.model_id
|
||||
JOIN brands b ON b.id_brand = m.brand_id
|
||||
JOIN years y ON y.id_year = mye.year_id
|
||||
''')
|
||||
|
||||
exact_index = defaultdict(list)
|
||||
compact_index = defaultdict(list)
|
||||
models_by_make = defaultdict(list)
|
||||
# For year proximity: make -> compact_model -> {year: [mye_ids]}
|
||||
year_range_index = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
|
||||
|
||||
for make, model, year, mye_id in cur.fetchall():
|
||||
nmake = normalize_make(make)
|
||||
if not nmake or not model or year is None:
|
||||
continue
|
||||
variants = build_model_variants(model)
|
||||
for v in variants:
|
||||
exact_index[(nmake, v, year)].append(mye_id)
|
||||
|
||||
compact = compact_alnum(strip_parentheses(normalize_model(model)))
|
||||
if compact:
|
||||
compact_index[(nmake, compact, year)].append(mye_id)
|
||||
year_range_index[nmake][compact][year].append(mye_id)
|
||||
|
||||
models_by_make[nmake].append((normalize_model(model), mye_id, year, compact))
|
||||
|
||||
total_myes = sum(len(v) for v in exact_index.values())
|
||||
print(f' {len(exact_index):,} exact keys, {total_myes:,} MYE entries')
|
||||
return exact_index, compact_index, models_by_make, year_range_index
|
||||
|
||||
|
||||
def find_by_alias(nmake, nmodel, year, models_by_make):
|
||||
"""Try specific model aliases and Mercedes class patterns."""
|
||||
aliases = MODEL_ALIASES.get((nmake, nmodel), [])
|
||||
# Mercedes fallback
|
||||
if nmake == 'MERCEDES BENZ':
|
||||
cls = mercedes_class_alias(nmodel)
|
||||
if cls and cls not in aliases:
|
||||
aliases = list(aliases) + [cls]
|
||||
|
||||
if not aliases:
|
||||
return None
|
||||
|
||||
# Try to find a master model that contains any alias substring and matches year
|
||||
for alias in aliases:
|
||||
alias_compact = compact_alnum(alias)
|
||||
for master_model, mye_id, mye_year, master_compact in models_by_make.get(nmake, []):
|
||||
if mye_year != year:
|
||||
continue
|
||||
if alias in master_model or alias_compact in master_compact:
|
||||
return mye_id
|
||||
return None
|
||||
|
||||
|
||||
def find_by_year_proximity(nmake, supplier_compact, year, year_range_index, max_diff=2):
|
||||
"""If exact year missing, find closest year within ±max_diff for same model."""
|
||||
years = year_range_index.get(nmake, {}).get(supplier_compact)
|
||||
if not years:
|
||||
return None
|
||||
best_y = None
|
||||
best_diff = None
|
||||
for y in years.keys():
|
||||
diff = abs(y - year)
|
||||
if diff <= max_diff:
|
||||
if best_diff is None or diff < best_diff:
|
||||
best_diff = diff
|
||||
best_y = y
|
||||
if best_y is not None:
|
||||
return year_range_index[nmake][supplier_compact][best_y][0]
|
||||
return None
|
||||
|
||||
|
||||
def find_mye_id(make, model, year, exact_index, compact_index, models_by_make, year_range_index):
|
||||
nmake = normalize_make(make)
|
||||
nmodel = normalize_model(model)
|
||||
if not nmake or not nmodel:
|
||||
return None
|
||||
|
||||
variants = build_model_variants(nmodel)
|
||||
|
||||
# 1) Exact/near-exact on any variant
|
||||
for v in variants:
|
||||
myes = exact_index.get((nmake, v, year))
|
||||
if myes:
|
||||
return myes[0]
|
||||
|
||||
supplier_compact = compact_alnum(strip_parentheses(nmodel))
|
||||
|
||||
# 2) Compact match
|
||||
myes = compact_index.get((nmake, supplier_compact, year))
|
||||
if myes:
|
||||
return myes[0]
|
||||
|
||||
# 3) Prefix/substring containment
|
||||
for master_model, mye_id, mye_year, master_compact in models_by_make.get(nmake, []):
|
||||
if mye_year != year:
|
||||
continue
|
||||
if not master_compact:
|
||||
continue
|
||||
if supplier_compact in master_compact or master_compact in supplier_compact:
|
||||
return mye_id
|
||||
|
||||
if year is None:
|
||||
return None
|
||||
|
||||
# 4) Model aliases
|
||||
mye_id = find_by_alias(nmake, nmodel, year, models_by_make)
|
||||
if mye_id:
|
||||
return mye_id
|
||||
|
||||
# 5) Year proximity ±3 years (same compact model)
|
||||
if supplier_compact:
|
||||
mye_id = find_by_year_proximity(nmake, supplier_compact, year, year_range_index, max_diff=3)
|
||||
if mye_id:
|
||||
return mye_id
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
args = sys.argv[1:]
|
||||
dry_run = False
|
||||
if '--dry-run' in args:
|
||||
dry_run = True
|
||||
args.remove('--dry-run')
|
||||
|
||||
if len(args) < 1:
|
||||
print('Usage: match_supplier_compat_to_mye.py [--dry-run] <supplier_name|--all>')
|
||||
sys.exit(1)
|
||||
|
||||
supplier_arg = args[0]
|
||||
suppliers = None if supplier_arg == '--all' else [supplier_arg]
|
||||
|
||||
if dry_run:
|
||||
print('=' * 60)
|
||||
print('DRY RUN MODE — no changes will be made')
|
||||
print('=' * 60)
|
||||
|
||||
conn = connect()
|
||||
cur = conn.cursor()
|
||||
|
||||
exact_index, compact_index, models_by_make, year_range_index = build_mye_index(cur)
|
||||
|
||||
if suppliers:
|
||||
cur.execute('''
|
||||
SELECT scc.id, scc.make, scc.model, scc.year
|
||||
FROM supplier_catalog_compat scc
|
||||
JOIN supplier_catalog sc ON sc.id = scc.catalog_id
|
||||
WHERE sc.supplier_name = ANY(%s) AND scc.model_year_engine_id IS NULL
|
||||
''', (suppliers,))
|
||||
else:
|
||||
cur.execute('''
|
||||
SELECT scc.id, scc.make, scc.model, scc.year
|
||||
FROM supplier_catalog_compat scc
|
||||
WHERE scc.model_year_engine_id IS NULL
|
||||
''')
|
||||
|
||||
rows = cur.fetchall()
|
||||
print(f'\nMatching {len(rows):,} compat rows...')
|
||||
|
||||
matched = 0
|
||||
unmatched = 0
|
||||
sample_matches = []
|
||||
sample_unmatched = []
|
||||
updates = []
|
||||
|
||||
for scc_id, make, model, year in rows:
|
||||
mye_id = find_mye_id(make, model, year, exact_index, compact_index, models_by_make, year_range_index)
|
||||
if mye_id:
|
||||
updates.append((mye_id, scc_id))
|
||||
matched += 1
|
||||
if len(sample_matches) < 10:
|
||||
sample_matches.append((make, model, year, mye_id))
|
||||
else:
|
||||
unmatched += 1
|
||||
if len(sample_unmatched) < 10:
|
||||
sample_unmatched.append((make, model, year))
|
||||
|
||||
print(f'Matched: {matched:,}')
|
||||
print(f'Unmatched: {unmatched:,}')
|
||||
|
||||
if sample_matches:
|
||||
print('\nSample matches:')
|
||||
for make, model, year, mye_id in sample_matches:
|
||||
print(f' {make} {model} {year} -> mye_id={mye_id}')
|
||||
|
||||
if sample_unmatched:
|
||||
print('\nSample unmatched:')
|
||||
for make, model, year in sample_unmatched:
|
||||
print(f' {make} {model} {year}')
|
||||
|
||||
if dry_run or not updates:
|
||||
cur.close()
|
||||
conn.close()
|
||||
if dry_run:
|
||||
print('\n' + '=' * 60)
|
||||
print('DRY RUN complete. Run without --dry-run to apply.')
|
||||
print('=' * 60)
|
||||
return
|
||||
|
||||
print(f'\nApplying {len(updates):,} updates...')
|
||||
cur.executemany('''
|
||||
UPDATE supplier_catalog_compat
|
||||
SET model_year_engine_id = %s, source = 'matched_fuzzy'
|
||||
WHERE id = %s
|
||||
''', updates)
|
||||
conn.commit()
|
||||
print('Updates committed.')
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user