- Cleaned 137+ fake engine-displacement models from supplier imports (v3/v4 scripts: Chevrolet, Ford, Chrysler, Dodge, Jeep, Nissan, etc.) - Removed 1,251+ corrupted models (INT. prefixes, year-suffix, torque specs, empty names, trailing-year variants) - Migrated supplier tables to master DB (supplier_catalog, supplier_catalog_compat, supplier_catalog_interchange) - Fixed _get_mye_ids_with_parts() to query supplier_catalog_compat from master DB so supplier-only vehicles appear for all tenants - Added fuzzy model matcher with parenthesis stripping, noise suffix removal, compact matching, prefix/substring fallback, model aliases, and ±3 year proximity - Matched compat rows: KEEP GREEN +14,152, KNADIAN +3,021, VAZLO +127,500, LUK +477, RAYBESTOS +1,743 - Added KNADIAN catalog importer with year-range expansion and future-year filtering - Added VAZLO catalog importer with position parsing and SKU-in-model cleanup - Added Keep Green, LUK, Yokomitsu, Raybestos catalog importers - Cache clearing after cleanups (_classify_cache_*, nexus:mye_ids:*, nexus:brand_mye_counts:*) Final match rates: - KEEP GREEN: 90.3% - VAZLO: 93.6% - YOKOMITSU: 100.0% - KNADIAN: 57.4% - LUK: 51.0% - RAYBESTOS: 55.9%
370 lines
11 KiB
Python
Executable File
370 lines
11 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Match supplier_catalog_compat rows to model_year_engine ids by fuzzy (make, model, year).
|
|
|
|
Supports exact match, parenthesis-stripped match, whitespace/dash normalization,
|
|
prefix/substring fallback, model aliases, and year proximity (±2 years).
|
|
|
|
Usage:
|
|
python scripts/match_supplier_compat_to_mye.py [--dry-run] <supplier_name|--all>
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import sys
|
|
from collections import defaultdict
|
|
|
|
import psycopg2
|
|
|
|
MASTER_DB_URL = os.environ.get('MASTER_DB_URL', 'postgresql://postgres@localhost/nexus_autoparts')
|
|
|
|
MAKE_ALIASES = {
|
|
'VOLKSWAGEN': 'VW',
|
|
'VOLKWAGEN': 'VW',
|
|
'MERCEDES BENZ': 'MERCEDES BENZ',
|
|
'MERCEDES-BENZ': 'MERCEDES BENZ',
|
|
'BMW MOTORRAD': 'BMW',
|
|
}
|
|
|
|
NOISE_SUFFIXES = {
|
|
'SEDAN', 'SALOON', 'COUPE', 'HATCHBACK', 'HATCH', 'WAGON', 'ESTATE',
|
|
'SUV', 'VAN', 'PICK', 'UP', 'PICKUP', 'CABRIOLET', 'CONVERTIBLE',
|
|
'LATINO', 'BRASIL', 'MEXICO', 'USA', 'EUROPA', 'EUROPE', 'NACIO',
|
|
'LIMITED', 'LTD', 'XLT', 'LE', 'SE', 'XLE', 'SPORT', 'LX', 'EX',
|
|
'4X2', '4X4', '4WD', 'AWD', 'FWD', 'RWD', '2WD',
|
|
}
|
|
|
|
# Specific model aliases: (make, supplier_model) -> list of possible master model substrings
|
|
MODEL_ALIASES = {
|
|
('INFINITI', 'JX35'): ['JX SUV'],
|
|
('INFINITI', 'G35'): ['G Coupe', 'G Saloon', 'G37'],
|
|
('INFINITI', 'G37'): ['G Coupe', 'G Saloon', 'G37'],
|
|
('HONDA', 'CRX'): ['CRX'],
|
|
('MAZDA', 'PROTEGE'): ['PROTEGE'],
|
|
('MAZDA', 'PROTEGE5'): ['PROTEGE'],
|
|
('KIA', 'SPECTRA'): ['SPECTRA', 'SEPHIA'],
|
|
('KIA', 'FORTE5'): ['FORTE'],
|
|
('CHEVROLET', 'OPTRA'): ['OPTRA', 'LACETTI'],
|
|
('CHEVROLET', 'AGILE'): ['AGILE'],
|
|
('FIAT', 'SIENA'): ['SIENA'],
|
|
('PONTIAC', 'G4'): ['G4', 'PURSUIT'],
|
|
('FORD', 'FIVE HUNDRED'): ['FIVE HUNDRED', '500', 'TAURUS'],
|
|
('FORD', 'POLICE INTERCEPTOR UTILITY'): ['POLICE INTERCEPTOR UTILITY', 'EXPLORER'],
|
|
('FORD', 'POLICE INTERCEPTOR SEDAN'): ['POLICE INTERCEPTOR SEDAN', 'TAURUS'],
|
|
('SCION', 'XA'): ['XA'],
|
|
('SAAB', '9-2X'): ['9-2X'],
|
|
('BUICK', 'LACROSSE'): ['LACROSSE'],
|
|
('DODGE', 'CALIBER'): ['CALIBER'],
|
|
('SUZUKI', 'EQUATOR'): ['EQUATOR'],
|
|
('CHRYSLER', 'LEBARON K'): ['LEBARON'],
|
|
('MERCEDES BENZ', 'A170'): ['A-CLASS'],
|
|
('MERCEDES BENZ', 'A210'): ['A-CLASS'],
|
|
}
|
|
|
|
# Regex-based class extraction for Mercedes: e.g. C350E -> C-Class, SL600 -> SL
|
|
MERCEDES_CLASS_PATTERNS = [
|
|
# These Mercedes classes use "X-CLASS" in master (C-CLASS, E-CLASS, S-CLASS, etc.)
|
|
(r'^(A|B|C|E|G|GL|GLA|GLB|GLC|GLE|GLK|GLS|M|R|S|V|X)\d', 'CLASS'),
|
|
# These use just the letters (SL, SLK, CLS, CL, CLK) without -CLASS
|
|
(r'^(SL|SLK|CLS|CL|CLK)\d', 'LETTERS'),
|
|
(r'^(260E|300E|320E|400E|500E)$', 'E-CLASS'),
|
|
(r'^(300SL|500SL)$', 'SL'),
|
|
(r'^(400SEL|500SEL|600SEL)$', 'S-CLASS'),
|
|
]
|
|
|
|
|
|
def normalize_make(make):
|
|
if not make:
|
|
return ''
|
|
m = str(make).strip().upper()
|
|
return MAKE_ALIASES.get(m, m)
|
|
|
|
|
|
def normalize_model(model):
|
|
if not model:
|
|
return ''
|
|
return ' '.join(str(model).upper().split())
|
|
|
|
|
|
def strip_parentheses(text):
|
|
return re.sub(r'\s*\([^)]*\)', '', text).strip()
|
|
|
|
|
|
def strip_noise_suffixes(text):
|
|
parts = text.split()
|
|
cleaned = []
|
|
for p in parts:
|
|
if p in NOISE_SUFFIXES:
|
|
break
|
|
cleaned.append(p)
|
|
return ' '.join(cleaned)
|
|
|
|
|
|
def compact_alnum(text):
|
|
return re.sub(r'[^A-Z0-9]', '', text)
|
|
|
|
|
|
def build_model_variants(model_name):
|
|
variants = set()
|
|
base = normalize_model(model_name)
|
|
if not base:
|
|
return variants
|
|
no_paren = strip_parentheses(base)
|
|
no_noise = strip_noise_suffixes(no_paren)
|
|
compact = compact_alnum(no_noise)
|
|
compact_paren = compact_alnum(no_paren)
|
|
compact_base = compact_alnum(base)
|
|
|
|
variants.add(base)
|
|
if no_paren:
|
|
variants.add(no_paren)
|
|
if no_noise:
|
|
variants.add(no_noise)
|
|
if compact:
|
|
variants.add(compact)
|
|
if compact_paren:
|
|
variants.add(compact_paren)
|
|
if compact_base:
|
|
variants.add(compact_base)
|
|
return variants
|
|
|
|
|
|
def mercedes_class_alias(model):
|
|
"""Return a master model substring for Mercedes class-based models."""
|
|
m = normalize_model(model)
|
|
for pat, repl in MERCEDES_CLASS_PATTERNS:
|
|
match = re.match(pat, m)
|
|
if match:
|
|
if repl == 'CLASS':
|
|
return match.group(1) + '-CLASS'
|
|
if repl == 'LETTERS':
|
|
return match.group(1)
|
|
return repl
|
|
return None
|
|
|
|
|
|
def connect():
|
|
return psycopg2.connect(MASTER_DB_URL)
|
|
|
|
|
|
def build_mye_index(cur):
|
|
print('Building MYE index...')
|
|
cur.execute('''
|
|
SELECT b.name_brand, m.name_model, y.year_car, mye.id_mye
|
|
FROM model_year_engine mye
|
|
JOIN models m ON m.id_model = mye.model_id
|
|
JOIN brands b ON b.id_brand = m.brand_id
|
|
JOIN years y ON y.id_year = mye.year_id
|
|
''')
|
|
|
|
exact_index = defaultdict(list)
|
|
compact_index = defaultdict(list)
|
|
models_by_make = defaultdict(list)
|
|
# For year proximity: make -> compact_model -> {year: [mye_ids]}
|
|
year_range_index = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
|
|
|
|
for make, model, year, mye_id in cur.fetchall():
|
|
nmake = normalize_make(make)
|
|
if not nmake or not model or year is None:
|
|
continue
|
|
variants = build_model_variants(model)
|
|
for v in variants:
|
|
exact_index[(nmake, v, year)].append(mye_id)
|
|
|
|
compact = compact_alnum(strip_parentheses(normalize_model(model)))
|
|
if compact:
|
|
compact_index[(nmake, compact, year)].append(mye_id)
|
|
year_range_index[nmake][compact][year].append(mye_id)
|
|
|
|
models_by_make[nmake].append((normalize_model(model), mye_id, year, compact))
|
|
|
|
total_myes = sum(len(v) for v in exact_index.values())
|
|
print(f' {len(exact_index):,} exact keys, {total_myes:,} MYE entries')
|
|
return exact_index, compact_index, models_by_make, year_range_index
|
|
|
|
|
|
def find_by_alias(nmake, nmodel, year, models_by_make):
|
|
"""Try specific model aliases and Mercedes class patterns."""
|
|
aliases = MODEL_ALIASES.get((nmake, nmodel), [])
|
|
# Mercedes fallback
|
|
if nmake == 'MERCEDES BENZ':
|
|
cls = mercedes_class_alias(nmodel)
|
|
if cls and cls not in aliases:
|
|
aliases = list(aliases) + [cls]
|
|
|
|
if not aliases:
|
|
return None
|
|
|
|
# Try to find a master model that contains any alias substring and matches year
|
|
for alias in aliases:
|
|
alias_compact = compact_alnum(alias)
|
|
for master_model, mye_id, mye_year, master_compact in models_by_make.get(nmake, []):
|
|
if mye_year != year:
|
|
continue
|
|
if alias in master_model or alias_compact in master_compact:
|
|
return mye_id
|
|
return None
|
|
|
|
|
|
def find_by_year_proximity(nmake, supplier_compact, year, year_range_index, max_diff=2):
|
|
"""If exact year missing, find closest year within ±max_diff for same model."""
|
|
years = year_range_index.get(nmake, {}).get(supplier_compact)
|
|
if not years:
|
|
return None
|
|
best_y = None
|
|
best_diff = None
|
|
for y in years.keys():
|
|
diff = abs(y - year)
|
|
if diff <= max_diff:
|
|
if best_diff is None or diff < best_diff:
|
|
best_diff = diff
|
|
best_y = y
|
|
if best_y is not None:
|
|
return year_range_index[nmake][supplier_compact][best_y][0]
|
|
return None
|
|
|
|
|
|
def find_mye_id(make, model, year, exact_index, compact_index, models_by_make, year_range_index):
|
|
nmake = normalize_make(make)
|
|
nmodel = normalize_model(model)
|
|
if not nmake or not nmodel:
|
|
return None
|
|
|
|
variants = build_model_variants(nmodel)
|
|
|
|
# 1) Exact/near-exact on any variant
|
|
for v in variants:
|
|
myes = exact_index.get((nmake, v, year))
|
|
if myes:
|
|
return myes[0]
|
|
|
|
supplier_compact = compact_alnum(strip_parentheses(nmodel))
|
|
|
|
# 2) Compact match
|
|
myes = compact_index.get((nmake, supplier_compact, year))
|
|
if myes:
|
|
return myes[0]
|
|
|
|
# 3) Prefix/substring containment
|
|
for master_model, mye_id, mye_year, master_compact in models_by_make.get(nmake, []):
|
|
if mye_year != year:
|
|
continue
|
|
if not master_compact:
|
|
continue
|
|
if supplier_compact in master_compact or master_compact in supplier_compact:
|
|
return mye_id
|
|
|
|
if year is None:
|
|
return None
|
|
|
|
# 4) Model aliases
|
|
mye_id = find_by_alias(nmake, nmodel, year, models_by_make)
|
|
if mye_id:
|
|
return mye_id
|
|
|
|
# 5) Year proximity ±3 years (same compact model)
|
|
if supplier_compact:
|
|
mye_id = find_by_year_proximity(nmake, supplier_compact, year, year_range_index, max_diff=3)
|
|
if mye_id:
|
|
return mye_id
|
|
|
|
return None
|
|
|
|
|
|
def main():
|
|
args = sys.argv[1:]
|
|
dry_run = False
|
|
if '--dry-run' in args:
|
|
dry_run = True
|
|
args.remove('--dry-run')
|
|
|
|
if len(args) < 1:
|
|
print('Usage: match_supplier_compat_to_mye.py [--dry-run] <supplier_name|--all>')
|
|
sys.exit(1)
|
|
|
|
supplier_arg = args[0]
|
|
suppliers = None if supplier_arg == '--all' else [supplier_arg]
|
|
|
|
if dry_run:
|
|
print('=' * 60)
|
|
print('DRY RUN MODE — no changes will be made')
|
|
print('=' * 60)
|
|
|
|
conn = connect()
|
|
cur = conn.cursor()
|
|
|
|
exact_index, compact_index, models_by_make, year_range_index = build_mye_index(cur)
|
|
|
|
if suppliers:
|
|
cur.execute('''
|
|
SELECT scc.id, scc.make, scc.model, scc.year
|
|
FROM supplier_catalog_compat scc
|
|
JOIN supplier_catalog sc ON sc.id = scc.catalog_id
|
|
WHERE sc.supplier_name = ANY(%s) AND scc.model_year_engine_id IS NULL
|
|
''', (suppliers,))
|
|
else:
|
|
cur.execute('''
|
|
SELECT scc.id, scc.make, scc.model, scc.year
|
|
FROM supplier_catalog_compat scc
|
|
WHERE scc.model_year_engine_id IS NULL
|
|
''')
|
|
|
|
rows = cur.fetchall()
|
|
print(f'\nMatching {len(rows):,} compat rows...')
|
|
|
|
matched = 0
|
|
unmatched = 0
|
|
sample_matches = []
|
|
sample_unmatched = []
|
|
updates = []
|
|
|
|
for scc_id, make, model, year in rows:
|
|
mye_id = find_mye_id(make, model, year, exact_index, compact_index, models_by_make, year_range_index)
|
|
if mye_id:
|
|
updates.append((mye_id, scc_id))
|
|
matched += 1
|
|
if len(sample_matches) < 10:
|
|
sample_matches.append((make, model, year, mye_id))
|
|
else:
|
|
unmatched += 1
|
|
if len(sample_unmatched) < 10:
|
|
sample_unmatched.append((make, model, year))
|
|
|
|
print(f'Matched: {matched:,}')
|
|
print(f'Unmatched: {unmatched:,}')
|
|
|
|
if sample_matches:
|
|
print('\nSample matches:')
|
|
for make, model, year, mye_id in sample_matches:
|
|
print(f' {make} {model} {year} -> mye_id={mye_id}')
|
|
|
|
if sample_unmatched:
|
|
print('\nSample unmatched:')
|
|
for make, model, year in sample_unmatched:
|
|
print(f' {make} {model} {year}')
|
|
|
|
if dry_run or not updates:
|
|
cur.close()
|
|
conn.close()
|
|
if dry_run:
|
|
print('\n' + '=' * 60)
|
|
print('DRY RUN complete. Run without --dry-run to apply.')
|
|
print('=' * 60)
|
|
return
|
|
|
|
print(f'\nApplying {len(updates):,} updates...')
|
|
cur.executemany('''
|
|
UPDATE supplier_catalog_compat
|
|
SET model_year_engine_id = %s, source = 'matched_fuzzy'
|
|
WHERE id = %s
|
|
''', updates)
|
|
conn.commit()
|
|
print('Updates committed.')
|
|
|
|
cur.close()
|
|
conn.close()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|