feat(catalog): supplier catalog cleanup, fuzzy matching, and navigation fixes
- Cleaned 137+ fake engine-displacement models from supplier imports (v3/v4 scripts: Chevrolet, Ford, Chrysler, Dodge, Jeep, Nissan, etc.) - Removed 1,251+ corrupted models (INT. prefixes, year-suffix, torque specs, empty names, trailing-year variants) - Migrated supplier tables to master DB (supplier_catalog, supplier_catalog_compat, supplier_catalog_interchange) - Fixed _get_mye_ids_with_parts() to query supplier_catalog_compat from master DB so supplier-only vehicles appear for all tenants - Added fuzzy model matcher with parenthesis stripping, noise suffix removal, compact matching, prefix/substring fallback, model aliases, and ±3 year proximity - Matched compat rows: KEEP GREEN +14,152, KNADIAN +3,021, VAZLO +127,500, LUK +477, RAYBESTOS +1,743 - Added KNADIAN catalog importer with year-range expansion and future-year filtering - Added VAZLO catalog importer with position parsing and SKU-in-model cleanup - Added Keep Green, LUK, Yokomitsu, Raybestos catalog importers - Cache clearing after cleanups (_classify_cache_*, nexus:mye_ids:*, nexus:brand_mye_counts:*) Final match rates: - KEEP GREEN: 90.3% - VAZLO: 93.6% - YOKOMITSU: 100.0% - KNADIAN: 57.4% - LUK: 51.0% - RAYBESTOS: 55.9%
This commit is contained in:
240
scripts/clean_supplier_corrupted_models.py
Executable file
240
scripts/clean_supplier_corrupted_models.py
Executable file
@@ -0,0 +1,240 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Clean supplier-corrupted models from master DB.
|
||||
Handles trailing years, year ranges, engine specs, trim variants, etc.
|
||||
|
||||
Usage:
|
||||
python scripts/clean_supplier_corrupted_models.py [--execute]
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
|
||||
import psycopg2
|
||||
|
||||
MASTER_DB_URL = os.environ.get('MASTER_DB_URL', 'postgresql://postgres@localhost/nexus_autoparts')
|
||||
|
||||
|
||||
def connect():
|
||||
return psycopg2.connect(MASTER_DB_URL)
|
||||
|
||||
|
||||
def delete_model_and_myes(conn, model_id):
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT id_mye FROM model_year_engine WHERE model_id = %s", (model_id,))
|
||||
mye_ids = [r[0] for r in cur.fetchall()]
|
||||
if mye_ids:
|
||||
cur.execute("UPDATE supplier_catalog_compat SET model_year_engine_id = NULL WHERE model_year_engine_id = ANY(%s)", (mye_ids,))
|
||||
cur.execute("DELETE FROM vin_cache WHERE model_year_engine_id = ANY(%s)", (mye_ids,))
|
||||
cur.execute("DELETE FROM model_year_engine WHERE id_mye = ANY(%s)", (mye_ids,))
|
||||
cur.execute("DELETE FROM models WHERE id_model = %s", (model_id,))
|
||||
cur.close()
|
||||
|
||||
|
||||
def normalize_for_match(name):
|
||||
"""Normalize model name for matching: uppercase, remove extra spaces, replace spaces with hyphens and vice versa."""
|
||||
if not name:
|
||||
return ''
|
||||
return ' '.join(str(name).upper().split())
|
||||
|
||||
|
||||
def find_base_model(cur, brand_id, base_name):
|
||||
"""Find a base model in same brand by normalized name match."""
|
||||
normalized = normalize_for_match(base_name)
|
||||
# Try exact
|
||||
cur.execute("""
|
||||
SELECT id_model, name_model FROM models
|
||||
WHERE brand_id = %s AND LOWER(name_model) = LOWER(%s)
|
||||
LIMIT 1
|
||||
""", (brand_id, normalized))
|
||||
row = cur.fetchone()
|
||||
if row:
|
||||
return row
|
||||
# Try with spaces replaced by hyphens
|
||||
hyphenated = normalized.replace(' ', '-')
|
||||
cur.execute("""
|
||||
SELECT id_model, name_model FROM models
|
||||
WHERE brand_id = %s AND REPLACE(UPPER(name_model), ' ', '-') = %s
|
||||
LIMIT 1
|
||||
""", (brand_id, hyphenated))
|
||||
row = cur.fetchone()
|
||||
if row:
|
||||
return row
|
||||
# Try with hyphens replaced by spaces
|
||||
spaced = normalized.replace('-', ' ')
|
||||
cur.execute("""
|
||||
SELECT id_model, name_model FROM models
|
||||
WHERE brand_id = %s AND REPLACE(UPPER(name_model), '-', ' ') = %s
|
||||
LIMIT 1
|
||||
""", (brand_id, spaced))
|
||||
return cur.fetchone()
|
||||
|
||||
|
||||
def merge_model_to_base(conn, model_id, base_id, base_name):
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT id_mye, year_id, engine_id FROM model_year_engine WHERE model_id = %s", (model_id,))
|
||||
myes = cur.fetchall()
|
||||
migrated = 0
|
||||
for mye_id, year_id, engine_id in myes:
|
||||
cur.execute("""
|
||||
SELECT id_mye FROM model_year_engine
|
||||
WHERE model_id = %s AND year_id = %s
|
||||
AND (engine_id = %s OR (engine_id IS NULL AND %s IS NULL))
|
||||
""", (base_id, year_id, engine_id, engine_id))
|
||||
base_mye = cur.fetchone()
|
||||
if base_mye:
|
||||
base_mye_id = base_mye[0]
|
||||
cur.execute("UPDATE supplier_catalog_compat SET model_year_engine_id = %s WHERE model_year_engine_id = %s", (base_mye_id, mye_id))
|
||||
cur.execute("DELETE FROM vin_cache WHERE model_year_engine_id = %s", (mye_id,))
|
||||
cur.execute("DELETE FROM model_year_engine WHERE id_mye = %s", (mye_id,))
|
||||
else:
|
||||
cur.execute("UPDATE model_year_engine SET model_id = %s WHERE id_mye = %s", (base_id, mye_id))
|
||||
migrated += 1
|
||||
cur.execute("DELETE FROM models WHERE id_model = %s", (model_id,))
|
||||
cur.close()
|
||||
return migrated
|
||||
|
||||
|
||||
def extract_base_name(name, reason):
|
||||
n = name.strip()
|
||||
if reason == 'trailing_year':
|
||||
m = re.search(r'^(.*?)\s+(19|20)\d{2}$', n)
|
||||
if m:
|
||||
return m.group(1).strip()
|
||||
elif reason == 'year_range_parens':
|
||||
m = re.search(r'^(.*?)\s+\d{2}-\d{2}\s*\(', n)
|
||||
if m:
|
||||
return m.group(1).strip()
|
||||
elif reason == 'hasta_tas':
|
||||
if 'Tas.' in n:
|
||||
m = re.search(r'^(.*?)(?:\s+\d+\.\d+L)?\s+\d{2}-\d{2}\s+Tas\.', n)
|
||||
if m:
|
||||
return m.group(1).strip()
|
||||
if 'hasta' in n.lower():
|
||||
m = re.search(r'^(.*?)\s+hasta', n, re.IGNORECASE)
|
||||
if m:
|
||||
return m.group(1).strip()
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
dry_run = '--execute' not in sys.argv
|
||||
if dry_run:
|
||||
print("=" * 60)
|
||||
print("DRY RUN MODE — no changes will be made")
|
||||
print("Run with --execute to apply changes")
|
||||
print("=" * 60)
|
||||
|
||||
conn = connect()
|
||||
cur = conn.cursor()
|
||||
|
||||
cur.execute('SELECT id_model, name_model, brand_id FROM models')
|
||||
models = cur.fetchall()
|
||||
|
||||
patterns = {
|
||||
'trailing_year': (re.compile(r' (19|20)\d{2}$'), lambda b: b != 'MCLAREN'),
|
||||
'year_range_parens': (re.compile(r'[A-Za-z]+ \d{2}-\d{2} \('), None),
|
||||
'engine_spec': (re.compile(r',?\s*\(\d+ HP\)|DOHC|SOHC|Valv\.|Turbo L4|L4,\s*\(', re.IGNORECASE), None),
|
||||
'hasta_tas': (re.compile(r'hasta|Tas\.', re.IGNORECASE), None),
|
||||
'engine_only': (re.compile(r'^\d+\.\d+L$', re.IGNORECASE), None),
|
||||
'engine_config': (re.compile(r'^\d+\.\d+L\s+(?:L\d|V\d|R\s|Turbo|TDI|GSI)', re.IGNORECASE),
|
||||
lambda n: not re.search(r'\([A-Z0-9_]{3,}\)$', n)),
|
||||
}
|
||||
|
||||
suspicious = []
|
||||
for mid, name, bid in models:
|
||||
if not name:
|
||||
continue
|
||||
for reason, (pat, extra_check) in patterns.items():
|
||||
if pat.search(name):
|
||||
ok = True
|
||||
if extra_check:
|
||||
if reason == 'trailing_year':
|
||||
cur.execute('SELECT name_brand FROM brands WHERE id_brand=%s', (bid,))
|
||||
bname = cur.fetchone()[0]
|
||||
ok = extra_check(bname)
|
||||
else:
|
||||
ok = extra_check(name)
|
||||
if ok:
|
||||
suspicious.append((bid, name, mid, reason))
|
||||
break
|
||||
|
||||
# Trim variant detection: "500 POP", "FIESTA SE", etc.
|
||||
trim_variants = ['LOUNGE', 'POP', 'SPORT', 'ADVENTURE', 'FIRE', 'GT', 'GTV', 'STD', 'SE', 'LE', 'XLE', 'LIMITED', 'LX', 'EX', 'SX']
|
||||
trim_pattern = re.compile(r'^(\S+?)\s*(' + '|'.join(trim_variants) + r')$')
|
||||
|
||||
trim_matches = []
|
||||
for mid, name, bid in models:
|
||||
if not name:
|
||||
continue
|
||||
if any(s[2] == mid for s in suspicious):
|
||||
continue # already flagged
|
||||
m = trim_pattern.match(name.upper())
|
||||
if m:
|
||||
base = m.group(1)
|
||||
base_model = find_base_model(cur, bid, base)
|
||||
if base_model:
|
||||
trim_matches.append((bid, name, mid, 'trim_variant', base_model[0], base_model[1]))
|
||||
|
||||
print(f"\nFound {len(suspicious)} suspicious models by pattern")
|
||||
print(f"Found {len(trim_matches)} trim variant models")
|
||||
|
||||
to_merge = []
|
||||
to_delete = []
|
||||
|
||||
for bid, name, mid, reason in suspicious:
|
||||
if reason in ('engine_spec', 'engine_only', 'engine_config'):
|
||||
to_delete.append((bid, name, mid, reason))
|
||||
continue
|
||||
base_name = extract_base_name(name, reason)
|
||||
if base_name:
|
||||
base = find_base_model(cur, bid, base_name)
|
||||
if base:
|
||||
to_merge.append((bid, name, mid, reason, base[0], base[1]))
|
||||
continue
|
||||
to_delete.append((bid, name, mid, reason))
|
||||
|
||||
# Add trim matches to merge list
|
||||
for item in trim_matches:
|
||||
to_merge.append(item)
|
||||
|
||||
print(f"\nTo merge: {len(to_merge)}")
|
||||
for bid, name, mid, reason, base_id, base_name in to_merge:
|
||||
print(f" [{bid}] '{name}' -> '{base_name}' (reason={reason})")
|
||||
|
||||
print(f"\nTo delete: {len(to_delete)}")
|
||||
for bid, name, mid, reason in to_delete:
|
||||
print(f" [{bid}] '{name}' reason={reason}")
|
||||
|
||||
if dry_run:
|
||||
print("\n" + "=" * 60)
|
||||
print("DRY RUN complete. Run with --execute to apply.")
|
||||
print("=" * 60)
|
||||
cur.close()
|
||||
conn.close()
|
||||
return
|
||||
|
||||
print("\nApplying merges...")
|
||||
for bid, name, mid, reason, base_id, base_name in to_merge:
|
||||
cur.execute('SELECT name_brand FROM brands WHERE id_brand=%s', (bid,))
|
||||
bname = cur.fetchone()[0]
|
||||
migrated = merge_model_to_base(conn, mid, base_id, base_name)
|
||||
print(f" [{bname}] '{name}' -> '{base_name}' ({migrated} MYEs migrated)")
|
||||
conn.commit()
|
||||
|
||||
print("\nApplying deletes...")
|
||||
for bid, name, mid, reason in to_delete:
|
||||
cur.execute('SELECT name_brand FROM brands WHERE id_brand=%s', (bid,))
|
||||
bname = cur.fetchone()[0]
|
||||
delete_model_and_myes(conn, mid)
|
||||
print(f" [{bname}] '{name}' deleted")
|
||||
conn.commit()
|
||||
|
||||
print(f"\nDone. Merged {len(to_merge)}, deleted {len(to_delete)}.")
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user