#!/usr/bin/env python3 """ Clean supplier-corrupted models from master DB. Handles trailing years, year ranges, engine specs, trim variants, etc. Usage: python scripts/clean_supplier_corrupted_models.py [--execute] """ import os import re import sys import psycopg2 MASTER_DB_URL = os.environ.get('MASTER_DB_URL', 'postgresql://postgres@localhost/nexus_autoparts') def connect(): return psycopg2.connect(MASTER_DB_URL) def delete_model_and_myes(conn, model_id): cur = conn.cursor() cur.execute("SELECT id_mye FROM model_year_engine WHERE model_id = %s", (model_id,)) mye_ids = [r[0] for r in cur.fetchall()] if mye_ids: cur.execute("UPDATE supplier_catalog_compat SET model_year_engine_id = NULL WHERE model_year_engine_id = ANY(%s)", (mye_ids,)) cur.execute("DELETE FROM vin_cache WHERE model_year_engine_id = ANY(%s)", (mye_ids,)) cur.execute("DELETE FROM model_year_engine WHERE id_mye = ANY(%s)", (mye_ids,)) cur.execute("DELETE FROM models WHERE id_model = %s", (model_id,)) cur.close() def normalize_for_match(name): """Normalize model name for matching: uppercase, remove extra spaces, replace spaces with hyphens and vice versa.""" if not name: return '' return ' '.join(str(name).upper().split()) def find_base_model(cur, brand_id, base_name): """Find a base model in same brand by normalized name match.""" normalized = normalize_for_match(base_name) # Try exact cur.execute(""" SELECT id_model, name_model FROM models WHERE brand_id = %s AND LOWER(name_model) = LOWER(%s) LIMIT 1 """, (brand_id, normalized)) row = cur.fetchone() if row: return row # Try with spaces replaced by hyphens hyphenated = normalized.replace(' ', '-') cur.execute(""" SELECT id_model, name_model FROM models WHERE brand_id = %s AND REPLACE(UPPER(name_model), ' ', '-') = %s LIMIT 1 """, (brand_id, hyphenated)) row = cur.fetchone() if row: return row # Try with hyphens replaced by spaces spaced = normalized.replace('-', ' ') cur.execute(""" SELECT id_model, name_model FROM models WHERE brand_id = %s AND REPLACE(UPPER(name_model), '-', ' ') = %s LIMIT 1 """, (brand_id, spaced)) return cur.fetchone() def merge_model_to_base(conn, model_id, base_id, base_name): cur = conn.cursor() cur.execute("SELECT id_mye, year_id, engine_id FROM model_year_engine WHERE model_id = %s", (model_id,)) myes = cur.fetchall() migrated = 0 for mye_id, year_id, engine_id in myes: cur.execute(""" SELECT id_mye FROM model_year_engine WHERE model_id = %s AND year_id = %s AND (engine_id = %s OR (engine_id IS NULL AND %s IS NULL)) """, (base_id, year_id, engine_id, engine_id)) base_mye = cur.fetchone() if base_mye: base_mye_id = base_mye[0] cur.execute("UPDATE supplier_catalog_compat SET model_year_engine_id = %s WHERE model_year_engine_id = %s", (base_mye_id, mye_id)) cur.execute("DELETE FROM vin_cache WHERE model_year_engine_id = %s", (mye_id,)) cur.execute("DELETE FROM model_year_engine WHERE id_mye = %s", (mye_id,)) else: cur.execute("UPDATE model_year_engine SET model_id = %s WHERE id_mye = %s", (base_id, mye_id)) migrated += 1 cur.execute("DELETE FROM models WHERE id_model = %s", (model_id,)) cur.close() return migrated def extract_base_name(name, reason): n = name.strip() if reason == 'trailing_year': m = re.search(r'^(.*?)\s+(19|20)\d{2}$', n) if m: return m.group(1).strip() elif reason == 'year_range_parens': m = re.search(r'^(.*?)\s+\d{2}-\d{2}\s*\(', n) if m: return m.group(1).strip() elif reason == 'hasta_tas': if 'Tas.' in n: m = re.search(r'^(.*?)(?:\s+\d+\.\d+L)?\s+\d{2}-\d{2}\s+Tas\.', n) if m: return m.group(1).strip() if 'hasta' in n.lower(): m = re.search(r'^(.*?)\s+hasta', n, re.IGNORECASE) if m: return m.group(1).strip() return None def main(): dry_run = '--execute' not in sys.argv if dry_run: print("=" * 60) print("DRY RUN MODE — no changes will be made") print("Run with --execute to apply changes") print("=" * 60) conn = connect() cur = conn.cursor() cur.execute('SELECT id_model, name_model, brand_id FROM models') models = cur.fetchall() patterns = { 'trailing_year': (re.compile(r' (19|20)\d{2}$'), lambda b: b != 'MCLAREN'), 'year_range_parens': (re.compile(r'[A-Za-z]+ \d{2}-\d{2} \('), None), 'engine_spec': (re.compile(r',?\s*\(\d+ HP\)|DOHC|SOHC|Valv\.|Turbo L4|L4,\s*\(', re.IGNORECASE), None), 'hasta_tas': (re.compile(r'hasta|Tas\.', re.IGNORECASE), None), 'engine_only': (re.compile(r'^\d+\.\d+L$', re.IGNORECASE), None), 'engine_config': (re.compile(r'^\d+\.\d+L\s+(?:L\d|V\d|R\s|Turbo|TDI|GSI)', re.IGNORECASE), lambda n: not re.search(r'\([A-Z0-9_]{3,}\)$', n)), } suspicious = [] for mid, name, bid in models: if not name: continue for reason, (pat, extra_check) in patterns.items(): if pat.search(name): ok = True if extra_check: if reason == 'trailing_year': cur.execute('SELECT name_brand FROM brands WHERE id_brand=%s', (bid,)) bname = cur.fetchone()[0] ok = extra_check(bname) else: ok = extra_check(name) if ok: suspicious.append((bid, name, mid, reason)) break # Trim variant detection: "500 POP", "FIESTA SE", etc. trim_variants = ['LOUNGE', 'POP', 'SPORT', 'ADVENTURE', 'FIRE', 'GT', 'GTV', 'STD', 'SE', 'LE', 'XLE', 'LIMITED', 'LX', 'EX', 'SX'] trim_pattern = re.compile(r'^(\S+?)\s*(' + '|'.join(trim_variants) + r')$') trim_matches = [] for mid, name, bid in models: if not name: continue if any(s[2] == mid for s in suspicious): continue # already flagged m = trim_pattern.match(name.upper()) if m: base = m.group(1) base_model = find_base_model(cur, bid, base) if base_model: trim_matches.append((bid, name, mid, 'trim_variant', base_model[0], base_model[1])) print(f"\nFound {len(suspicious)} suspicious models by pattern") print(f"Found {len(trim_matches)} trim variant models") to_merge = [] to_delete = [] for bid, name, mid, reason in suspicious: if reason in ('engine_spec', 'engine_only', 'engine_config'): to_delete.append((bid, name, mid, reason)) continue base_name = extract_base_name(name, reason) if base_name: base = find_base_model(cur, bid, base_name) if base: to_merge.append((bid, name, mid, reason, base[0], base[1])) continue to_delete.append((bid, name, mid, reason)) # Add trim matches to merge list for item in trim_matches: to_merge.append(item) print(f"\nTo merge: {len(to_merge)}") for bid, name, mid, reason, base_id, base_name in to_merge: print(f" [{bid}] '{name}' -> '{base_name}' (reason={reason})") print(f"\nTo delete: {len(to_delete)}") for bid, name, mid, reason in to_delete: print(f" [{bid}] '{name}' reason={reason}") if dry_run: print("\n" + "=" * 60) print("DRY RUN complete. Run with --execute to apply.") print("=" * 60) cur.close() conn.close() return print("\nApplying merges...") for bid, name, mid, reason, base_id, base_name in to_merge: cur.execute('SELECT name_brand FROM brands WHERE id_brand=%s', (bid,)) bname = cur.fetchone()[0] migrated = merge_model_to_base(conn, mid, base_id, base_name) print(f" [{bname}] '{name}' -> '{base_name}' ({migrated} MYEs migrated)") conn.commit() print("\nApplying deletes...") for bid, name, mid, reason in to_delete: cur.execute('SELECT name_brand FROM brands WHERE id_brand=%s', (bid,)) bname = cur.fetchone()[0] delete_model_and_myes(conn, mid) print(f" [{bname}] '{name}' deleted") conn.commit() print(f"\nDone. Merged {len(to_merge)}, deleted {len(to_delete)}.") cur.close() conn.close() if __name__ == '__main__': main()