Files
Autoparts-DB/scripts/clean_supplier_corrupted_models.py
consultoria-as ea29cc31c0 feat(catalog): supplier catalog cleanup, fuzzy matching, and navigation fixes
- Cleaned 137+ fake engine-displacement models from supplier imports
  (v3/v4 scripts: Chevrolet, Ford, Chrysler, Dodge, Jeep, Nissan, etc.)
- Removed 1,251+ corrupted models (INT. prefixes, year-suffix, torque specs,
  empty names, trailing-year variants)
- Migrated supplier tables to master DB (supplier_catalog,
  supplier_catalog_compat, supplier_catalog_interchange)
- Fixed _get_mye_ids_with_parts() to query supplier_catalog_compat from
  master DB so supplier-only vehicles appear for all tenants
- Added fuzzy model matcher with parenthesis stripping, noise suffix removal,
  compact matching, prefix/substring fallback, model aliases, and ±3 year
  proximity
- Matched compat rows: KEEP GREEN +14,152, KNADIAN +3,021, VAZLO +127,500,
  LUK +477, RAYBESTOS +1,743
- Added KNADIAN catalog importer with year-range expansion and future-year
  filtering
- Added VAZLO catalog importer with position parsing and SKU-in-model cleanup
- Added Keep Green, LUK, Yokomitsu, Raybestos catalog importers
- Cache clearing after cleanups (_classify_cache_*, nexus:mye_ids:*,
  nexus:brand_mye_counts:*)

Final match rates:
- KEEP GREEN: 90.3%
- VAZLO: 93.6%
- YOKOMITSU: 100.0%
- KNADIAN: 57.4%
- LUK: 51.0%
- RAYBESTOS: 55.9%
2026-06-09 07:47:42 +00:00

241 lines
8.6 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Clean supplier-corrupted models from master DB.
Handles trailing years, year ranges, engine specs, trim variants, etc.
Usage:
python scripts/clean_supplier_corrupted_models.py [--execute]
"""
import os
import re
import sys
import psycopg2
MASTER_DB_URL = os.environ.get('MASTER_DB_URL', 'postgresql://postgres@localhost/nexus_autoparts')
def connect():
return psycopg2.connect(MASTER_DB_URL)
def delete_model_and_myes(conn, model_id):
cur = conn.cursor()
cur.execute("SELECT id_mye FROM model_year_engine WHERE model_id = %s", (model_id,))
mye_ids = [r[0] for r in cur.fetchall()]
if mye_ids:
cur.execute("UPDATE supplier_catalog_compat SET model_year_engine_id = NULL WHERE model_year_engine_id = ANY(%s)", (mye_ids,))
cur.execute("DELETE FROM vin_cache WHERE model_year_engine_id = ANY(%s)", (mye_ids,))
cur.execute("DELETE FROM model_year_engine WHERE id_mye = ANY(%s)", (mye_ids,))
cur.execute("DELETE FROM models WHERE id_model = %s", (model_id,))
cur.close()
def normalize_for_match(name):
"""Normalize model name for matching: uppercase, remove extra spaces, replace spaces with hyphens and vice versa."""
if not name:
return ''
return ' '.join(str(name).upper().split())
def find_base_model(cur, brand_id, base_name):
"""Find a base model in same brand by normalized name match."""
normalized = normalize_for_match(base_name)
# Try exact
cur.execute("""
SELECT id_model, name_model FROM models
WHERE brand_id = %s AND LOWER(name_model) = LOWER(%s)
LIMIT 1
""", (brand_id, normalized))
row = cur.fetchone()
if row:
return row
# Try with spaces replaced by hyphens
hyphenated = normalized.replace(' ', '-')
cur.execute("""
SELECT id_model, name_model FROM models
WHERE brand_id = %s AND REPLACE(UPPER(name_model), ' ', '-') = %s
LIMIT 1
""", (brand_id, hyphenated))
row = cur.fetchone()
if row:
return row
# Try with hyphens replaced by spaces
spaced = normalized.replace('-', ' ')
cur.execute("""
SELECT id_model, name_model FROM models
WHERE brand_id = %s AND REPLACE(UPPER(name_model), '-', ' ') = %s
LIMIT 1
""", (brand_id, spaced))
return cur.fetchone()
def merge_model_to_base(conn, model_id, base_id, base_name):
cur = conn.cursor()
cur.execute("SELECT id_mye, year_id, engine_id FROM model_year_engine WHERE model_id = %s", (model_id,))
myes = cur.fetchall()
migrated = 0
for mye_id, year_id, engine_id in myes:
cur.execute("""
SELECT id_mye FROM model_year_engine
WHERE model_id = %s AND year_id = %s
AND (engine_id = %s OR (engine_id IS NULL AND %s IS NULL))
""", (base_id, year_id, engine_id, engine_id))
base_mye = cur.fetchone()
if base_mye:
base_mye_id = base_mye[0]
cur.execute("UPDATE supplier_catalog_compat SET model_year_engine_id = %s WHERE model_year_engine_id = %s", (base_mye_id, mye_id))
cur.execute("DELETE FROM vin_cache WHERE model_year_engine_id = %s", (mye_id,))
cur.execute("DELETE FROM model_year_engine WHERE id_mye = %s", (mye_id,))
else:
cur.execute("UPDATE model_year_engine SET model_id = %s WHERE id_mye = %s", (base_id, mye_id))
migrated += 1
cur.execute("DELETE FROM models WHERE id_model = %s", (model_id,))
cur.close()
return migrated
def extract_base_name(name, reason):
n = name.strip()
if reason == 'trailing_year':
m = re.search(r'^(.*?)\s+(19|20)\d{2}$', n)
if m:
return m.group(1).strip()
elif reason == 'year_range_parens':
m = re.search(r'^(.*?)\s+\d{2}-\d{2}\s*\(', n)
if m:
return m.group(1).strip()
elif reason == 'hasta_tas':
if 'Tas.' in n:
m = re.search(r'^(.*?)(?:\s+\d+\.\d+L)?\s+\d{2}-\d{2}\s+Tas\.', n)
if m:
return m.group(1).strip()
if 'hasta' in n.lower():
m = re.search(r'^(.*?)\s+hasta', n, re.IGNORECASE)
if m:
return m.group(1).strip()
return None
def main():
dry_run = '--execute' not in sys.argv
if dry_run:
print("=" * 60)
print("DRY RUN MODE — no changes will be made")
print("Run with --execute to apply changes")
print("=" * 60)
conn = connect()
cur = conn.cursor()
cur.execute('SELECT id_model, name_model, brand_id FROM models')
models = cur.fetchall()
patterns = {
'trailing_year': (re.compile(r' (19|20)\d{2}$'), lambda b: b != 'MCLAREN'),
'year_range_parens': (re.compile(r'[A-Za-z]+ \d{2}-\d{2} \('), None),
'engine_spec': (re.compile(r',?\s*\(\d+ HP\)|DOHC|SOHC|Valv\.|Turbo L4|L4,\s*\(', re.IGNORECASE), None),
'hasta_tas': (re.compile(r'hasta|Tas\.', re.IGNORECASE), None),
'engine_only': (re.compile(r'^\d+\.\d+L$', re.IGNORECASE), None),
'engine_config': (re.compile(r'^\d+\.\d+L\s+(?:L\d|V\d|R\s|Turbo|TDI|GSI)', re.IGNORECASE),
lambda n: not re.search(r'\([A-Z0-9_]{3,}\)$', n)),
}
suspicious = []
for mid, name, bid in models:
if not name:
continue
for reason, (pat, extra_check) in patterns.items():
if pat.search(name):
ok = True
if extra_check:
if reason == 'trailing_year':
cur.execute('SELECT name_brand FROM brands WHERE id_brand=%s', (bid,))
bname = cur.fetchone()[0]
ok = extra_check(bname)
else:
ok = extra_check(name)
if ok:
suspicious.append((bid, name, mid, reason))
break
# Trim variant detection: "500 POP", "FIESTA SE", etc.
trim_variants = ['LOUNGE', 'POP', 'SPORT', 'ADVENTURE', 'FIRE', 'GT', 'GTV', 'STD', 'SE', 'LE', 'XLE', 'LIMITED', 'LX', 'EX', 'SX']
trim_pattern = re.compile(r'^(\S+?)\s*(' + '|'.join(trim_variants) + r')$')
trim_matches = []
for mid, name, bid in models:
if not name:
continue
if any(s[2] == mid for s in suspicious):
continue # already flagged
m = trim_pattern.match(name.upper())
if m:
base = m.group(1)
base_model = find_base_model(cur, bid, base)
if base_model:
trim_matches.append((bid, name, mid, 'trim_variant', base_model[0], base_model[1]))
print(f"\nFound {len(suspicious)} suspicious models by pattern")
print(f"Found {len(trim_matches)} trim variant models")
to_merge = []
to_delete = []
for bid, name, mid, reason in suspicious:
if reason in ('engine_spec', 'engine_only', 'engine_config'):
to_delete.append((bid, name, mid, reason))
continue
base_name = extract_base_name(name, reason)
if base_name:
base = find_base_model(cur, bid, base_name)
if base:
to_merge.append((bid, name, mid, reason, base[0], base[1]))
continue
to_delete.append((bid, name, mid, reason))
# Add trim matches to merge list
for item in trim_matches:
to_merge.append(item)
print(f"\nTo merge: {len(to_merge)}")
for bid, name, mid, reason, base_id, base_name in to_merge:
print(f" [{bid}] '{name}' -> '{base_name}' (reason={reason})")
print(f"\nTo delete: {len(to_delete)}")
for bid, name, mid, reason in to_delete:
print(f" [{bid}] '{name}' reason={reason}")
if dry_run:
print("\n" + "=" * 60)
print("DRY RUN complete. Run with --execute to apply.")
print("=" * 60)
cur.close()
conn.close()
return
print("\nApplying merges...")
for bid, name, mid, reason, base_id, base_name in to_merge:
cur.execute('SELECT name_brand FROM brands WHERE id_brand=%s', (bid,))
bname = cur.fetchone()[0]
migrated = merge_model_to_base(conn, mid, base_id, base_name)
print(f" [{bname}] '{name}' -> '{base_name}' ({migrated} MYEs migrated)")
conn.commit()
print("\nApplying deletes...")
for bid, name, mid, reason in to_delete:
cur.execute('SELECT name_brand FROM brands WHERE id_brand=%s', (bid,))
bname = cur.fetchone()[0]
delete_model_and_myes(conn, mid)
print(f" [{bname}] '{name}' deleted")
conn.commit()
print(f"\nDone. Merged {len(to_merge)}, deleted {len(to_delete)}.")
cur.close()
conn.close()
if __name__ == '__main__':
main()