- Cleaned 137+ fake engine-displacement models from supplier imports (v3/v4 scripts: Chevrolet, Ford, Chrysler, Dodge, Jeep, Nissan, etc.) - Removed 1,251+ corrupted models (INT. prefixes, year-suffix, torque specs, empty names, trailing-year variants) - Migrated supplier tables to master DB (supplier_catalog, supplier_catalog_compat, supplier_catalog_interchange) - Fixed _get_mye_ids_with_parts() to query supplier_catalog_compat from master DB so supplier-only vehicles appear for all tenants - Added fuzzy model matcher with parenthesis stripping, noise suffix removal, compact matching, prefix/substring fallback, model aliases, and ±3 year proximity - Matched compat rows: KEEP GREEN +14,152, KNADIAN +3,021, VAZLO +127,500, LUK +477, RAYBESTOS +1,743 - Added KNADIAN catalog importer with year-range expansion and future-year filtering - Added VAZLO catalog importer with position parsing and SKU-in-model cleanup - Added Keep Green, LUK, Yokomitsu, Raybestos catalog importers - Cache clearing after cleanups (_classify_cache_*, nexus:mye_ids:*, nexus:brand_mye_counts:*) Final match rates: - KEEP GREEN: 90.3% - VAZLO: 93.6% - YOKOMITSU: 100.0% - KNADIAN: 57.4% - LUK: 51.0% - RAYBESTOS: 55.9%
241 lines
8.6 KiB
Python
Executable File
241 lines
8.6 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Clean supplier-corrupted models from master DB.
|
|
Handles trailing years, year ranges, engine specs, trim variants, etc.
|
|
|
|
Usage:
|
|
python scripts/clean_supplier_corrupted_models.py [--execute]
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import sys
|
|
|
|
import psycopg2
|
|
|
|
MASTER_DB_URL = os.environ.get('MASTER_DB_URL', 'postgresql://postgres@localhost/nexus_autoparts')
|
|
|
|
|
|
def connect():
|
|
return psycopg2.connect(MASTER_DB_URL)
|
|
|
|
|
|
def delete_model_and_myes(conn, model_id):
|
|
cur = conn.cursor()
|
|
cur.execute("SELECT id_mye FROM model_year_engine WHERE model_id = %s", (model_id,))
|
|
mye_ids = [r[0] for r in cur.fetchall()]
|
|
if mye_ids:
|
|
cur.execute("UPDATE supplier_catalog_compat SET model_year_engine_id = NULL WHERE model_year_engine_id = ANY(%s)", (mye_ids,))
|
|
cur.execute("DELETE FROM vin_cache WHERE model_year_engine_id = ANY(%s)", (mye_ids,))
|
|
cur.execute("DELETE FROM model_year_engine WHERE id_mye = ANY(%s)", (mye_ids,))
|
|
cur.execute("DELETE FROM models WHERE id_model = %s", (model_id,))
|
|
cur.close()
|
|
|
|
|
|
def normalize_for_match(name):
|
|
"""Normalize model name for matching: uppercase, remove extra spaces, replace spaces with hyphens and vice versa."""
|
|
if not name:
|
|
return ''
|
|
return ' '.join(str(name).upper().split())
|
|
|
|
|
|
def find_base_model(cur, brand_id, base_name):
|
|
"""Find a base model in same brand by normalized name match."""
|
|
normalized = normalize_for_match(base_name)
|
|
# Try exact
|
|
cur.execute("""
|
|
SELECT id_model, name_model FROM models
|
|
WHERE brand_id = %s AND LOWER(name_model) = LOWER(%s)
|
|
LIMIT 1
|
|
""", (brand_id, normalized))
|
|
row = cur.fetchone()
|
|
if row:
|
|
return row
|
|
# Try with spaces replaced by hyphens
|
|
hyphenated = normalized.replace(' ', '-')
|
|
cur.execute("""
|
|
SELECT id_model, name_model FROM models
|
|
WHERE brand_id = %s AND REPLACE(UPPER(name_model), ' ', '-') = %s
|
|
LIMIT 1
|
|
""", (brand_id, hyphenated))
|
|
row = cur.fetchone()
|
|
if row:
|
|
return row
|
|
# Try with hyphens replaced by spaces
|
|
spaced = normalized.replace('-', ' ')
|
|
cur.execute("""
|
|
SELECT id_model, name_model FROM models
|
|
WHERE brand_id = %s AND REPLACE(UPPER(name_model), '-', ' ') = %s
|
|
LIMIT 1
|
|
""", (brand_id, spaced))
|
|
return cur.fetchone()
|
|
|
|
|
|
def merge_model_to_base(conn, model_id, base_id, base_name):
|
|
cur = conn.cursor()
|
|
cur.execute("SELECT id_mye, year_id, engine_id FROM model_year_engine WHERE model_id = %s", (model_id,))
|
|
myes = cur.fetchall()
|
|
migrated = 0
|
|
for mye_id, year_id, engine_id in myes:
|
|
cur.execute("""
|
|
SELECT id_mye FROM model_year_engine
|
|
WHERE model_id = %s AND year_id = %s
|
|
AND (engine_id = %s OR (engine_id IS NULL AND %s IS NULL))
|
|
""", (base_id, year_id, engine_id, engine_id))
|
|
base_mye = cur.fetchone()
|
|
if base_mye:
|
|
base_mye_id = base_mye[0]
|
|
cur.execute("UPDATE supplier_catalog_compat SET model_year_engine_id = %s WHERE model_year_engine_id = %s", (base_mye_id, mye_id))
|
|
cur.execute("DELETE FROM vin_cache WHERE model_year_engine_id = %s", (mye_id,))
|
|
cur.execute("DELETE FROM model_year_engine WHERE id_mye = %s", (mye_id,))
|
|
else:
|
|
cur.execute("UPDATE model_year_engine SET model_id = %s WHERE id_mye = %s", (base_id, mye_id))
|
|
migrated += 1
|
|
cur.execute("DELETE FROM models WHERE id_model = %s", (model_id,))
|
|
cur.close()
|
|
return migrated
|
|
|
|
|
|
def extract_base_name(name, reason):
|
|
n = name.strip()
|
|
if reason == 'trailing_year':
|
|
m = re.search(r'^(.*?)\s+(19|20)\d{2}$', n)
|
|
if m:
|
|
return m.group(1).strip()
|
|
elif reason == 'year_range_parens':
|
|
m = re.search(r'^(.*?)\s+\d{2}-\d{2}\s*\(', n)
|
|
if m:
|
|
return m.group(1).strip()
|
|
elif reason == 'hasta_tas':
|
|
if 'Tas.' in n:
|
|
m = re.search(r'^(.*?)(?:\s+\d+\.\d+L)?\s+\d{2}-\d{2}\s+Tas\.', n)
|
|
if m:
|
|
return m.group(1).strip()
|
|
if 'hasta' in n.lower():
|
|
m = re.search(r'^(.*?)\s+hasta', n, re.IGNORECASE)
|
|
if m:
|
|
return m.group(1).strip()
|
|
return None
|
|
|
|
|
|
def main():
|
|
dry_run = '--execute' not in sys.argv
|
|
if dry_run:
|
|
print("=" * 60)
|
|
print("DRY RUN MODE — no changes will be made")
|
|
print("Run with --execute to apply changes")
|
|
print("=" * 60)
|
|
|
|
conn = connect()
|
|
cur = conn.cursor()
|
|
|
|
cur.execute('SELECT id_model, name_model, brand_id FROM models')
|
|
models = cur.fetchall()
|
|
|
|
patterns = {
|
|
'trailing_year': (re.compile(r' (19|20)\d{2}$'), lambda b: b != 'MCLAREN'),
|
|
'year_range_parens': (re.compile(r'[A-Za-z]+ \d{2}-\d{2} \('), None),
|
|
'engine_spec': (re.compile(r',?\s*\(\d+ HP\)|DOHC|SOHC|Valv\.|Turbo L4|L4,\s*\(', re.IGNORECASE), None),
|
|
'hasta_tas': (re.compile(r'hasta|Tas\.', re.IGNORECASE), None),
|
|
'engine_only': (re.compile(r'^\d+\.\d+L$', re.IGNORECASE), None),
|
|
'engine_config': (re.compile(r'^\d+\.\d+L\s+(?:L\d|V\d|R\s|Turbo|TDI|GSI)', re.IGNORECASE),
|
|
lambda n: not re.search(r'\([A-Z0-9_]{3,}\)$', n)),
|
|
}
|
|
|
|
suspicious = []
|
|
for mid, name, bid in models:
|
|
if not name:
|
|
continue
|
|
for reason, (pat, extra_check) in patterns.items():
|
|
if pat.search(name):
|
|
ok = True
|
|
if extra_check:
|
|
if reason == 'trailing_year':
|
|
cur.execute('SELECT name_brand FROM brands WHERE id_brand=%s', (bid,))
|
|
bname = cur.fetchone()[0]
|
|
ok = extra_check(bname)
|
|
else:
|
|
ok = extra_check(name)
|
|
if ok:
|
|
suspicious.append((bid, name, mid, reason))
|
|
break
|
|
|
|
# Trim variant detection: "500 POP", "FIESTA SE", etc.
|
|
trim_variants = ['LOUNGE', 'POP', 'SPORT', 'ADVENTURE', 'FIRE', 'GT', 'GTV', 'STD', 'SE', 'LE', 'XLE', 'LIMITED', 'LX', 'EX', 'SX']
|
|
trim_pattern = re.compile(r'^(\S+?)\s*(' + '|'.join(trim_variants) + r')$')
|
|
|
|
trim_matches = []
|
|
for mid, name, bid in models:
|
|
if not name:
|
|
continue
|
|
if any(s[2] == mid for s in suspicious):
|
|
continue # already flagged
|
|
m = trim_pattern.match(name.upper())
|
|
if m:
|
|
base = m.group(1)
|
|
base_model = find_base_model(cur, bid, base)
|
|
if base_model:
|
|
trim_matches.append((bid, name, mid, 'trim_variant', base_model[0], base_model[1]))
|
|
|
|
print(f"\nFound {len(suspicious)} suspicious models by pattern")
|
|
print(f"Found {len(trim_matches)} trim variant models")
|
|
|
|
to_merge = []
|
|
to_delete = []
|
|
|
|
for bid, name, mid, reason in suspicious:
|
|
if reason in ('engine_spec', 'engine_only', 'engine_config'):
|
|
to_delete.append((bid, name, mid, reason))
|
|
continue
|
|
base_name = extract_base_name(name, reason)
|
|
if base_name:
|
|
base = find_base_model(cur, bid, base_name)
|
|
if base:
|
|
to_merge.append((bid, name, mid, reason, base[0], base[1]))
|
|
continue
|
|
to_delete.append((bid, name, mid, reason))
|
|
|
|
# Add trim matches to merge list
|
|
for item in trim_matches:
|
|
to_merge.append(item)
|
|
|
|
print(f"\nTo merge: {len(to_merge)}")
|
|
for bid, name, mid, reason, base_id, base_name in to_merge:
|
|
print(f" [{bid}] '{name}' -> '{base_name}' (reason={reason})")
|
|
|
|
print(f"\nTo delete: {len(to_delete)}")
|
|
for bid, name, mid, reason in to_delete:
|
|
print(f" [{bid}] '{name}' reason={reason}")
|
|
|
|
if dry_run:
|
|
print("\n" + "=" * 60)
|
|
print("DRY RUN complete. Run with --execute to apply.")
|
|
print("=" * 60)
|
|
cur.close()
|
|
conn.close()
|
|
return
|
|
|
|
print("\nApplying merges...")
|
|
for bid, name, mid, reason, base_id, base_name in to_merge:
|
|
cur.execute('SELECT name_brand FROM brands WHERE id_brand=%s', (bid,))
|
|
bname = cur.fetchone()[0]
|
|
migrated = merge_model_to_base(conn, mid, base_id, base_name)
|
|
print(f" [{bname}] '{name}' -> '{base_name}' ({migrated} MYEs migrated)")
|
|
conn.commit()
|
|
|
|
print("\nApplying deletes...")
|
|
for bid, name, mid, reason in to_delete:
|
|
cur.execute('SELECT name_brand FROM brands WHERE id_brand=%s', (bid,))
|
|
bname = cur.fetchone()[0]
|
|
delete_model_and_myes(conn, mid)
|
|
print(f" [{bname}] '{name}' deleted")
|
|
conn.commit()
|
|
|
|
print(f"\nDone. Merged {len(to_merge)}, deleted {len(to_delete)}.")
|
|
cur.close()
|
|
conn.close()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|