Files
Autoparts-DB/scripts/clean_fake_models.py
consultoria-as ea29cc31c0 feat(catalog): supplier catalog cleanup, fuzzy matching, and navigation fixes
- Cleaned 137+ fake engine-displacement models from supplier imports
  (v3/v4 scripts: Chevrolet, Ford, Chrysler, Dodge, Jeep, Nissan, etc.)
- Removed 1,251+ corrupted models (INT. prefixes, year-suffix, torque specs,
  empty names, trailing-year variants)
- Migrated supplier tables to master DB (supplier_catalog,
  supplier_catalog_compat, supplier_catalog_interchange)
- Fixed _get_mye_ids_with_parts() to query supplier_catalog_compat from
  master DB so supplier-only vehicles appear for all tenants
- Added fuzzy model matcher with parenthesis stripping, noise suffix removal,
  compact matching, prefix/substring fallback, model aliases, and ±3 year
  proximity
- Matched compat rows: KEEP GREEN +14,152, KNADIAN +3,021, VAZLO +127,500,
  LUK +477, RAYBESTOS +1,743
- Added KNADIAN catalog importer with year-range expansion and future-year
  filtering
- Added VAZLO catalog importer with position parsing and SKU-in-model cleanup
- Added Keep Green, LUK, Yokomitsu, Raybestos catalog importers
- Cache clearing after cleanups (_classify_cache_*, nexus:mye_ids:*,
  nexus:brand_mye_counts:*)

Final match rates:
- KEEP GREEN: 90.3%
- VAZLO: 93.6%
- YOKOMITSU: 100.0%
- KNADIAN: 57.4%
- LUK: 51.0%
- RAYBESTOS: 55.9%
2026-06-09 07:47:42 +00:00

275 lines
11 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Clean fake/corrupted models from master DB caused by supplier catalog imports.
Handles:
- Models ending in ' INT.' -> map to base model
- Empty-name models -> delete or merge
- Year-range models (09-15, etc.) -> delete
- Torque-spec models ((60 Nm+90°), etc.) -> delete
"""
import sys
import re
import psycopg2
from collections import defaultdict
MASTER_DSN = "host=localhost dbname=nexus_autoparts user=postgres password=1123517"
def get_connection():
return psycopg2.connect(MASTER_DSN)
def delete_model_and_myes(conn, model_id, dry_run=True):
"""Delete all MYEs for a model, then the model itself."""
cur = conn.cursor()
cur.execute("SELECT id_mye FROM model_year_engine WHERE model_id = %s", (model_id,))
mye_ids = [r[0] for r in cur.fetchall()]
if mye_ids:
print(f" Would delete {len(mye_ids)} MYEs for model {model_id}")
if not dry_run:
# supplier_catalog_compat has no FK, just update to null
cur.execute("UPDATE supplier_catalog_compat SET model_year_engine_id = NULL WHERE model_year_engine_id = ANY(%s)", (mye_ids,))
cur.execute("DELETE FROM vin_cache WHERE model_year_engine_id = ANY(%s)", (mye_ids,))
cur.execute("DELETE FROM model_year_engine WHERE id_mye = ANY(%s)", (mye_ids,))
else:
print(f" No MYEs for model {model_id}")
print(f" Would delete model {model_id}")
if not dry_run:
cur.execute("DELETE FROM models WHERE id_model = %s", (model_id,))
cur.close()
def merge_int_models(conn, dry_run=True):
"""Merge 'X INT.' models into their base equivalents."""
cur = conn.cursor()
cur.execute("""
SELECT m.id_model, m.name_model, m.brand_id, b.name_brand
FROM models m
JOIN brands b ON b.id_brand = m.brand_id
WHERE m.name_model LIKE '%INT.'
ORDER BY m.brand_id, m.name_model
""")
int_models = cur.fetchall()
print(f"Found {len(int_models)} INT. models to process")
merged = 0
renamed = 0
skipped = 0
for model_id, name_model, brand_id, brand_name in int_models:
base_name = name_model[:-5] # Remove ' INT.'
# Find base model (case-insensitive)
cur.execute("""
SELECT id_model, name_model FROM models
WHERE brand_id = %s AND LOWER(name_model) = LOWER(%s)
LIMIT 1
""", (brand_id, base_name))
base = cur.fetchone()
if base:
base_id, base_name_exact = base
print(f"[{brand_name}] {name_model} -> {base_name_exact} (id={base_id})")
else:
# No base exists: rename this model to base name
print(f"[{brand_name}] {name_model} -> RENAME to '{base_name}' (no base found)")
if not dry_run:
cur.execute("UPDATE models SET name_model = %s WHERE id_model = %s", (base_name, model_id))
conn.commit()
renamed += 1
continue
# Migrate MYEs from INT model to base model
cur.execute("""
SELECT id_mye, year_id, engine_id FROM model_year_engine
WHERE model_id = %s
""", (model_id,))
myes = cur.fetchall()
mye_migrated = 0
mye_deleted = 0
for mye_id, year_id, engine_id in myes:
# Find equivalent MYE in base model
cur.execute("""
SELECT id_mye FROM model_year_engine
WHERE model_id = %s AND year_id = %s
AND (engine_id = %s OR (engine_id IS NULL AND %s IS NULL))
""", (base_id, year_id, engine_id, engine_id))
base_mye = cur.fetchone()
if base_mye:
base_mye_id = base_mye[0]
# Update supplier_catalog_compat
cur.execute("""
UPDATE supplier_catalog_compat
SET model_year_engine_id = %s
WHERE model_year_engine_id = %s
""", (base_mye_id, mye_id))
# Delete the old MYE
cur.execute("DELETE FROM vin_cache WHERE model_year_engine_id = %s", (mye_id,))
cur.execute("DELETE FROM model_year_engine WHERE id_mye = %s", (mye_id,))
mye_migrated += 1
else:
# Move MYE to base model
cur.execute("""
UPDATE model_year_engine SET model_id = %s WHERE id_mye = %s
""", (base_id, mye_id))
mye_migrated += 1
# Now delete the INT model (should have no MYEs left)
if not dry_run:
cur.execute("DELETE FROM models WHERE id_model = %s", (model_id,))
conn.commit()
print(f" Migrated {mye_migrated} MYEs, deleted model")
merged += 1
print(f"\nINT. summary: merged={merged}, renamed={renamed}, skipped={skipped}")
cur.close()
return merged, renamed, skipped
def clean_empty_models(conn, dry_run=True):
"""Delete or merge models with empty names."""
cur = conn.cursor()
cur.execute("""
SELECT m.id_model, m.name_model, m.brand_id, b.name_brand,
(SELECT COUNT(*) FROM model_year_engine mye WHERE mye.model_id = m.id_model) as mye_count
FROM models m
JOIN brands b ON b.id_brand = m.brand_id
WHERE m.name_model IS NULL OR TRIM(m.name_model) = ''
ORDER BY mye_count DESC
""")
empty_models = cur.fetchall()
print(f"\nFound {len(empty_models)} empty-name models")
deleted = 0
for model_id, name_model, brand_id, brand_name, mye_count in empty_models:
print(f"[{brand_name}] empty model id={model_id}, MYEs={mye_count}")
if mye_count == 0:
print(f" -> Safe to delete (no MYEs)")
if not dry_run:
cur.execute("DELETE FROM models WHERE id_model = %s", (model_id,))
conn.commit()
deleted += 1
else:
# Check MYEs: if they have supplier_catalog_compat, we need to find a target
cur.execute("""
SELECT mye.id_mye, mye.year_id, mye.engine_id, y.year_car, e.name_engine
FROM model_year_engine mye
LEFT JOIN years y ON y.id_year = mye.year_id
LEFT JOIN engines e ON e.id_engine = mye.engine_id
WHERE mye.model_id = %s
""", (model_id,))
myes = cur.fetchall()
print(f" -> Has {len(myes)} MYEs. Details:")
for mye_id, yid, eid, yname, ename in myes:
print(f" MYE {mye_id}: year={yname}, engine={ename}")
# Check if there's a real model in same brand with this year+engine combo
cur.execute("""
SELECT m2.id_model, m2.name_model FROM model_year_engine mye2
JOIN models m2 ON m2.id_model = mye2.model_id
WHERE m2.brand_id = %s AND mye2.year_id = %s
AND (mye2.engine_id = %s OR (mye2.engine_id IS NULL AND %s IS NULL))
LIMIT 3
""", (brand_id, yid, eid, eid))
candidates = cur.fetchall()
print(f" Candidates: {candidates}")
if candidates and not dry_run:
target_id = candidates[0][0]
cur.execute("""
UPDATE supplier_catalog_compat SET model_year_engine_id = NULL
WHERE model_year_engine_id = %s
""", (mye_id,))
cur.execute("DELETE FROM vin_cache WHERE model_year_engine_id = %s", (mye_id,))
cur.execute("DELETE FROM model_year_engine WHERE id_mye = %s", (mye_id,))
conn.commit()
print(f" -> Cleared MYE {mye_id} (moved to NULL, manual remap needed)")
if not dry_run:
cur.execute("DELETE FROM models WHERE id_model = %s", (model_id,))
conn.commit()
deleted += 1
print(f"Empty models processed: {deleted}")
cur.close()
return deleted
def clean_year_range_models(conn, dry_run=True):
"""Delete models that are year ranges like '09-15'."""
cur = conn.cursor()
cur.execute("SELECT id_model, name_model, brand_id FROM models")
year_ranges = []
for mid, name, bid in cur.fetchall():
if name and re.match(r'^(\d{2}-\d{2}|\d{4}-\d{4})$', name.strip()):
year_ranges.append((mid, name.strip(), bid))
print(f"\nFound {len(year_ranges)} year-range models")
deleted = 0
for mid, name, bid in year_ranges:
cur.execute("SELECT COUNT(*) FROM model_year_engine WHERE model_id = %s", (mid,))
count = cur.fetchone()[0]
cur.execute("SELECT name_brand FROM brands WHERE id_brand = %s", (bid,))
bname = cur.fetchone()[0]
print(f"[{bname}] '{name}' id={mid}, MYEs={count}")
if not dry_run:
delete_model_and_myes(conn, mid, dry_run=False)
conn.commit()
deleted += 1
print(f"Year-range models deleted: {deleted}")
cur.close()
return deleted
def clean_torque_models(conn, dry_run=True):
"""Delete models that contain torque specs like 'Nm'."""
cur = conn.cursor()
cur.execute("SELECT id_model, name_model, brand_id FROM models")
torque_models = []
for mid, name, bid in cur.fetchall():
if name and ('Nm' in name or 'nm' in name.lower()):
torque_models.append((mid, name, bid))
print(f"\nFound {len(torque_models)} torque-spec models")
deleted = 0
for mid, name, bid in torque_models:
cur.execute("SELECT COUNT(*) FROM model_year_engine WHERE model_id = %s", (mid,))
count = cur.fetchone()[0]
cur.execute("SELECT name_brand FROM brands WHERE id_brand = %s", (bid,))
bname = cur.fetchone()[0]
print(f"[{bname}] '{name}' id={mid}, MYEs={count}")
if not dry_run:
delete_model_and_myes(conn, mid, dry_run=False)
conn.commit()
deleted += 1
print(f"Torque-spec models deleted: {deleted}")
cur.close()
return deleted
def main():
dry_run = '--execute' not in sys.argv
if dry_run:
print("=" * 60)
print("DRY RUN MODE — no changes will be made")
print("Run with --execute to apply changes")
print("=" * 60)
conn = get_connection()
try:
merge_int_models(conn, dry_run=dry_run)
clean_empty_models(conn, dry_run=dry_run)
clean_year_range_models(conn, dry_run=dry_run)
clean_torque_models(conn, dry_run=dry_run)
finally:
conn.close()
if dry_run:
print("\n" + "=" * 60)
print("DRY RUN complete. Run with --execute to apply.")
print("=" * 60)
if __name__ == '__main__':
main()