feat(catalog): supplier catalog cleanup, fuzzy matching, and navigation fixes

- Cleaned 137+ fake engine-displacement models from supplier imports
  (v3/v4 scripts: Chevrolet, Ford, Chrysler, Dodge, Jeep, Nissan, etc.)
- Removed 1,251+ corrupted models (INT. prefixes, year-suffix, torque specs,
  empty names, trailing-year variants)
- Migrated supplier tables to master DB (supplier_catalog,
  supplier_catalog_compat, supplier_catalog_interchange)
- Fixed _get_mye_ids_with_parts() to query supplier_catalog_compat from
  master DB so supplier-only vehicles appear for all tenants
- Added fuzzy model matcher with parenthesis stripping, noise suffix removal,
  compact matching, prefix/substring fallback, model aliases, and ±3 year
  proximity
- Matched compat rows: KEEP GREEN +14,152, KNADIAN +3,021, VAZLO +127,500,
  LUK +477, RAYBESTOS +1,743
- Added KNADIAN catalog importer with year-range expansion and future-year
  filtering
- Added VAZLO catalog importer with position parsing and SKU-in-model cleanup
- Added Keep Green, LUK, Yokomitsu, Raybestos catalog importers
- Cache clearing after cleanups (_classify_cache_*, nexus:mye_ids:*,
  nexus:brand_mye_counts:*)

Final match rates:
- KEEP GREEN: 90.3%
- VAZLO: 93.6%
- YOKOMITSU: 100.0%
- KNADIAN: 57.4%
- LUK: 51.0%
- RAYBESTOS: 55.9%
This commit is contained in:
2026-06-09 07:47:42 +00:00
parent 5ea667b80e
commit ea29cc31c0
53 changed files with 7727 additions and 548 deletions

View File

@@ -0,0 +1,393 @@
#!/usr/bin/env python3
"""
Import Yokomitsu catalog from Excel into supplier_catalog tables.
Usage:
python scripts/import_yokomitsu_catalog.py
"""
import os
import re
import sys
from datetime import datetime
import psycopg2
from openpyxl import load_workbook
# DB connections
MASTER_DB_URL = os.environ.get('MASTER_DB_URL', 'postgresql://postgres@localhost/nexus_autoparts')
TENANT_DB_URL = os.environ.get('TENANT_DB_URL', 'postgresql://postgres@localhost/tenant_refaccionaria_rached')
EXCEL_PATH = os.path.join(os.path.dirname(__file__), '..', 'data', 'YOKOMITSU_CATALOGOS_COMPLETOS_TODOS.xlsx')
SUPPLIER_NAME = 'YOKOMITSU'
TENANT_ID = 31
def connect_master():
return psycopg2.connect(MASTER_DB_URL)
def connect_tenant():
return psycopg2.connect(TENANT_DB_URL)
def parse_year(token):
"""Parse a 2-digit or 4-digit year string."""
token = token.strip()
if not token:
return None
# Handle ranges like 08-13 or 08-15 -> use first year
if '-' in token:
token = token.split('-')[0]
token = token.strip()
if not token.isdigit():
return None
n = int(token)
if n < 50:
return 2000 + n
if n < 100:
return 1900 + n
if n >= 1900 and n <= 2050:
return n
return None
def parse_vehicle(vehicle_raw):
"""
Parse a vehicle string like:
'Chevrolet AVEO 1.5L 18'
'Audi A4 1.8L/2.0L 09'
'Dodge GRAND CHEROKEE 2/4WD 3.0L/3.7L/4.7L 08'
'Volkswagen JETTA A4/CLASICO 1.8L/2.0L 06 V'
'NISSAN 720 1988'
'Dodge CARAVAN/VOYAGER 00'
'ER 08-15 10' (garbage/unknown)
Returns dict with make, model, year, engine, vehicle_raw.
"""
if not vehicle_raw:
return {'make': None, 'model': None, 'year': None, 'engine': None, 'vehicle_raw': vehicle_raw}
s = str(vehicle_raw).strip()
# Remove trailing 'V' (variant marker)
s = re.sub(r'\s+V$', '', s)
tokens = s.split()
if len(tokens) < 2:
return {'make': None, 'model': None, 'year': None, 'engine': None, 'vehicle_raw': s}
# Last token is usually year (or year with suffix)
year = parse_year(tokens[-1])
if year is None and len(tokens) >= 3:
# Try second-to-last if last doesn't look like year
year = parse_year(tokens[-2])
if year:
tokens = tokens[:-2] + [tokens[-1]] # keep last as extra, but year found at -2
year = parse_year(tokens[-2])
if year is None:
# No year found; keep raw and try best-effort
make = tokens[0] if tokens else None
return {'make': make, 'model': ' '.join(tokens[1:]) if len(tokens) > 1 else None,
'year': None, 'engine': None, 'vehicle_raw': s}
# Remove year token
tokens_without_year = tokens[:-1]
make = tokens_without_year[0] if tokens_without_year else None
# Try to extract engine from remaining tokens
# Engine patterns: contains 'L', 'WD', 'DIESEL', 'TURBO', numeric with decimal
remaining = ' '.join(tokens_without_year[1:]) if len(tokens_without_year) > 1 else ''
# Heuristic: look for engine tokens at the END of remaining string
# Common patterns: "1.5L", "1.8L/2.0L", "2/4WD", "3.0L/3.7L/4.7L", "1.9L DIESEL"
engine = None
model = remaining
# Try to find engine pattern from the end
engine_match = re.search(r'(\d+(?:\.\d+)?\s*L(?:/\d+(?:\.\d+)?\s*L)*|\d+/\d+WD|\d+\.\d+L\s+DIESEL|\d+\.\d+L\s+TURBO)$', remaining, re.IGNORECASE)
if engine_match:
engine = engine_match.group(1)
model = remaining[:engine_match.start()].strip()
else:
# Try simpler: anything with digits and 'L' or 'WD' at the very end
parts = remaining.split()
if parts and re.search(r'\d', parts[-1]) and ('L' in parts[-1].upper() or 'WD' in parts[-1].upper()):
engine = parts[-1]
model = ' '.join(parts[:-1])
return {
'make': make,
'model': model,
'year': year,
'engine': engine,
'vehicle_raw': s,
}
def build_brand_cache(cur):
"""Fetch all brands from master DB."""
cur.execute("SELECT id_brand, name_brand FROM brands")
return {row[1].upper(): row[0] for row in cur.fetchall()}
def build_model_cache(cur):
"""Fetch all models from master DB."""
cur.execute("SELECT id_model, brand_id, name_model FROM models")
rows = cur.fetchall()
# Index by brand_id for fast lookup
cache = {}
for mid, bid, name in rows:
cache.setdefault(bid, []).append((mid, name))
return cache
def build_year_cache(cur):
"""Fetch all years from master DB."""
cur.execute("SELECT id_year, year_car FROM years")
return {row[1]: row[0] for row in cur.fetchall()}
def build_mye_cache(cur):
"""Fetch all MYE entries."""
cur.execute("SELECT id_mye, model_id, year_id FROM model_year_engine")
cache = {}
for mye_id, model_id, year_id in cur.fetchall():
cache.setdefault((model_id, year_id), []).append(mye_id)
return cache
def fuzzy_match_vehicle(parsed, brand_cache, model_cache, year_cache, mye_cache):
"""
Try to match parsed vehicle to MYE IDs.
Returns list of mye_ids (may be empty).
"""
make = parsed.get('make')
model_keyword = parsed.get('model')
year = parsed.get('year')
if not make or not model_keyword or not year:
return []
# Find brand
brand_id = brand_cache.get(make.upper())
if not brand_id:
# Try partial match
for name, bid in brand_cache.items():
if make.upper() in name or name in make.upper():
brand_id = bid
break
if not brand_id:
return []
# Find models for this brand that contain the keyword
models = model_cache.get(brand_id, [])
# Extract keyword: longest uppercase word from model string
keyword = model_keyword.upper()
# Try exact word match first
matched_model_ids = []
for mid, mname in models:
if keyword in mname.upper():
matched_model_ids.append(mid)
if not matched_model_ids:
# Try with each word in keyword
words = [w for w in keyword.split() if len(w) >= 3]
for mid, mname in models:
mname_up = mname.upper()
if any(w in mname_up for w in words):
matched_model_ids.append(mid)
if not matched_model_ids:
return []
# Find year_id
year_id = year_cache.get(year)
if not year_id:
return []
# Collect MYEs for all matched model+year combos
mye_ids = []
for mid in matched_model_ids:
mye_ids.extend(mye_cache.get((mid, year_id), []))
return mye_ids
def extract_interchanges(row):
"""Extract (brand, part_number) pairs from the interchange columns."""
interchanges = []
# Columns: MARCA.1=2, INTERCAMBIO=3, MARCA.2=4, INTERCAMBIO.1=5, ... up to MARCA.6=12, INTERCAMBIO.5=13
pairs = [
(row[2], row[3]),
(row[4], row[5]),
(row[6], row[7]),
(row[8], row[9]),
(row[10], row[11]),
(row[12], row[13]),
]
for brand, pn in pairs:
if brand and pn:
brand = str(brand).strip()
pn = str(pn).strip()
if brand and pn:
interchanges.append((brand, pn))
return interchanges
def main():
print(f"[{datetime.now().isoformat()}] Starting import...")
if not os.path.exists(EXCEL_PATH):
print(f"ERROR: Excel not found at {EXCEL_PATH}")
sys.exit(1)
print(f"Loading {EXCEL_PATH}...")
wb = load_workbook(EXCEL_PATH, read_only=True, data_only=True)
master_conn = connect_master()
master_conn = connect_master()
master_cur = master_conn.cursor()
master_cur = master_conn.cursor()
print("Building caches...")
brand_cache = build_brand_cache(master_cur)
model_cache = build_model_cache(master_cur)
year_cache = build_year_cache(master_cur)
mye_cache = build_mye_cache(master_cur)
print(f" Brands: {len(brand_cache)}, Models: {sum(len(v) for v in model_cache.values())}, Years: {len(year_cache)}, MYE combos: {len(mye_cache)}")
# Prepare UPSERT statements
upsert_catalog_sql = """
INSERT INTO supplier_catalog (supplier_name, sku, name, category)
VALUES (%s, %s, %s, %s, %s)
ON CONFLICT (supplier_name, sku, category) DO UPDATE SET
name = EXCLUDED.name,
category = EXCLUDED.category
RETURNING id
"""
insert_compat_sql = """
INSERT INTO supplier_catalog_compat
(catalog_id, make, model, year, engine, model_year_engine_id, source)
VALUES (%s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (catalog_id, make, model, year, engine) DO NOTHING
"""
insert_interchange_sql = """
INSERT INTO supplier_catalog_interchange (catalog_id, brand, part_number)
VALUES (%s, %s, %s)
ON CONFLICT DO NOTHING
"""
# Track stats
stats = {
'sheets': 0,
'rows': 0,
'catalog_items': 0,
'compat_rows': 0,
'interchange_rows': 0,
'vehicles_parsed': 0,
'vehicles_matched': 0,
'mye_matches': 0,
}
# Process each sheet
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
rows = list(ws.iter_rows(values_only=True))
if not rows:
continue
headers = rows[0]
data_rows = rows[1:]
stats['sheets'] += 1
print(f"\nProcessing sheet '{sheet_name}' with {len(data_rows)} rows...")
for idx, row in enumerate(data_rows):
if idx % 1000 == 0 and idx > 0:
print(f" ...{idx} rows processed")
# Skip empty rows
if not row or not row[1]:
continue
sku = str(row[1]).strip()
name = str(row[14]).strip() if row[14] else ''
vehicle_raw = str(row[15]).strip() if row[15] else ''
if not sku or not name:
continue
stats['rows'] += 1
# Upsert catalog item
master_cur.execute(upsert_catalog_sql, (SUPPLIER_NAME, sku, name, sheet_name))
catalog_id = master_cur.fetchone()[0]
stats['catalog_items'] += 1
# Parse vehicle
parsed = parse_vehicle(vehicle_raw)
stats['vehicles_parsed'] += 1
mye_ids = fuzzy_match_vehicle(parsed, brand_cache, model_cache, year_cache, mye_cache)
if mye_ids:
stats['vehicles_matched'] += 1
stats['mye_matches'] += len(mye_ids)
# Insert compatibility rows
# If we have MYE matches, insert one row per MYE
if mye_ids:
for mye_id in mye_ids:
master_cur.execute(insert_compat_sql, (
catalog_id,
parsed['make'],
parsed['model'],
parsed['year'],
parsed['engine'],
mye_id,
'fuzzy_match',
))
stats['compat_rows'] += 1
else:
# No MYE match: insert with text only
master_cur.execute(insert_compat_sql, (
catalog_id,
parsed['make'],
parsed['model'],
parsed['year'],
parsed['engine'],
None,
'import_text',
))
stats['compat_rows'] += 1
# Insert interchanges
interchanges = extract_interchanges(row)
for brand, pn in interchanges:
master_cur.execute(insert_interchange_sql, (catalog_id, brand, pn))
stats['interchange_rows'] += 1
# Commit per sheet
master_conn.commit()
print(f" Sheet '{sheet_name}' committed.")
# Final stats
print(f"\n{'='*60}")
print("IMPORT COMPLETE")
print(f"{'='*60}")
print(f"Sheets processed: {stats['sheets']}")
print(f"Total rows read: {stats['rows']}")
print(f"Catalog items: {stats['catalog_items']}")
print(f"Compat rows: {stats['compat_rows']}")
print(f"Interchange rows: {stats['interchange_rows']}")
print(f"Vehicles parsed: {stats['vehicles_parsed']}")
print(f"Vehicles with MYE: {stats['vehicles_matched']}")
print(f"Total MYE matches: {stats['mye_matches']}")
master_cur.close()
master_cur.close()
master_conn.close()
master_conn.close()
if __name__ == '__main__':
main()