feat(catalog): supplier catalog cleanup, fuzzy matching, and navigation fixes
- Cleaned 137+ fake engine-displacement models from supplier imports (v3/v4 scripts: Chevrolet, Ford, Chrysler, Dodge, Jeep, Nissan, etc.) - Removed 1,251+ corrupted models (INT. prefixes, year-suffix, torque specs, empty names, trailing-year variants) - Migrated supplier tables to master DB (supplier_catalog, supplier_catalog_compat, supplier_catalog_interchange) - Fixed _get_mye_ids_with_parts() to query supplier_catalog_compat from master DB so supplier-only vehicles appear for all tenants - Added fuzzy model matcher with parenthesis stripping, noise suffix removal, compact matching, prefix/substring fallback, model aliases, and ±3 year proximity - Matched compat rows: KEEP GREEN +14,152, KNADIAN +3,021, VAZLO +127,500, LUK +477, RAYBESTOS +1,743 - Added KNADIAN catalog importer with year-range expansion and future-year filtering - Added VAZLO catalog importer with position parsing and SKU-in-model cleanup - Added Keep Green, LUK, Yokomitsu, Raybestos catalog importers - Cache clearing after cleanups (_classify_cache_*, nexus:mye_ids:*, nexus:brand_mye_counts:*) Final match rates: - KEEP GREEN: 90.3% - VAZLO: 93.6% - YOKOMITSU: 100.0% - KNADIAN: 57.4% - LUK: 51.0% - RAYBESTOS: 55.9%
This commit is contained in:
393
scripts/import_yokomitsu_catalog.py
Executable file
393
scripts/import_yokomitsu_catalog.py
Executable file
@@ -0,0 +1,393 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Import Yokomitsu catalog from Excel into supplier_catalog tables.
|
||||
|
||||
Usage:
|
||||
python scripts/import_yokomitsu_catalog.py
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime
|
||||
|
||||
import psycopg2
|
||||
from openpyxl import load_workbook
|
||||
|
||||
# DB connections
|
||||
MASTER_DB_URL = os.environ.get('MASTER_DB_URL', 'postgresql://postgres@localhost/nexus_autoparts')
|
||||
TENANT_DB_URL = os.environ.get('TENANT_DB_URL', 'postgresql://postgres@localhost/tenant_refaccionaria_rached')
|
||||
|
||||
EXCEL_PATH = os.path.join(os.path.dirname(__file__), '..', 'data', 'YOKOMITSU_CATALOGOS_COMPLETOS_TODOS.xlsx')
|
||||
SUPPLIER_NAME = 'YOKOMITSU'
|
||||
TENANT_ID = 31
|
||||
|
||||
|
||||
def connect_master():
|
||||
return psycopg2.connect(MASTER_DB_URL)
|
||||
|
||||
|
||||
def connect_tenant():
|
||||
return psycopg2.connect(TENANT_DB_URL)
|
||||
|
||||
|
||||
def parse_year(token):
|
||||
"""Parse a 2-digit or 4-digit year string."""
|
||||
token = token.strip()
|
||||
if not token:
|
||||
return None
|
||||
# Handle ranges like 08-13 or 08-15 -> use first year
|
||||
if '-' in token:
|
||||
token = token.split('-')[0]
|
||||
token = token.strip()
|
||||
if not token.isdigit():
|
||||
return None
|
||||
n = int(token)
|
||||
if n < 50:
|
||||
return 2000 + n
|
||||
if n < 100:
|
||||
return 1900 + n
|
||||
if n >= 1900 and n <= 2050:
|
||||
return n
|
||||
return None
|
||||
|
||||
|
||||
def parse_vehicle(vehicle_raw):
|
||||
"""
|
||||
Parse a vehicle string like:
|
||||
'Chevrolet AVEO 1.5L 18'
|
||||
'Audi A4 1.8L/2.0L 09'
|
||||
'Dodge GRAND CHEROKEE 2/4WD 3.0L/3.7L/4.7L 08'
|
||||
'Volkswagen JETTA A4/CLASICO 1.8L/2.0L 06 V'
|
||||
'NISSAN 720 1988'
|
||||
'Dodge CARAVAN/VOYAGER 00'
|
||||
'ER 08-15 10' (garbage/unknown)
|
||||
|
||||
Returns dict with make, model, year, engine, vehicle_raw.
|
||||
"""
|
||||
if not vehicle_raw:
|
||||
return {'make': None, 'model': None, 'year': None, 'engine': None, 'vehicle_raw': vehicle_raw}
|
||||
|
||||
s = str(vehicle_raw).strip()
|
||||
# Remove trailing 'V' (variant marker)
|
||||
s = re.sub(r'\s+V$', '', s)
|
||||
|
||||
tokens = s.split()
|
||||
if len(tokens) < 2:
|
||||
return {'make': None, 'model': None, 'year': None, 'engine': None, 'vehicle_raw': s}
|
||||
|
||||
# Last token is usually year (or year with suffix)
|
||||
year = parse_year(tokens[-1])
|
||||
if year is None and len(tokens) >= 3:
|
||||
# Try second-to-last if last doesn't look like year
|
||||
year = parse_year(tokens[-2])
|
||||
if year:
|
||||
tokens = tokens[:-2] + [tokens[-1]] # keep last as extra, but year found at -2
|
||||
year = parse_year(tokens[-2])
|
||||
if year is None:
|
||||
# No year found; keep raw and try best-effort
|
||||
make = tokens[0] if tokens else None
|
||||
return {'make': make, 'model': ' '.join(tokens[1:]) if len(tokens) > 1 else None,
|
||||
'year': None, 'engine': None, 'vehicle_raw': s}
|
||||
|
||||
# Remove year token
|
||||
tokens_without_year = tokens[:-1]
|
||||
make = tokens_without_year[0] if tokens_without_year else None
|
||||
|
||||
# Try to extract engine from remaining tokens
|
||||
# Engine patterns: contains 'L', 'WD', 'DIESEL', 'TURBO', numeric with decimal
|
||||
remaining = ' '.join(tokens_without_year[1:]) if len(tokens_without_year) > 1 else ''
|
||||
|
||||
# Heuristic: look for engine tokens at the END of remaining string
|
||||
# Common patterns: "1.5L", "1.8L/2.0L", "2/4WD", "3.0L/3.7L/4.7L", "1.9L DIESEL"
|
||||
engine = None
|
||||
model = remaining
|
||||
|
||||
# Try to find engine pattern from the end
|
||||
engine_match = re.search(r'(\d+(?:\.\d+)?\s*L(?:/\d+(?:\.\d+)?\s*L)*|\d+/\d+WD|\d+\.\d+L\s+DIESEL|\d+\.\d+L\s+TURBO)$', remaining, re.IGNORECASE)
|
||||
if engine_match:
|
||||
engine = engine_match.group(1)
|
||||
model = remaining[:engine_match.start()].strip()
|
||||
else:
|
||||
# Try simpler: anything with digits and 'L' or 'WD' at the very end
|
||||
parts = remaining.split()
|
||||
if parts and re.search(r'\d', parts[-1]) and ('L' in parts[-1].upper() or 'WD' in parts[-1].upper()):
|
||||
engine = parts[-1]
|
||||
model = ' '.join(parts[:-1])
|
||||
|
||||
return {
|
||||
'make': make,
|
||||
'model': model,
|
||||
'year': year,
|
||||
'engine': engine,
|
||||
'vehicle_raw': s,
|
||||
}
|
||||
|
||||
|
||||
def build_brand_cache(cur):
|
||||
"""Fetch all brands from master DB."""
|
||||
cur.execute("SELECT id_brand, name_brand FROM brands")
|
||||
return {row[1].upper(): row[0] for row in cur.fetchall()}
|
||||
|
||||
|
||||
def build_model_cache(cur):
|
||||
"""Fetch all models from master DB."""
|
||||
cur.execute("SELECT id_model, brand_id, name_model FROM models")
|
||||
rows = cur.fetchall()
|
||||
# Index by brand_id for fast lookup
|
||||
cache = {}
|
||||
for mid, bid, name in rows:
|
||||
cache.setdefault(bid, []).append((mid, name))
|
||||
return cache
|
||||
|
||||
|
||||
def build_year_cache(cur):
|
||||
"""Fetch all years from master DB."""
|
||||
cur.execute("SELECT id_year, year_car FROM years")
|
||||
return {row[1]: row[0] for row in cur.fetchall()}
|
||||
|
||||
|
||||
def build_mye_cache(cur):
|
||||
"""Fetch all MYE entries."""
|
||||
cur.execute("SELECT id_mye, model_id, year_id FROM model_year_engine")
|
||||
cache = {}
|
||||
for mye_id, model_id, year_id in cur.fetchall():
|
||||
cache.setdefault((model_id, year_id), []).append(mye_id)
|
||||
return cache
|
||||
|
||||
|
||||
def fuzzy_match_vehicle(parsed, brand_cache, model_cache, year_cache, mye_cache):
|
||||
"""
|
||||
Try to match parsed vehicle to MYE IDs.
|
||||
Returns list of mye_ids (may be empty).
|
||||
"""
|
||||
make = parsed.get('make')
|
||||
model_keyword = parsed.get('model')
|
||||
year = parsed.get('year')
|
||||
|
||||
if not make or not model_keyword or not year:
|
||||
return []
|
||||
|
||||
# Find brand
|
||||
brand_id = brand_cache.get(make.upper())
|
||||
if not brand_id:
|
||||
# Try partial match
|
||||
for name, bid in brand_cache.items():
|
||||
if make.upper() in name or name in make.upper():
|
||||
brand_id = bid
|
||||
break
|
||||
if not brand_id:
|
||||
return []
|
||||
|
||||
# Find models for this brand that contain the keyword
|
||||
models = model_cache.get(brand_id, [])
|
||||
# Extract keyword: longest uppercase word from model string
|
||||
keyword = model_keyword.upper()
|
||||
# Try exact word match first
|
||||
matched_model_ids = []
|
||||
for mid, mname in models:
|
||||
if keyword in mname.upper():
|
||||
matched_model_ids.append(mid)
|
||||
|
||||
if not matched_model_ids:
|
||||
# Try with each word in keyword
|
||||
words = [w for w in keyword.split() if len(w) >= 3]
|
||||
for mid, mname in models:
|
||||
mname_up = mname.upper()
|
||||
if any(w in mname_up for w in words):
|
||||
matched_model_ids.append(mid)
|
||||
|
||||
if not matched_model_ids:
|
||||
return []
|
||||
|
||||
# Find year_id
|
||||
year_id = year_cache.get(year)
|
||||
if not year_id:
|
||||
return []
|
||||
|
||||
# Collect MYEs for all matched model+year combos
|
||||
mye_ids = []
|
||||
for mid in matched_model_ids:
|
||||
mye_ids.extend(mye_cache.get((mid, year_id), []))
|
||||
|
||||
return mye_ids
|
||||
|
||||
|
||||
def extract_interchanges(row):
|
||||
"""Extract (brand, part_number) pairs from the interchange columns."""
|
||||
interchanges = []
|
||||
# Columns: MARCA.1=2, INTERCAMBIO=3, MARCA.2=4, INTERCAMBIO.1=5, ... up to MARCA.6=12, INTERCAMBIO.5=13
|
||||
pairs = [
|
||||
(row[2], row[3]),
|
||||
(row[4], row[5]),
|
||||
(row[6], row[7]),
|
||||
(row[8], row[9]),
|
||||
(row[10], row[11]),
|
||||
(row[12], row[13]),
|
||||
]
|
||||
for brand, pn in pairs:
|
||||
if brand and pn:
|
||||
brand = str(brand).strip()
|
||||
pn = str(pn).strip()
|
||||
if brand and pn:
|
||||
interchanges.append((brand, pn))
|
||||
return interchanges
|
||||
|
||||
|
||||
def main():
|
||||
print(f"[{datetime.now().isoformat()}] Starting import...")
|
||||
|
||||
if not os.path.exists(EXCEL_PATH):
|
||||
print(f"ERROR: Excel not found at {EXCEL_PATH}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Loading {EXCEL_PATH}...")
|
||||
wb = load_workbook(EXCEL_PATH, read_only=True, data_only=True)
|
||||
|
||||
master_conn = connect_master()
|
||||
master_conn = connect_master()
|
||||
master_cur = master_conn.cursor()
|
||||
master_cur = master_conn.cursor()
|
||||
|
||||
print("Building caches...")
|
||||
brand_cache = build_brand_cache(master_cur)
|
||||
model_cache = build_model_cache(master_cur)
|
||||
year_cache = build_year_cache(master_cur)
|
||||
mye_cache = build_mye_cache(master_cur)
|
||||
print(f" Brands: {len(brand_cache)}, Models: {sum(len(v) for v in model_cache.values())}, Years: {len(year_cache)}, MYE combos: {len(mye_cache)}")
|
||||
|
||||
# Prepare UPSERT statements
|
||||
upsert_catalog_sql = """
|
||||
INSERT INTO supplier_catalog (supplier_name, sku, name, category)
|
||||
VALUES (%s, %s, %s, %s, %s)
|
||||
ON CONFLICT (supplier_name, sku, category) DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
category = EXCLUDED.category
|
||||
RETURNING id
|
||||
"""
|
||||
|
||||
insert_compat_sql = """
|
||||
INSERT INTO supplier_catalog_compat
|
||||
(catalog_id, make, model, year, engine, model_year_engine_id, source)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s)
|
||||
ON CONFLICT (catalog_id, make, model, year, engine) DO NOTHING
|
||||
"""
|
||||
|
||||
insert_interchange_sql = """
|
||||
INSERT INTO supplier_catalog_interchange (catalog_id, brand, part_number)
|
||||
VALUES (%s, %s, %s)
|
||||
ON CONFLICT DO NOTHING
|
||||
"""
|
||||
|
||||
# Track stats
|
||||
stats = {
|
||||
'sheets': 0,
|
||||
'rows': 0,
|
||||
'catalog_items': 0,
|
||||
'compat_rows': 0,
|
||||
'interchange_rows': 0,
|
||||
'vehicles_parsed': 0,
|
||||
'vehicles_matched': 0,
|
||||
'mye_matches': 0,
|
||||
}
|
||||
|
||||
# Process each sheet
|
||||
for sheet_name in wb.sheetnames:
|
||||
ws = wb[sheet_name]
|
||||
rows = list(ws.iter_rows(values_only=True))
|
||||
if not rows:
|
||||
continue
|
||||
headers = rows[0]
|
||||
data_rows = rows[1:]
|
||||
stats['sheets'] += 1
|
||||
print(f"\nProcessing sheet '{sheet_name}' with {len(data_rows)} rows...")
|
||||
|
||||
for idx, row in enumerate(data_rows):
|
||||
if idx % 1000 == 0 and idx > 0:
|
||||
print(f" ...{idx} rows processed")
|
||||
|
||||
# Skip empty rows
|
||||
if not row or not row[1]:
|
||||
continue
|
||||
|
||||
sku = str(row[1]).strip()
|
||||
name = str(row[14]).strip() if row[14] else ''
|
||||
vehicle_raw = str(row[15]).strip() if row[15] else ''
|
||||
|
||||
if not sku or not name:
|
||||
continue
|
||||
|
||||
stats['rows'] += 1
|
||||
|
||||
# Upsert catalog item
|
||||
master_cur.execute(upsert_catalog_sql, (SUPPLIER_NAME, sku, name, sheet_name))
|
||||
catalog_id = master_cur.fetchone()[0]
|
||||
stats['catalog_items'] += 1
|
||||
|
||||
# Parse vehicle
|
||||
parsed = parse_vehicle(vehicle_raw)
|
||||
stats['vehicles_parsed'] += 1
|
||||
|
||||
mye_ids = fuzzy_match_vehicle(parsed, brand_cache, model_cache, year_cache, mye_cache)
|
||||
if mye_ids:
|
||||
stats['vehicles_matched'] += 1
|
||||
stats['mye_matches'] += len(mye_ids)
|
||||
|
||||
# Insert compatibility rows
|
||||
# If we have MYE matches, insert one row per MYE
|
||||
if mye_ids:
|
||||
for mye_id in mye_ids:
|
||||
master_cur.execute(insert_compat_sql, (
|
||||
catalog_id,
|
||||
parsed['make'],
|
||||
parsed['model'],
|
||||
parsed['year'],
|
||||
parsed['engine'],
|
||||
mye_id,
|
||||
'fuzzy_match',
|
||||
))
|
||||
stats['compat_rows'] += 1
|
||||
else:
|
||||
# No MYE match: insert with text only
|
||||
master_cur.execute(insert_compat_sql, (
|
||||
catalog_id,
|
||||
parsed['make'],
|
||||
parsed['model'],
|
||||
parsed['year'],
|
||||
parsed['engine'],
|
||||
None,
|
||||
'import_text',
|
||||
))
|
||||
stats['compat_rows'] += 1
|
||||
|
||||
# Insert interchanges
|
||||
interchanges = extract_interchanges(row)
|
||||
for brand, pn in interchanges:
|
||||
master_cur.execute(insert_interchange_sql, (catalog_id, brand, pn))
|
||||
stats['interchange_rows'] += 1
|
||||
|
||||
# Commit per sheet
|
||||
master_conn.commit()
|
||||
print(f" Sheet '{sheet_name}' committed.")
|
||||
|
||||
# Final stats
|
||||
print(f"\n{'='*60}")
|
||||
print("IMPORT COMPLETE")
|
||||
print(f"{'='*60}")
|
||||
print(f"Sheets processed: {stats['sheets']}")
|
||||
print(f"Total rows read: {stats['rows']}")
|
||||
print(f"Catalog items: {stats['catalog_items']}")
|
||||
print(f"Compat rows: {stats['compat_rows']}")
|
||||
print(f"Interchange rows: {stats['interchange_rows']}")
|
||||
print(f"Vehicles parsed: {stats['vehicles_parsed']}")
|
||||
print(f"Vehicles with MYE: {stats['vehicles_matched']}")
|
||||
print(f"Total MYE matches: {stats['mye_matches']}")
|
||||
|
||||
master_cur.close()
|
||||
master_cur.close()
|
||||
master_conn.close()
|
||||
master_conn.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user