feat(catalog): supplier catalog cleanup, fuzzy matching, and navigation fixes
- Cleaned 137+ fake engine-displacement models from supplier imports (v3/v4 scripts: Chevrolet, Ford, Chrysler, Dodge, Jeep, Nissan, etc.) - Removed 1,251+ corrupted models (INT. prefixes, year-suffix, torque specs, empty names, trailing-year variants) - Migrated supplier tables to master DB (supplier_catalog, supplier_catalog_compat, supplier_catalog_interchange) - Fixed _get_mye_ids_with_parts() to query supplier_catalog_compat from master DB so supplier-only vehicles appear for all tenants - Added fuzzy model matcher with parenthesis stripping, noise suffix removal, compact matching, prefix/substring fallback, model aliases, and ±3 year proximity - Matched compat rows: KEEP GREEN +14,152, KNADIAN +3,021, VAZLO +127,500, LUK +477, RAYBESTOS +1,743 - Added KNADIAN catalog importer with year-range expansion and future-year filtering - Added VAZLO catalog importer with position parsing and SKU-in-model cleanup - Added Keep Green, LUK, Yokomitsu, Raybestos catalog importers - Cache clearing after cleanups (_classify_cache_*, nexus:mye_ids:*, nexus:brand_mye_counts:*) Final match rates: - KEEP GREEN: 90.3% - VAZLO: 93.6% - YOKOMITSU: 100.0% - KNADIAN: 57.4% - LUK: 51.0% - RAYBESTOS: 55.9%
This commit is contained in:
240
scripts/import_keepgreen_catalog.py
Normal file
240
scripts/import_keepgreen_catalog.py
Normal file
@@ -0,0 +1,240 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Import Keep Green (KG) catalog from Excel into supplier_catalog tables.
|
||||
|
||||
Usage:
|
||||
python scripts/import_keepgreen_catalog.py
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
|
||||
import psycopg2
|
||||
from openpyxl import load_workbook
|
||||
|
||||
MASTER_DB_URL = os.environ.get('MASTER_DB_URL', 'postgresql://postgres@localhost/nexus_autoparts')
|
||||
EXCEL_PATH = os.path.join(os.path.dirname(__file__), '..', 'data', 'KG (1).xlsx')
|
||||
SUPPLIER_NAME = 'KEEP GREEN'
|
||||
|
||||
MULTI_WORD_MAKES = {
|
||||
('MERCEDES', 'BENZ'): 'MERCEDES BENZ',
|
||||
('LAND', 'ROVER'): 'LAND ROVER',
|
||||
('ALFA', 'ROMEO'): 'ALFA ROMEO',
|
||||
('AMERICAN', 'MOTORS'): 'AMERICAN MOTORS',
|
||||
('ROLLS', 'ROYCE'): 'ROLLS ROYCE',
|
||||
('ASTON', 'MARTIN'): 'ASTON MARTIN',
|
||||
('GREAT', 'WALL'): 'GREAT WALL',
|
||||
}
|
||||
|
||||
|
||||
def connect_master():
|
||||
return psycopg2.connect(MASTER_DB_URL)
|
||||
|
||||
|
||||
def normalize_name(name):
|
||||
if not name:
|
||||
return ''
|
||||
return ' '.join(str(name).replace('\n', ' ').split())
|
||||
|
||||
|
||||
def parse_make(carro):
|
||||
"""Extract make from CARRO_PERTENECIENTE text."""
|
||||
if not carro:
|
||||
return None
|
||||
parts = str(carro).strip().split()
|
||||
if not parts:
|
||||
return None
|
||||
make = parts[0]
|
||||
if len(parts) >= 2:
|
||||
key = (parts[0].upper(), parts[1].upper())
|
||||
if key in MULTI_WORD_MAKES:
|
||||
make = MULTI_WORD_MAKES[key]
|
||||
return make
|
||||
|
||||
|
||||
def extract_interchanges(row):
|
||||
"""Extract (brand, part_number) pairs from interchange columns.
|
||||
KG: interchanges start at col 5 (MARCA.1) through col 16 (INTERCAMBIO.5).
|
||||
"""
|
||||
interchanges = []
|
||||
for i in range(6):
|
||||
marca_col = 5 + i * 2
|
||||
inter_col = 6 + i * 2
|
||||
if marca_col < len(row) and row[marca_col]:
|
||||
brand = str(row[marca_col]).strip()
|
||||
pn = str(row[inter_col]).strip() if inter_col < len(row) and row[inter_col] else ''
|
||||
if brand and pn:
|
||||
interchanges.append((brand, pn))
|
||||
return interchanges
|
||||
|
||||
|
||||
def expand_year(year_val):
|
||||
"""Return list of integer years from a year value.
|
||||
Handles: 1998, 1998-1999, 98-99, '1998 1999', etc.
|
||||
"""
|
||||
if year_val is None:
|
||||
return [None]
|
||||
s = str(year_val).strip()
|
||||
if not s:
|
||||
return [None]
|
||||
|
||||
# Single 4-digit year
|
||||
if re.match(r'^(19|20)\d{2}$', s):
|
||||
return [int(s)]
|
||||
|
||||
# Range with dash or slash: 1998-1999, 98-99, 1998/1999
|
||||
m = re.match(r'^(\d{2,4})\s*[-/]\s*(\d{2,4})$', s)
|
||||
if m:
|
||||
start = int(m.group(1))
|
||||
end = int(m.group(2))
|
||||
# Normalize 2-digit years
|
||||
if start < 100:
|
||||
start = 1900 + start if start >= 70 else 2000 + start
|
||||
if end < 100:
|
||||
end = 1900 + end if end >= 70 else 2000 + end
|
||||
if end < start:
|
||||
start, end = end, start
|
||||
# Sanity: cap range length
|
||||
if end - start > 100:
|
||||
return [None]
|
||||
return list(range(start, end + 1))
|
||||
|
||||
# Try plain integer
|
||||
try:
|
||||
y = int(float(s))
|
||||
if 1900 <= y <= 2100:
|
||||
return [y]
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return [None]
|
||||
|
||||
|
||||
def main():
|
||||
print(f"[{datetime.now().isoformat()}] Starting Keep Green import...")
|
||||
|
||||
if not os.path.exists(EXCEL_PATH):
|
||||
print(f"ERROR: Excel not found at {EXCEL_PATH}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Loading {EXCEL_PATH}...")
|
||||
wb = load_workbook(EXCEL_PATH, read_only=True, data_only=True)
|
||||
|
||||
master_conn = connect_master()
|
||||
master_cur = master_conn.cursor()
|
||||
|
||||
upsert_catalog_sql = """
|
||||
INSERT INTO supplier_catalog (supplier_name, sku, name, category, is_active)
|
||||
VALUES (%s, %s, %s, %s, true)
|
||||
ON CONFLICT (supplier_name, sku, category) DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
category = EXCLUDED.category,
|
||||
is_active = true
|
||||
RETURNING id
|
||||
"""
|
||||
|
||||
insert_compat_sql = """
|
||||
INSERT INTO supplier_catalog_compat
|
||||
(catalog_id, make, model, year, engine, model_year_engine_id, source)
|
||||
VALUES (%s, %s, %s, %s, %s, NULL, %s)
|
||||
ON CONFLICT (catalog_id, make, model, year, engine) DO NOTHING
|
||||
"""
|
||||
|
||||
insert_interchange_sql = """
|
||||
INSERT INTO supplier_catalog_interchange (catalog_id, brand, part_number)
|
||||
VALUES (%s, %s, %s)
|
||||
ON CONFLICT DO NOTHING
|
||||
"""
|
||||
|
||||
stats = defaultdict(int)
|
||||
|
||||
for sheet_name in wb.sheetnames:
|
||||
ws = wb[sheet_name]
|
||||
rows = list(ws.iter_rows(values_only=True))
|
||||
if not rows:
|
||||
continue
|
||||
data_rows = rows[1:]
|
||||
stats['sheets'] += 1
|
||||
print(f"\nProcessing sheet '{sheet_name}' with {len(data_rows)} rows...")
|
||||
|
||||
catalog_id_cache = {}
|
||||
|
||||
for idx, row in enumerate(data_rows):
|
||||
if idx % 2000 == 0 and idx > 0:
|
||||
print(f" ...{idx} rows processed")
|
||||
|
||||
if not row or len(row) < 5 or not row[4]:
|
||||
stats['skipped_no_sku'] += 1
|
||||
continue
|
||||
|
||||
make = str(row[0]).strip().upper() if row[0] else ''
|
||||
model = str(row[1]).strip() if row[1] else ''
|
||||
engine = normalize_name(row[2]) if row[2] else None
|
||||
year_raw = row[3]
|
||||
sku = str(row[4]).strip()
|
||||
name = normalize_name(row[17]) if len(row) > 17 and row[17] else sheet_name
|
||||
carro = str(row[18]).strip() if len(row) > 18 and row[18] else ''
|
||||
|
||||
if not sku:
|
||||
stats['skipped_no_sku'] += 1
|
||||
continue
|
||||
if not make or not model:
|
||||
stats['skipped_no_vehicle'] += 1
|
||||
continue
|
||||
|
||||
stats['rows'] += 1
|
||||
|
||||
# Prefer make from MARCA column; fall back to parsing CARRO_PERTENECIENTE
|
||||
parsed_make = parse_make(carro) or make
|
||||
|
||||
# Upsert catalog item (keyed by sku; category = sheet name)
|
||||
cache_key = sku
|
||||
catalog_id = catalog_id_cache.get(cache_key)
|
||||
if catalog_id is None:
|
||||
master_cur.execute(upsert_catalog_sql, (SUPPLIER_NAME, sku, name, sheet_name))
|
||||
row_result = master_cur.fetchone()
|
||||
catalog_id = row_result[0] if row_result else None
|
||||
catalog_id_cache[cache_key] = catalog_id
|
||||
stats['catalog_items'] += 1
|
||||
|
||||
if catalog_id is None:
|
||||
stats['skipped_no_catalog'] += 1
|
||||
continue
|
||||
|
||||
# Expand years and insert compat rows
|
||||
years = expand_year(year_raw)
|
||||
for year in years:
|
||||
master_cur.execute(insert_compat_sql, (
|
||||
catalog_id,
|
||||
parsed_make,
|
||||
model,
|
||||
year,
|
||||
engine or None,
|
||||
'import_text',
|
||||
))
|
||||
stats['compat_rows'] += 1
|
||||
|
||||
# Insert interchanges
|
||||
interchanges = extract_interchanges(row)
|
||||
for brand, pn in interchanges:
|
||||
master_cur.execute(insert_interchange_sql, (catalog_id, brand, pn))
|
||||
stats['interchange_rows'] += 1
|
||||
|
||||
master_conn.commit()
|
||||
print(f" Sheet '{sheet_name}' committed.")
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print("IMPORT COMPLETE")
|
||||
print(f"{'='*60}")
|
||||
for k, v in sorted(stats.items()):
|
||||
print(f"{k:25s}: {v}")
|
||||
|
||||
master_cur.close()
|
||||
master_conn.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user