Files
Autoparts-DB/scripts/import_vazlo_catalog.py
consultoria-as ea29cc31c0 feat(catalog): supplier catalog cleanup, fuzzy matching, and navigation fixes
- Cleaned 137+ fake engine-displacement models from supplier imports
  (v3/v4 scripts: Chevrolet, Ford, Chrysler, Dodge, Jeep, Nissan, etc.)
- Removed 1,251+ corrupted models (INT. prefixes, year-suffix, torque specs,
  empty names, trailing-year variants)
- Migrated supplier tables to master DB (supplier_catalog,
  supplier_catalog_compat, supplier_catalog_interchange)
- Fixed _get_mye_ids_with_parts() to query supplier_catalog_compat from
  master DB so supplier-only vehicles appear for all tenants
- Added fuzzy model matcher with parenthesis stripping, noise suffix removal,
  compact matching, prefix/substring fallback, model aliases, and ±3 year
  proximity
- Matched compat rows: KEEP GREEN +14,152, KNADIAN +3,021, VAZLO +127,500,
  LUK +477, RAYBESTOS +1,743
- Added KNADIAN catalog importer with year-range expansion and future-year
  filtering
- Added VAZLO catalog importer with position parsing and SKU-in-model cleanup
- Added Keep Green, LUK, Yokomitsu, Raybestos catalog importers
- Cache clearing after cleanups (_classify_cache_*, nexus:mye_ids:*,
  nexus:brand_mye_counts:*)

Final match rates:
- KEEP GREEN: 90.3%
- VAZLO: 93.6%
- YOKOMITSU: 100.0%
- KNADIAN: 57.4%
- LUK: 51.0%
- RAYBESTOS: 55.9%
2026-06-09 07:47:42 +00:00

286 lines
8.6 KiB
Python

#!/usr/bin/env python3
"""
Import VAZLO catalog from Excel into supplier_catalog tables.
Usage:
python scripts/import_vazlo_catalog.py
"""
import os
import re
import sys
from collections import defaultdict
from datetime import datetime
import psycopg2
from openpyxl import load_workbook
# DB connections
MASTER_DB_URL = os.environ.get('MASTER_DB_URL', 'postgresql://postgres@localhost/nexus_autoparts')
TENANT_DB_URL = os.environ.get('TENANT_DB_URL', 'postgresql://postgres@localhost/tenant_refaccionaria_rached')
EXCEL_PATH = os.path.join(os.path.dirname(__file__), '..', 'data', 'VAZLO (1).xlsx')
SUPPLIER_NAME = 'VAZLO'
TENANT_ID = 31
POS_KEYWORDS = {
'DEL.', 'TRAS.', 'FRONT.', 'EXT.', 'IZQ.', 'DER.', 'RUEDA', 'CAJA',
'INF.', 'SUP.', 'TRANS.', 'STD', 'AWD', '2/4WD', '4WD', 'FWD', 'RWD',
'4X4', 'TURBO', 'GASOLINA', 'DIESEL',
'DEL', 'TRAS', 'FRONT', 'EXT', 'IZQ', 'DER', 'INF', 'SUP', 'TRANS',
}
MULTI_WORD_MAKES = {
('MERCEDES', 'BENZ'): 'MERCEDES BENZ',
('LAND', 'ROVER'): 'LAND ROVER',
('ALFA', 'ROMEO'): 'ALFA ROMEO',
('AMERICAN', 'MOTORS'): 'AMERICAN MOTORS',
('ROLLS', 'ROYCE'): 'ROLLS ROYCE',
('ASTON', 'MARTIN'): 'ASTON MARTIN',
('GREAT', 'WALL'): 'GREAT WALL',
}
def connect_master():
return psycopg2.connect(MASTER_DB_URL)
def connect_tenant():
return psycopg2.connect(TENANT_DB_URL)
def collect_all_skus(wb):
"""Pre-scan all SKUs to detect SKU-in-model cases."""
skus = set()
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
for row in ws.iter_rows(min_row=2, values_only=True):
sku = str(row[1]).strip() if row[1] else ''
if sku:
skus.add(sku)
return skus
def parse_carro(carro, all_skus):
"""
Parse CARRO_PERTENECIENTE like:
'ACURA TL DEL. 2015'
'BMW X1 SDRIVE 20IA TRAS. 2018'
'ACURA TL FRONT. DER. 2004'
'AUDI 4000S CAJA 1980'
'MERCEDES BENZ C350 E --'
'ACURA TLX 3429' (3429 is a SKU inserted into model)
Returns dict with make, model, year, position, raw.
"""
if not carro:
return {'make': None, 'model': None, 'year': None, 'position': None, 'raw': carro}
s = str(carro).strip()
parts = s.split()
if not parts:
return {'make': None, 'model': None, 'year': None, 'position': None, 'raw': s}
# Extract year from end
year = None
if re.match(r'^(19|20)\d{2}$', parts[-1]):
year = int(parts[-1])
parts = parts[:-1]
# Remove trailing '--' (no-year marker)
if parts and parts[-1] == '--':
parts = parts[:-1]
# Extract make
make = parts[0] if parts else ''
if len(parts) >= 2:
key = (parts[0].upper(), parts[1].upper())
if key in MULTI_WORD_MAKES:
make = MULTI_WORD_MAKES[key]
parts = parts[2:]
else:
parts = parts[1:]
else:
parts = parts[1:]
# Extract position keywords from the end
position_parts = []
while parts and parts[-1].upper() in POS_KEYWORDS:
position_parts.insert(0, parts[-1])
parts = parts[:-1]
model = ' '.join(parts)
# Remove trailing SKU numbers that match known VAZLO SKUs
# e.g. "ACURA TLX 3429" -> model="TLX", sku_suffix="3429"
model_parts = model.split()
if model_parts and re.match(r'^\d{3,4}$', model_parts[-1]) and model_parts[-1] in all_skus:
model = ' '.join(model_parts[:-1])
return {
'make': make,
'model': model,
'year': year,
'position': ' '.join(position_parts),
'raw': s,
}
def extract_interchanges(row):
"""Extract (brand, part_number) pairs from all 11 interchange columns."""
interchanges = []
for i in range(11):
marca_col = 2 + i * 2
inter_col = 3 + i * 2
if marca_col < len(row) and row[marca_col]:
brand = str(row[marca_col]).strip()
pn = str(row[inter_col]).strip() if inter_col < len(row) and row[inter_col] else ''
if brand and pn:
interchanges.append((brand, pn))
return interchanges
def normalize_name(name):
"""Clean up piece name: collapse whitespace, replace newlines."""
if not name:
return ''
return ' '.join(str(name).replace('\n', ' ').split())
def main():
print(f"[{datetime.now().isoformat()}] Starting VAZLO import...")
if not os.path.exists(EXCEL_PATH):
print(f"ERROR: Excel not found at {EXCEL_PATH}")
sys.exit(1)
print(f"Loading {EXCEL_PATH}...")
wb = load_workbook(EXCEL_PATH, read_only=True, data_only=True)
# Pre-scan SKUs for SKU-in-model detection
print("Pre-scanning SKUs...")
all_skus = collect_all_skus(wb)
print(f" Found {len(all_skus)} unique SKUs")
master_conn = connect_master()
master_conn = connect_master()
master_cur = master_conn.cursor()
upsert_catalog_sql = """
INSERT INTO supplier_catalog (supplier_name, sku, name, category, is_active)
VALUES (%s, %s, %s, %s, true)
ON CONFLICT (supplier_name, sku, category) DO UPDATE SET
name = EXCLUDED.name,
category = EXCLUDED.category,
is_active = true
RETURNING id
"""
insert_compat_sql = """
INSERT INTO supplier_catalog_compat
(catalog_id, make, model, year, engine, model_year_engine_id, source)
VALUES (%s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (catalog_id, make, model, year, engine) DO NOTHING
"""
insert_interchange_sql = """
INSERT INTO supplier_catalog_interchange (catalog_id, brand, part_number)
VALUES (%s, %s, %s)
ON CONFLICT DO NOTHING
"""
stats = {
'sheets': 0,
'rows': 0,
'catalog_items': 0,
'compat_rows': 0,
'interchange_rows': 0,
'vehicles_parsed': 0,
'skipped_no_sku': 0,
'skipped_no_carro': 0,
}
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
rows = list(ws.iter_rows(values_only=True))
if not rows:
continue
data_rows = rows[1:]
stats['sheets'] += 1
print(f"\nProcessing sheet '{sheet_name}' with {len(data_rows)} rows...")
# Cache catalog_id per (sku, sheet_name) to avoid repeated upserts
catalog_id_cache = {}
for idx, row in enumerate(data_rows):
if idx % 2000 == 0 and idx > 0:
print(f" ...{idx} rows processed")
if not row or not row[1]:
stats['skipped_no_sku'] += 1
continue
sku = str(row[1]).strip()
name = normalize_name(row[24])
carro_raw = str(row[25]).strip() if row[25] else ''
if not sku:
stats['skipped_no_sku'] += 1
continue
stats['rows'] += 1
# Upsert catalog item (keyed by sku + category)
cache_key = (sku, sheet_name)
catalog_id = catalog_id_cache.get(cache_key)
if catalog_id is None:
master_cur.execute(upsert_catalog_sql, (SUPPLIER_NAME, sku, name, sheet_name))
catalog_id = master_cur.fetchone()[0]
catalog_id_cache[cache_key] = catalog_id
stats['catalog_items'] += 1
# Parse vehicle
parsed = parse_carro(carro_raw, all_skus)
stats['vehicles_parsed'] += 1
# Insert compatibility (text-only, no MYE matching during import)
master_cur.execute(insert_compat_sql, (
catalog_id,
parsed['make'],
parsed['model'],
parsed['year'],
parsed['position'] or None,
None,
'import_text',
))
stats['compat_rows'] += 1
# Insert interchanges
interchanges = extract_interchanges(row)
for brand, pn in interchanges:
master_cur.execute(insert_interchange_sql, (catalog_id, brand, pn))
stats['interchange_rows'] += 1
# Commit per sheet
master_conn.commit()
print(f" Sheet '{sheet_name}' committed.")
print(f"\n{'='*60}")
print("IMPORT COMPLETE")
print(f"{'='*60}")
print(f"Sheets processed: {stats['sheets']}")
print(f"Total rows read: {stats['rows']}")
print(f"Catalog items: {stats['catalog_items']}")
print(f"Compat rows: {stats['compat_rows']}")
print(f"Interchange rows: {stats['interchange_rows']}")
print(f"Vehicles parsed: {stats['vehicles_parsed']}")
print(f"Skipped (no SKU): {stats['skipped_no_sku']}")
master_cur.close()
master_conn.close()
master_conn.close()
if __name__ == '__main__':
main()