feat(catalog): supplier catalog cleanup, fuzzy matching, and navigation fixes
- Cleaned 137+ fake engine-displacement models from supplier imports (v3/v4 scripts: Chevrolet, Ford, Chrysler, Dodge, Jeep, Nissan, etc.) - Removed 1,251+ corrupted models (INT. prefixes, year-suffix, torque specs, empty names, trailing-year variants) - Migrated supplier tables to master DB (supplier_catalog, supplier_catalog_compat, supplier_catalog_interchange) - Fixed _get_mye_ids_with_parts() to query supplier_catalog_compat from master DB so supplier-only vehicles appear for all tenants - Added fuzzy model matcher with parenthesis stripping, noise suffix removal, compact matching, prefix/substring fallback, model aliases, and ±3 year proximity - Matched compat rows: KEEP GREEN +14,152, KNADIAN +3,021, VAZLO +127,500, LUK +477, RAYBESTOS +1,743 - Added KNADIAN catalog importer with year-range expansion and future-year filtering - Added VAZLO catalog importer with position parsing and SKU-in-model cleanup - Added Keep Green, LUK, Yokomitsu, Raybestos catalog importers - Cache clearing after cleanups (_classify_cache_*, nexus:mye_ids:*, nexus:brand_mye_counts:*) Final match rates: - KEEP GREEN: 90.3% - VAZLO: 93.6% - YOKOMITSU: 100.0% - KNADIAN: 57.4% - LUK: 51.0% - RAYBESTOS: 55.9%
This commit is contained in:
303
scripts/import_raybestos_catalog.py
Normal file
303
scripts/import_raybestos_catalog.py
Normal file
@@ -0,0 +1,303 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Import Raybestos catalog from Excel into supplier_catalog tables.
|
||||
|
||||
Usage:
|
||||
python scripts/import_raybestos_catalog.py
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from collections import Counter
|
||||
from datetime import datetime
|
||||
|
||||
import psycopg2
|
||||
from openpyxl import load_workbook
|
||||
|
||||
MASTER_DB_URL = os.environ.get('MASTER_DB_URL', 'postgresql://postgres@localhost/nexus_autoparts')
|
||||
TENANT_DB_URL = os.environ.get('TENANT_DB_URL', 'postgresql://postgres@localhost/tenant_refaccionaria_rached')
|
||||
|
||||
EXCEL_PATH = os.path.join(os.path.dirname(__file__), '..', 'data', 'RAYBESTOS.xlsx')
|
||||
SUPPLIER_NAME = 'RAYBESTOS'
|
||||
TENANT_ID = 31
|
||||
|
||||
KNOWN_MAKES = {
|
||||
'ACURA', 'ALFA', 'AMERICAN', 'ASTON', 'AUDI', 'BMW', 'BUICK', 'CADILLAC',
|
||||
'CHEVROLET', 'CHRYSLER', 'CITROEN', 'DAEWOO', 'DODGE', 'FIAT', 'FORD',
|
||||
'GMC', 'GREAT', 'HONDA', 'HYUNDAI', 'INFINITI', 'ISUZU', 'JAGUAR', 'JEEP',
|
||||
'KIA', 'LAMBORGHINI', 'LAND', 'LEXUS', 'LINCOLN', 'MAZDA', 'MERCEDES',
|
||||
'MERCURY', 'MINI', 'MITSUBISHI', 'NISSAN', 'PEUGEOT', 'PONTIAC', 'PORSCHE',
|
||||
'RENAULT', 'ROLLS', 'SATURN', 'SCION', 'SEAT', 'SKODA', 'SMART', 'SUBARU',
|
||||
'SUZUKI', 'TESLA', 'TOYOTA', 'VOLKSWAGEN', 'VOLSWAGEN', 'VOLVO', 'VW'
|
||||
}
|
||||
|
||||
POS_KEYWORDS = {'DELANTERA', 'TRASERA', 'TAS', 'DEL', 'TRAS', 'FRONT', 'REAR', 'LAT', 'IZQ', 'DER'}
|
||||
NOTE_KEYWORDS = {'LATIN', 'AMERICA', 'NACIONAL', 'USA', 'EUROPA', 'IMPORTADO'}
|
||||
|
||||
|
||||
def connect_master():
|
||||
return psycopg2.connect(MASTER_DB_URL)
|
||||
|
||||
|
||||
def connect_tenant():
|
||||
return psycopg2.connect(TENANT_DB_URL)
|
||||
|
||||
|
||||
def normalize_name(name):
|
||||
if not name:
|
||||
return ''
|
||||
return ' '.join(str(name).replace('\n', ' ').split())
|
||||
|
||||
|
||||
def parse_abbr_year(token):
|
||||
if not token or not token.isdigit():
|
||||
return None
|
||||
n = int(token)
|
||||
if n < 50:
|
||||
return 2000 + n
|
||||
if n < 100:
|
||||
return 1900 + n
|
||||
return None
|
||||
|
||||
|
||||
def extract_make(parts):
|
||||
"""Return (make, make_len) if first words form a known make, else (None, 0)."""
|
||||
if not parts:
|
||||
return None, 0
|
||||
first = parts[0].upper()
|
||||
if first not in KNOWN_MAKES:
|
||||
return None, 0
|
||||
if first == 'ALFA' and len(parts) >= 2 and parts[1].upper() == 'ROMEO':
|
||||
return 'ALFA ROMEO', 2
|
||||
if first == 'MERCEDES' and len(parts) >= 2 and parts[1].upper() == 'BENZ':
|
||||
return 'MERCEDES BENZ', 2
|
||||
if first == 'ROLLS' and len(parts) >= 2 and parts[1].upper() == 'ROYCE':
|
||||
return 'ROLLS ROYCE', 2
|
||||
if first == 'LAND' and len(parts) >= 2 and parts[1].upper() == 'ROVER':
|
||||
return 'LAND ROVER', 2
|
||||
if first == 'GREAT' and len(parts) >= 2 and parts[1].upper() == 'WALL':
|
||||
return 'GREAT WALL', 2
|
||||
if first == 'AMERICAN' and len(parts) >= 2 and parts[1].upper() == 'MOTORS':
|
||||
return 'AMERICAN MOTORS', 2
|
||||
if first == 'ASTON' and len(parts) >= 2 and parts[1].upper() == 'MARTIN':
|
||||
return 'ASTON MARTIN', 2
|
||||
# Normalize common typos
|
||||
if first == 'VOLSWAGEN':
|
||||
return 'Volkswagen', 1
|
||||
if first == 'VW':
|
||||
return 'Volkswagen', 1
|
||||
return parts[0], 1
|
||||
|
||||
|
||||
def parse_raybestos(carro, last_make):
|
||||
if not carro:
|
||||
return None, None, None, None, last_make
|
||||
s = ' '.join(str(carro).strip().split())
|
||||
if not s:
|
||||
return None, None, None, None, last_make
|
||||
|
||||
parts = s.split()
|
||||
|
||||
# Extract 4-digit year from end
|
||||
year = None
|
||||
if parts and re.match(r'^(19|20)\d{2}$', parts[-1]):
|
||||
year = int(parts[-1])
|
||||
parts = parts[:-1]
|
||||
|
||||
# Extract make
|
||||
make, make_len = extract_make(parts)
|
||||
if make:
|
||||
last_make = make
|
||||
remaining = parts[make_len:]
|
||||
elif last_make:
|
||||
make = last_make
|
||||
remaining = parts[:]
|
||||
else:
|
||||
make = None
|
||||
remaining = parts[:]
|
||||
|
||||
# Extract abbreviated year or year range from remaining
|
||||
if year is None and remaining:
|
||||
for i in range(len(remaining)):
|
||||
# Year range like 17-18, 90-05
|
||||
m = re.match(r'^(\d{2})-(\d{2})$', remaining[i])
|
||||
if m:
|
||||
year = parse_abbr_year(m.group(2)) # use end year
|
||||
remaining = remaining[:i] + remaining[i + 1:]
|
||||
break
|
||||
# Single 2-digit year
|
||||
if re.match(r'^\d{2}$', remaining[i]):
|
||||
y = parse_abbr_year(remaining[i])
|
||||
if y:
|
||||
year = y
|
||||
remaining = remaining[:i] + remaining[i + 1:]
|
||||
break
|
||||
|
||||
# Extract position keywords and notes
|
||||
position = None
|
||||
cleaned = []
|
||||
for p in remaining:
|
||||
pup = p.upper()
|
||||
if pup in POS_KEYWORDS:
|
||||
if pup == 'TAS':
|
||||
position = 'TRASERA'
|
||||
elif pup in ('DEL', 'FRONT'):
|
||||
position = 'DELANTERA'
|
||||
elif pup in ('TRAS', 'REAR'):
|
||||
position = 'TRASERA'
|
||||
else:
|
||||
position = pup.title()
|
||||
elif pup in NOTE_KEYWORDS:
|
||||
pass # skip notes
|
||||
else:
|
||||
cleaned.append(p)
|
||||
|
||||
model = ' '.join(cleaned)
|
||||
|
||||
return make, model, position, year, last_make
|
||||
|
||||
|
||||
def extract_interchanges(row):
|
||||
"""Extract (brand, part_number) pairs from 2 interchange columns."""
|
||||
interchanges = []
|
||||
for i in range(2):
|
||||
marca_col = 2 + i * 2
|
||||
inter_col = 3 + i * 2
|
||||
if marca_col < len(row) and row[marca_col]:
|
||||
brand = str(row[marca_col]).strip()
|
||||
pn = str(row[inter_col]).strip() if inter_col < len(row) and row[inter_col] else ''
|
||||
if brand and pn:
|
||||
interchanges.append((brand, pn))
|
||||
return interchanges
|
||||
|
||||
|
||||
def main():
|
||||
print(f"[{datetime.now().isoformat()}] Starting Raybestos import...")
|
||||
|
||||
if not os.path.exists(EXCEL_PATH):
|
||||
print(f"ERROR: Excel not found at {EXCEL_PATH}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Loading {EXCEL_PATH}...")
|
||||
wb = load_workbook(EXCEL_PATH, read_only=True, data_only=True)
|
||||
ws = wb['Freno_de_disco']
|
||||
|
||||
master_conn = connect_master()
|
||||
master_conn = connect_master()
|
||||
master_cur = master_conn.cursor()
|
||||
|
||||
# Pre-scan: determine most common name per SKU
|
||||
print("Pre-scanning SKUs...")
|
||||
sku_name_counter = Counter()
|
||||
for row in ws.iter_rows(min_row=2, values_only=True):
|
||||
sku = str(row[1]).strip() if row[1] else ''
|
||||
name = normalize_name(row[6])
|
||||
if sku and name:
|
||||
sku_name_counter[(sku, name)] += 1
|
||||
|
||||
sku_best_name = {}
|
||||
for (sku, name), count in sku_name_counter.items():
|
||||
if sku not in sku_best_name or count > sku_best_name[sku][1]:
|
||||
sku_best_name[sku] = (name, count)
|
||||
|
||||
print(f" Found {len(sku_best_name)} unique SKUs")
|
||||
|
||||
upsert_catalog_sql = """
|
||||
INSERT INTO supplier_catalog (supplier_name, sku, name, category)
|
||||
VALUES (%s, %s, %s, %s, %s)
|
||||
ON CONFLICT (supplier_name, sku, category) DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
category = EXCLUDED.category
|
||||
RETURNING id
|
||||
"""
|
||||
|
||||
insert_compat_sql = """
|
||||
INSERT INTO supplier_catalog_compat
|
||||
(catalog_id, make, model, year, engine, model_year_engine_id, source)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s)
|
||||
ON CONFLICT (catalog_id, make, model, year, engine) DO NOTHING
|
||||
"""
|
||||
|
||||
insert_interchange_sql = """
|
||||
INSERT INTO supplier_catalog_interchange (catalog_id, brand, part_number)
|
||||
VALUES (%s, %s, %s)
|
||||
ON CONFLICT DO NOTHING
|
||||
"""
|
||||
|
||||
stats = {
|
||||
'rows': 0,
|
||||
'catalog_items': 0,
|
||||
'compat_rows': 0,
|
||||
'interchange_rows': 0,
|
||||
'vehicles_parsed': 0,
|
||||
'forward_filled_make': 0,
|
||||
}
|
||||
|
||||
catalog_id_cache = {}
|
||||
last_make = None
|
||||
|
||||
for idx, row in enumerate(ws.iter_rows(min_row=2, values_only=True)):
|
||||
if idx % 1000 == 0 and idx > 0:
|
||||
print(f" ...{idx} rows processed")
|
||||
|
||||
if not row or not row[1]:
|
||||
continue
|
||||
|
||||
sku = str(row[1]).strip()
|
||||
name = sku_best_name.get(sku, ('', 0))[0]
|
||||
carro_raw = str(row[7]).strip() if row[7] else ''
|
||||
|
||||
if not sku or not name:
|
||||
continue
|
||||
|
||||
stats['rows'] += 1
|
||||
|
||||
cache_key = (sku, 'Freno_de_disco')
|
||||
catalog_id = catalog_id_cache.get(cache_key)
|
||||
if catalog_id is None:
|
||||
master_cur.execute(upsert_catalog_sql, (SUPPLIER_NAME, sku, name, 'Freno_de_disco'))
|
||||
catalog_id = master_cur.fetchone()[0]
|
||||
catalog_id_cache[cache_key] = catalog_id
|
||||
stats['catalog_items'] += 1
|
||||
|
||||
make, model, position, year, last_make = parse_raybestos(carro_raw, last_make)
|
||||
if make and carro_raw and not extract_make(carro_raw.split())[0]:
|
||||
stats['forward_filled_make'] += 1
|
||||
stats['vehicles_parsed'] += 1
|
||||
|
||||
master_cur.execute(insert_compat_sql, (
|
||||
catalog_id,
|
||||
make,
|
||||
model,
|
||||
year,
|
||||
position,
|
||||
None,
|
||||
'import_text',
|
||||
))
|
||||
stats['compat_rows'] += 1
|
||||
|
||||
interchanges = extract_interchanges(row)
|
||||
for brand, pn in interchanges:
|
||||
master_cur.execute(insert_interchange_sql, (catalog_id, brand, pn))
|
||||
stats['interchange_rows'] += 1
|
||||
|
||||
master_conn.commit()
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print("IMPORT COMPLETE")
|
||||
print(f"{'='*60}")
|
||||
print(f"Total rows read: {stats['rows']}")
|
||||
print(f"Catalog items: {stats['catalog_items']}")
|
||||
print(f"Compat rows: {stats['compat_rows']}")
|
||||
print(f"Interchange rows: {stats['interchange_rows']}")
|
||||
print(f"Vehicles parsed: {stats['vehicles_parsed']}")
|
||||
print(f"Forward-filled makes: {stats['forward_filled_make']}")
|
||||
|
||||
master_cur.close()
|
||||
master_conn.close()
|
||||
master_conn.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user