- Cleaned 137+ fake engine-displacement models from supplier imports (v3/v4 scripts: Chevrolet, Ford, Chrysler, Dodge, Jeep, Nissan, etc.) - Removed 1,251+ corrupted models (INT. prefixes, year-suffix, torque specs, empty names, trailing-year variants) - Migrated supplier tables to master DB (supplier_catalog, supplier_catalog_compat, supplier_catalog_interchange) - Fixed _get_mye_ids_with_parts() to query supplier_catalog_compat from master DB so supplier-only vehicles appear for all tenants - Added fuzzy model matcher with parenthesis stripping, noise suffix removal, compact matching, prefix/substring fallback, model aliases, and ±3 year proximity - Matched compat rows: KEEP GREEN +14,152, KNADIAN +3,021, VAZLO +127,500, LUK +477, RAYBESTOS +1,743 - Added KNADIAN catalog importer with year-range expansion and future-year filtering - Added VAZLO catalog importer with position parsing and SKU-in-model cleanup - Added Keep Green, LUK, Yokomitsu, Raybestos catalog importers - Cache clearing after cleanups (_classify_cache_*, nexus:mye_ids:*, nexus:brand_mye_counts:*) Final match rates: - KEEP GREEN: 90.3% - VAZLO: 93.6% - YOKOMITSU: 100.0% - KNADIAN: 57.4% - LUK: 51.0% - RAYBESTOS: 55.9%
394 lines
12 KiB
Python
Executable File
394 lines
12 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Import Yokomitsu catalog from Excel into supplier_catalog tables.
|
|
|
|
Usage:
|
|
python scripts/import_yokomitsu_catalog.py
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import sys
|
|
from datetime import datetime
|
|
|
|
import psycopg2
|
|
from openpyxl import load_workbook
|
|
|
|
# DB connections
|
|
MASTER_DB_URL = os.environ.get('MASTER_DB_URL', 'postgresql://postgres@localhost/nexus_autoparts')
|
|
TENANT_DB_URL = os.environ.get('TENANT_DB_URL', 'postgresql://postgres@localhost/tenant_refaccionaria_rached')
|
|
|
|
EXCEL_PATH = os.path.join(os.path.dirname(__file__), '..', 'data', 'YOKOMITSU_CATALOGOS_COMPLETOS_TODOS.xlsx')
|
|
SUPPLIER_NAME = 'YOKOMITSU'
|
|
TENANT_ID = 31
|
|
|
|
|
|
def connect_master():
|
|
return psycopg2.connect(MASTER_DB_URL)
|
|
|
|
|
|
def connect_tenant():
|
|
return psycopg2.connect(TENANT_DB_URL)
|
|
|
|
|
|
def parse_year(token):
|
|
"""Parse a 2-digit or 4-digit year string."""
|
|
token = token.strip()
|
|
if not token:
|
|
return None
|
|
# Handle ranges like 08-13 or 08-15 -> use first year
|
|
if '-' in token:
|
|
token = token.split('-')[0]
|
|
token = token.strip()
|
|
if not token.isdigit():
|
|
return None
|
|
n = int(token)
|
|
if n < 50:
|
|
return 2000 + n
|
|
if n < 100:
|
|
return 1900 + n
|
|
if n >= 1900 and n <= 2050:
|
|
return n
|
|
return None
|
|
|
|
|
|
def parse_vehicle(vehicle_raw):
|
|
"""
|
|
Parse a vehicle string like:
|
|
'Chevrolet AVEO 1.5L 18'
|
|
'Audi A4 1.8L/2.0L 09'
|
|
'Dodge GRAND CHEROKEE 2/4WD 3.0L/3.7L/4.7L 08'
|
|
'Volkswagen JETTA A4/CLASICO 1.8L/2.0L 06 V'
|
|
'NISSAN 720 1988'
|
|
'Dodge CARAVAN/VOYAGER 00'
|
|
'ER 08-15 10' (garbage/unknown)
|
|
|
|
Returns dict with make, model, year, engine, vehicle_raw.
|
|
"""
|
|
if not vehicle_raw:
|
|
return {'make': None, 'model': None, 'year': None, 'engine': None, 'vehicle_raw': vehicle_raw}
|
|
|
|
s = str(vehicle_raw).strip()
|
|
# Remove trailing 'V' (variant marker)
|
|
s = re.sub(r'\s+V$', '', s)
|
|
|
|
tokens = s.split()
|
|
if len(tokens) < 2:
|
|
return {'make': None, 'model': None, 'year': None, 'engine': None, 'vehicle_raw': s}
|
|
|
|
# Last token is usually year (or year with suffix)
|
|
year = parse_year(tokens[-1])
|
|
if year is None and len(tokens) >= 3:
|
|
# Try second-to-last if last doesn't look like year
|
|
year = parse_year(tokens[-2])
|
|
if year:
|
|
tokens = tokens[:-2] + [tokens[-1]] # keep last as extra, but year found at -2
|
|
year = parse_year(tokens[-2])
|
|
if year is None:
|
|
# No year found; keep raw and try best-effort
|
|
make = tokens[0] if tokens else None
|
|
return {'make': make, 'model': ' '.join(tokens[1:]) if len(tokens) > 1 else None,
|
|
'year': None, 'engine': None, 'vehicle_raw': s}
|
|
|
|
# Remove year token
|
|
tokens_without_year = tokens[:-1]
|
|
make = tokens_without_year[0] if tokens_without_year else None
|
|
|
|
# Try to extract engine from remaining tokens
|
|
# Engine patterns: contains 'L', 'WD', 'DIESEL', 'TURBO', numeric with decimal
|
|
remaining = ' '.join(tokens_without_year[1:]) if len(tokens_without_year) > 1 else ''
|
|
|
|
# Heuristic: look for engine tokens at the END of remaining string
|
|
# Common patterns: "1.5L", "1.8L/2.0L", "2/4WD", "3.0L/3.7L/4.7L", "1.9L DIESEL"
|
|
engine = None
|
|
model = remaining
|
|
|
|
# Try to find engine pattern from the end
|
|
engine_match = re.search(r'(\d+(?:\.\d+)?\s*L(?:/\d+(?:\.\d+)?\s*L)*|\d+/\d+WD|\d+\.\d+L\s+DIESEL|\d+\.\d+L\s+TURBO)$', remaining, re.IGNORECASE)
|
|
if engine_match:
|
|
engine = engine_match.group(1)
|
|
model = remaining[:engine_match.start()].strip()
|
|
else:
|
|
# Try simpler: anything with digits and 'L' or 'WD' at the very end
|
|
parts = remaining.split()
|
|
if parts and re.search(r'\d', parts[-1]) and ('L' in parts[-1].upper() or 'WD' in parts[-1].upper()):
|
|
engine = parts[-1]
|
|
model = ' '.join(parts[:-1])
|
|
|
|
return {
|
|
'make': make,
|
|
'model': model,
|
|
'year': year,
|
|
'engine': engine,
|
|
'vehicle_raw': s,
|
|
}
|
|
|
|
|
|
def build_brand_cache(cur):
|
|
"""Fetch all brands from master DB."""
|
|
cur.execute("SELECT id_brand, name_brand FROM brands")
|
|
return {row[1].upper(): row[0] for row in cur.fetchall()}
|
|
|
|
|
|
def build_model_cache(cur):
|
|
"""Fetch all models from master DB."""
|
|
cur.execute("SELECT id_model, brand_id, name_model FROM models")
|
|
rows = cur.fetchall()
|
|
# Index by brand_id for fast lookup
|
|
cache = {}
|
|
for mid, bid, name in rows:
|
|
cache.setdefault(bid, []).append((mid, name))
|
|
return cache
|
|
|
|
|
|
def build_year_cache(cur):
|
|
"""Fetch all years from master DB."""
|
|
cur.execute("SELECT id_year, year_car FROM years")
|
|
return {row[1]: row[0] for row in cur.fetchall()}
|
|
|
|
|
|
def build_mye_cache(cur):
|
|
"""Fetch all MYE entries."""
|
|
cur.execute("SELECT id_mye, model_id, year_id FROM model_year_engine")
|
|
cache = {}
|
|
for mye_id, model_id, year_id in cur.fetchall():
|
|
cache.setdefault((model_id, year_id), []).append(mye_id)
|
|
return cache
|
|
|
|
|
|
def fuzzy_match_vehicle(parsed, brand_cache, model_cache, year_cache, mye_cache):
|
|
"""
|
|
Try to match parsed vehicle to MYE IDs.
|
|
Returns list of mye_ids (may be empty).
|
|
"""
|
|
make = parsed.get('make')
|
|
model_keyword = parsed.get('model')
|
|
year = parsed.get('year')
|
|
|
|
if not make or not model_keyword or not year:
|
|
return []
|
|
|
|
# Find brand
|
|
brand_id = brand_cache.get(make.upper())
|
|
if not brand_id:
|
|
# Try partial match
|
|
for name, bid in brand_cache.items():
|
|
if make.upper() in name or name in make.upper():
|
|
brand_id = bid
|
|
break
|
|
if not brand_id:
|
|
return []
|
|
|
|
# Find models for this brand that contain the keyword
|
|
models = model_cache.get(brand_id, [])
|
|
# Extract keyword: longest uppercase word from model string
|
|
keyword = model_keyword.upper()
|
|
# Try exact word match first
|
|
matched_model_ids = []
|
|
for mid, mname in models:
|
|
if keyword in mname.upper():
|
|
matched_model_ids.append(mid)
|
|
|
|
if not matched_model_ids:
|
|
# Try with each word in keyword
|
|
words = [w for w in keyword.split() if len(w) >= 3]
|
|
for mid, mname in models:
|
|
mname_up = mname.upper()
|
|
if any(w in mname_up for w in words):
|
|
matched_model_ids.append(mid)
|
|
|
|
if not matched_model_ids:
|
|
return []
|
|
|
|
# Find year_id
|
|
year_id = year_cache.get(year)
|
|
if not year_id:
|
|
return []
|
|
|
|
# Collect MYEs for all matched model+year combos
|
|
mye_ids = []
|
|
for mid in matched_model_ids:
|
|
mye_ids.extend(mye_cache.get((mid, year_id), []))
|
|
|
|
return mye_ids
|
|
|
|
|
|
def extract_interchanges(row):
|
|
"""Extract (brand, part_number) pairs from the interchange columns."""
|
|
interchanges = []
|
|
# Columns: MARCA.1=2, INTERCAMBIO=3, MARCA.2=4, INTERCAMBIO.1=5, ... up to MARCA.6=12, INTERCAMBIO.5=13
|
|
pairs = [
|
|
(row[2], row[3]),
|
|
(row[4], row[5]),
|
|
(row[6], row[7]),
|
|
(row[8], row[9]),
|
|
(row[10], row[11]),
|
|
(row[12], row[13]),
|
|
]
|
|
for brand, pn in pairs:
|
|
if brand and pn:
|
|
brand = str(brand).strip()
|
|
pn = str(pn).strip()
|
|
if brand and pn:
|
|
interchanges.append((brand, pn))
|
|
return interchanges
|
|
|
|
|
|
def main():
|
|
print(f"[{datetime.now().isoformat()}] Starting import...")
|
|
|
|
if not os.path.exists(EXCEL_PATH):
|
|
print(f"ERROR: Excel not found at {EXCEL_PATH}")
|
|
sys.exit(1)
|
|
|
|
print(f"Loading {EXCEL_PATH}...")
|
|
wb = load_workbook(EXCEL_PATH, read_only=True, data_only=True)
|
|
|
|
master_conn = connect_master()
|
|
master_conn = connect_master()
|
|
master_cur = master_conn.cursor()
|
|
master_cur = master_conn.cursor()
|
|
|
|
print("Building caches...")
|
|
brand_cache = build_brand_cache(master_cur)
|
|
model_cache = build_model_cache(master_cur)
|
|
year_cache = build_year_cache(master_cur)
|
|
mye_cache = build_mye_cache(master_cur)
|
|
print(f" Brands: {len(brand_cache)}, Models: {sum(len(v) for v in model_cache.values())}, Years: {len(year_cache)}, MYE combos: {len(mye_cache)}")
|
|
|
|
# Prepare UPSERT statements
|
|
upsert_catalog_sql = """
|
|
INSERT INTO supplier_catalog (supplier_name, sku, name, category)
|
|
VALUES (%s, %s, %s, %s, %s)
|
|
ON CONFLICT (supplier_name, sku, category) DO UPDATE SET
|
|
name = EXCLUDED.name,
|
|
category = EXCLUDED.category
|
|
RETURNING id
|
|
"""
|
|
|
|
insert_compat_sql = """
|
|
INSERT INTO supplier_catalog_compat
|
|
(catalog_id, make, model, year, engine, model_year_engine_id, source)
|
|
VALUES (%s, %s, %s, %s, %s, %s, %s)
|
|
ON CONFLICT (catalog_id, make, model, year, engine) DO NOTHING
|
|
"""
|
|
|
|
insert_interchange_sql = """
|
|
INSERT INTO supplier_catalog_interchange (catalog_id, brand, part_number)
|
|
VALUES (%s, %s, %s)
|
|
ON CONFLICT DO NOTHING
|
|
"""
|
|
|
|
# Track stats
|
|
stats = {
|
|
'sheets': 0,
|
|
'rows': 0,
|
|
'catalog_items': 0,
|
|
'compat_rows': 0,
|
|
'interchange_rows': 0,
|
|
'vehicles_parsed': 0,
|
|
'vehicles_matched': 0,
|
|
'mye_matches': 0,
|
|
}
|
|
|
|
# Process each sheet
|
|
for sheet_name in wb.sheetnames:
|
|
ws = wb[sheet_name]
|
|
rows = list(ws.iter_rows(values_only=True))
|
|
if not rows:
|
|
continue
|
|
headers = rows[0]
|
|
data_rows = rows[1:]
|
|
stats['sheets'] += 1
|
|
print(f"\nProcessing sheet '{sheet_name}' with {len(data_rows)} rows...")
|
|
|
|
for idx, row in enumerate(data_rows):
|
|
if idx % 1000 == 0 and idx > 0:
|
|
print(f" ...{idx} rows processed")
|
|
|
|
# Skip empty rows
|
|
if not row or not row[1]:
|
|
continue
|
|
|
|
sku = str(row[1]).strip()
|
|
name = str(row[14]).strip() if row[14] else ''
|
|
vehicle_raw = str(row[15]).strip() if row[15] else ''
|
|
|
|
if not sku or not name:
|
|
continue
|
|
|
|
stats['rows'] += 1
|
|
|
|
# Upsert catalog item
|
|
master_cur.execute(upsert_catalog_sql, (SUPPLIER_NAME, sku, name, sheet_name))
|
|
catalog_id = master_cur.fetchone()[0]
|
|
stats['catalog_items'] += 1
|
|
|
|
# Parse vehicle
|
|
parsed = parse_vehicle(vehicle_raw)
|
|
stats['vehicles_parsed'] += 1
|
|
|
|
mye_ids = fuzzy_match_vehicle(parsed, brand_cache, model_cache, year_cache, mye_cache)
|
|
if mye_ids:
|
|
stats['vehicles_matched'] += 1
|
|
stats['mye_matches'] += len(mye_ids)
|
|
|
|
# Insert compatibility rows
|
|
# If we have MYE matches, insert one row per MYE
|
|
if mye_ids:
|
|
for mye_id in mye_ids:
|
|
master_cur.execute(insert_compat_sql, (
|
|
catalog_id,
|
|
parsed['make'],
|
|
parsed['model'],
|
|
parsed['year'],
|
|
parsed['engine'],
|
|
mye_id,
|
|
'fuzzy_match',
|
|
))
|
|
stats['compat_rows'] += 1
|
|
else:
|
|
# No MYE match: insert with text only
|
|
master_cur.execute(insert_compat_sql, (
|
|
catalog_id,
|
|
parsed['make'],
|
|
parsed['model'],
|
|
parsed['year'],
|
|
parsed['engine'],
|
|
None,
|
|
'import_text',
|
|
))
|
|
stats['compat_rows'] += 1
|
|
|
|
# Insert interchanges
|
|
interchanges = extract_interchanges(row)
|
|
for brand, pn in interchanges:
|
|
master_cur.execute(insert_interchange_sql, (catalog_id, brand, pn))
|
|
stats['interchange_rows'] += 1
|
|
|
|
# Commit per sheet
|
|
master_conn.commit()
|
|
print(f" Sheet '{sheet_name}' committed.")
|
|
|
|
# Final stats
|
|
print(f"\n{'='*60}")
|
|
print("IMPORT COMPLETE")
|
|
print(f"{'='*60}")
|
|
print(f"Sheets processed: {stats['sheets']}")
|
|
print(f"Total rows read: {stats['rows']}")
|
|
print(f"Catalog items: {stats['catalog_items']}")
|
|
print(f"Compat rows: {stats['compat_rows']}")
|
|
print(f"Interchange rows: {stats['interchange_rows']}")
|
|
print(f"Vehicles parsed: {stats['vehicles_parsed']}")
|
|
print(f"Vehicles with MYE: {stats['vehicles_matched']}")
|
|
print(f"Total MYE matches: {stats['mye_matches']}")
|
|
|
|
master_cur.close()
|
|
master_cur.close()
|
|
master_conn.close()
|
|
master_conn.close()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|