- import_tecdoc.py: 2-phase TecDoc download + import (brands, models, vehicles) - import_live.py: real-time streaming importer for part details - run_all_brands.sh: automated sequential brand processing pipeline Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
415 lines
15 KiB
Python
415 lines
15 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Import vehicle data from TecDoc (Apify) into Nexus Autoparts PostgreSQL.
|
||
|
||
Two-phase approach:
|
||
Phase 1: Download all data from TecDoc API to local JSON files
|
||
Phase 2: Import JSON files into PostgreSQL
|
||
|
||
Usage:
|
||
python3 scripts/import_tecdoc.py download # Phase 1: fetch from API
|
||
python3 scripts/import_tecdoc.py download --brand TOYOTA # Single brand
|
||
python3 scripts/import_tecdoc.py import # Phase 2: load into DB
|
||
python3 scripts/import_tecdoc.py status # Check progress
|
||
"""
|
||
|
||
import os
|
||
import sys
|
||
import json
|
||
import time
|
||
import argparse
|
||
import requests
|
||
import psycopg2
|
||
from datetime import datetime
|
||
from pathlib import Path
|
||
|
||
# --- Config ---
|
||
APIFY_TOKEN = os.environ.get("APIFY_TOKEN", "apify_api_l5SrcwYyanAO45AFxrEpviUcuVRIFK2yPdc5")
|
||
APIFY_ACTOR = "making-data-meaningful~tecdoc"
|
||
APIFY_URL = f"https://api.apify.com/v2/acts/{APIFY_ACTOR}/run-sync-get-dataset-items"
|
||
DB_URL = os.environ.get("DATABASE_URL", "postgresql://nexus:nexus_autoparts_2026@localhost/nexus_autoparts")
|
||
|
||
TYPE_ID = 1 # Passenger cars
|
||
LANG_ID = 4 # English
|
||
COUNTRY_ID = 153 # Mexico
|
||
|
||
DATA_DIR = Path("/home/Autopartes/data/tecdoc")
|
||
APIFY_DELAY = 1.0 # seconds between API calls
|
||
|
||
|
||
def apify_call(input_data, retries=3):
|
||
"""Call Apify actor and return result."""
|
||
for attempt in range(retries):
|
||
try:
|
||
resp = requests.post(
|
||
APIFY_URL, params={"token": APIFY_TOKEN},
|
||
headers={"Content-Type": "application/json"},
|
||
json=input_data, timeout=120
|
||
)
|
||
if resp.status_code in (200, 201):
|
||
data = resp.json()
|
||
return data[0] if isinstance(data, list) and data else data
|
||
elif resp.status_code == 429:
|
||
wait = 15 * (attempt + 1)
|
||
print(f" Rate limited, waiting {wait}s...", flush=True)
|
||
time.sleep(wait)
|
||
else:
|
||
print(f" HTTP {resp.status_code}: {resp.text[:100]}", flush=True)
|
||
time.sleep(5)
|
||
except Exception as e:
|
||
print(f" Error: {e}", flush=True)
|
||
time.sleep(5)
|
||
return None
|
||
|
||
|
||
# ──────────────── Phase 1: Download ────────────────
|
||
|
||
def download(brand_filter=None):
|
||
"""Download all TecDoc data to local JSON files."""
|
||
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
||
|
||
# Step 1: Manufacturers
|
||
mfr_file = DATA_DIR / "manufacturers.json"
|
||
if mfr_file.exists():
|
||
manufacturers = json.loads(mfr_file.read_text())
|
||
print(f"Loaded {len(manufacturers)} cached manufacturers", flush=True)
|
||
else:
|
||
print("Fetching manufacturers...", flush=True)
|
||
result = apify_call({"endpoint_manufacturerIdsByTypeId": True, "manufacturer_typeId_2": TYPE_ID})
|
||
manufacturers = result["manufacturers"]
|
||
mfr_file.write_text(json.dumps(manufacturers, indent=1))
|
||
print(f" Saved {len(manufacturers)} manufacturers", flush=True)
|
||
|
||
if brand_filter:
|
||
manufacturers = [m for m in manufacturers if brand_filter.upper() in m["manufacturerName"].upper()]
|
||
print(f"Filtered to {len(manufacturers)} matching '{brand_filter}'", flush=True)
|
||
|
||
# Step 2: Models for each manufacturer
|
||
models_dir = DATA_DIR / "models"
|
||
models_dir.mkdir(exist_ok=True)
|
||
|
||
for i, mfr in enumerate(manufacturers):
|
||
mfr_id = mfr["manufacturerId"]
|
||
mfr_name = mfr["manufacturerName"]
|
||
model_file = models_dir / f"{mfr_id}.json"
|
||
|
||
if model_file.exists():
|
||
continue # Skip already downloaded
|
||
|
||
print(f"[{i+1}/{len(manufacturers)}] {mfr_name} (id={mfr_id})", flush=True)
|
||
time.sleep(APIFY_DELAY)
|
||
|
||
result = apify_call({
|
||
"endpoint_modelsByTypeManufacturer": True,
|
||
"models_typeId_1": TYPE_ID,
|
||
"models_manufacturerId_1": mfr_id,
|
||
"models_langId_1": LANG_ID,
|
||
"models_countryFilterId_1": COUNTRY_ID
|
||
})
|
||
|
||
models = result.get("models", []) if result else []
|
||
model_file.write_text(json.dumps(models, indent=1))
|
||
print(f" {len(models)} models", flush=True)
|
||
|
||
# Step 3: Vehicle types for each model
|
||
vehicles_dir = DATA_DIR / "vehicles"
|
||
vehicles_dir.mkdir(exist_ok=True)
|
||
|
||
# Iterate all model files
|
||
total_models = 0
|
||
processed = 0
|
||
|
||
for model_file in sorted(models_dir.glob("*.json")):
|
||
mfr_id = model_file.stem
|
||
models = json.loads(model_file.read_text())
|
||
total_models += len(models)
|
||
|
||
for model in models:
|
||
td_model_id = model["modelId"]
|
||
vehicle_file = vehicles_dir / f"{td_model_id}.json"
|
||
|
||
if vehicle_file.exists():
|
||
processed += 1
|
||
continue
|
||
|
||
print(f" [{processed+1}/{total_models}] Model {model['modelName']} (id={td_model_id})", flush=True)
|
||
time.sleep(APIFY_DELAY)
|
||
|
||
result = apify_call({
|
||
"endpoint_vehicleEngineTypesByModel": True,
|
||
"vehicle_typeId_3": TYPE_ID,
|
||
"vehicle_modelId_3": td_model_id,
|
||
"vehicle_langId_3": LANG_ID,
|
||
"vehicle_countryFilterId_3": COUNTRY_ID
|
||
})
|
||
|
||
vehicles = result.get("modelTypes", []) if result else []
|
||
vehicle_file.write_text(json.dumps(vehicles, indent=1))
|
||
processed += 1
|
||
|
||
print(f"\nDownload complete! {processed} model vehicle files.", flush=True)
|
||
|
||
|
||
# ──────────────── Phase 2: Import ────────────────
|
||
|
||
def parse_fuel_id(fuel_str):
|
||
if not fuel_str:
|
||
return None
|
||
f = fuel_str.lower()
|
||
if "diesel" in f:
|
||
return 1
|
||
if "electric" in f and "petrol" not in f and "gas" not in f:
|
||
return 2
|
||
return 3
|
||
|
||
|
||
def parse_body_id(model_name):
|
||
if not model_name:
|
||
return None
|
||
mapping = {
|
||
"Saloon": 1, "Sedan": 1, "Coupe": 2, "Coupé": 2,
|
||
"Hatchback": 3, "SUV": 4, "Off-Road": 4, "Crossover": 5,
|
||
"Truck": 6, "Van": 7, "Box Body": 7, "MPV": 8,
|
||
"Estate": 9, "Wagon": 9, "Kombi": 9,
|
||
"Convertible": 10, "Cabrio": 10, "Cabriolet": 10,
|
||
"Pick-up": 11, "Pickup": 11,
|
||
"Platform": 12, "Chassis": 12, "Bus": 13, "Roadster": 15,
|
||
}
|
||
for key, val in mapping.items():
|
||
if key in model_name:
|
||
return val
|
||
return None
|
||
|
||
|
||
def do_import():
|
||
"""Import downloaded JSON data into PostgreSQL."""
|
||
if not DATA_DIR.exists():
|
||
print("No data directory found. Run 'download' first.")
|
||
return
|
||
|
||
mfr_file = DATA_DIR / "manufacturers.json"
|
||
if not mfr_file.exists():
|
||
print("No manufacturers.json found. Run 'download' first.")
|
||
return
|
||
|
||
manufacturers = json.loads(mfr_file.read_text())
|
||
models_dir = DATA_DIR / "models"
|
||
vehicles_dir = DATA_DIR / "vehicles"
|
||
|
||
conn = psycopg2.connect(DB_URL)
|
||
cur = conn.cursor()
|
||
|
||
# Ensure years exist (1950–2027)
|
||
cur.execute("SELECT id_year, year_car FROM years")
|
||
year_cache = {r[1]: r[0] for r in cur.fetchall()}
|
||
for y in range(1950, 2028):
|
||
if y not in year_cache:
|
||
cur.execute("INSERT INTO years (year_car) VALUES (%s) RETURNING id_year", (y,))
|
||
year_cache[y] = cur.fetchone()[0]
|
||
conn.commit()
|
||
|
||
# Caches
|
||
brand_cache = {}
|
||
model_cache = {}
|
||
engine_cache = {}
|
||
mye_set = set()
|
||
|
||
stats = {"brands": 0, "models": 0, "engines": 0, "mye": 0, "skipped": 0}
|
||
current_year = datetime.now().year
|
||
|
||
for mfr in manufacturers:
|
||
mfr_id = mfr["manufacturerId"]
|
||
brand_name = mfr["manufacturerName"]
|
||
|
||
# Skip regional duplicates
|
||
if "(" in brand_name:
|
||
stats["skipped"] += 1
|
||
continue
|
||
|
||
model_file = models_dir / f"{mfr_id}.json"
|
||
if not model_file.exists():
|
||
continue
|
||
|
||
models = json.loads(model_file.read_text())
|
||
if not models:
|
||
continue
|
||
|
||
# Insert brand
|
||
if brand_name not in brand_cache:
|
||
cur.execute(
|
||
"INSERT INTO brands (name_brand) VALUES (%s) ON CONFLICT (name_brand) DO UPDATE SET name_brand=EXCLUDED.name_brand RETURNING id_brand",
|
||
(brand_name,))
|
||
brand_cache[brand_name] = cur.fetchone()[0]
|
||
stats["brands"] += 1
|
||
|
||
brand_id = brand_cache[brand_name]
|
||
|
||
for model in models:
|
||
model_name = model.get("modelName")
|
||
if not model_name:
|
||
continue
|
||
td_model_id = model["modelId"]
|
||
year_from = model.get("modelYearFrom", "")[:4] if model.get("modelYearFrom") else None
|
||
year_to = model.get("modelYearTo", "")[:4] if model.get("modelYearTo") else None
|
||
body_id = parse_body_id(model_name)
|
||
|
||
# Insert model
|
||
model_key = (brand_id, model_name)
|
||
if model_key not in model_cache:
|
||
cur.execute(
|
||
"""INSERT INTO models (brand_id, name_model, id_body, production_start_year, production_end_year)
|
||
VALUES (%s, %s, %s, %s, %s) RETURNING id_model""",
|
||
(brand_id, model_name, body_id,
|
||
int(year_from) if year_from else None,
|
||
int(year_to) if year_to else None))
|
||
model_cache[model_key] = cur.fetchone()[0]
|
||
stats["models"] += 1
|
||
|
||
model_db_id = model_cache[model_key]
|
||
|
||
# Load vehicles
|
||
vehicle_file = vehicles_dir / f"{td_model_id}.json"
|
||
if not vehicle_file.exists():
|
||
continue
|
||
|
||
vehicles = json.loads(vehicle_file.read_text())
|
||
if not vehicles:
|
||
continue
|
||
|
||
# Dedup by vehicleId
|
||
seen_v = {}
|
||
for v in vehicles:
|
||
vid = v["vehicleId"]
|
||
if vid not in seen_v:
|
||
seen_v[vid] = v
|
||
seen_v[vid]["_codes"] = [v.get("engineCodes", "")]
|
||
else:
|
||
c = v.get("engineCodes", "")
|
||
if c and c not in seen_v[vid]["_codes"]:
|
||
seen_v[vid]["_codes"].append(c)
|
||
|
||
for v in seen_v.values():
|
||
cap_lt = float(v["capacityLt"]) if v.get("capacityLt") else 0
|
||
cylinders = v.get("numberOfCylinders")
|
||
fuel = v.get("fuelType", "")
|
||
power_ps = float(v["powerPs"]) if v.get("powerPs") else 0
|
||
power_hp = int(power_ps * 0.9863) if power_ps else None
|
||
displacement = float(v["capacityTech"]) if v.get("capacityTech") else None
|
||
codes = ", ".join(v["_codes"])
|
||
fuel_id = parse_fuel_id(fuel)
|
||
|
||
# Build engine name
|
||
fl = fuel.lower() if fuel else ""
|
||
if "electric" in fl and "petrol" not in fl and cap_lt == 0:
|
||
eng_name = f"Electric {power_hp}hp" if power_hp else "Electric"
|
||
else:
|
||
eng_name = f"{cap_lt:.1f}L"
|
||
if cylinders:
|
||
eng_name += f" {cylinders}cyl"
|
||
if "diesel" in fl:
|
||
eng_name += " Diesel"
|
||
elif "electric" in fl:
|
||
eng_name += " Hybrid"
|
||
if power_hp:
|
||
eng_name += f" {power_hp}hp"
|
||
|
||
engine_key = (eng_name, displacement, cylinders, fuel_id, power_hp, codes)
|
||
if engine_key not in engine_cache:
|
||
cur.execute(
|
||
"""INSERT INTO engines (name_engine, displacement_cc, cylinders, id_fuel, power_hp, engine_code)
|
||
VALUES (%s, %s, %s, %s, %s, %s) RETURNING id_engine""",
|
||
(eng_name, displacement, cylinders, fuel_id, power_hp, codes))
|
||
engine_cache[engine_key] = cur.fetchone()[0]
|
||
stats["engines"] += 1
|
||
|
||
engine_db_id = engine_cache[engine_key]
|
||
|
||
start_str = v.get("constructionIntervalStart")
|
||
end_str = v.get("constructionIntervalEnd")
|
||
if not start_str:
|
||
continue
|
||
|
||
start_year = max(int(start_str[:4]), 1950)
|
||
end_year = min(int(end_str[:4]) if end_str else current_year, current_year + 1)
|
||
trim = v.get("typeEngineName", "")
|
||
|
||
for year in range(start_year, end_year + 1):
|
||
yid = year_cache.get(year)
|
||
if not yid:
|
||
continue
|
||
mye_key = (model_db_id, yid, engine_db_id, trim)
|
||
if mye_key in mye_set:
|
||
continue
|
||
mye_set.add(mye_key)
|
||
cur.execute(
|
||
"""INSERT INTO model_year_engine (model_id, year_id, engine_id, trim_level)
|
||
VALUES (%s, %s, %s, %s) ON CONFLICT DO NOTHING""",
|
||
(model_db_id, yid, engine_db_id, trim))
|
||
stats["mye"] += 1
|
||
|
||
# Commit per brand
|
||
conn.commit()
|
||
|
||
conn.commit()
|
||
cur.close()
|
||
conn.close()
|
||
|
||
print(f"\n{'='*60}", flush=True)
|
||
print(f"IMPORT COMPLETE", flush=True)
|
||
print(f" Brands: {stats['brands']} ({stats['skipped']} regional skipped)", flush=True)
|
||
print(f" Models: {stats['models']}", flush=True)
|
||
print(f" Engines: {stats['engines']}", flush=True)
|
||
print(f" MYE: {stats['mye']}", flush=True)
|
||
print(f"{'='*60}", flush=True)
|
||
|
||
|
||
# ──────────────── Status ────────────────
|
||
|
||
def status():
|
||
"""Show download progress."""
|
||
if not DATA_DIR.exists():
|
||
print("No data directory yet.")
|
||
return
|
||
|
||
mfr_file = DATA_DIR / "manufacturers.json"
|
||
if not mfr_file.exists():
|
||
print("Manufacturers not downloaded yet.")
|
||
return
|
||
|
||
manufacturers = json.loads(mfr_file.read_text())
|
||
models_dir = DATA_DIR / "models"
|
||
vehicles_dir = DATA_DIR / "vehicles"
|
||
|
||
model_files = list(models_dir.glob("*.json")) if models_dir.exists() else []
|
||
vehicle_files = list(vehicles_dir.glob("*.json")) if vehicles_dir.exists() else []
|
||
|
||
total_models = 0
|
||
for f in model_files:
|
||
total_models += len(json.loads(f.read_text()))
|
||
|
||
print(f"Manufacturers: {len(manufacturers)} total")
|
||
print(f"Model files: {len(model_files)} / {len(manufacturers)} brands downloaded")
|
||
print(f"Total models: {total_models}")
|
||
print(f"Vehicle files: {len(vehicle_files)} / {total_models} models downloaded")
|
||
|
||
if total_models > 0:
|
||
pct = len(vehicle_files) / total_models * 100
|
||
print(f"Progress: {pct:.1f}%")
|
||
remaining = total_models - len(vehicle_files)
|
||
est_minutes = remaining * APIFY_DELAY / 60 + remaining * 3 / 60 # delay + avg API time
|
||
print(f"Est. remaining: ~{est_minutes:.0f} minutes ({remaining} API calls)")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
parser = argparse.ArgumentParser(description="TecDoc vehicle data import")
|
||
parser.add_argument("command", choices=["download", "import", "status"])
|
||
parser.add_argument("--brand", help="Filter by brand name")
|
||
args = parser.parse_args()
|
||
|
||
if args.command == "download":
|
||
download(brand_filter=args.brand)
|
||
elif args.command == "import":
|
||
do_import()
|
||
elif args.command == "status":
|
||
status()
|