Files
Autoparts-DB/scripts/import_tecdoc.py
consultoria-as d269bc1ffb feat: add TecDoc import pipeline scripts
- import_tecdoc.py: 2-phase TecDoc download + import (brands, models, vehicles)
- import_live.py: real-time streaming importer for part details
- run_all_brands.sh: automated sequential brand processing pipeline

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-18 22:25:48 +00:00

415 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Import vehicle data from TecDoc (Apify) into Nexus Autoparts PostgreSQL.
Two-phase approach:
Phase 1: Download all data from TecDoc API to local JSON files
Phase 2: Import JSON files into PostgreSQL
Usage:
python3 scripts/import_tecdoc.py download # Phase 1: fetch from API
python3 scripts/import_tecdoc.py download --brand TOYOTA # Single brand
python3 scripts/import_tecdoc.py import # Phase 2: load into DB
python3 scripts/import_tecdoc.py status # Check progress
"""
import os
import sys
import json
import time
import argparse
import requests
import psycopg2
from datetime import datetime
from pathlib import Path
# --- Config ---
APIFY_TOKEN = os.environ.get("APIFY_TOKEN", "apify_api_l5SrcwYyanAO45AFxrEpviUcuVRIFK2yPdc5")
APIFY_ACTOR = "making-data-meaningful~tecdoc"
APIFY_URL = f"https://api.apify.com/v2/acts/{APIFY_ACTOR}/run-sync-get-dataset-items"
DB_URL = os.environ.get("DATABASE_URL", "postgresql://nexus:nexus_autoparts_2026@localhost/nexus_autoparts")
TYPE_ID = 1 # Passenger cars
LANG_ID = 4 # English
COUNTRY_ID = 153 # Mexico
DATA_DIR = Path("/home/Autopartes/data/tecdoc")
APIFY_DELAY = 1.0 # seconds between API calls
def apify_call(input_data, retries=3):
"""Call Apify actor and return result."""
for attempt in range(retries):
try:
resp = requests.post(
APIFY_URL, params={"token": APIFY_TOKEN},
headers={"Content-Type": "application/json"},
json=input_data, timeout=120
)
if resp.status_code in (200, 201):
data = resp.json()
return data[0] if isinstance(data, list) and data else data
elif resp.status_code == 429:
wait = 15 * (attempt + 1)
print(f" Rate limited, waiting {wait}s...", flush=True)
time.sleep(wait)
else:
print(f" HTTP {resp.status_code}: {resp.text[:100]}", flush=True)
time.sleep(5)
except Exception as e:
print(f" Error: {e}", flush=True)
time.sleep(5)
return None
# ──────────────── Phase 1: Download ────────────────
def download(brand_filter=None):
"""Download all TecDoc data to local JSON files."""
DATA_DIR.mkdir(parents=True, exist_ok=True)
# Step 1: Manufacturers
mfr_file = DATA_DIR / "manufacturers.json"
if mfr_file.exists():
manufacturers = json.loads(mfr_file.read_text())
print(f"Loaded {len(manufacturers)} cached manufacturers", flush=True)
else:
print("Fetching manufacturers...", flush=True)
result = apify_call({"endpoint_manufacturerIdsByTypeId": True, "manufacturer_typeId_2": TYPE_ID})
manufacturers = result["manufacturers"]
mfr_file.write_text(json.dumps(manufacturers, indent=1))
print(f" Saved {len(manufacturers)} manufacturers", flush=True)
if brand_filter:
manufacturers = [m for m in manufacturers if brand_filter.upper() in m["manufacturerName"].upper()]
print(f"Filtered to {len(manufacturers)} matching '{brand_filter}'", flush=True)
# Step 2: Models for each manufacturer
models_dir = DATA_DIR / "models"
models_dir.mkdir(exist_ok=True)
for i, mfr in enumerate(manufacturers):
mfr_id = mfr["manufacturerId"]
mfr_name = mfr["manufacturerName"]
model_file = models_dir / f"{mfr_id}.json"
if model_file.exists():
continue # Skip already downloaded
print(f"[{i+1}/{len(manufacturers)}] {mfr_name} (id={mfr_id})", flush=True)
time.sleep(APIFY_DELAY)
result = apify_call({
"endpoint_modelsByTypeManufacturer": True,
"models_typeId_1": TYPE_ID,
"models_manufacturerId_1": mfr_id,
"models_langId_1": LANG_ID,
"models_countryFilterId_1": COUNTRY_ID
})
models = result.get("models", []) if result else []
model_file.write_text(json.dumps(models, indent=1))
print(f" {len(models)} models", flush=True)
# Step 3: Vehicle types for each model
vehicles_dir = DATA_DIR / "vehicles"
vehicles_dir.mkdir(exist_ok=True)
# Iterate all model files
total_models = 0
processed = 0
for model_file in sorted(models_dir.glob("*.json")):
mfr_id = model_file.stem
models = json.loads(model_file.read_text())
total_models += len(models)
for model in models:
td_model_id = model["modelId"]
vehicle_file = vehicles_dir / f"{td_model_id}.json"
if vehicle_file.exists():
processed += 1
continue
print(f" [{processed+1}/{total_models}] Model {model['modelName']} (id={td_model_id})", flush=True)
time.sleep(APIFY_DELAY)
result = apify_call({
"endpoint_vehicleEngineTypesByModel": True,
"vehicle_typeId_3": TYPE_ID,
"vehicle_modelId_3": td_model_id,
"vehicle_langId_3": LANG_ID,
"vehicle_countryFilterId_3": COUNTRY_ID
})
vehicles = result.get("modelTypes", []) if result else []
vehicle_file.write_text(json.dumps(vehicles, indent=1))
processed += 1
print(f"\nDownload complete! {processed} model vehicle files.", flush=True)
# ──────────────── Phase 2: Import ────────────────
def parse_fuel_id(fuel_str):
if not fuel_str:
return None
f = fuel_str.lower()
if "diesel" in f:
return 1
if "electric" in f and "petrol" not in f and "gas" not in f:
return 2
return 3
def parse_body_id(model_name):
if not model_name:
return None
mapping = {
"Saloon": 1, "Sedan": 1, "Coupe": 2, "Coupé": 2,
"Hatchback": 3, "SUV": 4, "Off-Road": 4, "Crossover": 5,
"Truck": 6, "Van": 7, "Box Body": 7, "MPV": 8,
"Estate": 9, "Wagon": 9, "Kombi": 9,
"Convertible": 10, "Cabrio": 10, "Cabriolet": 10,
"Pick-up": 11, "Pickup": 11,
"Platform": 12, "Chassis": 12, "Bus": 13, "Roadster": 15,
}
for key, val in mapping.items():
if key in model_name:
return val
return None
def do_import():
"""Import downloaded JSON data into PostgreSQL."""
if not DATA_DIR.exists():
print("No data directory found. Run 'download' first.")
return
mfr_file = DATA_DIR / "manufacturers.json"
if not mfr_file.exists():
print("No manufacturers.json found. Run 'download' first.")
return
manufacturers = json.loads(mfr_file.read_text())
models_dir = DATA_DIR / "models"
vehicles_dir = DATA_DIR / "vehicles"
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
# Ensure years exist (19502027)
cur.execute("SELECT id_year, year_car FROM years")
year_cache = {r[1]: r[0] for r in cur.fetchall()}
for y in range(1950, 2028):
if y not in year_cache:
cur.execute("INSERT INTO years (year_car) VALUES (%s) RETURNING id_year", (y,))
year_cache[y] = cur.fetchone()[0]
conn.commit()
# Caches
brand_cache = {}
model_cache = {}
engine_cache = {}
mye_set = set()
stats = {"brands": 0, "models": 0, "engines": 0, "mye": 0, "skipped": 0}
current_year = datetime.now().year
for mfr in manufacturers:
mfr_id = mfr["manufacturerId"]
brand_name = mfr["manufacturerName"]
# Skip regional duplicates
if "(" in brand_name:
stats["skipped"] += 1
continue
model_file = models_dir / f"{mfr_id}.json"
if not model_file.exists():
continue
models = json.loads(model_file.read_text())
if not models:
continue
# Insert brand
if brand_name not in brand_cache:
cur.execute(
"INSERT INTO brands (name_brand) VALUES (%s) ON CONFLICT (name_brand) DO UPDATE SET name_brand=EXCLUDED.name_brand RETURNING id_brand",
(brand_name,))
brand_cache[brand_name] = cur.fetchone()[0]
stats["brands"] += 1
brand_id = brand_cache[brand_name]
for model in models:
model_name = model.get("modelName")
if not model_name:
continue
td_model_id = model["modelId"]
year_from = model.get("modelYearFrom", "")[:4] if model.get("modelYearFrom") else None
year_to = model.get("modelYearTo", "")[:4] if model.get("modelYearTo") else None
body_id = parse_body_id(model_name)
# Insert model
model_key = (brand_id, model_name)
if model_key not in model_cache:
cur.execute(
"""INSERT INTO models (brand_id, name_model, id_body, production_start_year, production_end_year)
VALUES (%s, %s, %s, %s, %s) RETURNING id_model""",
(brand_id, model_name, body_id,
int(year_from) if year_from else None,
int(year_to) if year_to else None))
model_cache[model_key] = cur.fetchone()[0]
stats["models"] += 1
model_db_id = model_cache[model_key]
# Load vehicles
vehicle_file = vehicles_dir / f"{td_model_id}.json"
if not vehicle_file.exists():
continue
vehicles = json.loads(vehicle_file.read_text())
if not vehicles:
continue
# Dedup by vehicleId
seen_v = {}
for v in vehicles:
vid = v["vehicleId"]
if vid not in seen_v:
seen_v[vid] = v
seen_v[vid]["_codes"] = [v.get("engineCodes", "")]
else:
c = v.get("engineCodes", "")
if c and c not in seen_v[vid]["_codes"]:
seen_v[vid]["_codes"].append(c)
for v in seen_v.values():
cap_lt = float(v["capacityLt"]) if v.get("capacityLt") else 0
cylinders = v.get("numberOfCylinders")
fuel = v.get("fuelType", "")
power_ps = float(v["powerPs"]) if v.get("powerPs") else 0
power_hp = int(power_ps * 0.9863) if power_ps else None
displacement = float(v["capacityTech"]) if v.get("capacityTech") else None
codes = ", ".join(v["_codes"])
fuel_id = parse_fuel_id(fuel)
# Build engine name
fl = fuel.lower() if fuel else ""
if "electric" in fl and "petrol" not in fl and cap_lt == 0:
eng_name = f"Electric {power_hp}hp" if power_hp else "Electric"
else:
eng_name = f"{cap_lt:.1f}L"
if cylinders:
eng_name += f" {cylinders}cyl"
if "diesel" in fl:
eng_name += " Diesel"
elif "electric" in fl:
eng_name += " Hybrid"
if power_hp:
eng_name += f" {power_hp}hp"
engine_key = (eng_name, displacement, cylinders, fuel_id, power_hp, codes)
if engine_key not in engine_cache:
cur.execute(
"""INSERT INTO engines (name_engine, displacement_cc, cylinders, id_fuel, power_hp, engine_code)
VALUES (%s, %s, %s, %s, %s, %s) RETURNING id_engine""",
(eng_name, displacement, cylinders, fuel_id, power_hp, codes))
engine_cache[engine_key] = cur.fetchone()[0]
stats["engines"] += 1
engine_db_id = engine_cache[engine_key]
start_str = v.get("constructionIntervalStart")
end_str = v.get("constructionIntervalEnd")
if not start_str:
continue
start_year = max(int(start_str[:4]), 1950)
end_year = min(int(end_str[:4]) if end_str else current_year, current_year + 1)
trim = v.get("typeEngineName", "")
for year in range(start_year, end_year + 1):
yid = year_cache.get(year)
if not yid:
continue
mye_key = (model_db_id, yid, engine_db_id, trim)
if mye_key in mye_set:
continue
mye_set.add(mye_key)
cur.execute(
"""INSERT INTO model_year_engine (model_id, year_id, engine_id, trim_level)
VALUES (%s, %s, %s, %s) ON CONFLICT DO NOTHING""",
(model_db_id, yid, engine_db_id, trim))
stats["mye"] += 1
# Commit per brand
conn.commit()
conn.commit()
cur.close()
conn.close()
print(f"\n{'='*60}", flush=True)
print(f"IMPORT COMPLETE", flush=True)
print(f" Brands: {stats['brands']} ({stats['skipped']} regional skipped)", flush=True)
print(f" Models: {stats['models']}", flush=True)
print(f" Engines: {stats['engines']}", flush=True)
print(f" MYE: {stats['mye']}", flush=True)
print(f"{'='*60}", flush=True)
# ──────────────── Status ────────────────
def status():
"""Show download progress."""
if not DATA_DIR.exists():
print("No data directory yet.")
return
mfr_file = DATA_DIR / "manufacturers.json"
if not mfr_file.exists():
print("Manufacturers not downloaded yet.")
return
manufacturers = json.loads(mfr_file.read_text())
models_dir = DATA_DIR / "models"
vehicles_dir = DATA_DIR / "vehicles"
model_files = list(models_dir.glob("*.json")) if models_dir.exists() else []
vehicle_files = list(vehicles_dir.glob("*.json")) if vehicles_dir.exists() else []
total_models = 0
for f in model_files:
total_models += len(json.loads(f.read_text()))
print(f"Manufacturers: {len(manufacturers)} total")
print(f"Model files: {len(model_files)} / {len(manufacturers)} brands downloaded")
print(f"Total models: {total_models}")
print(f"Vehicle files: {len(vehicle_files)} / {total_models} models downloaded")
if total_models > 0:
pct = len(vehicle_files) / total_models * 100
print(f"Progress: {pct:.1f}%")
remaining = total_models - len(vehicle_files)
est_minutes = remaining * APIFY_DELAY / 60 + remaining * 3 / 60 # delay + avg API time
print(f"Est. remaining: ~{est_minutes:.0f} minutes ({remaining} API calls)")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="TecDoc vehicle data import")
parser.add_argument("command", choices=["download", "import", "status"])
parser.add_argument("--brand", help="Filter by brand name")
args = parser.parse_args()
if args.command == "download":
download(brand_filter=args.brand)
elif args.command == "import":
do_import()
elif args.command == "status":
status()