fix: performance improvements, shared UI, and cross-reference data quality

Backend (server.py): - Fix N+1 query in /api/diagrams/<id>/parts with batch cross-ref query - Add LIMIT safety nets to 15 endpoints (50-5000 per data type) - Add pagination to /api/vehicles, /api/model-year-engine, /api/vehicles/<id>/parts, /api/admin/export - Optimize search_vehicles() EXISTS subquery to JOIN - Restrict static route to /static/* subdir (security fix) - Add detailed=true support to /api/brands and /api/models Frontend: - Extract shared CSS into shared.css (variables, reset, buttons, forms, scrollbar) - Create shared nav.js component (logo + navigation links, auto-highlights) - Update all 4 HTML pages to use shared CSS and nav - Update JS to handle paginated API responses Data quality: - Fix cross-reference source field: map 72K records from catalog names to actual brands - Fix aftermarket_parts manufacturer_id: correct 8K records with wrong brand attribution - Delete 98MB backup file, orphan records, and garbage cross-references - Add import scripts for DAR, FRAM, WIX, MOOG, Cartek catalogs Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-17 03:09:22 +00:00
parent 3ea2de61e2
commit 7ecf1295a5
17 changed files with 6605 additions and 848 deletions
--- a/vehicle_database/scripts/extract_moog_diagrams.py
+++ b/vehicle_database/scripts/extract_moog_diagrams.py
@@ -0,0 +1,179 @@
+#!/usr/bin/env python3
+"""
+EXTRACTOR DE IMÁGENES DE DIAGRAMAS MOOG
+Extrae las ilustraciones de suspensión/dirección de los PDFs MOOG
+y las guarda como archivos de imagen mapeados a sus figure codes.
+"""
+
+import re
+import sys
+import io
+import hashlib
+from pathlib import Path
+
+import pypdf
+
+OUTPUT_DIR = Path(__file__).parent.parent.parent / 'dashboard' / 'static' / 'diagrams' / 'moog'
+
+VOLUMES = {
+    '1': {
+        'path': '/tmp/catalogs/suspension/moog_vol1_1989back.pdf',
+        'start_page': 3,
+        'end_page': 1037,
+        'label': 'Vol 1 (≤1989)',
+    },
+    '2': {
+        'path': '/tmp/catalogs/suspension/moog_vol2_1990_2005.pdf',
+        'start_page': 6,
+        'end_page': 1641,
+        'label': 'Vol 2 (1990-2005)',
+    },
+    '3': {
+        'path': '/tmp/catalogs/suspension/moog_vol3_2006up.pdf',
+        'start_page': 7,
+        'end_page': 1089,
+        'label': 'Vol 3 (2006+)',
+    },
+}
+
+FIGURE_RE = re.compile(r'\b([FSR]\d{3})\b')
+
+
+def extract_figure_codes(text):
+    """Extract ordered unique figure codes from page text."""
+    codes = []
+    seen = set()
+    for m in FIGURE_RE.finditer(text):
+        code = m.group(1)
+        if code not in seen:
+            codes.append(code)
+            seen.add(code)
+    return codes
+
+
+def extract_volume(vol_key, already_extracted):
+    """Extract diagram images from one MOOG volume."""
+    vol = VOLUMES[vol_key]
+    print(f"\n--- Procesando {vol['label']} ---")
+    print(f"    PDF: {vol['path']}")
+
+    pdf = pypdf.PdfReader(vol['path'])
+    total_pages = len(pdf.pages)
+    end_page = min(vol['end_page'], total_pages - 1)
+
+    extracted = 0
+    skipped = 0
+    errors = 0
+
+    for page_idx in range(vol['start_page'], end_page + 1):
+        if page_idx % 100 == 0:
+            print(f"    Página {page_idx}/{end_page}... (extraídas: {extracted})")
+
+        try:
+            page = pdf.pages[page_idx]
+            text = page.extract_text() or ''
+
+            # Get figure codes from this page
+            fig_codes = extract_figure_codes(text)
+            if not fig_codes:
+                continue
+
+            # Filter out already-extracted codes
+            needed_codes = [c for c in fig_codes if c not in already_extracted]
+            if not needed_codes:
+                skipped += len(fig_codes)
+                continue
+
+            # Extract images from page
+            images = []
+            try:
+                for img_key in page.images:
+                    img_data = img_key.data
+                    # Filter by size - diagram images are >10KB typically
+                    if len(img_data) > 5000:
+                        images.append(img_data)
+            except Exception:
+                # Fallback: try to extract from xobjects directly
+                try:
+                    if '/XObject' in page['/Resources']:
+                        xobjects = page['/Resources']['/XObject'].get_object()
+                        for obj_name in sorted(xobjects.keys()):
+                            xobj = xobjects[obj_name].get_object()
+                            if xobj.get('/Subtype') == '/Image':
+                                w = int(xobj.get('/Width', 0))
+                                h = int(xobj.get('/Height', 0))
+                                if w > 200 and h > 100:
+                                    try:
+                                        img_data = xobj.get_data()
+                                        if len(img_data) > 5000:
+                                            images.append(img_data)
+                                    except Exception:
+                                        pass
+                except Exception:
+                    pass
+
+            if not images:
+                continue
+
+            # Match figure codes to images
+            # Strategy: if same number of large images and figure codes, match 1:1 in order
+            # If fewer images than codes, some codes share images (use first available)
+            # If more images than codes, filter further by size
+            for i, code in enumerate(needed_codes):
+                if i < len(images):
+                    img_data = images[i]
+                    # Determine file extension from magic bytes
+                    ext = 'jpg'
+                    if img_data[:4] == b'\x89PNG':
+                        ext = 'png'
+                    elif img_data[:4] == b'\x00\x00\x00\x0c':
+                        ext = 'jp2'
+
+                    out_path = OUTPUT_DIR / f"{code}.{ext}"
+                    out_path.write_bytes(img_data)
+                    already_extracted.add(code)
+                    extracted += 1
+
+        except Exception as e:
+            errors += 1
+            if errors <= 5:
+                print(f"    Error en página {page_idx}: {e}")
+
+    print(f"    Resultado: {extracted} extraídas, {skipped} ya existentes, {errors} errores")
+    return extracted
+
+
+def main():
+    volumes = sys.argv[1:] if len(sys.argv) > 1 else ['3', '2', '1']
+
+    print("=" * 70)
+    print("EXTRACTOR DE DIAGRAMAS MOOG")
+    print("=" * 70)
+
+    # Create output directory
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+    print(f"Directorio de salida: {OUTPUT_DIR}")
+
+    # Check what's already extracted
+    already_extracted = set()
+    for f in OUTPUT_DIR.iterdir():
+        if f.suffix in ('.jpg', '.png', '.jp2'):
+            already_extracted.add(f.stem)
+    print(f"Ya extraídas: {len(already_extracted)}")
+
+    total = 0
+    for vol_key in volumes:
+        if vol_key not in VOLUMES:
+            print(f"Volumen {vol_key} no reconocido, saltando...")
+            continue
+        count = extract_volume(vol_key, already_extracted)
+        total += count
+
+    print(f"\n{'=' * 70}")
+    print(f"EXTRACCIÓN COMPLETADA: {total} nuevas imágenes")
+    print(f"Total en directorio: {len(list(OUTPUT_DIR.iterdir()))}")
+    print(f"{'=' * 70}")
+
+
+if __name__ == '__main__':
+    main()