fix: performance improvements, shared UI, and cross-reference data quality
Backend (server.py): - Fix N+1 query in /api/diagrams/<id>/parts with batch cross-ref query - Add LIMIT safety nets to 15 endpoints (50-5000 per data type) - Add pagination to /api/vehicles, /api/model-year-engine, /api/vehicles/<id>/parts, /api/admin/export - Optimize search_vehicles() EXISTS subquery to JOIN - Restrict static route to /static/* subdir (security fix) - Add detailed=true support to /api/brands and /api/models Frontend: - Extract shared CSS into shared.css (variables, reset, buttons, forms, scrollbar) - Create shared nav.js component (logo + navigation links, auto-highlights) - Update all 4 HTML pages to use shared CSS and nav - Update JS to handle paginated API responses Data quality: - Fix cross-reference source field: map 72K records from catalog names to actual brands - Fix aftermarket_parts manufacturer_id: correct 8K records with wrong brand attribution - Delete 98MB backup file, orphan records, and garbage cross-references - Add import scripts for DAR, FRAM, WIX, MOOG, Cartek catalogs Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
179
vehicle_database/scripts/extract_moog_diagrams.py
Normal file
179
vehicle_database/scripts/extract_moog_diagrams.py
Normal file
@@ -0,0 +1,179 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
EXTRACTOR DE IMÁGENES DE DIAGRAMAS MOOG
|
||||
Extrae las ilustraciones de suspensión/dirección de los PDFs MOOG
|
||||
y las guarda como archivos de imagen mapeados a sus figure codes.
|
||||
"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
import io
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
|
||||
import pypdf
|
||||
|
||||
OUTPUT_DIR = Path(__file__).parent.parent.parent / 'dashboard' / 'static' / 'diagrams' / 'moog'
|
||||
|
||||
VOLUMES = {
|
||||
'1': {
|
||||
'path': '/tmp/catalogs/suspension/moog_vol1_1989back.pdf',
|
||||
'start_page': 3,
|
||||
'end_page': 1037,
|
||||
'label': 'Vol 1 (≤1989)',
|
||||
},
|
||||
'2': {
|
||||
'path': '/tmp/catalogs/suspension/moog_vol2_1990_2005.pdf',
|
||||
'start_page': 6,
|
||||
'end_page': 1641,
|
||||
'label': 'Vol 2 (1990-2005)',
|
||||
},
|
||||
'3': {
|
||||
'path': '/tmp/catalogs/suspension/moog_vol3_2006up.pdf',
|
||||
'start_page': 7,
|
||||
'end_page': 1089,
|
||||
'label': 'Vol 3 (2006+)',
|
||||
},
|
||||
}
|
||||
|
||||
FIGURE_RE = re.compile(r'\b([FSR]\d{3})\b')
|
||||
|
||||
|
||||
def extract_figure_codes(text):
|
||||
"""Extract ordered unique figure codes from page text."""
|
||||
codes = []
|
||||
seen = set()
|
||||
for m in FIGURE_RE.finditer(text):
|
||||
code = m.group(1)
|
||||
if code not in seen:
|
||||
codes.append(code)
|
||||
seen.add(code)
|
||||
return codes
|
||||
|
||||
|
||||
def extract_volume(vol_key, already_extracted):
|
||||
"""Extract diagram images from one MOOG volume."""
|
||||
vol = VOLUMES[vol_key]
|
||||
print(f"\n--- Procesando {vol['label']} ---")
|
||||
print(f" PDF: {vol['path']}")
|
||||
|
||||
pdf = pypdf.PdfReader(vol['path'])
|
||||
total_pages = len(pdf.pages)
|
||||
end_page = min(vol['end_page'], total_pages - 1)
|
||||
|
||||
extracted = 0
|
||||
skipped = 0
|
||||
errors = 0
|
||||
|
||||
for page_idx in range(vol['start_page'], end_page + 1):
|
||||
if page_idx % 100 == 0:
|
||||
print(f" Página {page_idx}/{end_page}... (extraídas: {extracted})")
|
||||
|
||||
try:
|
||||
page = pdf.pages[page_idx]
|
||||
text = page.extract_text() or ''
|
||||
|
||||
# Get figure codes from this page
|
||||
fig_codes = extract_figure_codes(text)
|
||||
if not fig_codes:
|
||||
continue
|
||||
|
||||
# Filter out already-extracted codes
|
||||
needed_codes = [c for c in fig_codes if c not in already_extracted]
|
||||
if not needed_codes:
|
||||
skipped += len(fig_codes)
|
||||
continue
|
||||
|
||||
# Extract images from page
|
||||
images = []
|
||||
try:
|
||||
for img_key in page.images:
|
||||
img_data = img_key.data
|
||||
# Filter by size - diagram images are >10KB typically
|
||||
if len(img_data) > 5000:
|
||||
images.append(img_data)
|
||||
except Exception:
|
||||
# Fallback: try to extract from xobjects directly
|
||||
try:
|
||||
if '/XObject' in page['/Resources']:
|
||||
xobjects = page['/Resources']['/XObject'].get_object()
|
||||
for obj_name in sorted(xobjects.keys()):
|
||||
xobj = xobjects[obj_name].get_object()
|
||||
if xobj.get('/Subtype') == '/Image':
|
||||
w = int(xobj.get('/Width', 0))
|
||||
h = int(xobj.get('/Height', 0))
|
||||
if w > 200 and h > 100:
|
||||
try:
|
||||
img_data = xobj.get_data()
|
||||
if len(img_data) > 5000:
|
||||
images.append(img_data)
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not images:
|
||||
continue
|
||||
|
||||
# Match figure codes to images
|
||||
# Strategy: if same number of large images and figure codes, match 1:1 in order
|
||||
# If fewer images than codes, some codes share images (use first available)
|
||||
# If more images than codes, filter further by size
|
||||
for i, code in enumerate(needed_codes):
|
||||
if i < len(images):
|
||||
img_data = images[i]
|
||||
# Determine file extension from magic bytes
|
||||
ext = 'jpg'
|
||||
if img_data[:4] == b'\x89PNG':
|
||||
ext = 'png'
|
||||
elif img_data[:4] == b'\x00\x00\x00\x0c':
|
||||
ext = 'jp2'
|
||||
|
||||
out_path = OUTPUT_DIR / f"{code}.{ext}"
|
||||
out_path.write_bytes(img_data)
|
||||
already_extracted.add(code)
|
||||
extracted += 1
|
||||
|
||||
except Exception as e:
|
||||
errors += 1
|
||||
if errors <= 5:
|
||||
print(f" Error en página {page_idx}: {e}")
|
||||
|
||||
print(f" Resultado: {extracted} extraídas, {skipped} ya existentes, {errors} errores")
|
||||
return extracted
|
||||
|
||||
|
||||
def main():
|
||||
volumes = sys.argv[1:] if len(sys.argv) > 1 else ['3', '2', '1']
|
||||
|
||||
print("=" * 70)
|
||||
print("EXTRACTOR DE DIAGRAMAS MOOG")
|
||||
print("=" * 70)
|
||||
|
||||
# Create output directory
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
print(f"Directorio de salida: {OUTPUT_DIR}")
|
||||
|
||||
# Check what's already extracted
|
||||
already_extracted = set()
|
||||
for f in OUTPUT_DIR.iterdir():
|
||||
if f.suffix in ('.jpg', '.png', '.jp2'):
|
||||
already_extracted.add(f.stem)
|
||||
print(f"Ya extraídas: {len(already_extracted)}")
|
||||
|
||||
total = 0
|
||||
for vol_key in volumes:
|
||||
if vol_key not in VOLUMES:
|
||||
print(f"Volumen {vol_key} no reconocido, saltando...")
|
||||
continue
|
||||
count = extract_volume(vol_key, already_extracted)
|
||||
total += count
|
||||
|
||||
print(f"\n{'=' * 70}")
|
||||
print(f"EXTRACCIÓN COMPLETADA: {total} nuevas imágenes")
|
||||
print(f"Total en directorio: {len(list(OUTPUT_DIR.iterdir()))}")
|
||||
print(f"{'=' * 70}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user