Backend (server.py): - Fix N+1 query in /api/diagrams/<id>/parts with batch cross-ref query - Add LIMIT safety nets to 15 endpoints (50-5000 per data type) - Add pagination to /api/vehicles, /api/model-year-engine, /api/vehicles/<id>/parts, /api/admin/export - Optimize search_vehicles() EXISTS subquery to JOIN - Restrict static route to /static/* subdir (security fix) - Add detailed=true support to /api/brands and /api/models Frontend: - Extract shared CSS into shared.css (variables, reset, buttons, forms, scrollbar) - Create shared nav.js component (logo + navigation links, auto-highlights) - Update all 4 HTML pages to use shared CSS and nav - Update JS to handle paginated API responses Data quality: - Fix cross-reference source field: map 72K records from catalog names to actual brands - Fix aftermarket_parts manufacturer_id: correct 8K records with wrong brand attribution - Delete 98MB backup file, orphan records, and garbage cross-references - Add import scripts for DAR, FRAM, WIX, MOOG, Cartek catalogs Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
180 lines
5.8 KiB
Python
180 lines
5.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
EXTRACTOR DE IMÁGENES DE DIAGRAMAS MOOG
|
|
Extrae las ilustraciones de suspensión/dirección de los PDFs MOOG
|
|
y las guarda como archivos de imagen mapeados a sus figure codes.
|
|
"""
|
|
|
|
import re
|
|
import sys
|
|
import io
|
|
import hashlib
|
|
from pathlib import Path
|
|
|
|
import pypdf
|
|
|
|
OUTPUT_DIR = Path(__file__).parent.parent.parent / 'dashboard' / 'static' / 'diagrams' / 'moog'
|
|
|
|
VOLUMES = {
|
|
'1': {
|
|
'path': '/tmp/catalogs/suspension/moog_vol1_1989back.pdf',
|
|
'start_page': 3,
|
|
'end_page': 1037,
|
|
'label': 'Vol 1 (≤1989)',
|
|
},
|
|
'2': {
|
|
'path': '/tmp/catalogs/suspension/moog_vol2_1990_2005.pdf',
|
|
'start_page': 6,
|
|
'end_page': 1641,
|
|
'label': 'Vol 2 (1990-2005)',
|
|
},
|
|
'3': {
|
|
'path': '/tmp/catalogs/suspension/moog_vol3_2006up.pdf',
|
|
'start_page': 7,
|
|
'end_page': 1089,
|
|
'label': 'Vol 3 (2006+)',
|
|
},
|
|
}
|
|
|
|
FIGURE_RE = re.compile(r'\b([FSR]\d{3})\b')
|
|
|
|
|
|
def extract_figure_codes(text):
|
|
"""Extract ordered unique figure codes from page text."""
|
|
codes = []
|
|
seen = set()
|
|
for m in FIGURE_RE.finditer(text):
|
|
code = m.group(1)
|
|
if code not in seen:
|
|
codes.append(code)
|
|
seen.add(code)
|
|
return codes
|
|
|
|
|
|
def extract_volume(vol_key, already_extracted):
|
|
"""Extract diagram images from one MOOG volume."""
|
|
vol = VOLUMES[vol_key]
|
|
print(f"\n--- Procesando {vol['label']} ---")
|
|
print(f" PDF: {vol['path']}")
|
|
|
|
pdf = pypdf.PdfReader(vol['path'])
|
|
total_pages = len(pdf.pages)
|
|
end_page = min(vol['end_page'], total_pages - 1)
|
|
|
|
extracted = 0
|
|
skipped = 0
|
|
errors = 0
|
|
|
|
for page_idx in range(vol['start_page'], end_page + 1):
|
|
if page_idx % 100 == 0:
|
|
print(f" Página {page_idx}/{end_page}... (extraídas: {extracted})")
|
|
|
|
try:
|
|
page = pdf.pages[page_idx]
|
|
text = page.extract_text() or ''
|
|
|
|
# Get figure codes from this page
|
|
fig_codes = extract_figure_codes(text)
|
|
if not fig_codes:
|
|
continue
|
|
|
|
# Filter out already-extracted codes
|
|
needed_codes = [c for c in fig_codes if c not in already_extracted]
|
|
if not needed_codes:
|
|
skipped += len(fig_codes)
|
|
continue
|
|
|
|
# Extract images from page
|
|
images = []
|
|
try:
|
|
for img_key in page.images:
|
|
img_data = img_key.data
|
|
# Filter by size - diagram images are >10KB typically
|
|
if len(img_data) > 5000:
|
|
images.append(img_data)
|
|
except Exception:
|
|
# Fallback: try to extract from xobjects directly
|
|
try:
|
|
if '/XObject' in page['/Resources']:
|
|
xobjects = page['/Resources']['/XObject'].get_object()
|
|
for obj_name in sorted(xobjects.keys()):
|
|
xobj = xobjects[obj_name].get_object()
|
|
if xobj.get('/Subtype') == '/Image':
|
|
w = int(xobj.get('/Width', 0))
|
|
h = int(xobj.get('/Height', 0))
|
|
if w > 200 and h > 100:
|
|
try:
|
|
img_data = xobj.get_data()
|
|
if len(img_data) > 5000:
|
|
images.append(img_data)
|
|
except Exception:
|
|
pass
|
|
except Exception:
|
|
pass
|
|
|
|
if not images:
|
|
continue
|
|
|
|
# Match figure codes to images
|
|
# Strategy: if same number of large images and figure codes, match 1:1 in order
|
|
# If fewer images than codes, some codes share images (use first available)
|
|
# If more images than codes, filter further by size
|
|
for i, code in enumerate(needed_codes):
|
|
if i < len(images):
|
|
img_data = images[i]
|
|
# Determine file extension from magic bytes
|
|
ext = 'jpg'
|
|
if img_data[:4] == b'\x89PNG':
|
|
ext = 'png'
|
|
elif img_data[:4] == b'\x00\x00\x00\x0c':
|
|
ext = 'jp2'
|
|
|
|
out_path = OUTPUT_DIR / f"{code}.{ext}"
|
|
out_path.write_bytes(img_data)
|
|
already_extracted.add(code)
|
|
extracted += 1
|
|
|
|
except Exception as e:
|
|
errors += 1
|
|
if errors <= 5:
|
|
print(f" Error en página {page_idx}: {e}")
|
|
|
|
print(f" Resultado: {extracted} extraídas, {skipped} ya existentes, {errors} errores")
|
|
return extracted
|
|
|
|
|
|
def main():
|
|
volumes = sys.argv[1:] if len(sys.argv) > 1 else ['3', '2', '1']
|
|
|
|
print("=" * 70)
|
|
print("EXTRACTOR DE DIAGRAMAS MOOG")
|
|
print("=" * 70)
|
|
|
|
# Create output directory
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
print(f"Directorio de salida: {OUTPUT_DIR}")
|
|
|
|
# Check what's already extracted
|
|
already_extracted = set()
|
|
for f in OUTPUT_DIR.iterdir():
|
|
if f.suffix in ('.jpg', '.png', '.jp2'):
|
|
already_extracted.add(f.stem)
|
|
print(f"Ya extraídas: {len(already_extracted)}")
|
|
|
|
total = 0
|
|
for vol_key in volumes:
|
|
if vol_key not in VOLUMES:
|
|
print(f"Volumen {vol_key} no reconocido, saltando...")
|
|
continue
|
|
count = extract_volume(vol_key, already_extracted)
|
|
total += count
|
|
|
|
print(f"\n{'=' * 70}")
|
|
print(f"EXTRACCIÓN COMPLETADA: {total} nuevas imágenes")
|
|
print(f"Total en directorio: {len(list(OUTPUT_DIR.iterdir()))}")
|
|
print(f"{'=' * 70}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|