#!/usr/bin/env python3 """ EXTRACTOR DE IMÁGENES DE DIAGRAMAS MOOG Extrae las ilustraciones de suspensión/dirección de los PDFs MOOG y las guarda como archivos de imagen mapeados a sus figure codes. """ import re import sys import io import hashlib from pathlib import Path import pypdf OUTPUT_DIR = Path(__file__).parent.parent.parent / 'dashboard' / 'static' / 'diagrams' / 'moog' VOLUMES = { '1': { 'path': '/tmp/catalogs/suspension/moog_vol1_1989back.pdf', 'start_page': 3, 'end_page': 1037, 'label': 'Vol 1 (≤1989)', }, '2': { 'path': '/tmp/catalogs/suspension/moog_vol2_1990_2005.pdf', 'start_page': 6, 'end_page': 1641, 'label': 'Vol 2 (1990-2005)', }, '3': { 'path': '/tmp/catalogs/suspension/moog_vol3_2006up.pdf', 'start_page': 7, 'end_page': 1089, 'label': 'Vol 3 (2006+)', }, } FIGURE_RE = re.compile(r'\b([FSR]\d{3})\b') def extract_figure_codes(text): """Extract ordered unique figure codes from page text.""" codes = [] seen = set() for m in FIGURE_RE.finditer(text): code = m.group(1) if code not in seen: codes.append(code) seen.add(code) return codes def extract_volume(vol_key, already_extracted): """Extract diagram images from one MOOG volume.""" vol = VOLUMES[vol_key] print(f"\n--- Procesando {vol['label']} ---") print(f" PDF: {vol['path']}") pdf = pypdf.PdfReader(vol['path']) total_pages = len(pdf.pages) end_page = min(vol['end_page'], total_pages - 1) extracted = 0 skipped = 0 errors = 0 for page_idx in range(vol['start_page'], end_page + 1): if page_idx % 100 == 0: print(f" Página {page_idx}/{end_page}... (extraídas: {extracted})") try: page = pdf.pages[page_idx] text = page.extract_text() or '' # Get figure codes from this page fig_codes = extract_figure_codes(text) if not fig_codes: continue # Filter out already-extracted codes needed_codes = [c for c in fig_codes if c not in already_extracted] if not needed_codes: skipped += len(fig_codes) continue # Extract images from page images = [] try: for img_key in page.images: img_data = img_key.data # Filter by size - diagram images are >10KB typically if len(img_data) > 5000: images.append(img_data) except Exception: # Fallback: try to extract from xobjects directly try: if '/XObject' in page['/Resources']: xobjects = page['/Resources']['/XObject'].get_object() for obj_name in sorted(xobjects.keys()): xobj = xobjects[obj_name].get_object() if xobj.get('/Subtype') == '/Image': w = int(xobj.get('/Width', 0)) h = int(xobj.get('/Height', 0)) if w > 200 and h > 100: try: img_data = xobj.get_data() if len(img_data) > 5000: images.append(img_data) except Exception: pass except Exception: pass if not images: continue # Match figure codes to images # Strategy: if same number of large images and figure codes, match 1:1 in order # If fewer images than codes, some codes share images (use first available) # If more images than codes, filter further by size for i, code in enumerate(needed_codes): if i < len(images): img_data = images[i] # Determine file extension from magic bytes ext = 'jpg' if img_data[:4] == b'\x89PNG': ext = 'png' elif img_data[:4] == b'\x00\x00\x00\x0c': ext = 'jp2' out_path = OUTPUT_DIR / f"{code}.{ext}" out_path.write_bytes(img_data) already_extracted.add(code) extracted += 1 except Exception as e: errors += 1 if errors <= 5: print(f" Error en página {page_idx}: {e}") print(f" Resultado: {extracted} extraídas, {skipped} ya existentes, {errors} errores") return extracted def main(): volumes = sys.argv[1:] if len(sys.argv) > 1 else ['3', '2', '1'] print("=" * 70) print("EXTRACTOR DE DIAGRAMAS MOOG") print("=" * 70) # Create output directory OUTPUT_DIR.mkdir(parents=True, exist_ok=True) print(f"Directorio de salida: {OUTPUT_DIR}") # Check what's already extracted already_extracted = set() for f in OUTPUT_DIR.iterdir(): if f.suffix in ('.jpg', '.png', '.jp2'): already_extracted.add(f.stem) print(f"Ya extraídas: {len(already_extracted)}") total = 0 for vol_key in volumes: if vol_key not in VOLUMES: print(f"Volumen {vol_key} no reconocido, saltando...") continue count = extract_volume(vol_key, already_extracted) total += count print(f"\n{'=' * 70}") print(f"EXTRACCIÓN COMPLETADA: {total} nuevas imágenes") print(f"Total en directorio: {len(list(OUTPUT_DIR.iterdir()))}") print(f"{'=' * 70}") if __name__ == '__main__': main()