import { type Page } from 'playwright'; export async function extractOpinionPdf(page: Page): Promise { const TIMEOUT = 120_000; const POLL_INTERVAL = 3_000; console.log('[SAT Opinion Scraper] Waiting for PDF to appear...'); let interceptedPdf: Buffer | null = null; page.on('response', async (response) => { try { const contentType = response.headers()['content-type'] || ''; if (contentType.includes('application/pdf') || response.url().endsWith('.pdf')) { const body = await response.body(); if (body.length > 100) { interceptedPdf = body; console.log(`[SAT Opinion Scraper] PDF intercepted via network: ${body.length} bytes`); } } } catch { /* response body may not be available */ } }); const startTime = Date.now(); while (Date.now() - startTime < TIMEOUT) { if (interceptedPdf) return interceptedPdf; // Strategy 1: or with PDF data URI const embedData = await page.evaluate(() => { for (const el of document.querySelectorAll('embed, object')) { const src = el.getAttribute('src') || el.getAttribute('data') || ''; if (src.startsWith('data:application/pdf;base64,')) return src; } return null; }).catch(() => null); if (embedData) { console.log('[SAT Opinion Scraper] PDF found via /'); return decodeDataUri(embedData); } // Strategy 2: Scan full HTML for base64 PDF const html = await page.content().catch(() => ''); const match = html.match(/data:application\/pdf;base64,([A-Za-z0-9+/=\s]+)/); if (match) { console.log('[SAT Opinion Scraper] PDF found via page content scan'); return decodeDataUri(`data:application/pdf;base64,${match[1]}`); } // Strategy 3: Check iframes for (const frame of page.frames()) { try { const frameUrl = frame.url(); if (frameUrl.startsWith('data:application/pdf;base64,')) { console.log('[SAT Opinion Scraper] PDF found via iframe URL'); return decodeDataUri(frameUrl); } const frameHtml = await frame.content(); const frameMatch = frameHtml.match(/data:application\/pdf;base64,([A-Za-z0-9+/=\s]+)/); if (frameMatch) { console.log('[SAT Opinion Scraper] PDF found via iframe content'); return decodeDataUri(`data:application/pdf;base64,${frameMatch[1]}`); } } catch { /* cross-origin frame */ } } // Strategy 4: Page URL itself if (page.url().startsWith('data:application/pdf;base64,')) { console.log('[SAT Opinion Scraper] PDF found via page URL'); return decodeDataUri(page.url()); } console.log(`[SAT Opinion Scraper] PDF not found, retrying in ${POLL_INTERVAL / 1000}s...`); await page.waitForTimeout(POLL_INTERVAL); } throw new Error(`PDF not found after ${TIMEOUT / 1000}s`); } function decodeDataUri(dataUri: string): Buffer { const prefix = 'data:application/pdf;base64,'; const base64 = dataUri.substring(prefix.length).replace(/\s/g, ''); return Buffer.from(base64, 'base64'); }