85 lines
3.1 KiB
TypeScript
85 lines
3.1 KiB
TypeScript
import { type Page } from 'playwright';
|
|
|
|
export async function extractOpinionPdf(page: Page): Promise<Buffer> {
|
|
const TIMEOUT = 120_000;
|
|
const POLL_INTERVAL = 3_000;
|
|
|
|
console.log('[SAT Opinion Scraper] Waiting for PDF to appear...');
|
|
|
|
let interceptedPdf: Buffer | null = null;
|
|
page.on('response', async (response) => {
|
|
try {
|
|
const contentType = response.headers()['content-type'] || '';
|
|
if (contentType.includes('application/pdf') || response.url().endsWith('.pdf')) {
|
|
const body = await response.body();
|
|
if (body.length > 100) {
|
|
interceptedPdf = body;
|
|
console.log(`[SAT Opinion Scraper] PDF intercepted via network: ${body.length} bytes`);
|
|
}
|
|
}
|
|
} catch { /* response body may not be available */ }
|
|
});
|
|
|
|
const startTime = Date.now();
|
|
|
|
while (Date.now() - startTime < TIMEOUT) {
|
|
if (interceptedPdf) return interceptedPdf;
|
|
|
|
// Strategy 1: <embed> or <object> with PDF data URI
|
|
const embedData = await page.evaluate(() => {
|
|
for (const el of document.querySelectorAll('embed, object')) {
|
|
const src = el.getAttribute('src') || el.getAttribute('data') || '';
|
|
if (src.startsWith('data:application/pdf;base64,')) return src;
|
|
}
|
|
return null;
|
|
}).catch(() => null);
|
|
|
|
if (embedData) {
|
|
console.log('[SAT Opinion Scraper] PDF found via <embed>/<object>');
|
|
return decodeDataUri(embedData);
|
|
}
|
|
|
|
// Strategy 2: Scan full HTML for base64 PDF
|
|
const html = await page.content().catch(() => '');
|
|
const match = html.match(/data:application\/pdf;base64,([A-Za-z0-9+/=\s]+)/);
|
|
if (match) {
|
|
console.log('[SAT Opinion Scraper] PDF found via page content scan');
|
|
return decodeDataUri(`data:application/pdf;base64,${match[1]}`);
|
|
}
|
|
|
|
// Strategy 3: Check iframes
|
|
for (const frame of page.frames()) {
|
|
try {
|
|
const frameUrl = frame.url();
|
|
if (frameUrl.startsWith('data:application/pdf;base64,')) {
|
|
console.log('[SAT Opinion Scraper] PDF found via iframe URL');
|
|
return decodeDataUri(frameUrl);
|
|
}
|
|
const frameHtml = await frame.content();
|
|
const frameMatch = frameHtml.match(/data:application\/pdf;base64,([A-Za-z0-9+/=\s]+)/);
|
|
if (frameMatch) {
|
|
console.log('[SAT Opinion Scraper] PDF found via iframe content');
|
|
return decodeDataUri(`data:application/pdf;base64,${frameMatch[1]}`);
|
|
}
|
|
} catch { /* cross-origin frame */ }
|
|
}
|
|
|
|
// Strategy 4: Page URL itself
|
|
if (page.url().startsWith('data:application/pdf;base64,')) {
|
|
console.log('[SAT Opinion Scraper] PDF found via page URL');
|
|
return decodeDataUri(page.url());
|
|
}
|
|
|
|
console.log(`[SAT Opinion Scraper] PDF not found, retrying in ${POLL_INTERVAL / 1000}s...`);
|
|
await page.waitForTimeout(POLL_INTERVAL);
|
|
}
|
|
|
|
throw new Error(`PDF not found after ${TIMEOUT / 1000}s`);
|
|
}
|
|
|
|
function decodeDataUri(dataUri: string): Buffer {
|
|
const prefix = 'data:application/pdf;base64,';
|
|
const base64 = dataUri.substring(prefix.length).replace(/\s/g, '');
|
|
return Buffer.from(base64, 'base64');
|
|
}
|