Initial commit - Horux Despachos NL
This commit is contained in:
84
apps/api/src/services/sat/sat-opinion-scraper.ts
Normal file
84
apps/api/src/services/sat/sat-opinion-scraper.ts
Normal file
@@ -0,0 +1,84 @@
|
||||
import { type Page } from 'playwright';
|
||||
|
||||
export async function extractOpinionPdf(page: Page): Promise<Buffer> {
|
||||
const TIMEOUT = 120_000;
|
||||
const POLL_INTERVAL = 3_000;
|
||||
|
||||
console.log('[SAT Opinion Scraper] Waiting for PDF to appear...');
|
||||
|
||||
let interceptedPdf: Buffer | null = null;
|
||||
page.on('response', async (response) => {
|
||||
try {
|
||||
const contentType = response.headers()['content-type'] || '';
|
||||
if (contentType.includes('application/pdf') || response.url().endsWith('.pdf')) {
|
||||
const body = await response.body();
|
||||
if (body.length > 100) {
|
||||
interceptedPdf = body;
|
||||
console.log(`[SAT Opinion Scraper] PDF intercepted via network: ${body.length} bytes`);
|
||||
}
|
||||
}
|
||||
} catch { /* response body may not be available */ }
|
||||
});
|
||||
|
||||
const startTime = Date.now();
|
||||
|
||||
while (Date.now() - startTime < TIMEOUT) {
|
||||
if (interceptedPdf) return interceptedPdf;
|
||||
|
||||
// Strategy 1: <embed> or <object> with PDF data URI
|
||||
const embedData = await page.evaluate(() => {
|
||||
for (const el of document.querySelectorAll('embed, object')) {
|
||||
const src = el.getAttribute('src') || el.getAttribute('data') || '';
|
||||
if (src.startsWith('data:application/pdf;base64,')) return src;
|
||||
}
|
||||
return null;
|
||||
}).catch(() => null);
|
||||
|
||||
if (embedData) {
|
||||
console.log('[SAT Opinion Scraper] PDF found via <embed>/<object>');
|
||||
return decodeDataUri(embedData);
|
||||
}
|
||||
|
||||
// Strategy 2: Scan full HTML for base64 PDF
|
||||
const html = await page.content().catch(() => '');
|
||||
const match = html.match(/data:application\/pdf;base64,([A-Za-z0-9+/=\s]+)/);
|
||||
if (match) {
|
||||
console.log('[SAT Opinion Scraper] PDF found via page content scan');
|
||||
return decodeDataUri(`data:application/pdf;base64,${match[1]}`);
|
||||
}
|
||||
|
||||
// Strategy 3: Check iframes
|
||||
for (const frame of page.frames()) {
|
||||
try {
|
||||
const frameUrl = frame.url();
|
||||
if (frameUrl.startsWith('data:application/pdf;base64,')) {
|
||||
console.log('[SAT Opinion Scraper] PDF found via iframe URL');
|
||||
return decodeDataUri(frameUrl);
|
||||
}
|
||||
const frameHtml = await frame.content();
|
||||
const frameMatch = frameHtml.match(/data:application\/pdf;base64,([A-Za-z0-9+/=\s]+)/);
|
||||
if (frameMatch) {
|
||||
console.log('[SAT Opinion Scraper] PDF found via iframe content');
|
||||
return decodeDataUri(`data:application/pdf;base64,${frameMatch[1]}`);
|
||||
}
|
||||
} catch { /* cross-origin frame */ }
|
||||
}
|
||||
|
||||
// Strategy 4: Page URL itself
|
||||
if (page.url().startsWith('data:application/pdf;base64,')) {
|
||||
console.log('[SAT Opinion Scraper] PDF found via page URL');
|
||||
return decodeDataUri(page.url());
|
||||
}
|
||||
|
||||
console.log(`[SAT Opinion Scraper] PDF not found, retrying in ${POLL_INTERVAL / 1000}s...`);
|
||||
await page.waitForTimeout(POLL_INTERVAL);
|
||||
}
|
||||
|
||||
throw new Error(`PDF not found after ${TIMEOUT / 1000}s`);
|
||||
}
|
||||
|
||||
function decodeDataUri(dataUri: string): Buffer {
|
||||
const prefix = 'data:application/pdf;base64,';
|
||||
const base64 = dataUri.substring(prefix.length).replace(/\s/g, '');
|
||||
return Buffer.from(base64, 'base64');
|
||||
}
|
||||
Reference in New Issue
Block a user