Initial commit - Horux Despachos NL

This commit is contained in:
2026-05-03 16:47:53 -06:00
commit b00b677c54
647 changed files with 133843 additions and 0 deletions

View File

@@ -0,0 +1,84 @@
import { type Page } from 'playwright';
export async function extractOpinionPdf(page: Page): Promise<Buffer> {
const TIMEOUT = 120_000;
const POLL_INTERVAL = 3_000;
console.log('[SAT Opinion Scraper] Waiting for PDF to appear...');
let interceptedPdf: Buffer | null = null;
page.on('response', async (response) => {
try {
const contentType = response.headers()['content-type'] || '';
if (contentType.includes('application/pdf') || response.url().endsWith('.pdf')) {
const body = await response.body();
if (body.length > 100) {
interceptedPdf = body;
console.log(`[SAT Opinion Scraper] PDF intercepted via network: ${body.length} bytes`);
}
}
} catch { /* response body may not be available */ }
});
const startTime = Date.now();
while (Date.now() - startTime < TIMEOUT) {
if (interceptedPdf) return interceptedPdf;
// Strategy 1: <embed> or <object> with PDF data URI
const embedData = await page.evaluate(() => {
for (const el of document.querySelectorAll('embed, object')) {
const src = el.getAttribute('src') || el.getAttribute('data') || '';
if (src.startsWith('data:application/pdf;base64,')) return src;
}
return null;
}).catch(() => null);
if (embedData) {
console.log('[SAT Opinion Scraper] PDF found via <embed>/<object>');
return decodeDataUri(embedData);
}
// Strategy 2: Scan full HTML for base64 PDF
const html = await page.content().catch(() => '');
const match = html.match(/data:application\/pdf;base64,([A-Za-z0-9+/=\s]+)/);
if (match) {
console.log('[SAT Opinion Scraper] PDF found via page content scan');
return decodeDataUri(`data:application/pdf;base64,${match[1]}`);
}
// Strategy 3: Check iframes
for (const frame of page.frames()) {
try {
const frameUrl = frame.url();
if (frameUrl.startsWith('data:application/pdf;base64,')) {
console.log('[SAT Opinion Scraper] PDF found via iframe URL');
return decodeDataUri(frameUrl);
}
const frameHtml = await frame.content();
const frameMatch = frameHtml.match(/data:application\/pdf;base64,([A-Za-z0-9+/=\s]+)/);
if (frameMatch) {
console.log('[SAT Opinion Scraper] PDF found via iframe content');
return decodeDataUri(`data:application/pdf;base64,${frameMatch[1]}`);
}
} catch { /* cross-origin frame */ }
}
// Strategy 4: Page URL itself
if (page.url().startsWith('data:application/pdf;base64,')) {
console.log('[SAT Opinion Scraper] PDF found via page URL');
return decodeDataUri(page.url());
}
console.log(`[SAT Opinion Scraper] PDF not found, retrying in ${POLL_INTERVAL / 1000}s...`);
await page.waitForTimeout(POLL_INTERVAL);
}
throw new Error(`PDF not found after ${TIMEOUT / 1000}s`);
}
function decodeDataUri(dataUri: string): Buffer {
const prefix = 'data:application/pdf;base64,';
const base64 = dataUri.substring(prefix.length).replace(/\s/g, '');
return Buffer.from(base64, 'base64');
}