154 lines
4.5 KiB
JavaScript
154 lines
4.5 KiB
JavaScript
const { chromium } = require('playwright');
|
|
const fs = require('fs');
|
|
const path = require('path');
|
|
const TurndownService = require('turndown');
|
|
|
|
const turndownService = new TurndownService();
|
|
const OUTPUT_DIR = process.env.OUTPUT_DIR || '/tmp/crawlapi';
|
|
|
|
function parseOptions(raw) {
|
|
try {
|
|
return JSON.parse(raw);
|
|
} catch {
|
|
return {};
|
|
}
|
|
}
|
|
|
|
function parseUrl(raw) {
|
|
try {
|
|
return JSON.parse(raw);
|
|
} catch {
|
|
return raw.replace(/^"|"$/g, '');
|
|
}
|
|
}
|
|
|
|
function ensureDir(dir) {
|
|
if (!fs.existsSync(dir)) {
|
|
fs.mkdirSync(dir, { recursive: true });
|
|
}
|
|
}
|
|
|
|
function writeFile(filename, data) {
|
|
ensureDir(OUTPUT_DIR);
|
|
const filepath = path.join(OUTPUT_DIR, filename);
|
|
fs.writeFileSync(filepath, data);
|
|
return filepath;
|
|
}
|
|
|
|
async function run() {
|
|
const endpoint = process.argv[2];
|
|
const url = parseUrl(process.argv[3]);
|
|
const options = parseOptions(process.argv[4]);
|
|
|
|
const browser = await chromium.launch({ headless: true });
|
|
const context = await browser.newContext({
|
|
viewport: {
|
|
width: options.width || 1440,
|
|
height: options.height || 900,
|
|
},
|
|
userAgent: options.user_agent || undefined,
|
|
});
|
|
const page = await context.newPage();
|
|
|
|
try {
|
|
const timeout = (options.timeout || 30) * 1000;
|
|
await page.goto(url, { waitUntil: 'networkidle', timeout });
|
|
|
|
if (options.wait_for) {
|
|
await page.waitForSelector(options.wait_for, { timeout });
|
|
}
|
|
|
|
let result = {};
|
|
|
|
switch (endpoint) {
|
|
case 'crawl': {
|
|
const html = await page.content();
|
|
const title = await page.title();
|
|
result = { html, title, url: page.url() };
|
|
break;
|
|
}
|
|
case 'content': {
|
|
const html = await page.content();
|
|
result = { html, url: page.url() };
|
|
break;
|
|
}
|
|
case 'screenshot': {
|
|
const screenshotOptions = { type: 'png' };
|
|
if (options.full_page) {
|
|
screenshotOptions.fullPage = true;
|
|
}
|
|
const buffer = await page.screenshot(screenshotOptions);
|
|
const filepath = writeFile(`${Date.now()}.png`, buffer);
|
|
result = { file_path: filepath, url: page.url() };
|
|
break;
|
|
}
|
|
case 'pdf': {
|
|
const buffer = await page.pdf({
|
|
format: 'A4',
|
|
printBackground: true,
|
|
});
|
|
const filepath = writeFile(`${Date.now()}.pdf`, buffer);
|
|
result = { file_path: filepath, url: page.url() };
|
|
break;
|
|
}
|
|
case 'markdown': {
|
|
const html = await page.content();
|
|
const markdown = turndownService.turndown(html);
|
|
result = { markdown, url: page.url() };
|
|
break;
|
|
}
|
|
case 'snapshot': {
|
|
const html = await page.content();
|
|
const screenshotBuffer = await page.screenshot({ type: 'png', fullPage: options.full_page || false });
|
|
const filepath = writeFile(`${Date.now()}.png`, screenshotBuffer);
|
|
result = { html, file_path: filepath, url: page.url() };
|
|
break;
|
|
}
|
|
case 'scrape': {
|
|
const selectors = options.selectors || ['h1', 'h2', 'p', 'a'];
|
|
const data = {};
|
|
for (const selector of selectors) {
|
|
const elements = await page.locator(selector).all();
|
|
const texts = [];
|
|
for (const el of elements) {
|
|
const text = await el.textContent();
|
|
if (text) texts.push(text.trim());
|
|
}
|
|
data[selector] = texts;
|
|
}
|
|
result = { data, url: page.url() };
|
|
break;
|
|
}
|
|
case 'json': {
|
|
const title = await page.title();
|
|
const description = await page.locator('meta[name="description"]').getAttribute('content').catch(() => null);
|
|
const headings = await page.locator('h1, h2, h3').allTextContents();
|
|
const links = await page.locator('a[href]').evaluateAll(els => els.map(el => ({ href: el.href, text: el.textContent?.trim() })));
|
|
result = { title, description, headings, links: links.slice(0, 50), url: page.url() };
|
|
break;
|
|
}
|
|
case 'links': {
|
|
const links = await page.locator('a[href]').evaluateAll(els =>
|
|
els.map(el => ({
|
|
href: el.href,
|
|
text: el.textContent?.trim() || '',
|
|
})).filter(l => l.href)
|
|
);
|
|
result = { links: [...new Map(links.map(l => [l.href, l])).values()], url: page.url() };
|
|
break;
|
|
}
|
|
default:
|
|
throw new Error(`Unknown endpoint: ${endpoint}`);
|
|
}
|
|
|
|
console.log(JSON.stringify(result));
|
|
} catch (error) {
|
|
console.error(error.message);
|
|
process.exit(1);
|
|
} finally {
|
|
await browser.close();
|
|
}
|
|
}
|
|
|
|
run();
|