Files
crawlapi/playwright/index.js
Developer 62994d4f3d
Some checks failed
CI / Test (push) Has been cancelled
Deploy / Deploy to Staging (push) Has been cancelled
CI / Build & Push (push) Has been cancelled
Deploy / Deploy to Production (push) Has been cancelled
Initial commit: Full Crawl API implementation
2026-04-29 07:03:48 +00:00

154 lines
4.5 KiB
JavaScript

const { chromium } = require('playwright');
const fs = require('fs');
const path = require('path');
const TurndownService = require('turndown');
const turndownService = new TurndownService();
const OUTPUT_DIR = process.env.OUTPUT_DIR || '/tmp/crawlapi';
function parseOptions(raw) {
try {
return JSON.parse(raw);
} catch {
return {};
}
}
function parseUrl(raw) {
try {
return JSON.parse(raw);
} catch {
return raw.replace(/^"|"$/g, '');
}
}
function ensureDir(dir) {
if (!fs.existsSync(dir)) {
fs.mkdirSync(dir, { recursive: true });
}
}
function writeFile(filename, data) {
ensureDir(OUTPUT_DIR);
const filepath = path.join(OUTPUT_DIR, filename);
fs.writeFileSync(filepath, data);
return filepath;
}
async function run() {
const endpoint = process.argv[2];
const url = parseUrl(process.argv[3]);
const options = parseOptions(process.argv[4]);
const browser = await chromium.launch({ headless: true });
const context = await browser.newContext({
viewport: {
width: options.width || 1440,
height: options.height || 900,
},
userAgent: options.user_agent || undefined,
});
const page = await context.newPage();
try {
const timeout = (options.timeout || 30) * 1000;
await page.goto(url, { waitUntil: 'networkidle', timeout });
if (options.wait_for) {
await page.waitForSelector(options.wait_for, { timeout });
}
let result = {};
switch (endpoint) {
case 'crawl': {
const html = await page.content();
const title = await page.title();
result = { html, title, url: page.url() };
break;
}
case 'content': {
const html = await page.content();
result = { html, url: page.url() };
break;
}
case 'screenshot': {
const screenshotOptions = { type: 'png' };
if (options.full_page) {
screenshotOptions.fullPage = true;
}
const buffer = await page.screenshot(screenshotOptions);
const filepath = writeFile(`${Date.now()}.png`, buffer);
result = { file_path: filepath, url: page.url() };
break;
}
case 'pdf': {
const buffer = await page.pdf({
format: 'A4',
printBackground: true,
});
const filepath = writeFile(`${Date.now()}.pdf`, buffer);
result = { file_path: filepath, url: page.url() };
break;
}
case 'markdown': {
const html = await page.content();
const markdown = turndownService.turndown(html);
result = { markdown, url: page.url() };
break;
}
case 'snapshot': {
const html = await page.content();
const screenshotBuffer = await page.screenshot({ type: 'png', fullPage: options.full_page || false });
const filepath = writeFile(`${Date.now()}.png`, screenshotBuffer);
result = { html, file_path: filepath, url: page.url() };
break;
}
case 'scrape': {
const selectors = options.selectors || ['h1', 'h2', 'p', 'a'];
const data = {};
for (const selector of selectors) {
const elements = await page.locator(selector).all();
const texts = [];
for (const el of elements) {
const text = await el.textContent();
if (text) texts.push(text.trim());
}
data[selector] = texts;
}
result = { data, url: page.url() };
break;
}
case 'json': {
const title = await page.title();
const description = await page.locator('meta[name="description"]').getAttribute('content').catch(() => null);
const headings = await page.locator('h1, h2, h3').allTextContents();
const links = await page.locator('a[href]').evaluateAll(els => els.map(el => ({ href: el.href, text: el.textContent?.trim() })));
result = { title, description, headings, links: links.slice(0, 50), url: page.url() };
break;
}
case 'links': {
const links = await page.locator('a[href]').evaluateAll(els =>
els.map(el => ({
href: el.href,
text: el.textContent?.trim() || '',
})).filter(l => l.href)
);
result = { links: [...new Map(links.map(l => [l.href, l])).values()], url: page.url() };
break;
}
default:
throw new Error(`Unknown endpoint: ${endpoint}`);
}
console.log(JSON.stringify(result));
} catch (error) {
console.error(error.message);
process.exit(1);
} finally {
await browser.close();
}
}
run();