Initial commit: Full Crawl API implementation
This commit is contained in:
153
playwright/index.js
Normal file
153
playwright/index.js
Normal file
@@ -0,0 +1,153 @@
|
||||
const { chromium } = require('playwright');
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const TurndownService = require('turndown');
|
||||
|
||||
const turndownService = new TurndownService();
|
||||
const OUTPUT_DIR = process.env.OUTPUT_DIR || '/tmp/crawlapi';
|
||||
|
||||
function parseOptions(raw) {
|
||||
try {
|
||||
return JSON.parse(raw);
|
||||
} catch {
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
function parseUrl(raw) {
|
||||
try {
|
||||
return JSON.parse(raw);
|
||||
} catch {
|
||||
return raw.replace(/^"|"$/g, '');
|
||||
}
|
||||
}
|
||||
|
||||
function ensureDir(dir) {
|
||||
if (!fs.existsSync(dir)) {
|
||||
fs.mkdirSync(dir, { recursive: true });
|
||||
}
|
||||
}
|
||||
|
||||
function writeFile(filename, data) {
|
||||
ensureDir(OUTPUT_DIR);
|
||||
const filepath = path.join(OUTPUT_DIR, filename);
|
||||
fs.writeFileSync(filepath, data);
|
||||
return filepath;
|
||||
}
|
||||
|
||||
async function run() {
|
||||
const endpoint = process.argv[2];
|
||||
const url = parseUrl(process.argv[3]);
|
||||
const options = parseOptions(process.argv[4]);
|
||||
|
||||
const browser = await chromium.launch({ headless: true });
|
||||
const context = await browser.newContext({
|
||||
viewport: {
|
||||
width: options.width || 1440,
|
||||
height: options.height || 900,
|
||||
},
|
||||
userAgent: options.user_agent || undefined,
|
||||
});
|
||||
const page = await context.newPage();
|
||||
|
||||
try {
|
||||
const timeout = (options.timeout || 30) * 1000;
|
||||
await page.goto(url, { waitUntil: 'networkidle', timeout });
|
||||
|
||||
if (options.wait_for) {
|
||||
await page.waitForSelector(options.wait_for, { timeout });
|
||||
}
|
||||
|
||||
let result = {};
|
||||
|
||||
switch (endpoint) {
|
||||
case 'crawl': {
|
||||
const html = await page.content();
|
||||
const title = await page.title();
|
||||
result = { html, title, url: page.url() };
|
||||
break;
|
||||
}
|
||||
case 'content': {
|
||||
const html = await page.content();
|
||||
result = { html, url: page.url() };
|
||||
break;
|
||||
}
|
||||
case 'screenshot': {
|
||||
const screenshotOptions = { type: 'png' };
|
||||
if (options.full_page) {
|
||||
screenshotOptions.fullPage = true;
|
||||
}
|
||||
const buffer = await page.screenshot(screenshotOptions);
|
||||
const filepath = writeFile(`${Date.now()}.png`, buffer);
|
||||
result = { file_path: filepath, url: page.url() };
|
||||
break;
|
||||
}
|
||||
case 'pdf': {
|
||||
const buffer = await page.pdf({
|
||||
format: 'A4',
|
||||
printBackground: true,
|
||||
});
|
||||
const filepath = writeFile(`${Date.now()}.pdf`, buffer);
|
||||
result = { file_path: filepath, url: page.url() };
|
||||
break;
|
||||
}
|
||||
case 'markdown': {
|
||||
const html = await page.content();
|
||||
const markdown = turndownService.turndown(html);
|
||||
result = { markdown, url: page.url() };
|
||||
break;
|
||||
}
|
||||
case 'snapshot': {
|
||||
const html = await page.content();
|
||||
const screenshotBuffer = await page.screenshot({ type: 'png', fullPage: options.full_page || false });
|
||||
const filepath = writeFile(`${Date.now()}.png`, screenshotBuffer);
|
||||
result = { html, file_path: filepath, url: page.url() };
|
||||
break;
|
||||
}
|
||||
case 'scrape': {
|
||||
const selectors = options.selectors || ['h1', 'h2', 'p', 'a'];
|
||||
const data = {};
|
||||
for (const selector of selectors) {
|
||||
const elements = await page.locator(selector).all();
|
||||
const texts = [];
|
||||
for (const el of elements) {
|
||||
const text = await el.textContent();
|
||||
if (text) texts.push(text.trim());
|
||||
}
|
||||
data[selector] = texts;
|
||||
}
|
||||
result = { data, url: page.url() };
|
||||
break;
|
||||
}
|
||||
case 'json': {
|
||||
const title = await page.title();
|
||||
const description = await page.locator('meta[name="description"]').getAttribute('content').catch(() => null);
|
||||
const headings = await page.locator('h1, h2, h3').allTextContents();
|
||||
const links = await page.locator('a[href]').evaluateAll(els => els.map(el => ({ href: el.href, text: el.textContent?.trim() })));
|
||||
result = { title, description, headings, links: links.slice(0, 50), url: page.url() };
|
||||
break;
|
||||
}
|
||||
case 'links': {
|
||||
const links = await page.locator('a[href]').evaluateAll(els =>
|
||||
els.map(el => ({
|
||||
href: el.href,
|
||||
text: el.textContent?.trim() || '',
|
||||
})).filter(l => l.href)
|
||||
);
|
||||
result = { links: [...new Map(links.map(l => [l.href, l])).values()], url: page.url() };
|
||||
break;
|
||||
}
|
||||
default:
|
||||
throw new Error(`Unknown endpoint: ${endpoint}`);
|
||||
}
|
||||
|
||||
console.log(JSON.stringify(result));
|
||||
} catch (error) {
|
||||
console.error(error.message);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
|
||||
run();
|
||||
Reference in New Issue
Block a user