Initial commit: Full Crawl API implementation
This commit is contained in:
153
playwright/index.js
Normal file
153
playwright/index.js
Normal file
@@ -0,0 +1,153 @@
|
||||
const { chromium } = require('playwright');
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const TurndownService = require('turndown');
|
||||
|
||||
const turndownService = new TurndownService();
|
||||
const OUTPUT_DIR = process.env.OUTPUT_DIR || '/tmp/crawlapi';
|
||||
|
||||
function parseOptions(raw) {
|
||||
try {
|
||||
return JSON.parse(raw);
|
||||
} catch {
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
function parseUrl(raw) {
|
||||
try {
|
||||
return JSON.parse(raw);
|
||||
} catch {
|
||||
return raw.replace(/^"|"$/g, '');
|
||||
}
|
||||
}
|
||||
|
||||
function ensureDir(dir) {
|
||||
if (!fs.existsSync(dir)) {
|
||||
fs.mkdirSync(dir, { recursive: true });
|
||||
}
|
||||
}
|
||||
|
||||
function writeFile(filename, data) {
|
||||
ensureDir(OUTPUT_DIR);
|
||||
const filepath = path.join(OUTPUT_DIR, filename);
|
||||
fs.writeFileSync(filepath, data);
|
||||
return filepath;
|
||||
}
|
||||
|
||||
async function run() {
|
||||
const endpoint = process.argv[2];
|
||||
const url = parseUrl(process.argv[3]);
|
||||
const options = parseOptions(process.argv[4]);
|
||||
|
||||
const browser = await chromium.launch({ headless: true });
|
||||
const context = await browser.newContext({
|
||||
viewport: {
|
||||
width: options.width || 1440,
|
||||
height: options.height || 900,
|
||||
},
|
||||
userAgent: options.user_agent || undefined,
|
||||
});
|
||||
const page = await context.newPage();
|
||||
|
||||
try {
|
||||
const timeout = (options.timeout || 30) * 1000;
|
||||
await page.goto(url, { waitUntil: 'networkidle', timeout });
|
||||
|
||||
if (options.wait_for) {
|
||||
await page.waitForSelector(options.wait_for, { timeout });
|
||||
}
|
||||
|
||||
let result = {};
|
||||
|
||||
switch (endpoint) {
|
||||
case 'crawl': {
|
||||
const html = await page.content();
|
||||
const title = await page.title();
|
||||
result = { html, title, url: page.url() };
|
||||
break;
|
||||
}
|
||||
case 'content': {
|
||||
const html = await page.content();
|
||||
result = { html, url: page.url() };
|
||||
break;
|
||||
}
|
||||
case 'screenshot': {
|
||||
const screenshotOptions = { type: 'png' };
|
||||
if (options.full_page) {
|
||||
screenshotOptions.fullPage = true;
|
||||
}
|
||||
const buffer = await page.screenshot(screenshotOptions);
|
||||
const filepath = writeFile(`${Date.now()}.png`, buffer);
|
||||
result = { file_path: filepath, url: page.url() };
|
||||
break;
|
||||
}
|
||||
case 'pdf': {
|
||||
const buffer = await page.pdf({
|
||||
format: 'A4',
|
||||
printBackground: true,
|
||||
});
|
||||
const filepath = writeFile(`${Date.now()}.pdf`, buffer);
|
||||
result = { file_path: filepath, url: page.url() };
|
||||
break;
|
||||
}
|
||||
case 'markdown': {
|
||||
const html = await page.content();
|
||||
const markdown = turndownService.turndown(html);
|
||||
result = { markdown, url: page.url() };
|
||||
break;
|
||||
}
|
||||
case 'snapshot': {
|
||||
const html = await page.content();
|
||||
const screenshotBuffer = await page.screenshot({ type: 'png', fullPage: options.full_page || false });
|
||||
const filepath = writeFile(`${Date.now()}.png`, screenshotBuffer);
|
||||
result = { html, file_path: filepath, url: page.url() };
|
||||
break;
|
||||
}
|
||||
case 'scrape': {
|
||||
const selectors = options.selectors || ['h1', 'h2', 'p', 'a'];
|
||||
const data = {};
|
||||
for (const selector of selectors) {
|
||||
const elements = await page.locator(selector).all();
|
||||
const texts = [];
|
||||
for (const el of elements) {
|
||||
const text = await el.textContent();
|
||||
if (text) texts.push(text.trim());
|
||||
}
|
||||
data[selector] = texts;
|
||||
}
|
||||
result = { data, url: page.url() };
|
||||
break;
|
||||
}
|
||||
case 'json': {
|
||||
const title = await page.title();
|
||||
const description = await page.locator('meta[name="description"]').getAttribute('content').catch(() => null);
|
||||
const headings = await page.locator('h1, h2, h3').allTextContents();
|
||||
const links = await page.locator('a[href]').evaluateAll(els => els.map(el => ({ href: el.href, text: el.textContent?.trim() })));
|
||||
result = { title, description, headings, links: links.slice(0, 50), url: page.url() };
|
||||
break;
|
||||
}
|
||||
case 'links': {
|
||||
const links = await page.locator('a[href]').evaluateAll(els =>
|
||||
els.map(el => ({
|
||||
href: el.href,
|
||||
text: el.textContent?.trim() || '',
|
||||
})).filter(l => l.href)
|
||||
);
|
||||
result = { links: [...new Map(links.map(l => [l.href, l])).values()], url: page.url() };
|
||||
break;
|
||||
}
|
||||
default:
|
||||
throw new Error(`Unknown endpoint: ${endpoint}`);
|
||||
}
|
||||
|
||||
console.log(JSON.stringify(result));
|
||||
} catch (error) {
|
||||
console.error(error.message);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
|
||||
run();
|
||||
13
playwright/package.json
Normal file
13
playwright/package.json
Normal file
@@ -0,0 +1,13 @@
|
||||
{
|
||||
"name": "crawlapi-playwright",
|
||||
"version": "1.0.0",
|
||||
"description": "Playwright worker for Crawl API",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"start": "node index.js"
|
||||
},
|
||||
"dependencies": {
|
||||
"playwright": "^1.49.0",
|
||||
"turndown": "^7.2.0"
|
||||
}
|
||||
}
|
||||
412
playwright/pool.js
Normal file
412
playwright/pool.js
Normal file
@@ -0,0 +1,412 @@
|
||||
const { chromium } = require('playwright');
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const TurndownService = require('turndown');
|
||||
const { execSync } = require('child_process');
|
||||
|
||||
const turndownService = new TurndownService();
|
||||
const OUTPUT_DIR = process.env.OUTPUT_DIR || '/tmp/crawlapi';
|
||||
const COOKIE_DIR = process.env.COOKIE_DIR || '/tmp/crawlapi/cookies';
|
||||
const PROXY_URL = process.env.PROXY_URL || '';
|
||||
const CAPTCHA_API_KEY = process.env.CAPTCHA_API_KEY || '';
|
||||
|
||||
class ProxyRotator {
|
||||
constructor() {
|
||||
this.proxies = [];
|
||||
this.index = 0;
|
||||
this.loadProxies();
|
||||
}
|
||||
|
||||
loadProxies() {
|
||||
if (PROXY_URL) {
|
||||
this.proxies = PROXY_URL.split(',').map(p => p.trim()).filter(Boolean);
|
||||
}
|
||||
}
|
||||
|
||||
getNext() {
|
||||
if (this.proxies.length === 0) return null;
|
||||
const proxy = this.proxies[this.index];
|
||||
this.index = (this.index + 1) % this.proxies.length;
|
||||
return proxy;
|
||||
}
|
||||
}
|
||||
|
||||
const proxyRotator = new ProxyRotator();
|
||||
|
||||
class BrowserPool {
|
||||
constructor(maxBrowsers = 5, maxPagesPerBrowser = 10) {
|
||||
this.maxBrowsers = maxBrowsers;
|
||||
this.maxPagesPerBrowser = maxPagesPerBrowser;
|
||||
this.browsers = [];
|
||||
this.initialized = false;
|
||||
}
|
||||
|
||||
async init() {
|
||||
if (this.initialized) return;
|
||||
for (let i = 0; i < this.maxBrowsers; i++) {
|
||||
const browser = await chromium.launch({
|
||||
headless: true,
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-accelerated-2d-canvas',
|
||||
'--disable-gpu',
|
||||
'--window-size=1920,1080',
|
||||
]
|
||||
});
|
||||
this.browsers.push({ browser, pages: [], lock: false });
|
||||
}
|
||||
this.initialized = true;
|
||||
console.error(`Browser pool initialized with ${this.maxBrowsers} browsers`);
|
||||
}
|
||||
|
||||
async acquirePage(proxy = null) {
|
||||
await this.init();
|
||||
return new Promise((resolve, reject) => {
|
||||
const tryAcquire = () => {
|
||||
for (const entry of this.browsers) {
|
||||
if (!entry.lock && entry.pages.length < this.maxPagesPerBrowser) {
|
||||
entry.lock = true;
|
||||
const contextOptions = {
|
||||
viewport: { width: 1440, height: 900 },
|
||||
};
|
||||
if (proxy) {
|
||||
contextOptions.proxy = { server: proxy };
|
||||
}
|
||||
entry.browser.newContext(contextOptions)
|
||||
.then(context => {
|
||||
return context.newPage();
|
||||
})
|
||||
.then(page => {
|
||||
entry.pages.push(page);
|
||||
entry.lock = false;
|
||||
resolve({ page, entry });
|
||||
})
|
||||
.catch(err => {
|
||||
entry.lock = false;
|
||||
reject(err);
|
||||
});
|
||||
return;
|
||||
}
|
||||
}
|
||||
setTimeout(tryAcquire, 50);
|
||||
};
|
||||
tryAcquire();
|
||||
});
|
||||
}
|
||||
|
||||
async releasePage(page, entry) {
|
||||
const idx = entry.pages.indexOf(page);
|
||||
if (idx > -1) {
|
||||
entry.pages.splice(idx, 1);
|
||||
}
|
||||
try {
|
||||
await page.context().close();
|
||||
} catch (e) {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
|
||||
async close() {
|
||||
for (const entry of this.browsers) {
|
||||
await entry.browser.close();
|
||||
}
|
||||
this.browsers = [];
|
||||
this.initialized = false;
|
||||
}
|
||||
}
|
||||
|
||||
const pool = new BrowserPool(
|
||||
parseInt(process.env.BROWSER_POOL_SIZE) || 5,
|
||||
parseInt(process.env.MAX_PAGES_PER_BROWSER) || 10
|
||||
);
|
||||
|
||||
function parseOptions(raw) {
|
||||
try { return JSON.parse(raw); } catch { return {}; }
|
||||
}
|
||||
|
||||
function parseUrl(raw) {
|
||||
try { return JSON.parse(raw); } catch { return raw.replace(/^"|"$/g, ''); }
|
||||
}
|
||||
|
||||
function ensureDir(dir) {
|
||||
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
|
||||
}
|
||||
|
||||
function writeFile(dir, filename, data) {
|
||||
ensureDir(dir);
|
||||
const filepath = path.join(dir, filename);
|
||||
fs.writeFileSync(filepath, data);
|
||||
return filepath;
|
||||
}
|
||||
|
||||
function getCookiePath(sessionId) {
|
||||
if (!sessionId) return null;
|
||||
ensureDir(COOKIE_DIR);
|
||||
return path.join(COOKIE_DIR, `${sessionId}.json`);
|
||||
}
|
||||
|
||||
async function loadCookies(context, sessionId) {
|
||||
const cookiePath = getCookiePath(sessionId);
|
||||
if (cookiePath && fs.existsSync(cookiePath)) {
|
||||
try {
|
||||
const cookies = JSON.parse(fs.readFileSync(cookiePath, 'utf8'));
|
||||
await context.addCookies(cookies);
|
||||
} catch (e) {
|
||||
console.error('Failed to load cookies:', e.message);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function saveCookies(context, sessionId) {
|
||||
const cookiePath = getCookiePath(sessionId);
|
||||
if (cookiePath) {
|
||||
try {
|
||||
const cookies = await context.cookies();
|
||||
fs.writeFileSync(cookiePath, JSON.stringify(cookies));
|
||||
} catch (e) {
|
||||
console.error('Failed to save cookies:', e.message);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function applyStealth(page) {
|
||||
// Stealth patches to avoid detection
|
||||
await page.addInitScript(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
||||
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
|
||||
Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
|
||||
window.chrome = { runtime: {} };
|
||||
const originalQuery = window.navigator.permissions.query;
|
||||
window.navigator.permissions.query = (parameters) => (
|
||||
parameters.name === 'notifications' ?
|
||||
Promise.resolve({ state: Notification.permission }) :
|
||||
originalQuery(parameters)
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
async function solveCaptcha(page, url) {
|
||||
if (!CAPTCHA_API_KEY) return false;
|
||||
|
||||
try {
|
||||
// Check for reCAPTCHA
|
||||
const recaptchaSiteKey = await page.locator('[data-sitekey]').first().getAttribute('data-sitekey').catch(() => null);
|
||||
if (recaptchaSiteKey) {
|
||||
console.error('Found reCAPTCHA, attempting to solve...');
|
||||
const taskData = {
|
||||
clientKey: CAPTCHA_API_KEY,
|
||||
task: {
|
||||
type: 'NoCaptchaTaskProxyless',
|
||||
websiteURL: url,
|
||||
websiteKey: recaptchaSiteKey,
|
||||
}
|
||||
};
|
||||
|
||||
// Submit to 2captcha / CapSolver
|
||||
const res = await fetch('https://api.capsolver.com/createTask', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify(taskData)
|
||||
});
|
||||
const data = await res.json();
|
||||
|
||||
if (data.errorId === 0) {
|
||||
const taskId = data.taskId;
|
||||
// Poll for result
|
||||
for (let i = 0; i < 60; i++) {
|
||||
await new Promise(r => setTimeout(r, 5000));
|
||||
const resultRes = await fetch('https://api.capsolver.com/getTaskResult', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ clientKey: CAPTCHA_API_KEY, taskId })
|
||||
});
|
||||
const resultData = await resultRes.json();
|
||||
if (resultData.status === 'ready') {
|
||||
await page.locator('#g-recaptcha-response').evaluate((el, token) => {
|
||||
el.value = token;
|
||||
}, resultData.solution.gRecaptchaResponse);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
console.error('Captcha solving failed:', e.message);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
async function run() {
|
||||
const endpoint = process.argv[2];
|
||||
const url = parseUrl(process.argv[3]);
|
||||
const options = parseOptions(process.argv[4]);
|
||||
const outputDir = process.env.OUTPUT_DIR || '/tmp/crawlapi';
|
||||
|
||||
// Get proxy if enabled
|
||||
const proxy = options.use_proxy ? proxyRotator.getNext() : null;
|
||||
if (proxy) {
|
||||
console.error(`Using proxy: ${proxy}`);
|
||||
}
|
||||
|
||||
const { page, entry } = await pool.acquirePage(proxy);
|
||||
|
||||
try {
|
||||
// Load cookies if session provided
|
||||
if (options.session_id) {
|
||||
await loadCookies(page.context(), options.session_id);
|
||||
}
|
||||
|
||||
// Set custom headers
|
||||
if (options.headers) {
|
||||
await page.setExtraHTTPHeaders(options.headers);
|
||||
}
|
||||
|
||||
// Apply stealth mode
|
||||
if (options.stealth !== false) {
|
||||
await applyStealth(page);
|
||||
}
|
||||
|
||||
// Mobile emulation
|
||||
if (options.mobile) {
|
||||
const devices = require('playwright').devices;
|
||||
const device = devices['iPhone 14'];
|
||||
if (device) {
|
||||
await page.setViewportSize(device.viewport);
|
||||
await page.setUserAgent(device.userAgent);
|
||||
}
|
||||
} else if (options.user_agent) {
|
||||
await page.setExtraHTTPHeaders({ 'User-Agent': options.user_agent });
|
||||
}
|
||||
|
||||
const timeout = (options.timeout || 30) * 1000;
|
||||
const viewport = {
|
||||
width: options.width || 1440,
|
||||
height: options.height || 900,
|
||||
};
|
||||
await page.setViewportSize(viewport);
|
||||
|
||||
await page.goto(url, { waitUntil: 'networkidle', timeout });
|
||||
|
||||
// Try to solve CAPTCHA if present
|
||||
if (options.solve_captcha !== false) {
|
||||
const solved = await solveCaptcha(page, url);
|
||||
if (solved) {
|
||||
await page.goto(url, { waitUntil: 'networkidle', timeout });
|
||||
}
|
||||
}
|
||||
|
||||
if (options.wait_for) {
|
||||
await page.waitForSelector(options.wait_for, { timeout });
|
||||
}
|
||||
|
||||
// Scroll for infinite scroll
|
||||
if (options.scroll_to_bottom) {
|
||||
let lastHeight = 0;
|
||||
let retries = 0;
|
||||
while (retries < 10) {
|
||||
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
||||
if (currentHeight === lastHeight) break;
|
||||
lastHeight = currentHeight;
|
||||
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
||||
await page.waitForTimeout(500);
|
||||
retries++;
|
||||
}
|
||||
}
|
||||
|
||||
let result = {};
|
||||
|
||||
switch (endpoint) {
|
||||
case 'crawl': {
|
||||
const html = await page.content();
|
||||
const title = await page.title();
|
||||
result = { html, title, url: page.url() };
|
||||
break;
|
||||
}
|
||||
case 'content': {
|
||||
const html = await page.content();
|
||||
result = { html, url: page.url() };
|
||||
break;
|
||||
}
|
||||
case 'screenshot': {
|
||||
const opts = { type: 'png' };
|
||||
if (options.full_page) opts.fullPage = true;
|
||||
const buffer = await page.screenshot(opts);
|
||||
const filepath = writeFile(outputDir, `${Date.now()}.png`, buffer);
|
||||
result = { file_path: filepath, url: page.url() };
|
||||
break;
|
||||
}
|
||||
case 'pdf': {
|
||||
const buffer = await page.pdf({ format: 'A4', printBackground: true });
|
||||
const filepath = writeFile(outputDir, `${Date.now()}.pdf`, buffer);
|
||||
result = { file_path: filepath, url: page.url() };
|
||||
break;
|
||||
}
|
||||
case 'markdown': {
|
||||
const html = await page.content();
|
||||
const markdown = turndownService.turndown(html);
|
||||
result = { markdown, url: page.url() };
|
||||
break;
|
||||
}
|
||||
case 'snapshot': {
|
||||
const html = await page.content();
|
||||
const buffer = await page.screenshot({ type: 'png', fullPage: options.full_page || false });
|
||||
const filepath = writeFile(outputDir, `${Date.now()}.png`, buffer);
|
||||
result = { html, file_path: filepath, url: page.url() };
|
||||
break;
|
||||
}
|
||||
case 'scrape': {
|
||||
const selectors = options.selectors || ['h1', 'h2', 'p', 'a'];
|
||||
const data = {};
|
||||
for (const selector of selectors) {
|
||||
const elements = await page.locator(selector).all();
|
||||
const texts = [];
|
||||
for (const el of elements) {
|
||||
const text = await el.textContent();
|
||||
if (text) texts.push(text.trim());
|
||||
}
|
||||
data[selector] = texts;
|
||||
}
|
||||
result = { data, url: page.url() };
|
||||
break;
|
||||
}
|
||||
case 'json': {
|
||||
const title = await page.title();
|
||||
const description = await page.locator('meta[name="description"]').getAttribute('content').catch(() => null);
|
||||
const headings = await page.locator('h1, h2, h3').allTextContents();
|
||||
const links = await page.locator('a[href]').evaluateAll(els => els.map(el => ({ href: el.href, text: el.textContent?.trim() })));
|
||||
result = { title, description, headings, links: links.slice(0, 50), url: page.url() };
|
||||
break;
|
||||
}
|
||||
case 'links': {
|
||||
const links = await page.locator('a[href]').evaluateAll(els =>
|
||||
els.map(el => ({ href: el.href, text: el.textContent?.trim() || '' })).filter(l => l.href)
|
||||
);
|
||||
result = { links: [...new Map(links.map(l => [l.href, l])).values()], url: page.url() };
|
||||
break;
|
||||
}
|
||||
default:
|
||||
throw new Error(`Unknown endpoint: ${endpoint}`);
|
||||
}
|
||||
|
||||
// Save cookies if session provided
|
||||
if (options.session_id) {
|
||||
await saveCookies(page.context(), options.session_id);
|
||||
}
|
||||
|
||||
console.log(JSON.stringify(result));
|
||||
} catch (error) {
|
||||
console.error(error.message);
|
||||
process.exitCode = 1;
|
||||
} finally {
|
||||
await pool.releasePage(page, entry);
|
||||
}
|
||||
}
|
||||
|
||||
run().then(() => {
|
||||
setTimeout(() => process.exit(process.exitCode || 0), 100);
|
||||
}).catch(err => {
|
||||
console.error(err);
|
||||
process.exit(1);
|
||||
});
|
||||
Reference in New Issue
Block a user