413 lines
12 KiB
JavaScript
413 lines
12 KiB
JavaScript
const { chromium } = require('playwright');
|
|
const fs = require('fs');
|
|
const path = require('path');
|
|
const TurndownService = require('turndown');
|
|
const { execSync } = require('child_process');
|
|
|
|
const turndownService = new TurndownService();
|
|
const OUTPUT_DIR = process.env.OUTPUT_DIR || '/tmp/crawlapi';
|
|
const COOKIE_DIR = process.env.COOKIE_DIR || '/tmp/crawlapi/cookies';
|
|
const PROXY_URL = process.env.PROXY_URL || '';
|
|
const CAPTCHA_API_KEY = process.env.CAPTCHA_API_KEY || '';
|
|
|
|
class ProxyRotator {
|
|
constructor() {
|
|
this.proxies = [];
|
|
this.index = 0;
|
|
this.loadProxies();
|
|
}
|
|
|
|
loadProxies() {
|
|
if (PROXY_URL) {
|
|
this.proxies = PROXY_URL.split(',').map(p => p.trim()).filter(Boolean);
|
|
}
|
|
}
|
|
|
|
getNext() {
|
|
if (this.proxies.length === 0) return null;
|
|
const proxy = this.proxies[this.index];
|
|
this.index = (this.index + 1) % this.proxies.length;
|
|
return proxy;
|
|
}
|
|
}
|
|
|
|
const proxyRotator = new ProxyRotator();
|
|
|
|
class BrowserPool {
|
|
constructor(maxBrowsers = 5, maxPagesPerBrowser = 10) {
|
|
this.maxBrowsers = maxBrowsers;
|
|
this.maxPagesPerBrowser = maxPagesPerBrowser;
|
|
this.browsers = [];
|
|
this.initialized = false;
|
|
}
|
|
|
|
async init() {
|
|
if (this.initialized) return;
|
|
for (let i = 0; i < this.maxBrowsers; i++) {
|
|
const browser = await chromium.launch({
|
|
headless: true,
|
|
args: [
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-dev-shm-usage',
|
|
'--disable-accelerated-2d-canvas',
|
|
'--disable-gpu',
|
|
'--window-size=1920,1080',
|
|
]
|
|
});
|
|
this.browsers.push({ browser, pages: [], lock: false });
|
|
}
|
|
this.initialized = true;
|
|
console.error(`Browser pool initialized with ${this.maxBrowsers} browsers`);
|
|
}
|
|
|
|
async acquirePage(proxy = null) {
|
|
await this.init();
|
|
return new Promise((resolve, reject) => {
|
|
const tryAcquire = () => {
|
|
for (const entry of this.browsers) {
|
|
if (!entry.lock && entry.pages.length < this.maxPagesPerBrowser) {
|
|
entry.lock = true;
|
|
const contextOptions = {
|
|
viewport: { width: 1440, height: 900 },
|
|
};
|
|
if (proxy) {
|
|
contextOptions.proxy = { server: proxy };
|
|
}
|
|
entry.browser.newContext(contextOptions)
|
|
.then(context => {
|
|
return context.newPage();
|
|
})
|
|
.then(page => {
|
|
entry.pages.push(page);
|
|
entry.lock = false;
|
|
resolve({ page, entry });
|
|
})
|
|
.catch(err => {
|
|
entry.lock = false;
|
|
reject(err);
|
|
});
|
|
return;
|
|
}
|
|
}
|
|
setTimeout(tryAcquire, 50);
|
|
};
|
|
tryAcquire();
|
|
});
|
|
}
|
|
|
|
async releasePage(page, entry) {
|
|
const idx = entry.pages.indexOf(page);
|
|
if (idx > -1) {
|
|
entry.pages.splice(idx, 1);
|
|
}
|
|
try {
|
|
await page.context().close();
|
|
} catch (e) {
|
|
// ignore
|
|
}
|
|
}
|
|
|
|
async close() {
|
|
for (const entry of this.browsers) {
|
|
await entry.browser.close();
|
|
}
|
|
this.browsers = [];
|
|
this.initialized = false;
|
|
}
|
|
}
|
|
|
|
const pool = new BrowserPool(
|
|
parseInt(process.env.BROWSER_POOL_SIZE) || 5,
|
|
parseInt(process.env.MAX_PAGES_PER_BROWSER) || 10
|
|
);
|
|
|
|
function parseOptions(raw) {
|
|
try { return JSON.parse(raw); } catch { return {}; }
|
|
}
|
|
|
|
function parseUrl(raw) {
|
|
try { return JSON.parse(raw); } catch { return raw.replace(/^"|"$/g, ''); }
|
|
}
|
|
|
|
function ensureDir(dir) {
|
|
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
|
|
}
|
|
|
|
function writeFile(dir, filename, data) {
|
|
ensureDir(dir);
|
|
const filepath = path.join(dir, filename);
|
|
fs.writeFileSync(filepath, data);
|
|
return filepath;
|
|
}
|
|
|
|
function getCookiePath(sessionId) {
|
|
if (!sessionId) return null;
|
|
ensureDir(COOKIE_DIR);
|
|
return path.join(COOKIE_DIR, `${sessionId}.json`);
|
|
}
|
|
|
|
async function loadCookies(context, sessionId) {
|
|
const cookiePath = getCookiePath(sessionId);
|
|
if (cookiePath && fs.existsSync(cookiePath)) {
|
|
try {
|
|
const cookies = JSON.parse(fs.readFileSync(cookiePath, 'utf8'));
|
|
await context.addCookies(cookies);
|
|
} catch (e) {
|
|
console.error('Failed to load cookies:', e.message);
|
|
}
|
|
}
|
|
}
|
|
|
|
async function saveCookies(context, sessionId) {
|
|
const cookiePath = getCookiePath(sessionId);
|
|
if (cookiePath) {
|
|
try {
|
|
const cookies = await context.cookies();
|
|
fs.writeFileSync(cookiePath, JSON.stringify(cookies));
|
|
} catch (e) {
|
|
console.error('Failed to save cookies:', e.message);
|
|
}
|
|
}
|
|
}
|
|
|
|
async function applyStealth(page) {
|
|
// Stealth patches to avoid detection
|
|
await page.addInitScript(() => {
|
|
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
|
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
|
|
Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
|
|
window.chrome = { runtime: {} };
|
|
const originalQuery = window.navigator.permissions.query;
|
|
window.navigator.permissions.query = (parameters) => (
|
|
parameters.name === 'notifications' ?
|
|
Promise.resolve({ state: Notification.permission }) :
|
|
originalQuery(parameters)
|
|
);
|
|
});
|
|
}
|
|
|
|
async function solveCaptcha(page, url) {
|
|
if (!CAPTCHA_API_KEY) return false;
|
|
|
|
try {
|
|
// Check for reCAPTCHA
|
|
const recaptchaSiteKey = await page.locator('[data-sitekey]').first().getAttribute('data-sitekey').catch(() => null);
|
|
if (recaptchaSiteKey) {
|
|
console.error('Found reCAPTCHA, attempting to solve...');
|
|
const taskData = {
|
|
clientKey: CAPTCHA_API_KEY,
|
|
task: {
|
|
type: 'NoCaptchaTaskProxyless',
|
|
websiteURL: url,
|
|
websiteKey: recaptchaSiteKey,
|
|
}
|
|
};
|
|
|
|
// Submit to 2captcha / CapSolver
|
|
const res = await fetch('https://api.capsolver.com/createTask', {
|
|
method: 'POST',
|
|
headers: { 'Content-Type': 'application/json' },
|
|
body: JSON.stringify(taskData)
|
|
});
|
|
const data = await res.json();
|
|
|
|
if (data.errorId === 0) {
|
|
const taskId = data.taskId;
|
|
// Poll for result
|
|
for (let i = 0; i < 60; i++) {
|
|
await new Promise(r => setTimeout(r, 5000));
|
|
const resultRes = await fetch('https://api.capsolver.com/getTaskResult', {
|
|
method: 'POST',
|
|
headers: { 'Content-Type': 'application/json' },
|
|
body: JSON.stringify({ clientKey: CAPTCHA_API_KEY, taskId })
|
|
});
|
|
const resultData = await resultRes.json();
|
|
if (resultData.status === 'ready') {
|
|
await page.locator('#g-recaptcha-response').evaluate((el, token) => {
|
|
el.value = token;
|
|
}, resultData.solution.gRecaptchaResponse);
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} catch (e) {
|
|
console.error('Captcha solving failed:', e.message);
|
|
}
|
|
return false;
|
|
}
|
|
|
|
async function run() {
|
|
const endpoint = process.argv[2];
|
|
const url = parseUrl(process.argv[3]);
|
|
const options = parseOptions(process.argv[4]);
|
|
const outputDir = process.env.OUTPUT_DIR || '/tmp/crawlapi';
|
|
|
|
// Get proxy if enabled
|
|
const proxy = options.use_proxy ? proxyRotator.getNext() : null;
|
|
if (proxy) {
|
|
console.error(`Using proxy: ${proxy}`);
|
|
}
|
|
|
|
const { page, entry } = await pool.acquirePage(proxy);
|
|
|
|
try {
|
|
// Load cookies if session provided
|
|
if (options.session_id) {
|
|
await loadCookies(page.context(), options.session_id);
|
|
}
|
|
|
|
// Set custom headers
|
|
if (options.headers) {
|
|
await page.setExtraHTTPHeaders(options.headers);
|
|
}
|
|
|
|
// Apply stealth mode
|
|
if (options.stealth !== false) {
|
|
await applyStealth(page);
|
|
}
|
|
|
|
// Mobile emulation
|
|
if (options.mobile) {
|
|
const devices = require('playwright').devices;
|
|
const device = devices['iPhone 14'];
|
|
if (device) {
|
|
await page.setViewportSize(device.viewport);
|
|
await page.setUserAgent(device.userAgent);
|
|
}
|
|
} else if (options.user_agent) {
|
|
await page.setExtraHTTPHeaders({ 'User-Agent': options.user_agent });
|
|
}
|
|
|
|
const timeout = (options.timeout || 30) * 1000;
|
|
const viewport = {
|
|
width: options.width || 1440,
|
|
height: options.height || 900,
|
|
};
|
|
await page.setViewportSize(viewport);
|
|
|
|
await page.goto(url, { waitUntil: 'networkidle', timeout });
|
|
|
|
// Try to solve CAPTCHA if present
|
|
if (options.solve_captcha !== false) {
|
|
const solved = await solveCaptcha(page, url);
|
|
if (solved) {
|
|
await page.goto(url, { waitUntil: 'networkidle', timeout });
|
|
}
|
|
}
|
|
|
|
if (options.wait_for) {
|
|
await page.waitForSelector(options.wait_for, { timeout });
|
|
}
|
|
|
|
// Scroll for infinite scroll
|
|
if (options.scroll_to_bottom) {
|
|
let lastHeight = 0;
|
|
let retries = 0;
|
|
while (retries < 10) {
|
|
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
|
if (currentHeight === lastHeight) break;
|
|
lastHeight = currentHeight;
|
|
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
|
await page.waitForTimeout(500);
|
|
retries++;
|
|
}
|
|
}
|
|
|
|
let result = {};
|
|
|
|
switch (endpoint) {
|
|
case 'crawl': {
|
|
const html = await page.content();
|
|
const title = await page.title();
|
|
result = { html, title, url: page.url() };
|
|
break;
|
|
}
|
|
case 'content': {
|
|
const html = await page.content();
|
|
result = { html, url: page.url() };
|
|
break;
|
|
}
|
|
case 'screenshot': {
|
|
const opts = { type: 'png' };
|
|
if (options.full_page) opts.fullPage = true;
|
|
const buffer = await page.screenshot(opts);
|
|
const filepath = writeFile(outputDir, `${Date.now()}.png`, buffer);
|
|
result = { file_path: filepath, url: page.url() };
|
|
break;
|
|
}
|
|
case 'pdf': {
|
|
const buffer = await page.pdf({ format: 'A4', printBackground: true });
|
|
const filepath = writeFile(outputDir, `${Date.now()}.pdf`, buffer);
|
|
result = { file_path: filepath, url: page.url() };
|
|
break;
|
|
}
|
|
case 'markdown': {
|
|
const html = await page.content();
|
|
const markdown = turndownService.turndown(html);
|
|
result = { markdown, url: page.url() };
|
|
break;
|
|
}
|
|
case 'snapshot': {
|
|
const html = await page.content();
|
|
const buffer = await page.screenshot({ type: 'png', fullPage: options.full_page || false });
|
|
const filepath = writeFile(outputDir, `${Date.now()}.png`, buffer);
|
|
result = { html, file_path: filepath, url: page.url() };
|
|
break;
|
|
}
|
|
case 'scrape': {
|
|
const selectors = options.selectors || ['h1', 'h2', 'p', 'a'];
|
|
const data = {};
|
|
for (const selector of selectors) {
|
|
const elements = await page.locator(selector).all();
|
|
const texts = [];
|
|
for (const el of elements) {
|
|
const text = await el.textContent();
|
|
if (text) texts.push(text.trim());
|
|
}
|
|
data[selector] = texts;
|
|
}
|
|
result = { data, url: page.url() };
|
|
break;
|
|
}
|
|
case 'json': {
|
|
const title = await page.title();
|
|
const description = await page.locator('meta[name="description"]').getAttribute('content').catch(() => null);
|
|
const headings = await page.locator('h1, h2, h3').allTextContents();
|
|
const links = await page.locator('a[href]').evaluateAll(els => els.map(el => ({ href: el.href, text: el.textContent?.trim() })));
|
|
result = { title, description, headings, links: links.slice(0, 50), url: page.url() };
|
|
break;
|
|
}
|
|
case 'links': {
|
|
const links = await page.locator('a[href]').evaluateAll(els =>
|
|
els.map(el => ({ href: el.href, text: el.textContent?.trim() || '' })).filter(l => l.href)
|
|
);
|
|
result = { links: [...new Map(links.map(l => [l.href, l])).values()], url: page.url() };
|
|
break;
|
|
}
|
|
default:
|
|
throw new Error(`Unknown endpoint: ${endpoint}`);
|
|
}
|
|
|
|
// Save cookies if session provided
|
|
if (options.session_id) {
|
|
await saveCookies(page.context(), options.session_id);
|
|
}
|
|
|
|
console.log(JSON.stringify(result));
|
|
} catch (error) {
|
|
console.error(error.message);
|
|
process.exitCode = 1;
|
|
} finally {
|
|
await pool.releasePage(page, entry);
|
|
}
|
|
}
|
|
|
|
run().then(() => {
|
|
setTimeout(() => process.exit(process.exitCode || 0), 100);
|
|
}).catch(err => {
|
|
console.error(err);
|
|
process.exit(1);
|
|
});
|