const { chromium } = require('playwright'); const fs = require('fs'); const path = require('path'); const TurndownService = require('turndown'); const { execSync } = require('child_process'); const turndownService = new TurndownService(); const OUTPUT_DIR = process.env.OUTPUT_DIR || '/tmp/crawlapi'; const COOKIE_DIR = process.env.COOKIE_DIR || '/tmp/crawlapi/cookies'; const PROXY_URL = process.env.PROXY_URL || ''; const CAPTCHA_API_KEY = process.env.CAPTCHA_API_KEY || ''; class ProxyRotator { constructor() { this.proxies = []; this.index = 0; this.loadProxies(); } loadProxies() { if (PROXY_URL) { this.proxies = PROXY_URL.split(',').map(p => p.trim()).filter(Boolean); } } getNext() { if (this.proxies.length === 0) return null; const proxy = this.proxies[this.index]; this.index = (this.index + 1) % this.proxies.length; return proxy; } } const proxyRotator = new ProxyRotator(); class BrowserPool { constructor(maxBrowsers = 5, maxPagesPerBrowser = 10) { this.maxBrowsers = maxBrowsers; this.maxPagesPerBrowser = maxPagesPerBrowser; this.browsers = []; this.initialized = false; } async init() { if (this.initialized) return; for (let i = 0; i < this.maxBrowsers; i++) { const browser = await chromium.launch({ headless: true, args: [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-accelerated-2d-canvas', '--disable-gpu', '--window-size=1920,1080', ] }); this.browsers.push({ browser, pages: [], lock: false }); } this.initialized = true; console.error(`Browser pool initialized with ${this.maxBrowsers} browsers`); } async acquirePage(proxy = null) { await this.init(); return new Promise((resolve, reject) => { const tryAcquire = () => { for (const entry of this.browsers) { if (!entry.lock && entry.pages.length < this.maxPagesPerBrowser) { entry.lock = true; const contextOptions = { viewport: { width: 1440, height: 900 }, }; if (proxy) { contextOptions.proxy = { server: proxy }; } entry.browser.newContext(contextOptions) .then(context => { return context.newPage(); }) .then(page => { entry.pages.push(page); entry.lock = false; resolve({ page, entry }); }) .catch(err => { entry.lock = false; reject(err); }); return; } } setTimeout(tryAcquire, 50); }; tryAcquire(); }); } async releasePage(page, entry) { const idx = entry.pages.indexOf(page); if (idx > -1) { entry.pages.splice(idx, 1); } try { await page.context().close(); } catch (e) { // ignore } } async close() { for (const entry of this.browsers) { await entry.browser.close(); } this.browsers = []; this.initialized = false; } } const pool = new BrowserPool( parseInt(process.env.BROWSER_POOL_SIZE) || 5, parseInt(process.env.MAX_PAGES_PER_BROWSER) || 10 ); function parseOptions(raw) { try { return JSON.parse(raw); } catch { return {}; } } function parseUrl(raw) { try { return JSON.parse(raw); } catch { return raw.replace(/^"|"$/g, ''); } } function ensureDir(dir) { if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true }); } function writeFile(dir, filename, data) { ensureDir(dir); const filepath = path.join(dir, filename); fs.writeFileSync(filepath, data); return filepath; } function getCookiePath(sessionId) { if (!sessionId) return null; ensureDir(COOKIE_DIR); return path.join(COOKIE_DIR, `${sessionId}.json`); } async function loadCookies(context, sessionId) { const cookiePath = getCookiePath(sessionId); if (cookiePath && fs.existsSync(cookiePath)) { try { const cookies = JSON.parse(fs.readFileSync(cookiePath, 'utf8')); await context.addCookies(cookies); } catch (e) { console.error('Failed to load cookies:', e.message); } } } async function saveCookies(context, sessionId) { const cookiePath = getCookiePath(sessionId); if (cookiePath) { try { const cookies = await context.cookies(); fs.writeFileSync(cookiePath, JSON.stringify(cookies)); } catch (e) { console.error('Failed to save cookies:', e.message); } } } async function applyStealth(page) { // Stealth patches to avoid detection await page.addInitScript(() => { Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] }); Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); window.chrome = { runtime: {} }; const originalQuery = window.navigator.permissions.query; window.navigator.permissions.query = (parameters) => ( parameters.name === 'notifications' ? Promise.resolve({ state: Notification.permission }) : originalQuery(parameters) ); }); } async function solveCaptcha(page, url) { if (!CAPTCHA_API_KEY) return false; try { // Check for reCAPTCHA const recaptchaSiteKey = await page.locator('[data-sitekey]').first().getAttribute('data-sitekey').catch(() => null); if (recaptchaSiteKey) { console.error('Found reCAPTCHA, attempting to solve...'); const taskData = { clientKey: CAPTCHA_API_KEY, task: { type: 'NoCaptchaTaskProxyless', websiteURL: url, websiteKey: recaptchaSiteKey, } }; // Submit to 2captcha / CapSolver const res = await fetch('https://api.capsolver.com/createTask', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(taskData) }); const data = await res.json(); if (data.errorId === 0) { const taskId = data.taskId; // Poll for result for (let i = 0; i < 60; i++) { await new Promise(r => setTimeout(r, 5000)); const resultRes = await fetch('https://api.capsolver.com/getTaskResult', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ clientKey: CAPTCHA_API_KEY, taskId }) }); const resultData = await resultRes.json(); if (resultData.status === 'ready') { await page.locator('#g-recaptcha-response').evaluate((el, token) => { el.value = token; }, resultData.solution.gRecaptchaResponse); return true; } } } } } catch (e) { console.error('Captcha solving failed:', e.message); } return false; } async function run() { const endpoint = process.argv[2]; const url = parseUrl(process.argv[3]); const options = parseOptions(process.argv[4]); const outputDir = process.env.OUTPUT_DIR || '/tmp/crawlapi'; // Get proxy if enabled const proxy = options.use_proxy ? proxyRotator.getNext() : null; if (proxy) { console.error(`Using proxy: ${proxy}`); } const { page, entry } = await pool.acquirePage(proxy); try { // Load cookies if session provided if (options.session_id) { await loadCookies(page.context(), options.session_id); } // Set custom headers if (options.headers) { await page.setExtraHTTPHeaders(options.headers); } // Apply stealth mode if (options.stealth !== false) { await applyStealth(page); } // Mobile emulation if (options.mobile) { const devices = require('playwright').devices; const device = devices['iPhone 14']; if (device) { await page.setViewportSize(device.viewport); await page.setUserAgent(device.userAgent); } } else if (options.user_agent) { await page.setExtraHTTPHeaders({ 'User-Agent': options.user_agent }); } const timeout = (options.timeout || 30) * 1000; const viewport = { width: options.width || 1440, height: options.height || 900, }; await page.setViewportSize(viewport); await page.goto(url, { waitUntil: 'networkidle', timeout }); // Try to solve CAPTCHA if present if (options.solve_captcha !== false) { const solved = await solveCaptcha(page, url); if (solved) { await page.goto(url, { waitUntil: 'networkidle', timeout }); } } if (options.wait_for) { await page.waitForSelector(options.wait_for, { timeout }); } // Scroll for infinite scroll if (options.scroll_to_bottom) { let lastHeight = 0; let retries = 0; while (retries < 10) { const currentHeight = await page.evaluate(() => document.body.scrollHeight); if (currentHeight === lastHeight) break; lastHeight = currentHeight; await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); await page.waitForTimeout(500); retries++; } } let result = {}; switch (endpoint) { case 'crawl': { const html = await page.content(); const title = await page.title(); result = { html, title, url: page.url() }; break; } case 'content': { const html = await page.content(); result = { html, url: page.url() }; break; } case 'screenshot': { const opts = { type: 'png' }; if (options.full_page) opts.fullPage = true; const buffer = await page.screenshot(opts); const filepath = writeFile(outputDir, `${Date.now()}.png`, buffer); result = { file_path: filepath, url: page.url() }; break; } case 'pdf': { const buffer = await page.pdf({ format: 'A4', printBackground: true }); const filepath = writeFile(outputDir, `${Date.now()}.pdf`, buffer); result = { file_path: filepath, url: page.url() }; break; } case 'markdown': { const html = await page.content(); const markdown = turndownService.turndown(html); result = { markdown, url: page.url() }; break; } case 'snapshot': { const html = await page.content(); const buffer = await page.screenshot({ type: 'png', fullPage: options.full_page || false }); const filepath = writeFile(outputDir, `${Date.now()}.png`, buffer); result = { html, file_path: filepath, url: page.url() }; break; } case 'scrape': { const selectors = options.selectors || ['h1', 'h2', 'p', 'a']; const data = {}; for (const selector of selectors) { const elements = await page.locator(selector).all(); const texts = []; for (const el of elements) { const text = await el.textContent(); if (text) texts.push(text.trim()); } data[selector] = texts; } result = { data, url: page.url() }; break; } case 'json': { const title = await page.title(); const description = await page.locator('meta[name="description"]').getAttribute('content').catch(() => null); const headings = await page.locator('h1, h2, h3').allTextContents(); const links = await page.locator('a[href]').evaluateAll(els => els.map(el => ({ href: el.href, text: el.textContent?.trim() }))); result = { title, description, headings, links: links.slice(0, 50), url: page.url() }; break; } case 'links': { const links = await page.locator('a[href]').evaluateAll(els => els.map(el => ({ href: el.href, text: el.textContent?.trim() || '' })).filter(l => l.href) ); result = { links: [...new Map(links.map(l => [l.href, l])).values()], url: page.url() }; break; } default: throw new Error(`Unknown endpoint: ${endpoint}`); } // Save cookies if session provided if (options.session_id) { await saveCookies(page.context(), options.session_id); } console.log(JSON.stringify(result)); } catch (error) { console.error(error.message); process.exitCode = 1; } finally { await pool.releasePage(page, entry); } } run().then(() => { setTimeout(() => process.exit(process.exitCode || 0), 100); }).catch(err => { console.error(err); process.exit(1); });