Initial commit: Full Crawl API implementation
Some checks failed
CI / Test (push) Has been cancelled
Deploy / Deploy to Staging (push) Has been cancelled
CI / Build & Push (push) Has been cancelled
Deploy / Deploy to Production (push) Has been cancelled

This commit is contained in:
2026-04-29 07:03:48 +00:00
commit 62994d4f3d
92 changed files with 6176 additions and 0 deletions

412
playwright/pool.js Normal file
View File

@@ -0,0 +1,412 @@
const { chromium } = require('playwright');
const fs = require('fs');
const path = require('path');
const TurndownService = require('turndown');
const { execSync } = require('child_process');
const turndownService = new TurndownService();
const OUTPUT_DIR = process.env.OUTPUT_DIR || '/tmp/crawlapi';
const COOKIE_DIR = process.env.COOKIE_DIR || '/tmp/crawlapi/cookies';
const PROXY_URL = process.env.PROXY_URL || '';
const CAPTCHA_API_KEY = process.env.CAPTCHA_API_KEY || '';
class ProxyRotator {
constructor() {
this.proxies = [];
this.index = 0;
this.loadProxies();
}
loadProxies() {
if (PROXY_URL) {
this.proxies = PROXY_URL.split(',').map(p => p.trim()).filter(Boolean);
}
}
getNext() {
if (this.proxies.length === 0) return null;
const proxy = this.proxies[this.index];
this.index = (this.index + 1) % this.proxies.length;
return proxy;
}
}
const proxyRotator = new ProxyRotator();
class BrowserPool {
constructor(maxBrowsers = 5, maxPagesPerBrowser = 10) {
this.maxBrowsers = maxBrowsers;
this.maxPagesPerBrowser = maxPagesPerBrowser;
this.browsers = [];
this.initialized = false;
}
async init() {
if (this.initialized) return;
for (let i = 0; i < this.maxBrowsers; i++) {
const browser = await chromium.launch({
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--disable-gpu',
'--window-size=1920,1080',
]
});
this.browsers.push({ browser, pages: [], lock: false });
}
this.initialized = true;
console.error(`Browser pool initialized with ${this.maxBrowsers} browsers`);
}
async acquirePage(proxy = null) {
await this.init();
return new Promise((resolve, reject) => {
const tryAcquire = () => {
for (const entry of this.browsers) {
if (!entry.lock && entry.pages.length < this.maxPagesPerBrowser) {
entry.lock = true;
const contextOptions = {
viewport: { width: 1440, height: 900 },
};
if (proxy) {
contextOptions.proxy = { server: proxy };
}
entry.browser.newContext(contextOptions)
.then(context => {
return context.newPage();
})
.then(page => {
entry.pages.push(page);
entry.lock = false;
resolve({ page, entry });
})
.catch(err => {
entry.lock = false;
reject(err);
});
return;
}
}
setTimeout(tryAcquire, 50);
};
tryAcquire();
});
}
async releasePage(page, entry) {
const idx = entry.pages.indexOf(page);
if (idx > -1) {
entry.pages.splice(idx, 1);
}
try {
await page.context().close();
} catch (e) {
// ignore
}
}
async close() {
for (const entry of this.browsers) {
await entry.browser.close();
}
this.browsers = [];
this.initialized = false;
}
}
const pool = new BrowserPool(
parseInt(process.env.BROWSER_POOL_SIZE) || 5,
parseInt(process.env.MAX_PAGES_PER_BROWSER) || 10
);
function parseOptions(raw) {
try { return JSON.parse(raw); } catch { return {}; }
}
function parseUrl(raw) {
try { return JSON.parse(raw); } catch { return raw.replace(/^"|"$/g, ''); }
}
function ensureDir(dir) {
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
}
function writeFile(dir, filename, data) {
ensureDir(dir);
const filepath = path.join(dir, filename);
fs.writeFileSync(filepath, data);
return filepath;
}
function getCookiePath(sessionId) {
if (!sessionId) return null;
ensureDir(COOKIE_DIR);
return path.join(COOKIE_DIR, `${sessionId}.json`);
}
async function loadCookies(context, sessionId) {
const cookiePath = getCookiePath(sessionId);
if (cookiePath && fs.existsSync(cookiePath)) {
try {
const cookies = JSON.parse(fs.readFileSync(cookiePath, 'utf8'));
await context.addCookies(cookies);
} catch (e) {
console.error('Failed to load cookies:', e.message);
}
}
}
async function saveCookies(context, sessionId) {
const cookiePath = getCookiePath(sessionId);
if (cookiePath) {
try {
const cookies = await context.cookies();
fs.writeFileSync(cookiePath, JSON.stringify(cookies));
} catch (e) {
console.error('Failed to save cookies:', e.message);
}
}
}
async function applyStealth(page) {
// Stealth patches to avoid detection
await page.addInitScript(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
window.chrome = { runtime: {} };
const originalQuery = window.navigator.permissions.query;
window.navigator.permissions.query = (parameters) => (
parameters.name === 'notifications' ?
Promise.resolve({ state: Notification.permission }) :
originalQuery(parameters)
);
});
}
async function solveCaptcha(page, url) {
if (!CAPTCHA_API_KEY) return false;
try {
// Check for reCAPTCHA
const recaptchaSiteKey = await page.locator('[data-sitekey]').first().getAttribute('data-sitekey').catch(() => null);
if (recaptchaSiteKey) {
console.error('Found reCAPTCHA, attempting to solve...');
const taskData = {
clientKey: CAPTCHA_API_KEY,
task: {
type: 'NoCaptchaTaskProxyless',
websiteURL: url,
websiteKey: recaptchaSiteKey,
}
};
// Submit to 2captcha / CapSolver
const res = await fetch('https://api.capsolver.com/createTask', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(taskData)
});
const data = await res.json();
if (data.errorId === 0) {
const taskId = data.taskId;
// Poll for result
for (let i = 0; i < 60; i++) {
await new Promise(r => setTimeout(r, 5000));
const resultRes = await fetch('https://api.capsolver.com/getTaskResult', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ clientKey: CAPTCHA_API_KEY, taskId })
});
const resultData = await resultRes.json();
if (resultData.status === 'ready') {
await page.locator('#g-recaptcha-response').evaluate((el, token) => {
el.value = token;
}, resultData.solution.gRecaptchaResponse);
return true;
}
}
}
}
} catch (e) {
console.error('Captcha solving failed:', e.message);
}
return false;
}
async function run() {
const endpoint = process.argv[2];
const url = parseUrl(process.argv[3]);
const options = parseOptions(process.argv[4]);
const outputDir = process.env.OUTPUT_DIR || '/tmp/crawlapi';
// Get proxy if enabled
const proxy = options.use_proxy ? proxyRotator.getNext() : null;
if (proxy) {
console.error(`Using proxy: ${proxy}`);
}
const { page, entry } = await pool.acquirePage(proxy);
try {
// Load cookies if session provided
if (options.session_id) {
await loadCookies(page.context(), options.session_id);
}
// Set custom headers
if (options.headers) {
await page.setExtraHTTPHeaders(options.headers);
}
// Apply stealth mode
if (options.stealth !== false) {
await applyStealth(page);
}
// Mobile emulation
if (options.mobile) {
const devices = require('playwright').devices;
const device = devices['iPhone 14'];
if (device) {
await page.setViewportSize(device.viewport);
await page.setUserAgent(device.userAgent);
}
} else if (options.user_agent) {
await page.setExtraHTTPHeaders({ 'User-Agent': options.user_agent });
}
const timeout = (options.timeout || 30) * 1000;
const viewport = {
width: options.width || 1440,
height: options.height || 900,
};
await page.setViewportSize(viewport);
await page.goto(url, { waitUntil: 'networkidle', timeout });
// Try to solve CAPTCHA if present
if (options.solve_captcha !== false) {
const solved = await solveCaptcha(page, url);
if (solved) {
await page.goto(url, { waitUntil: 'networkidle', timeout });
}
}
if (options.wait_for) {
await page.waitForSelector(options.wait_for, { timeout });
}
// Scroll for infinite scroll
if (options.scroll_to_bottom) {
let lastHeight = 0;
let retries = 0;
while (retries < 10) {
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
if (currentHeight === lastHeight) break;
lastHeight = currentHeight;
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
await page.waitForTimeout(500);
retries++;
}
}
let result = {};
switch (endpoint) {
case 'crawl': {
const html = await page.content();
const title = await page.title();
result = { html, title, url: page.url() };
break;
}
case 'content': {
const html = await page.content();
result = { html, url: page.url() };
break;
}
case 'screenshot': {
const opts = { type: 'png' };
if (options.full_page) opts.fullPage = true;
const buffer = await page.screenshot(opts);
const filepath = writeFile(outputDir, `${Date.now()}.png`, buffer);
result = { file_path: filepath, url: page.url() };
break;
}
case 'pdf': {
const buffer = await page.pdf({ format: 'A4', printBackground: true });
const filepath = writeFile(outputDir, `${Date.now()}.pdf`, buffer);
result = { file_path: filepath, url: page.url() };
break;
}
case 'markdown': {
const html = await page.content();
const markdown = turndownService.turndown(html);
result = { markdown, url: page.url() };
break;
}
case 'snapshot': {
const html = await page.content();
const buffer = await page.screenshot({ type: 'png', fullPage: options.full_page || false });
const filepath = writeFile(outputDir, `${Date.now()}.png`, buffer);
result = { html, file_path: filepath, url: page.url() };
break;
}
case 'scrape': {
const selectors = options.selectors || ['h1', 'h2', 'p', 'a'];
const data = {};
for (const selector of selectors) {
const elements = await page.locator(selector).all();
const texts = [];
for (const el of elements) {
const text = await el.textContent();
if (text) texts.push(text.trim());
}
data[selector] = texts;
}
result = { data, url: page.url() };
break;
}
case 'json': {
const title = await page.title();
const description = await page.locator('meta[name="description"]').getAttribute('content').catch(() => null);
const headings = await page.locator('h1, h2, h3').allTextContents();
const links = await page.locator('a[href]').evaluateAll(els => els.map(el => ({ href: el.href, text: el.textContent?.trim() })));
result = { title, description, headings, links: links.slice(0, 50), url: page.url() };
break;
}
case 'links': {
const links = await page.locator('a[href]').evaluateAll(els =>
els.map(el => ({ href: el.href, text: el.textContent?.trim() || '' })).filter(l => l.href)
);
result = { links: [...new Map(links.map(l => [l.href, l])).values()], url: page.url() };
break;
}
default:
throw new Error(`Unknown endpoint: ${endpoint}`);
}
// Save cookies if session provided
if (options.session_id) {
await saveCookies(page.context(), options.session_id);
}
console.log(JSON.stringify(result));
} catch (error) {
console.error(error.message);
process.exitCode = 1;
} finally {
await pool.releasePage(page, entry);
}
}
run().then(() => {
setTimeout(() => process.exit(process.exitCode || 0), 100);
}).catch(err => {
console.error(err);
process.exit(1);
});