#!/usr/bin/env node import fs from 'fs/promises'; import path from 'path'; import process from 'process'; import { chromium } from 'playwright'; import http from 'http'; import https from 'https'; const VIEWPORT_SIZE = { width: 375, height: 667 }; const CONTEXT_OPTIONS = { viewport: VIEWPORT_SIZE, deviceScaleFactor: 3, isMobile: true, hasTouch: true, }; const MEDIA_MIME_PREFIXES = ['audio/', 'video/', 'image/']; const MEDIA_EXTENSIONS = [ '.mp3', '.wav', '.ogg', '.aac', '.m4a', '.flac', '.mp4', '.webm', '.ogv', '.avi', '.mov', '.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.bmp', '.ico', ]; const MAX_RETRIES = 2; const TIMEOUT = 30000; function getCurrentDate() { const now = new Date(); const year = now.getFullYear(); const month = String(now.getMonth() + 1).padStart(2, '0'); const day = String(now.getDate()).padStart(2, '0'); return `${year}${month}${day}`; } function extractPageId(url) { const patterns = [ /storeview\/([a-z0-9]+)/i, /\/v\/([a-z0-9]+)/i, ]; for (const pattern of patterns) { const match = url.match(pattern); if (match && match[1]) { return match[1]; } } return null; } function sanitizeTitle(title) { if (!title) { return 'page'; } return title.replace(/[<>:"/\\|?*]/g, '').trim().slice(0, 100) || 'page'; } function parseArgs() { const args = process.argv.slice(2); let pageUrl = null; let cdpUrl = process.env.ENV_CDP || null; for (let i = 0; i < args.length; i++) { const arg = args[i]; if (arg === '--cdp') { cdpUrl = args[i + 1]; i += 1; } else if (!arg.startsWith('--') && !pageUrl) { pageUrl = arg; } } if (!pageUrl) { console.error('Usage: node run.mjs [--cdp ws://host:port/devtools/browser/|http://host:port] (or ENV_CDP)'); process.exit(1); } const pageId = extractPageId(pageUrl); if (!pageId) { console.error('Error: Could not extract page-id from the URL'); process.exit(1); } return { pageId, pageUrl, cdpUrl }; } function isMediaUrl(url, mimeType) { if (mimeType && MEDIA_MIME_PREFIXES.some((prefix) => mimeType.toLowerCase().startsWith(prefix))) { return true; } const lowerUrl = url.toLowerCase(); return MEDIA_EXTENSIONS.some((ext) => lowerUrl.includes(ext)); } function buildCaptureUrl(pageUrl) { const pageId = extractPageId(pageUrl); return `https://ca.rrx.cn/v/${pageId}?rrxsrc=2&iframe=1&tpl=1`; } async function fetchJson(url) { return new Promise((resolve, reject) => { const client = url.startsWith('https:') ? https : http; const req = client.get(url, (res) => { if (res.statusCode !== 200) { reject(new Error(`HTTP ${res.statusCode}`)); res.resume(); return; } let data = ''; res.setEncoding('utf8'); res.on('data', (chunk) => { data += chunk; }); res.on('end', () => { try { resolve(JSON.parse(data)); } catch (err) { reject(err); } }); }); req.on('error', reject); }); } async function resolveCdpUrl(cdpUrl) { if (!cdpUrl) { return null; } if (cdpUrl.startsWith('ws://') || cdpUrl.startsWith('wss://')) { return cdpUrl; } const base = cdpUrl.endsWith('/') ? cdpUrl.slice(0, -1) : cdpUrl; const endpointUrl = `${base}/json/version`; const data = await fetchJson(endpointUrl); if (!data || !data.webSocketDebuggerUrl) { throw new Error(`No webSocketDebuggerUrl found at ${endpointUrl}`); } return data.webSocketDebuggerUrl; } function guessExtension(contentType) { if (!contentType) { return '.bin'; } const mime = contentType.split(';')[0].trim(); const map = { 'image/jpeg': '.jpg', 'image/png': '.png', 'image/gif': '.gif', 'image/webp': '.webp', 'image/svg+xml': '.svg', 'audio/mpeg': '.mp3', 'audio/wav': '.wav', 'audio/ogg': '.ogg', 'audio/aac': '.aac', 'video/mp4': '.mp4', 'video/webm': '.webm', 'video/ogg': '.ogv', 'application/octet-stream': '.bin', }; return map[mime] || '.bin'; } function sanitizeFilename(url, contentType) { try { const parsed = new URL(url); let basename = path.basename(parsed.pathname); if (basename.includes('@!')) { basename = basename.split('@!')[0]; } if (!basename) { basename = 'file'; } const { name, ext } = path.parse(basename); if (!ext) { return name + guessExtension(contentType); } return basename; } catch { return 'file.bin'; } } async function uniqueName(basePath) { try { await fs.access(basePath); const { dir, name, ext } = path.parse(basePath); let idx = 1; while (true) { const candidate = path.join(dir, `${name}_${idx}${ext}`); try { await fs.access(candidate); idx += 1; } catch { return candidate; } } } catch { return basePath; } } async function downloadOne(url, destDir, timeout) { return new Promise((resolve) => { try { const parsed = new URL(url); const client = parsed.protocol === 'https:' ? https : http; const req = client.get(url, { timeout }, async (resp) => { if (resp.statusCode !== 200) { resolve([false, `HTTP ${resp.statusCode}`]); resp.resume(); return; } const contentType = resp.headers['content-type']; const fname = sanitizeFilename(url, contentType); const target = await uniqueName(path.join(destDir, fname)); const fileStream = await fs.open(target, 'w'); const writeStream = fileStream.createWriteStream(); let failed = false; let errorMsg = null; writeStream.on('error', (err) => { failed = true; errorMsg = err.message; writeStream.close(); fileStream.close(); }); writeStream.on('finish', () => { writeStream.close(); fileStream.close(); }); writeStream.on('close', () => { if (failed) { fs.unlink(target).catch(() => {}); resolve([false, errorMsg]); } else { resolve([true, null]); } }); resp.pipe(writeStream); }); req.on('error', (err) => { resolve([false, err.message]); }); req.on('timeout', () => { req.destroy(); resolve([false, 'Request timeout']); }); } catch (err) { resolve([false, err.message]); } }); } async function deleteZeroByteFiles(destDir) { const removed = []; const files = await fs.readdir(destDir); for (const file of files) { const filePath = path.join(destDir, file); const stats = await fs.stat(filePath); if (stats.isFile() && stats.size === 0) { await fs.unlink(filePath); removed.push(file); } } return removed; } function planDownloads(urls) { const plan = {}; for (const url of urls) { plan[url] = sanitizeFilename(url, null); } return plan; } async function verifyMissing(destDir, plan) { const missing = []; const files = await fs.readdir(destDir); const existing = new Set(); for (const file of files) { const filePath = path.join(destDir, file); const stats = await fs.stat(filePath); if (stats.isFile()) { existing.add(file); } } for (const [url, fname] of Object.entries(plan)) { if (!existing.has(fname)) { const baseName = path.parse(fname).name; const hasPrefix = Array.from(existing).some((name) => name.startsWith(baseName)); if (!hasPrefix) { missing.push(url); } } } return missing; } function dedupeAndFilterUrls(urls) { const seen = new Set(); const filtered = []; const skipped = []; for (const url of urls) { const cleanUrl = url.split('?')[0]; if (seen.has(cleanUrl)) { skipped.push(url); continue; } seen.add(cleanUrl); filtered.push(url); } return { filtered, skipped }; } async function saveSourceUrls(destRoot, sourcesPath) { const sources = await fs.readFile(sourcesPath, 'utf-8'); await fs.writeFile( path.join(destRoot, 'urls.txt'), sources.trim() + '\n', 'utf-8', ); } function sleep(ms) { return new Promise((resolve) => setTimeout(resolve, ms)); } async function captureNetworkAndScreenshot(context, captureUrl, pageId, today) { const page = await context.newPage(); const requests = new Set(); page.on('request', (request) => { const resourceType = request.resourceType(); if (resourceType === 'image' || resourceType === 'media') { requests.add(request.url()); } }); page.on('response', (response) => { const url = response.url(); const mimeType = response.headers()['content-type'] || ''; if (isMediaUrl(url, mimeType)) { requests.add(url); } }); try { console.log('Navigating to:', captureUrl); await page.goto(captureUrl, { waitUntil: 'networkidle', timeout: 30000 }); await page.waitForTimeout(3000); await page.setViewportSize(VIEWPORT_SIZE); const pageTitle = sanitizeTitle(await page.title()); const destRoot = path.join('downloads', `${today}-${pageTitle}-${pageId}`); await fs.mkdir(destRoot, { recursive: true }); const screenshotPath = path.join(destRoot, 'index.png'); console.log('Taking screenshot...'); await page.screenshot({ path: screenshotPath, fullPage: false }); console.log('Screenshot saved:', screenshotPath); return { mediaUrls: [...requests], pageTitle, destRoot }; } finally { await page.close(); } } async function downloadAssets(urls, destRoot, timeout, retries) { const mediaDir = path.join(destRoot, 'media'); await fs.mkdir(mediaDir, { recursive: true }); const { filtered, skipped } = dedupeAndFilterUrls(urls); const plan = planDownloads(filtered); console.log(`Starting download of ${filtered.length} URLs...`); if (skipped.length > 0) { console.log(`Skipped ${skipped.length} duplicate URLs after cleaning.`); } for (const url of filtered) { console.log(`Downloading: ${url}`); const [ok, err] = await downloadOne(url, mediaDir, timeout); if (!ok) { console.error(`WARN: download failed for ${url}: ${err}`); } else { console.log(`Downloaded: ${url}`); } await sleep(50); } const removed = await deleteZeroByteFiles(mediaDir); if (removed.length > 0) { console.log(`Removed zero-byte files: ${removed.join(', ')}`); } let missing = await verifyMissing(mediaDir, plan); let attempts = 0; while (missing.length > 0 && attempts < retries) { attempts += 1; console.log(`Retry round ${attempts}: missing ${missing.length} files`); const stillMissing = []; for (const url of missing) { console.log(`Retrying: ${url}`); const [ok, err] = await downloadOne(url, mediaDir, timeout); if (!ok) { console.error(`WARN: retry failed for ${url}: ${err}`); stillMissing.push(url); } else { console.log(`Downloaded on retry: ${url}`); } } const removedRetry = await deleteZeroByteFiles(mediaDir); if (removedRetry.length > 0) { console.log(`Removed zero-byte files after retry: ${removedRetry.join(', ')}`); } const retryPlan = {}; for (const url of stillMissing) { retryPlan[url] = plan[url]; } missing = stillMissing.length > 0 ? await verifyMissing(mediaDir, retryPlan) : []; } if (missing.length > 0) { console.error('FAILED to fetch these URLs after retries:'); for (const url of missing) { console.error(url); } } else { console.log('All planned files downloaded.'); } } async function main() { const { pageId, pageUrl, cdpUrl } = parseArgs(); console.log('Extracted page-id:', pageId); const today = getCurrentDate(); const tempDir = 'temp'; await fs.mkdir(tempDir, { recursive: true }); const sourcesFile = path.join(tempDir, 'sources.txt'); await fs.writeFile(sourcesFile, pageUrl + '\n', 'utf-8'); let browser; if (cdpUrl) { console.log('Resolving Chrome DevTools endpoint from:', cdpUrl); const resolvedCdpUrl = await resolveCdpUrl(cdpUrl); console.log('Connecting to remote Chrome DevTools:', resolvedCdpUrl); browser = await chromium.connectOverCDP(resolvedCdpUrl); } else { console.log('Launching Playwright Chromium...'); browser = await chromium.launch({ headless: true }); } const context = await browser.newContext(CONTEXT_OPTIONS); const captureUrl = buildCaptureUrl(pageUrl); console.log('Capture URL:', captureUrl); try { const { mediaUrls, pageTitle, destRoot } = await captureNetworkAndScreenshot( context, captureUrl, pageId, today, ); await saveSourceUrls(destRoot, sourcesFile); console.log(`Saved source URLs to ${path.join(destRoot, 'urls.txt')}`); console.log('Found', mediaUrls.length, 'media URLs'); if (mediaUrls.length > 0) { await downloadAssets(mediaUrls, destRoot, TIMEOUT, MAX_RETRIES); } else { console.log('No media URLs found, skipping download'); } } catch (error) { console.error('Error during capture:', error.message); } finally { console.log('Closing browser...'); await context.close(); await browser.close(); } console.log('Workflow completed.'); } main().catch((err) => { console.error(err); process.exit(1); });