#!/usr/bin/env node import fs from 'fs/promises'; import path from 'path'; import { spawn } from 'child_process'; import process from 'process'; import { chromium } from 'playwright'; import http from 'http'; import https from 'https'; const VIEWPORT = { width: 390, height: 844, deviceScaleFactor: 3, isMobile: true, hasTouch: true, }; const MEDIA_MIME_PREFIXES = ['audio/', 'video/', 'image/']; const MEDIA_EXTENSIONS = [ '.mp3', '.wav', '.ogg', '.aac', '.m4a', '.flac', '.mp4', '.webm', '.ogv', '.avi', '.mov', '.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.bmp', '.ico', ]; function getCurrentDate() { const now = new Date(); const year = now.getFullYear(); const month = String(now.getMonth() + 1).padStart(2, '0'); const day = String(now.getDate()).padStart(2, '0'); return `${year}${month}${day}`; } function extractPageId(url) { const patterns = [ /storeview\/([a-z0-9]+)/i, /\/v\/([a-z0-9]+)/i, ]; for (const pattern of patterns) { const match = url.match(pattern); if (match && match[1]) { return match[1]; } } return null; } function sanitizeTitle(title) { if (!title) { return 'page'; } return title.replace(/[<>:"/\\|?*]/g, '').trim().slice(0, 100) || 'page'; } function parseArgs() { const args = process.argv.slice(2); let pageUrl = null; let cdpUrl = process.env.ENV_CDP || null; for (let i = 0; i < args.length; i++) { const arg = args[i]; if (arg === '--cdp') { cdpUrl = args[i + 1]; i += 1; } else if (!arg.startsWith('--') && !pageUrl) { pageUrl = arg; } } if (!pageUrl) { console.error('Usage: node run.mjs [--cdp ws://host:port/devtools/browser/|http://host:port] (or ENV_CDP)'); process.exit(1); } const pageId = extractPageId(pageUrl); if (!pageId) { console.error('Error: Could not extract page-id from the URL'); process.exit(1); } return { pageId, pageUrl, cdpUrl }; } function isMediaUrl(url, mimeType) { if (mimeType && MEDIA_MIME_PREFIXES.some((prefix) => mimeType.toLowerCase().startsWith(prefix))) { return true; } const lowerUrl = url.toLowerCase(); return MEDIA_EXTENSIONS.some((ext) => lowerUrl.includes(ext)); } function buildCaptureUrl(pageUrl) { const pageId = extractPageId(pageUrl); return `https://ca.rrx.cn/v/${pageId}?rrxsrc=2&iframe=1&tpl=1`; } async function captureNetworkAndScreenshot(context, captureUrl, pageId, today) { const page = await context.newPage(); const requests = new Set(); page.on('request', (request) => { const resourceType = request.resourceType(); if (resourceType === 'image' || resourceType === 'media') { requests.add(request.url()); } }); page.on('response', (response) => { const url = response.url(); const mimeType = response.headers()['content-type'] || ''; if (isMediaUrl(url, mimeType)) { requests.add(url); } }); try { console.log('Navigating to:', captureUrl); await page.goto(captureUrl, { waitUntil: 'networkidle', timeout: 30000 }); await page.waitForTimeout(3000); const pageTitle = sanitizeTitle(await page.title()); const destRoot = path.join('downloads', `${today}-${pageTitle}-${pageId}`); await fs.mkdir(destRoot, { recursive: true }); const screenshotPath = path.join(destRoot, 'index.png'); console.log('Taking screenshot...'); await page.screenshot({ path: screenshotPath, fullPage: false }); console.log('Screenshot saved:', screenshotPath); return { mediaUrls: [...requests], pageTitle }; } finally { await page.close(); } } function runDownloadScript(pageId, title, urlsFile, sourcesFile) { return new Promise((resolve) => { const args = [ 'download.mjs', '--page-id', pageId, '--title', title, '--urls', urlsFile, '--sources', sourcesFile, ]; const proc = spawn('node', args, { stdio: 'inherit' }); proc.on('close', resolve); }); } async function fetchJson(url) { return new Promise((resolve, reject) => { const client = url.startsWith('https:') ? https : http; const req = client.get(url, (res) => { if (res.statusCode !== 200) { reject(new Error(`HTTP ${res.statusCode}`)); res.resume(); return; } let data = ''; res.setEncoding('utf8'); res.on('data', (chunk) => { data += chunk; }); res.on('end', () => { try { resolve(JSON.parse(data)); } catch (err) { reject(err); } }); }); req.on('error', reject); }); } async function resolveCdpUrl(cdpUrl) { if (!cdpUrl) { return null; } if (cdpUrl.startsWith('ws://') || cdpUrl.startsWith('wss://')) { return cdpUrl; } const base = cdpUrl.endsWith('/') ? cdpUrl.slice(0, -1) : cdpUrl; const endpointUrl = `${base}/json/version`; const data = await fetchJson(endpointUrl); if (!data || !data.webSocketDebuggerUrl) { throw new Error(`No webSocketDebuggerUrl found at ${endpointUrl}`); } return data.webSocketDebuggerUrl; } async function main() { const { pageId, pageUrl, cdpUrl } = parseArgs(); console.log('Extracted page-id:', pageId); const today = getCurrentDate(); const tempDir = 'temp'; await fs.mkdir(tempDir, { recursive: true }); const urlsFile = path.join(tempDir, 'urls.txt'); const sourcesFile = path.join(tempDir, 'sources.txt'); await fs.writeFile(sourcesFile, pageUrl + '\n', 'utf-8'); let browser; if (cdpUrl) { console.log('Resolving Chrome DevTools endpoint from:', cdpUrl); const resolvedCdpUrl = await resolveCdpUrl(cdpUrl); console.log('Connecting to remote Chrome DevTools:', resolvedCdpUrl); browser = await chromium.connectOverCDP(resolvedCdpUrl); } else { console.log('Launching Playwright Chromium...'); browser = await chromium.launch({ headless: true }); } const context = await browser.newContext(VIEWPORT); const captureUrl = buildCaptureUrl(pageUrl); console.log('Capture URL:', captureUrl); try { const { mediaUrls, pageTitle } = await captureNetworkAndScreenshot( context, captureUrl, pageId, today, ); console.log('Found', mediaUrls.length, 'media URLs'); if (mediaUrls.length > 0) { await fs.writeFile(urlsFile, mediaUrls.join('\n') + '\n', 'utf-8'); await runDownloadScript(pageId, pageTitle, urlsFile, sourcesFile); } else { console.log('No media URLs found, skipping download'); } } catch (error) { console.error('Error during capture:', error.message); } finally { console.log('Closing browser...'); await context.close(); await browser.close(); } console.log('Workflow completed.'); } main().catch((err) => { console.error(err); process.exit(1); });