Files
rrx-downloader/run.mjs

513 lines
13 KiB
JavaScript

#!/usr/bin/env node
import fs from 'fs/promises';
import path from 'path';
import process from 'process';
import { chromium } from 'playwright';
import http from 'http';
import https from 'https';
const VIEWPORT_SIZE = { width: 375, height: 667 };
const CONTEXT_OPTIONS = {
viewport: VIEWPORT_SIZE,
deviceScaleFactor: 3,
isMobile: true,
hasTouch: true,
};
const MEDIA_MIME_PREFIXES = ['audio/', 'video/', 'image/'];
const MEDIA_EXTENSIONS = [
'.mp3', '.wav', '.ogg', '.aac', '.m4a', '.flac',
'.mp4', '.webm', '.ogv', '.avi', '.mov',
'.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.bmp', '.ico',
];
const MAX_RETRIES = 2;
const TIMEOUT = 30000;
function getCurrentDate() {
const now = new Date();
const year = now.getFullYear();
const month = String(now.getMonth() + 1).padStart(2, '0');
const day = String(now.getDate()).padStart(2, '0');
return `${year}${month}${day}`;
}
function extractPageId(url) {
const patterns = [
/storeview\/([a-z0-9]+)/i,
/\/v\/([a-z0-9]+)/i,
];
for (const pattern of patterns) {
const match = url.match(pattern);
if (match && match[1]) {
return match[1];
}
}
return null;
}
function sanitizeTitle(title) {
if (!title) {
return 'page';
}
return title.replace(/[<>:"/\\|?*]/g, '').trim().slice(0, 100) || 'page';
}
function parseArgs() {
const args = process.argv.slice(2);
let pageUrl = null;
let cdpUrl = process.env.ENV_CDP || null;
for (let i = 0; i < args.length; i++) {
const arg = args[i];
if (arg === '--cdp') {
cdpUrl = args[i + 1];
i += 1;
} else if (!arg.startsWith('--') && !pageUrl) {
pageUrl = arg;
}
}
if (!pageUrl) {
console.error('Usage: node run.mjs <page-url> [--cdp ws://host:port/devtools/browser/<id>|http://host:port] (or ENV_CDP)');
process.exit(1);
}
const pageId = extractPageId(pageUrl);
if (!pageId) {
console.error('Error: Could not extract page-id from the URL');
process.exit(1);
}
return { pageId, pageUrl, cdpUrl };
}
function isMediaUrl(url, mimeType) {
if (mimeType && MEDIA_MIME_PREFIXES.some((prefix) => mimeType.toLowerCase().startsWith(prefix))) {
return true;
}
const lowerUrl = url.toLowerCase();
return MEDIA_EXTENSIONS.some((ext) => lowerUrl.includes(ext));
}
function buildCaptureUrl(pageUrl) {
const pageId = extractPageId(pageUrl);
return `https://ca.rrx.cn/v/${pageId}?rrxsrc=2&iframe=1&tpl=1`;
}
async function fetchJson(url) {
return new Promise((resolve, reject) => {
const client = url.startsWith('https:') ? https : http;
const req = client.get(url, (res) => {
if (res.statusCode !== 200) {
reject(new Error(`HTTP ${res.statusCode}`));
res.resume();
return;
}
let data = '';
res.setEncoding('utf8');
res.on('data', (chunk) => {
data += chunk;
});
res.on('end', () => {
try {
resolve(JSON.parse(data));
} catch (err) {
reject(err);
}
});
});
req.on('error', reject);
});
}
async function resolveCdpUrl(cdpUrl) {
if (!cdpUrl) {
return null;
}
if (cdpUrl.startsWith('ws://') || cdpUrl.startsWith('wss://')) {
return cdpUrl;
}
const base = cdpUrl.endsWith('/') ? cdpUrl.slice(0, -1) : cdpUrl;
const endpointUrl = `${base}/json/version`;
const data = await fetchJson(endpointUrl);
if (!data || !data.webSocketDebuggerUrl) {
throw new Error(`No webSocketDebuggerUrl found at ${endpointUrl}`);
}
return data.webSocketDebuggerUrl;
}
function guessExtension(contentType) {
if (!contentType) {
return '.bin';
}
const mime = contentType.split(';')[0].trim();
const map = {
'image/jpeg': '.jpg',
'image/png': '.png',
'image/gif': '.gif',
'image/webp': '.webp',
'image/svg+xml': '.svg',
'audio/mpeg': '.mp3',
'audio/wav': '.wav',
'audio/ogg': '.ogg',
'audio/aac': '.aac',
'video/mp4': '.mp4',
'video/webm': '.webm',
'video/ogg': '.ogv',
'application/octet-stream': '.bin',
};
return map[mime] || '.bin';
}
function sanitizeFilename(url, contentType) {
try {
const parsed = new URL(url);
let basename = path.basename(parsed.pathname);
if (basename.includes('@!')) {
basename = basename.split('@!')[0];
}
if (!basename) {
basename = 'file';
}
const { name, ext } = path.parse(basename);
if (!ext) {
return name + guessExtension(contentType);
}
return basename;
} catch {
return 'file.bin';
}
}
async function uniqueName(basePath) {
try {
await fs.access(basePath);
const { dir, name, ext } = path.parse(basePath);
let idx = 1;
while (true) {
const candidate = path.join(dir, `${name}_${idx}${ext}`);
try {
await fs.access(candidate);
idx += 1;
} catch {
return candidate;
}
}
} catch {
return basePath;
}
}
async function downloadOne(url, destDir, timeout) {
return new Promise((resolve) => {
try {
const parsed = new URL(url);
const client = parsed.protocol === 'https:' ? https : http;
const req = client.get(url, { timeout }, async (resp) => {
if (resp.statusCode !== 200) {
resolve([false, `HTTP ${resp.statusCode}`]);
resp.resume();
return;
}
const contentType = resp.headers['content-type'];
const fname = sanitizeFilename(url, contentType);
const target = await uniqueName(path.join(destDir, fname));
const fileStream = await fs.open(target, 'w');
const writeStream = fileStream.createWriteStream();
let failed = false;
let errorMsg = null;
writeStream.on('error', (err) => {
failed = true;
errorMsg = err.message;
writeStream.close();
fileStream.close();
});
writeStream.on('finish', () => {
writeStream.close();
fileStream.close();
});
writeStream.on('close', () => {
if (failed) {
fs.unlink(target).catch(() => {});
resolve([false, errorMsg]);
} else {
resolve([true, null]);
}
});
resp.pipe(writeStream);
});
req.on('error', (err) => {
resolve([false, err.message]);
});
req.on('timeout', () => {
req.destroy();
resolve([false, 'Request timeout']);
});
} catch (err) {
resolve([false, err.message]);
}
});
}
async function deleteZeroByteFiles(destDir) {
const removed = [];
const files = await fs.readdir(destDir);
for (const file of files) {
const filePath = path.join(destDir, file);
const stats = await fs.stat(filePath);
if (stats.isFile() && stats.size === 0) {
await fs.unlink(filePath);
removed.push(file);
}
}
return removed;
}
function planDownloads(urls) {
const plan = {};
for (const url of urls) {
plan[url] = sanitizeFilename(url, null);
}
return plan;
}
async function verifyMissing(destDir, plan) {
const missing = [];
const files = await fs.readdir(destDir);
const existing = new Set();
for (const file of files) {
const filePath = path.join(destDir, file);
const stats = await fs.stat(filePath);
if (stats.isFile()) {
existing.add(file);
}
}
for (const [url, fname] of Object.entries(plan)) {
if (!existing.has(fname)) {
const baseName = path.parse(fname).name;
const hasPrefix = Array.from(existing).some((name) => name.startsWith(baseName));
if (!hasPrefix) {
missing.push(url);
}
}
}
return missing;
}
function dedupeAndFilterUrls(urls) {
const seen = new Set();
const filtered = [];
const skipped = [];
for (const url of urls) {
const cleanUrl = url.split('?')[0];
if (seen.has(cleanUrl)) {
skipped.push(url);
continue;
}
seen.add(cleanUrl);
filtered.push(url);
}
return { filtered, skipped };
}
async function saveSourceUrls(destRoot, sourcesPath) {
const sources = await fs.readFile(sourcesPath, 'utf-8');
await fs.writeFile(
path.join(destRoot, 'urls.txt'),
sources.trim() + '\n',
'utf-8',
);
}
function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
async function captureNetworkAndScreenshot(context, captureUrl, pageId, today) {
const page = await context.newPage();
const requests = new Set();
page.on('request', (request) => {
const resourceType = request.resourceType();
if (resourceType === 'image' || resourceType === 'media') {
requests.add(request.url());
}
});
page.on('response', (response) => {
const url = response.url();
const mimeType = response.headers()['content-type'] || '';
if (isMediaUrl(url, mimeType)) {
requests.add(url);
}
});
try {
console.log('Navigating to:', captureUrl);
await page.goto(captureUrl, { waitUntil: 'networkidle', timeout: 30000 });
await page.waitForTimeout(3000);
await page.setViewportSize(VIEWPORT_SIZE);
const pageTitle = sanitizeTitle(await page.title());
const destRoot = path.join('downloads', `${today}-${pageTitle}-${pageId}`);
await fs.mkdir(destRoot, { recursive: true });
const screenshotPath = path.join(destRoot, 'index.png');
console.log('Taking screenshot...');
await page.screenshot({ path: screenshotPath, fullPage: false });
console.log('Screenshot saved:', screenshotPath);
return { mediaUrls: [...requests], pageTitle, destRoot };
} finally {
await page.close();
}
}
async function downloadAssets(urls, destRoot, timeout, retries) {
const mediaDir = path.join(destRoot, 'media');
await fs.mkdir(mediaDir, { recursive: true });
const { filtered, skipped } = dedupeAndFilterUrls(urls);
const plan = planDownloads(filtered);
console.log(`Starting download of ${filtered.length} URLs...`);
if (skipped.length > 0) {
console.log(`Skipped ${skipped.length} duplicate URLs after cleaning.`);
}
for (const url of filtered) {
console.log(`Downloading: ${url}`);
const [ok, err] = await downloadOne(url, mediaDir, timeout);
if (!ok) {
console.error(`WARN: download failed for ${url}: ${err}`);
} else {
console.log(`Downloaded: ${url}`);
}
await sleep(50);
}
const removed = await deleteZeroByteFiles(mediaDir);
if (removed.length > 0) {
console.log(`Removed zero-byte files: ${removed.join(', ')}`);
}
let missing = await verifyMissing(mediaDir, plan);
let attempts = 0;
while (missing.length > 0 && attempts < retries) {
attempts += 1;
console.log(`Retry round ${attempts}: missing ${missing.length} files`);
const stillMissing = [];
for (const url of missing) {
console.log(`Retrying: ${url}`);
const [ok, err] = await downloadOne(url, mediaDir, timeout);
if (!ok) {
console.error(`WARN: retry failed for ${url}: ${err}`);
stillMissing.push(url);
} else {
console.log(`Downloaded on retry: ${url}`);
}
}
const removedRetry = await deleteZeroByteFiles(mediaDir);
if (removedRetry.length > 0) {
console.log(`Removed zero-byte files after retry: ${removedRetry.join(', ')}`);
}
const retryPlan = {};
for (const url of stillMissing) {
retryPlan[url] = plan[url];
}
missing = stillMissing.length > 0 ? await verifyMissing(mediaDir, retryPlan) : [];
}
if (missing.length > 0) {
console.error('FAILED to fetch these URLs after retries:');
for (const url of missing) {
console.error(url);
}
} else {
console.log('All planned files downloaded.');
}
}
async function main() {
const { pageId, pageUrl, cdpUrl } = parseArgs();
console.log('Extracted page-id:', pageId);
const today = getCurrentDate();
const tempDir = 'temp';
await fs.mkdir(tempDir, { recursive: true });
const sourcesFile = path.join(tempDir, 'sources.txt');
await fs.writeFile(sourcesFile, pageUrl + '\n', 'utf-8');
let browser;
if (cdpUrl) {
console.log('Resolving Chrome DevTools endpoint from:', cdpUrl);
const resolvedCdpUrl = await resolveCdpUrl(cdpUrl);
console.log('Connecting to remote Chrome DevTools:', resolvedCdpUrl);
browser = await chromium.connectOverCDP(resolvedCdpUrl);
} else {
console.log('Launching Playwright Chromium...');
browser = await chromium.launch({ headless: true });
}
const context = await browser.newContext(CONTEXT_OPTIONS);
const captureUrl = buildCaptureUrl(pageUrl);
console.log('Capture URL:', captureUrl);
try {
const { mediaUrls, pageTitle, destRoot } = await captureNetworkAndScreenshot(
context,
captureUrl,
pageId,
today,
);
await saveSourceUrls(destRoot, sourcesFile);
console.log(`Saved source URLs to ${path.join(destRoot, 'urls.txt')}`);
console.log('Found', mediaUrls.length, 'media URLs');
if (mediaUrls.length > 0) {
await downloadAssets(mediaUrls, destRoot, TIMEOUT, MAX_RETRIES);
} else {
console.log('No media URLs found, skipping download');
}
} catch (error) {
console.error('Error during capture:', error.message);
} finally {
console.log('Closing browser...');
await context.close();
await browser.close();
}
console.log('Workflow completed.');
}
main().catch((err) => {
console.error(err);
process.exit(1);
});