Add Node.js workflow implementation with Puppeteer integration

- Introduce run.mjs to orchestrate full workflow: Puppeteer opens a browser with mobile viewport (390x844 @ dpr 3), navigates to capture URL, records network requests for media, takes a viewport screenshot, then calls download.mjs to batch download assets.
- Implement download.mjs to handle asset downloading with same behavior as Python version (retry, zero-byte cleanup, unique naming, save urls.txt, delete input file).
- Update AGENTS.md to document the new primary workflow and keep manual download instructions for both Python and Node.js.
- Support title extraction from page title for folder naming.
This commit is contained in:
2026-02-02 17:42:38 +08:00
parent 96afeb5e1c
commit dc6cb1513b
3 changed files with 526 additions and 1 deletions

171
run.mjs Normal file
View File

@@ -0,0 +1,171 @@
#!/usr/bin/env node
import fs from 'fs/promises';
import path from 'path';
import { fileURLToPath } from 'url';
import { spawn } from 'child_process';
import process from 'process';
import puppeteer from 'puppeteer';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const VIEWPORT = { width: 390, height: 844, deviceScaleFactor: 3, isMobile: true, hasTouch: true };
const MEDIA_MIME_PREFIXES = ['audio/', 'video/', 'image/'];
const MEDIA_EXTENSIONS = ['.mp3', '.wav', '.ogg', '.aac', '.m4a', '.flac', '.mp4', '.webm', '.ogv', '.avi', '.mov', '.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.bmp', '.ico'];
function getCurrentDate() {
const now = new Date();
const year = now.getFullYear();
const month = String(now.getMonth() + 1).padStart(2, '0');
const day = String(now.getDate()).padStart(2, '0');
return `${year}${month}${day}`;
}
function extractPageId(url) {
const patterns = [
/storeview\/([a-z0-9]+)/i,
/\/v\/([a-z0-9]+)/i,
];
for (const pattern of patterns) {
const match = url.match(pattern);
if (match && match[1]) {
return match[1];
}
}
return null;
}
function parseArgs() {
const args = process.argv.slice(2);
const pageUrl = args[0];
if (!pageUrl) {
console.error('Usage: node run.mjs <page-url>');
process.exit(1);
}
const pageId = extractPageId(pageUrl);
if (!pageId) {
console.error('Error: Could not extract page-id from the URL');
process.exit(1);
}
return { pageId, pageUrl };
}
function isMediaUrl(url, mimeType) {
if (mimeType && MEDIA_MIME_PREFIXES.some(prefix => mimeType.toLowerCase().startsWith(prefix))) {
return true;
}
const lowerUrl = url.toLowerCase();
return MEDIA_EXTENSIONS.some(ext => lowerUrl.includes(ext));
}
function buildCaptureUrl(pageUrl) {
const pageId = extractPageId(pageUrl);
return `https://ca.rrx.cn/v/${pageId}?rrxsrc=2&iframe=1&tpl=1`;
}
async function captureNetworkAndScreenshot(browser, captureUrl, pageId, today) {
const page = await browser.newPage();
const requests = [];
page.on('request', request => {
const url = request.url();
const resourceType = request.resourceType();
if (['image', 'media', 'font'].includes(resourceType)) {
requests.push(url);
}
});
page.on('response', response => {
const url = response.url();
const headers = response.headers();
const mimeType = headers['content-type'] || '';
if (isMediaUrl(url, mimeType)) {
if (!requests.includes(url)) {
requests.push(url);
}
}
});
await page.emulate(VIEWPORT);
await page.setViewport(VIEWPORT);
console.log('Navigating to:', captureUrl);
await page.goto(captureUrl, { waitUntil: 'networkidle2', timeout: 30000 });
await new Promise(resolve => setTimeout(resolve, 3000));
let pageTitle = 'page';
try {
pageTitle = await page.title();
pageTitle = pageTitle.replace(/[<>:"/\\|?*]/g, '').trim().slice(0, 100) || 'page';
} catch {}
const destRoot = path.join('downloads', `${today}-${pageTitle}-${pageId}`);
await fs.mkdir(destRoot, { recursive: true });
const screenshotPath = path.join(destRoot, 'index.png');
console.log('Taking screenshot...');
await page.screenshot({ path: screenshotPath, fullPage: false });
console.log('Screenshot saved:', screenshotPath);
const mediaUrls = [...new Set(requests.filter(url => isMediaUrl(url, null)))];
return { page, mediaUrls, pageTitle };
}
function runDownloadScript(pageId, title, urlsFile, sourcesFile) {
return new Promise((resolve) => {
const args = [
'download.mjs',
'--page-id', pageId,
'--title', title,
'--urls', urlsFile,
'--sources', sourcesFile,
];
const proc = spawn('node', args, { stdio: 'inherit' });
proc.on('close', resolve);
});
}
async function main() {
const { pageId, pageUrl } = parseArgs();
console.log('Extracted page-id:', pageId);
const today = getCurrentDate();
const tempDir = 'temp';
await fs.mkdir(tempDir, { recursive: true });
const urlsFile = path.join(tempDir, 'urls.txt');
const sourcesFile = path.join(tempDir, 'sources.txt');
await fs.writeFile(sourcesFile, pageUrl + '\n', 'utf-8');
console.log('Launching browser...');
const browser = await puppeteer.launch({ headless: 'new' });
const captureUrl = buildCaptureUrl(pageUrl);
console.log('Capture URL:', captureUrl);
try {
const { page, mediaUrls, pageTitle } = await captureNetworkAndScreenshot(browser, captureUrl, pageId, today);
console.log('Found', mediaUrls.length, 'media URLs');
if (mediaUrls.length > 0) {
await fs.writeFile(urlsFile, mediaUrls.join('\n') + '\n', 'utf-8');
await runDownloadScript(pageId, pageTitle, urlsFile, sourcesFile);
} else {
console.log('No media URLs found, skipping download');
}
console.log('Closing captured page...');
await page.close();
} catch (error) {
console.error('Error during capture:', error.message);
}
console.log('Closing browser...');
await browser.close();
console.log('Workflow completed.');
}
main().catch(err => {
console.error(err);
process.exit(1);
});