Support ENV_CDP and HTTP CDP endpoints

This commit is contained in:
2026-02-02 18:13:51 +08:00
parent dc6cb1513b
commit 047e5f2766
2 changed files with 142 additions and 67 deletions

View File

@@ -15,7 +15,7 @@ This repository has no existing conventions. These guidelines apply to all work
- After collecting all requested resources and screenshots, close any additional tabs/pages opened for capture. This is mandatory; do not leave capture tabs open. - After collecting all requested resources and screenshots, close any additional tabs/pages opened for capture. This is mandatory; do not leave capture tabs open.
## Download script usage ## Download script usage
- Primary workflow: run `node run.mjs <page-url>` to capture network requests, screenshot, and download media in one step. This script uses Puppeteer to open a browser with mobile viewport (390×844 @ dpr 3), navigate to the page, capture audio/video/image URLs, take a viewport screenshot, then call `download.mjs` to batch download assets. - Primary workflow: run `node run.mjs <page-url>` to capture network requests, screenshot, and download media in one step. This script uses Playwright Chromium to open a browser with mobile viewport (390×844 @ dpr 3), navigate to the page, capture audio/video/image URLs, take a viewport screenshot, then call `download.mjs` to batch download assets. For remote debugging, pass `--cdp ws://host:port/devtools/browser/<id>` or `--cdp http://host:port` (or set `ENV_CDP`) to resolve and connect to a Chrome DevTools endpoint.
- For manual control: use `python download.py --page-id <id> --title "<title>" --urls urls.txt --sources source_urls.txt` (Python) or `node download.mjs --page-id <id> --title "<title>" --urls urls.txt --sources source_urls.txt` (Node.js) to batch download assets. The script generates `<date>` using format `YYYYMMDD`. - For manual control: use `python download.py --page-id <id> --title "<title>" --urls urls.txt --sources source_urls.txt` (Python) or `node download.mjs --page-id <id> --title "<title>" --urls urls.txt --sources source_urls.txt` (Node.js) to batch download assets. The script generates `<date>` using format `YYYYMMDD`.
- `urls.txt` should list the target asset URLs (one per line) already filtered to the requested scope (e.g., media only). - `urls.txt` should list the target asset URLs (one per line) already filtered to the requested scope (e.g., media only).
- Downloads go to `downloads/<date>-<title>-<page-id>/media/`; filenames are cleaned (query/`@!` removed) and extensions retained/guessed; duplicates get numeric suffixes. - Downloads go to `downloads/<date>-<title>-<page-id>/media/`; filenames are cleaned (query/`@!` removed) and extensions retained/guessed; duplicates get numeric suffixes.

185
run.mjs
View File

@@ -1,17 +1,26 @@
#!/usr/bin/env node #!/usr/bin/env node
import fs from 'fs/promises'; import fs from 'fs/promises';
import path from 'path'; import path from 'path';
import { fileURLToPath } from 'url';
import { spawn } from 'child_process'; import { spawn } from 'child_process';
import process from 'process'; import process from 'process';
import puppeteer from 'puppeteer'; import { chromium } from 'playwright';
import http from 'http';
import https from 'https';
const __filename = fileURLToPath(import.meta.url); const VIEWPORT = {
const __dirname = path.dirname(__filename); width: 390,
height: 844,
deviceScaleFactor: 3,
isMobile: true,
hasTouch: true,
};
const VIEWPORT = { width: 390, height: 844, deviceScaleFactor: 3, isMobile: true, hasTouch: true };
const MEDIA_MIME_PREFIXES = ['audio/', 'video/', 'image/']; const MEDIA_MIME_PREFIXES = ['audio/', 'video/', 'image/'];
const MEDIA_EXTENSIONS = ['.mp3', '.wav', '.ogg', '.aac', '.m4a', '.flac', '.mp4', '.webm', '.ogv', '.avi', '.mov', '.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.bmp', '.ico']; const MEDIA_EXTENSIONS = [
'.mp3', '.wav', '.ogg', '.aac', '.m4a', '.flac',
'.mp4', '.webm', '.ogv', '.avi', '.mov',
'.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.bmp', '.ico',
];
function getCurrentDate() { function getCurrentDate() {
const now = new Date(); const now = new Date();
@@ -35,11 +44,30 @@ function extractPageId(url) {
return null; return null;
} }
function sanitizeTitle(title) {
if (!title) {
return 'page';
}
return title.replace(/[<>:"/\\|?*]/g, '').trim().slice(0, 100) || 'page';
}
function parseArgs() { function parseArgs() {
const args = process.argv.slice(2); const args = process.argv.slice(2);
const pageUrl = args[0]; let pageUrl = null;
let cdpUrl = process.env.ENV_CDP || null;
for (let i = 0; i < args.length; i++) {
const arg = args[i];
if (arg === '--cdp') {
cdpUrl = args[i + 1];
i += 1;
} else if (!arg.startsWith('--') && !pageUrl) {
pageUrl = arg;
}
}
if (!pageUrl) { if (!pageUrl) {
console.error('Usage: node run.mjs <page-url>'); console.error('Usage: node run.mjs <page-url> [--cdp ws://host:port/devtools/browser/<id>|http://host:port] (or ENV_CDP)');
process.exit(1); process.exit(1);
} }
const pageId = extractPageId(pageUrl); const pageId = extractPageId(pageUrl);
@@ -47,15 +75,15 @@ function parseArgs() {
console.error('Error: Could not extract page-id from the URL'); console.error('Error: Could not extract page-id from the URL');
process.exit(1); process.exit(1);
} }
return { pageId, pageUrl }; return { pageId, pageUrl, cdpUrl };
} }
function isMediaUrl(url, mimeType) { function isMediaUrl(url, mimeType) {
if (mimeType && MEDIA_MIME_PREFIXES.some(prefix => mimeType.toLowerCase().startsWith(prefix))) { if (mimeType && MEDIA_MIME_PREFIXES.some((prefix) => mimeType.toLowerCase().startsWith(prefix))) {
return true; return true;
} }
const lowerUrl = url.toLowerCase(); const lowerUrl = url.toLowerCase();
return MEDIA_EXTENSIONS.some(ext => lowerUrl.includes(ext)); return MEDIA_EXTENSIONS.some((ext) => lowerUrl.includes(ext));
} }
function buildCaptureUrl(pageUrl) { function buildCaptureUrl(pageUrl) {
@@ -63,52 +91,43 @@ function buildCaptureUrl(pageUrl) {
return `https://ca.rrx.cn/v/${pageId}?rrxsrc=2&iframe=1&tpl=1`; return `https://ca.rrx.cn/v/${pageId}?rrxsrc=2&iframe=1&tpl=1`;
} }
async function captureNetworkAndScreenshot(browser, captureUrl, pageId, today) { async function captureNetworkAndScreenshot(context, captureUrl, pageId, today) {
const page = await browser.newPage(); const page = await context.newPage();
const requests = []; const requests = new Set();
page.on('request', request => { page.on('request', (request) => {
const url = request.url();
const resourceType = request.resourceType(); const resourceType = request.resourceType();
if (['image', 'media', 'font'].includes(resourceType)) { if (resourceType === 'image' || resourceType === 'media') {
requests.push(url); requests.add(request.url());
} }
}); });
page.on('response', response => { page.on('response', (response) => {
const url = response.url(); const url = response.url();
const headers = response.headers(); const mimeType = response.headers()['content-type'] || '';
const mimeType = headers['content-type'] || '';
if (isMediaUrl(url, mimeType)) { if (isMediaUrl(url, mimeType)) {
if (!requests.includes(url)) { requests.add(url);
requests.push(url);
}
} }
}); });
await page.emulate(VIEWPORT);
await page.setViewport(VIEWPORT);
console.log('Navigating to:', captureUrl);
await page.goto(captureUrl, { waitUntil: 'networkidle2', timeout: 30000 });
await new Promise(resolve => setTimeout(resolve, 3000));
let pageTitle = 'page';
try { try {
pageTitle = await page.title(); console.log('Navigating to:', captureUrl);
pageTitle = pageTitle.replace(/[<>:"/\\|?*]/g, '').trim().slice(0, 100) || 'page'; await page.goto(captureUrl, { waitUntil: 'networkidle', timeout: 30000 });
} catch {} await page.waitForTimeout(3000);
const destRoot = path.join('downloads', `${today}-${pageTitle}-${pageId}`); const pageTitle = sanitizeTitle(await page.title());
await fs.mkdir(destRoot, { recursive: true }); const destRoot = path.join('downloads', `${today}-${pageTitle}-${pageId}`);
const screenshotPath = path.join(destRoot, 'index.png'); await fs.mkdir(destRoot, { recursive: true });
console.log('Taking screenshot...');
await page.screenshot({ path: screenshotPath, fullPage: false });
console.log('Screenshot saved:', screenshotPath);
const mediaUrls = [...new Set(requests.filter(url => isMediaUrl(url, null)))]; const screenshotPath = path.join(destRoot, 'index.png');
return { page, mediaUrls, pageTitle }; console.log('Taking screenshot...');
await page.screenshot({ path: screenshotPath, fullPage: false });
console.log('Screenshot saved:', screenshotPath);
return { mediaUrls: [...requests], pageTitle };
} finally {
await page.close();
}
} }
function runDownloadScript(pageId, title, urlsFile, sourcesFile) { function runDownloadScript(pageId, title, urlsFile, sourcesFile) {
@@ -125,8 +144,50 @@ function runDownloadScript(pageId, title, urlsFile, sourcesFile) {
}); });
} }
async function fetchJson(url) {
return new Promise((resolve, reject) => {
const client = url.startsWith('https:') ? https : http;
const req = client.get(url, (res) => {
if (res.statusCode !== 200) {
reject(new Error(`HTTP ${res.statusCode}`));
res.resume();
return;
}
let data = '';
res.setEncoding('utf8');
res.on('data', (chunk) => {
data += chunk;
});
res.on('end', () => {
try {
resolve(JSON.parse(data));
} catch (err) {
reject(err);
}
});
});
req.on('error', reject);
});
}
async function resolveCdpUrl(cdpUrl) {
if (!cdpUrl) {
return null;
}
if (cdpUrl.startsWith('ws://') || cdpUrl.startsWith('wss://')) {
return cdpUrl;
}
const base = cdpUrl.endsWith('/') ? cdpUrl.slice(0, -1) : cdpUrl;
const endpointUrl = `${base}/json/version`;
const data = await fetchJson(endpointUrl);
if (!data || !data.webSocketDebuggerUrl) {
throw new Error(`No webSocketDebuggerUrl found at ${endpointUrl}`);
}
return data.webSocketDebuggerUrl;
}
async function main() { async function main() {
const { pageId, pageUrl } = parseArgs(); const { pageId, pageUrl, cdpUrl } = parseArgs();
console.log('Extracted page-id:', pageId); console.log('Extracted page-id:', pageId);
const today = getCurrentDate(); const today = getCurrentDate();
@@ -137,35 +198,49 @@ async function main() {
const sourcesFile = path.join(tempDir, 'sources.txt'); const sourcesFile = path.join(tempDir, 'sources.txt');
await fs.writeFile(sourcesFile, pageUrl + '\n', 'utf-8'); await fs.writeFile(sourcesFile, pageUrl + '\n', 'utf-8');
console.log('Launching browser...'); let browser;
const browser = await puppeteer.launch({ headless: 'new' }); if (cdpUrl) {
console.log('Resolving Chrome DevTools endpoint from:', cdpUrl);
const resolvedCdpUrl = await resolveCdpUrl(cdpUrl);
console.log('Connecting to remote Chrome DevTools:', resolvedCdpUrl);
browser = await chromium.connectOverCDP(resolvedCdpUrl);
} else {
console.log('Launching Playwright Chromium...');
browser = await chromium.launch({ headless: true });
}
const context = await browser.newContext(VIEWPORT);
const captureUrl = buildCaptureUrl(pageUrl); const captureUrl = buildCaptureUrl(pageUrl);
console.log('Capture URL:', captureUrl); console.log('Capture URL:', captureUrl);
try { try {
const { page, mediaUrls, pageTitle } = await captureNetworkAndScreenshot(browser, captureUrl, pageId, today); const { mediaUrls, pageTitle } = await captureNetworkAndScreenshot(
console.log('Found', mediaUrls.length, 'media URLs'); context,
captureUrl,
pageId,
today,
);
console.log('Found', mediaUrls.length, 'media URLs');
if (mediaUrls.length > 0) { if (mediaUrls.length > 0) {
await fs.writeFile(urlsFile, mediaUrls.join('\n') + '\n', 'utf-8'); await fs.writeFile(urlsFile, mediaUrls.join('\n') + '\n', 'utf-8');
await runDownloadScript(pageId, pageTitle, urlsFile, sourcesFile); await runDownloadScript(pageId, pageTitle, urlsFile, sourcesFile);
} else { } else {
console.log('No media URLs found, skipping download'); console.log('No media URLs found, skipping download');
} }
console.log('Closing captured page...');
await page.close();
} catch (error) { } catch (error) {
console.error('Error during capture:', error.message); console.error('Error during capture:', error.message);
} finally {
console.log('Closing browser...');
await context.close();
await browser.close();
} }
console.log('Closing browser...');
await browser.close();
console.log('Workflow completed.'); console.log('Workflow completed.');
} }
main().catch(err => { main().catch((err) => {
console.error(err); console.error(err);
process.exit(1); process.exit(1);
}); });