diff --git a/AGENTS.md b/AGENTS.md index a601296..e097a13 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -15,7 +15,7 @@ This repository has no existing conventions. These guidelines apply to all work - After collecting all requested resources and screenshots, close any additional tabs/pages opened for capture. This is mandatory; do not leave capture tabs open. ## Download script usage -- Primary workflow: run `node run.mjs ` to capture network requests, screenshot, and download media in one step. This script uses Puppeteer to open a browser with mobile viewport (390×844 @ dpr 3), navigate to the page, capture audio/video/image URLs, take a viewport screenshot, then call `download.mjs` to batch download assets. +- Primary workflow: run `node run.mjs ` to capture network requests, screenshot, and download media in one step. This script uses Playwright Chromium to open a browser with mobile viewport (390×844 @ dpr 3), navigate to the page, capture audio/video/image URLs, take a viewport screenshot, then call `download.mjs` to batch download assets. For remote debugging, pass `--cdp ws://host:port/devtools/browser/` or `--cdp http://host:port` (or set `ENV_CDP`) to resolve and connect to a Chrome DevTools endpoint. - For manual control: use `python download.py --page-id --title "" --urls urls.txt --sources source_urls.txt` (Python) or `node download.mjs --page-id <id> --title "<title>" --urls urls.txt --sources source_urls.txt` (Node.js) to batch download assets. The script generates `<date>` using format `YYYYMMDD`. - `urls.txt` should list the target asset URLs (one per line) already filtered to the requested scope (e.g., media only). - Downloads go to `downloads/<date>-<title>-<page-id>/media/`; filenames are cleaned (query/`@!` removed) and extensions retained/guessed; duplicates get numeric suffixes. diff --git a/run.mjs b/run.mjs index a5905e4..4e36f78 100644 --- a/run.mjs +++ b/run.mjs @@ -1,17 +1,26 @@ #!/usr/bin/env node import fs from 'fs/promises'; import path from 'path'; -import { fileURLToPath } from 'url'; import { spawn } from 'child_process'; import process from 'process'; -import puppeteer from 'puppeteer'; +import { chromium } from 'playwright'; +import http from 'http'; +import https from 'https'; -const __filename = fileURLToPath(import.meta.url); -const __dirname = path.dirname(__filename); +const VIEWPORT = { + width: 390, + height: 844, + deviceScaleFactor: 3, + isMobile: true, + hasTouch: true, +}; -const VIEWPORT = { width: 390, height: 844, deviceScaleFactor: 3, isMobile: true, hasTouch: true }; const MEDIA_MIME_PREFIXES = ['audio/', 'video/', 'image/']; -const MEDIA_EXTENSIONS = ['.mp3', '.wav', '.ogg', '.aac', '.m4a', '.flac', '.mp4', '.webm', '.ogv', '.avi', '.mov', '.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.bmp', '.ico']; +const MEDIA_EXTENSIONS = [ + '.mp3', '.wav', '.ogg', '.aac', '.m4a', '.flac', + '.mp4', '.webm', '.ogv', '.avi', '.mov', + '.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.bmp', '.ico', +]; function getCurrentDate() { const now = new Date(); @@ -35,11 +44,30 @@ function extractPageId(url) { return null; } +function sanitizeTitle(title) { + if (!title) { + return 'page'; + } + return title.replace(/[<>:"/\\|?*]/g, '').trim().slice(0, 100) || 'page'; +} + function parseArgs() { const args = process.argv.slice(2); - const pageUrl = args[0]; + let pageUrl = null; + let cdpUrl = process.env.ENV_CDP || null; + + for (let i = 0; i < args.length; i++) { + const arg = args[i]; + if (arg === '--cdp') { + cdpUrl = args[i + 1]; + i += 1; + } else if (!arg.startsWith('--') && !pageUrl) { + pageUrl = arg; + } + } + if (!pageUrl) { - console.error('Usage: node run.mjs <page-url>'); + console.error('Usage: node run.mjs <page-url> [--cdp ws://host:port/devtools/browser/<id>|http://host:port] (or ENV_CDP)'); process.exit(1); } const pageId = extractPageId(pageUrl); @@ -47,15 +75,15 @@ function parseArgs() { console.error('Error: Could not extract page-id from the URL'); process.exit(1); } - return { pageId, pageUrl }; + return { pageId, pageUrl, cdpUrl }; } function isMediaUrl(url, mimeType) { - if (mimeType && MEDIA_MIME_PREFIXES.some(prefix => mimeType.toLowerCase().startsWith(prefix))) { + if (mimeType && MEDIA_MIME_PREFIXES.some((prefix) => mimeType.toLowerCase().startsWith(prefix))) { return true; } const lowerUrl = url.toLowerCase(); - return MEDIA_EXTENSIONS.some(ext => lowerUrl.includes(ext)); + return MEDIA_EXTENSIONS.some((ext) => lowerUrl.includes(ext)); } function buildCaptureUrl(pageUrl) { @@ -63,52 +91,43 @@ function buildCaptureUrl(pageUrl) { return `https://ca.rrx.cn/v/${pageId}?rrxsrc=2&iframe=1&tpl=1`; } -async function captureNetworkAndScreenshot(browser, captureUrl, pageId, today) { - const page = await browser.newPage(); - const requests = []; - - page.on('request', request => { - const url = request.url(); +async function captureNetworkAndScreenshot(context, captureUrl, pageId, today) { + const page = await context.newPage(); + const requests = new Set(); + + page.on('request', (request) => { const resourceType = request.resourceType(); - if (['image', 'media', 'font'].includes(resourceType)) { - requests.push(url); + if (resourceType === 'image' || resourceType === 'media') { + requests.add(request.url()); } }); - - page.on('response', response => { + + page.on('response', (response) => { const url = response.url(); - const headers = response.headers(); - const mimeType = headers['content-type'] || ''; + const mimeType = response.headers()['content-type'] || ''; if (isMediaUrl(url, mimeType)) { - if (!requests.includes(url)) { - requests.push(url); - } + requests.add(url); } }); - - await page.emulate(VIEWPORT); - await page.setViewport(VIEWPORT); - - console.log('Navigating to:', captureUrl); - await page.goto(captureUrl, { waitUntil: 'networkidle2', timeout: 30000 }); - - await new Promise(resolve => setTimeout(resolve, 3000)); - - let pageTitle = 'page'; + try { - pageTitle = await page.title(); - pageTitle = pageTitle.replace(/[<>:"/\\|?*]/g, '').trim().slice(0, 100) || 'page'; - } catch {} - - const destRoot = path.join('downloads', `${today}-${pageTitle}-${pageId}`); - await fs.mkdir(destRoot, { recursive: true }); - const screenshotPath = path.join(destRoot, 'index.png'); - console.log('Taking screenshot...'); - await page.screenshot({ path: screenshotPath, fullPage: false }); - console.log('Screenshot saved:', screenshotPath); - - const mediaUrls = [...new Set(requests.filter(url => isMediaUrl(url, null)))]; - return { page, mediaUrls, pageTitle }; + console.log('Navigating to:', captureUrl); + await page.goto(captureUrl, { waitUntil: 'networkidle', timeout: 30000 }); + await page.waitForTimeout(3000); + + const pageTitle = sanitizeTitle(await page.title()); + const destRoot = path.join('downloads', `${today}-${pageTitle}-${pageId}`); + await fs.mkdir(destRoot, { recursive: true }); + + const screenshotPath = path.join(destRoot, 'index.png'); + console.log('Taking screenshot...'); + await page.screenshot({ path: screenshotPath, fullPage: false }); + console.log('Screenshot saved:', screenshotPath); + + return { mediaUrls: [...requests], pageTitle }; + } finally { + await page.close(); + } } function runDownloadScript(pageId, title, urlsFile, sourcesFile) { @@ -125,47 +144,103 @@ function runDownloadScript(pageId, title, urlsFile, sourcesFile) { }); } +async function fetchJson(url) { + return new Promise((resolve, reject) => { + const client = url.startsWith('https:') ? https : http; + const req = client.get(url, (res) => { + if (res.statusCode !== 200) { + reject(new Error(`HTTP ${res.statusCode}`)); + res.resume(); + return; + } + let data = ''; + res.setEncoding('utf8'); + res.on('data', (chunk) => { + data += chunk; + }); + res.on('end', () => { + try { + resolve(JSON.parse(data)); + } catch (err) { + reject(err); + } + }); + }); + req.on('error', reject); + }); +} + +async function resolveCdpUrl(cdpUrl) { + if (!cdpUrl) { + return null; + } + if (cdpUrl.startsWith('ws://') || cdpUrl.startsWith('wss://')) { + return cdpUrl; + } + const base = cdpUrl.endsWith('/') ? cdpUrl.slice(0, -1) : cdpUrl; + const endpointUrl = `${base}/json/version`; + const data = await fetchJson(endpointUrl); + if (!data || !data.webSocketDebuggerUrl) { + throw new Error(`No webSocketDebuggerUrl found at ${endpointUrl}`); + } + return data.webSocketDebuggerUrl; +} + async function main() { - const { pageId, pageUrl } = parseArgs(); + const { pageId, pageUrl, cdpUrl } = parseArgs(); console.log('Extracted page-id:', pageId); - + const today = getCurrentDate(); const tempDir = 'temp'; await fs.mkdir(tempDir, { recursive: true }); - + const urlsFile = path.join(tempDir, 'urls.txt'); const sourcesFile = path.join(tempDir, 'sources.txt'); await fs.writeFile(sourcesFile, pageUrl + '\n', 'utf-8'); - - console.log('Launching browser...'); - const browser = await puppeteer.launch({ headless: 'new' }); - + + let browser; + if (cdpUrl) { + console.log('Resolving Chrome DevTools endpoint from:', cdpUrl); + const resolvedCdpUrl = await resolveCdpUrl(cdpUrl); + console.log('Connecting to remote Chrome DevTools:', resolvedCdpUrl); + browser = await chromium.connectOverCDP(resolvedCdpUrl); + } else { + console.log('Launching Playwright Chromium...'); + browser = await chromium.launch({ headless: true }); + } + + const context = await browser.newContext(VIEWPORT); + const captureUrl = buildCaptureUrl(pageUrl); console.log('Capture URL:', captureUrl); - + try { - const { page, mediaUrls, pageTitle } = await captureNetworkAndScreenshot(browser, captureUrl, pageId, today); + const { mediaUrls, pageTitle } = await captureNetworkAndScreenshot( + context, + captureUrl, + pageId, + today, + ); + console.log('Found', mediaUrls.length, 'media URLs'); - if (mediaUrls.length > 0) { await fs.writeFile(urlsFile, mediaUrls.join('\n') + '\n', 'utf-8'); await runDownloadScript(pageId, pageTitle, urlsFile, sourcesFile); } else { console.log('No media URLs found, skipping download'); } - - console.log('Closing captured page...'); - await page.close(); } catch (error) { console.error('Error during capture:', error.message); + } finally { + console.log('Closing browser...'); + await context.close(); + await browser.close(); } - - console.log('Closing browser...'); - await browser.close(); + console.log('Workflow completed.'); } -main().catch(err => { +main().catch((err) => { console.error(err); process.exit(1); });