From 883188246296ce18ddbf81664c025766555f07ea Mon Sep 17 00:00:00 2001 From: Rogee Date: Mon, 2 Feb 2026 18:26:25 +0800 Subject: [PATCH] Merge download workflow into run script --- AGENTS.md | 9 +- download.mjs | 352 ---------------------------------------------- run.mjs | 390 +++++++++++++++++++++++++++++++++++++++++++-------- 3 files changed, 332 insertions(+), 419 deletions(-) delete mode 100644 download.mjs diff --git a/AGENTS.md b/AGENTS.md index e097a13..b34f2b8 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -6,7 +6,7 @@ This repository has no existing conventions. These guidelines apply to all work ## Network capture and assets - Resource scope is limited to: audio, video, images, and viewport screenshots; do not collect other asset types unless explicitly requested. - When a task asks to harvest page resources, prioritize what the user requests (e.g., only media or only core assets). Ask for scope if unclear. -- If the user provides a URL like `https://h5.rrx.cn/storeview/.html`, extract ``. Open a blank tab first, apply viewport override (width 390, height 844, devicePixelRatio 3, mobile: true, hasTouch: true), then navigate that tab to `https://ca.rrx.cn/v/?rrxsrc=2&iframe=1&tpl=1`. Equivalent automation: call DevTools/Emulation to override device metrics with `{width:390,height:844,deviceScaleFactor:3, mobile:true, hasTouch:true}` before navigation to avoid double-loading assets. +- If the user provides a URL like `https://h5.rrx.cn/storeview/.html`, extract ``. Open a blank tab first, apply viewport override (width 375, height 667, devicePixelRatio 3, mobile: true, hasTouch: true), then navigate that tab to `https://ca.rrx.cn/v/?rrxsrc=2&iframe=1&tpl=1`. Equivalent automation: call DevTools/Emulation to override device metrics with `{width:375,height:667,deviceScaleFactor:3, mobile:true, hasTouch:true}` before navigation to avoid double-loading assets. - Use DevTools network captures to list requests; identify media by MIME or URL suffix. - Save assets under `downloads/--<page-id>/media/` (title from current page; date format `YYYYMMDD`) with clean filenames (strip query strings and `@!` size suffixes; keep proper extensions). After download, rename any files still containing size tokens or missing extensions to the original base name + proper extension. - Also save the source page URL(s) provided by the user into the folder root as `downloads/<date>-<title>-<page-id>/urls.txt`. @@ -15,16 +15,15 @@ This repository has no existing conventions. These guidelines apply to all work - After collecting all requested resources and screenshots, close any additional tabs/pages opened for capture. This is mandatory; do not leave capture tabs open. ## Download script usage -- Primary workflow: run `node run.mjs <page-url>` to capture network requests, screenshot, and download media in one step. This script uses Playwright Chromium to open a browser with mobile viewport (390×844 @ dpr 3), navigate to the page, capture audio/video/image URLs, take a viewport screenshot, then call `download.mjs` to batch download assets. For remote debugging, pass `--cdp ws://host:port/devtools/browser/<id>` or `--cdp http://host:port` (or set `ENV_CDP`) to resolve and connect to a Chrome DevTools endpoint. -- For manual control: use `python download.py --page-id <id> --title "<title>" --urls urls.txt --sources source_urls.txt` (Python) or `node download.mjs --page-id <id> --title "<title>" --urls urls.txt --sources source_urls.txt` (Node.js) to batch download assets. The script generates `<date>` using format `YYYYMMDD`. +- Primary workflow: run `node run.mjs <page-url>` to capture network requests, screenshot, and download media in one step. This script uses Playwright Chromium to open a browser with mobile viewport (375×667 @ dpr 3), navigate to the page, capture audio/video/image URLs, take a viewport screenshot, then download assets directly (no separate `download.mjs`). For remote debugging, pass `--cdp ws://host:port/devtools/browser/<id>` or `--cdp http://host:port` (or set `ENV_CDP`) to resolve and connect to a Chrome DevTools endpoint. +- For manual control: use `python download.py --page-id <id> --title "<title>" --urls urls.txt --sources source_urls.txt` to batch download assets. The script generates `<date>` using format `YYYYMMDD`. - `urls.txt` should list the target asset URLs (one per line) already filtered to the requested scope (e.g., media only). - Downloads go to `downloads/<date>-<title>-<page-id>/media/`; filenames are cleaned (query/`@!` removed) and extensions retained/guessed; duplicates get numeric suffixes. - After the batch finishes, the script deletes 0-byte files, compares against the planned list, retries missing items up to 2 times, and reports any still-missing resources. - `urls.txt` is written to `downloads/<date>-<title>-<page-id>/urls.txt` to record user-provided page URLs. -- The script also deletes the `--urls` input file upon completion. ## Screenshots -- Default viewport for screenshots: width 390, height 844, devicePixelRatio 3 (mobile portrait). Do not change unless the user explicitly requests another size. +- Default viewport for screenshots: width 375, height 667, devicePixelRatio 3 (mobile portrait). Do not change unless the user explicitly requests another size. - Match the screenshot to the user’s requested viewport. If they mention a size, emulate it and verify with `window.innerWidth/innerHeight` and `devicePixelRatio`. - Capture screenshots with Chrome DevTools (device emulation per above) and save to `downloads/<date>-<title>-<page-id>/index.png` (title from current page; date format `YYYYMMDD`); use full-page only when explicitly asked. diff --git a/download.mjs b/download.mjs deleted file mode 100644 index ab02b6e..0000000 --- a/download.mjs +++ /dev/null @@ -1,352 +0,0 @@ -#!/usr/bin/env node -import fs from 'fs/promises'; -import path from 'path'; -import { fileURLToPath } from 'url'; -import https from 'https'; -import http from 'http'; -import { URL as URLParser } from 'url'; - -const __filename = fileURLToPath(import.meta.url); -const __dirname = path.dirname(__filename); - -const MAX_RETRIES = 2; -const TIMEOUT = 30000; -const DATE_FMT = '%Y%m%d'; - -function getCurrentDate() { - const now = new Date(); - const year = now.getFullYear(); - const month = String(now.getMonth() + 1).padStart(2, '0'); - const day = String(now.getDate()).padStart(2, '0'); - return `${year}${month}${day}`; -} - -function parseArgs() { - const args = process.argv.slice(2); - const result = { - pageId: null, - title: null, - urls: null, - sources: null, - retries: MAX_RETRIES, - timeout: TIMEOUT - }; - - for (let i = 0; i < args.length; i++) { - const arg = args[i]; - if (arg === '--page-id') { - result.pageId = args[++i]; - } else if (arg === '--title') { - result.title = args[++i]; - } else if (arg === '--urls') { - result.urls = args[++i]; - } else if (arg === '--sources') { - result.sources = args[++i]; - } else if (arg === '--retries') { - result.retries = parseInt(args[++i], 10); - } else if (arg === '--timeout') { - result.timeout = parseInt(args[++i], 10); - } - } - - if (!result.pageId || !result.title || !result.urls || !result.sources) { - console.error('Usage: node download.mjs --page-id <id> --title "<title>" --urls urls.txt --sources source_urls.txt'); - process.exit(1); - } - - return result; -} - -async function loadLines(filepath) { - const content = await fs.readFile(filepath, 'utf-8'); - return content.split('\n').map(line => line.trim()).filter(line => line.length > 0); -} - -async function ensureDir(dirPath) { - await fs.mkdir(dirPath, { recursive: true }); -} - -function sanitizeFilename(url, contentType) { - try { - const parsed = new URLParser(url); - let basename = path.basename(parsed.pathname); - - if (basename.includes('@!')) { - basename = basename.split('@!')[0]; - } - - if (!basename) { - basename = 'file'; - } - - const { name, ext } = path.parse(basename); - if (!ext) { - const guessedExt = guessExtension(contentType); - return name + guessedExt; - } - - return basename; - } catch { - return 'file.bin'; - } -} - -function guessExtension(contentType) { - if (contentType) { - const mime = contentType.split(';')[0].trim(); - const map = { - 'image/jpeg': '.jpg', - 'image/png': '.png', - 'image/gif': '.gif', - 'image/webp': '.webp', - 'image/svg+xml': '.svg', - 'audio/mpeg': '.mp3', - 'audio/wav': '.wav', - 'audio/ogg': '.ogg', - 'audio/aac': '.aac', - 'video/mp4': '.mp4', - 'video/webm': '.webm', - 'video/ogg': '.ogv', - 'application/octet-stream': '.bin' - }; - return map[mime] || '.bin'; - } - return '.bin'; -} - -async function uniqueName(basePath) { - try { - await fs.access(basePath); - const { dir, name, ext } = path.parse(basePath); - let idx = 1; - while (true) { - const candidate = path.join(dir, `${name}_${idx}${ext}`); - try { - await fs.access(candidate); - idx++; - } catch { - return candidate; - } - } - } catch { - return basePath; - } -} - -function downloadOne(url, destDir, timeout) { - return new Promise((resolve) => { - try { - const parsed = new URLParser(url); - const client = parsed.protocol === 'https:' ? https : http; - - const options = { - rejectUnauthorized: false, - timeout - }; - - const req = client.get(url, options, async (resp) => { - if (resp.statusCode !== 200) { - resolve([false, `HTTP ${resp.statusCode}`]); - resp.resume(); - return; - } - - const contentType = resp.headers['content-type']; - const fname = sanitizeFilename(url, contentType); - const target = await uniqueName(path.join(destDir, fname)); - - const fileStream = await fs.open(target, 'w'); - const writeStream = fileStream.createWriteStream(); - - let failed = false; - let errorMsg = null; - - writeStream.on('error', (err) => { - failed = true; - errorMsg = err.message; - writeStream.close(); - fileStream.close(); - }); - - writeStream.on('finish', () => { - writeStream.close(); - fileStream.close(); - }); - - writeStream.on('close', () => { - if (failed) { - fs.unlink(target).catch(() => {}); - resolve([false, errorMsg]); - } else { - resolve([true, null]); - } - }); - - resp.pipe(writeStream); - }); - - req.on('error', (err) => { - resolve([false, err.message]); - }); - - req.on('timeout', () => { - req.destroy(); - resolve([false, 'Request timeout']); - }); - - } catch (err) { - resolve([false, err.message]); - } - }); -} - -async function deleteZeroByteFiles(destDir) { - const removed = []; - try { - const files = await fs.readdir(destDir); - for (const file of files) { - const filePath = path.join(destDir, file); - const stats = await fs.stat(filePath); - if (stats.isFile() && stats.size === 0) { - await fs.unlink(filePath); - removed.push(file); - } - } - } catch (err) { - console.error(`Error deleting zero-byte files: ${err.message}`); - } - return removed; -} - -function planDownloads(urls) { - const plan = {}; - for (const url of urls) { - plan[url] = sanitizeFilename(url, null); - } - return plan; -} - -async function verifyMissing(destDir, plan) { - const missing = []; - try { - const files = await fs.readdir(destDir); - const existing = new Set(); - - for (const file of files) { - const filePath = path.join(destDir, file); - const stats = await fs.stat(filePath); - if (stats.isFile()) { - existing.add(file); - } - } - - for (const [url, fname] of Object.entries(plan)) { - if (!existing.has(fname)) { - const baseName = path.parse(fname).name; - const hasPrefix = Array.from(existing).some( - name => name.startsWith(baseName) - ); - if (!hasPrefix) { - missing.push(url); - } - } - } - } catch (err) { - console.error(`Error verifying missing files: ${err.message}`); - } - return missing; -} - -async function saveSourceUrls(destRoot, sourcesPath) { - await ensureDir(destRoot); - const sources = await loadLines(sourcesPath); - await fs.writeFile( - path.join(destRoot, 'urls.txt'), - sources.join('\n') + '\n', - 'utf-8' - ); -} - -function sleep(ms) { - return new Promise(resolve => setTimeout(resolve, ms)); -} - -async function main() { - const args = parseArgs(); - const today = getCurrentDate(); - const destRoot = path.join('downloads', `${today}-${args.title}-${args.pageId}`); - const mediaDir = path.join(destRoot, 'media'); - - await ensureDir(mediaDir); - - const urlsPath = args.urls; - const urls = await loadLines(urlsPath); - const plan = planDownloads(urls); - - console.log(`Starting download of ${urls.length} URLs...`); - - for (const url of urls) { - const [ok, err] = await downloadOne(url, mediaDir, args.timeout); - if (!ok) { - console.error(`WARN: download failed for ${url}: ${err}`); - } - await sleep(50); - } - - const removed = await deleteZeroByteFiles(mediaDir); - if (removed.length > 0) { - console.log(`Removed zero-byte files: ${removed.join(', ')}`); - } - - let missing = await verifyMissing(mediaDir, plan); - let attempts = 0; - - while (missing.length > 0 && attempts < args.retries) { - attempts++; - console.log(`Retry round ${attempts}: missing ${missing.length} files`); - - const stillMissing = []; - for (const url of missing) { - const [ok, err] = await downloadOne(url, mediaDir, args.timeout); - if (!ok) { - console.error(`WARN: retry failed for ${url}: ${err}`); - stillMissing.push(url); - } - } - - const removedRetry = await deleteZeroByteFiles(mediaDir); - if (removedRetry.length > 0) { - console.log(`Removed zero-byte files after retry: ${removedRetry.join(', ')}`); - } - - const retryPlan = {}; - for (const url of stillMissing) { - retryPlan[url] = plan[url]; - } - missing = stillMissing.length > 0 ? await verifyMissing(mediaDir, retryPlan) : []; - } - - if (missing.length > 0) { - console.error('FAILED to fetch these URLs after retries:'); - for (const url of missing) { - console.error(url); - } - } else { - console.log('All planned files downloaded.'); - } - - await saveSourceUrls(destRoot, args.sources); - console.log(`Saved source URLs to ${path.join(destRoot, 'urls.txt')}`); - - try { - await fs.unlink(urlsPath); - console.log(`Removed input file: ${urlsPath}`); - } catch (err) { - console.error(`WARN: failed to remove input file ${urlsPath}: ${err.message}`); - } -} - -main().catch(err => { - console.error(err); - process.exit(1); -}); diff --git a/run.mjs b/run.mjs index 4e36f78..c0e1e59 100644 --- a/run.mjs +++ b/run.mjs @@ -1,15 +1,14 @@ #!/usr/bin/env node import fs from 'fs/promises'; import path from 'path'; -import { spawn } from 'child_process'; import process from 'process'; import { chromium } from 'playwright'; import http from 'http'; import https from 'https'; -const VIEWPORT = { - width: 390, - height: 844, +const VIEWPORT_SIZE = { width: 375, height: 667 }; +const CONTEXT_OPTIONS = { + viewport: VIEWPORT_SIZE, deviceScaleFactor: 3, isMobile: true, hasTouch: true, @@ -22,6 +21,9 @@ const MEDIA_EXTENSIONS = [ '.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.bmp', '.ico', ]; +const MAX_RETRIES = 2; +const TIMEOUT = 30000; + function getCurrentDate() { const now = new Date(); const year = now.getFullYear(); @@ -91,59 +93,6 @@ function buildCaptureUrl(pageUrl) { return `https://ca.rrx.cn/v/${pageId}?rrxsrc=2&iframe=1&tpl=1`; } -async function captureNetworkAndScreenshot(context, captureUrl, pageId, today) { - const page = await context.newPage(); - const requests = new Set(); - - page.on('request', (request) => { - const resourceType = request.resourceType(); - if (resourceType === 'image' || resourceType === 'media') { - requests.add(request.url()); - } - }); - - page.on('response', (response) => { - const url = response.url(); - const mimeType = response.headers()['content-type'] || ''; - if (isMediaUrl(url, mimeType)) { - requests.add(url); - } - }); - - try { - console.log('Navigating to:', captureUrl); - await page.goto(captureUrl, { waitUntil: 'networkidle', timeout: 30000 }); - await page.waitForTimeout(3000); - - const pageTitle = sanitizeTitle(await page.title()); - const destRoot = path.join('downloads', `${today}-${pageTitle}-${pageId}`); - await fs.mkdir(destRoot, { recursive: true }); - - const screenshotPath = path.join(destRoot, 'index.png'); - console.log('Taking screenshot...'); - await page.screenshot({ path: screenshotPath, fullPage: false }); - console.log('Screenshot saved:', screenshotPath); - - return { mediaUrls: [...requests], pageTitle }; - } finally { - await page.close(); - } -} - -function runDownloadScript(pageId, title, urlsFile, sourcesFile) { - return new Promise((resolve) => { - const args = [ - 'download.mjs', - '--page-id', pageId, - '--title', title, - '--urls', urlsFile, - '--sources', sourcesFile, - ]; - const proc = spawn('node', args, { stdio: 'inherit' }); - proc.on('close', resolve); - }); -} - async function fetchJson(url) { return new Promise((resolve, reject) => { const client = url.startsWith('https:') ? https : http; @@ -186,6 +135,322 @@ async function resolveCdpUrl(cdpUrl) { return data.webSocketDebuggerUrl; } +function guessExtension(contentType) { + if (!contentType) { + return '.bin'; + } + const mime = contentType.split(';')[0].trim(); + const map = { + 'image/jpeg': '.jpg', + 'image/png': '.png', + 'image/gif': '.gif', + 'image/webp': '.webp', + 'image/svg+xml': '.svg', + 'audio/mpeg': '.mp3', + 'audio/wav': '.wav', + 'audio/ogg': '.ogg', + 'audio/aac': '.aac', + 'video/mp4': '.mp4', + 'video/webm': '.webm', + 'video/ogg': '.ogv', + 'application/octet-stream': '.bin', + }; + return map[mime] || '.bin'; +} + +function sanitizeFilename(url, contentType) { + try { + const parsed = new URL(url); + let basename = path.basename(parsed.pathname); + + if (basename.includes('@!')) { + basename = basename.split('@!')[0]; + } + + if (!basename) { + basename = 'file'; + } + + const { name, ext } = path.parse(basename); + if (!ext) { + return name + guessExtension(contentType); + } + + return basename; + } catch { + return 'file.bin'; + } +} + +async function uniqueName(basePath) { + try { + await fs.access(basePath); + const { dir, name, ext } = path.parse(basePath); + let idx = 1; + while (true) { + const candidate = path.join(dir, `${name}_${idx}${ext}`); + try { + await fs.access(candidate); + idx += 1; + } catch { + return candidate; + } + } + } catch { + return basePath; + } +} + +async function downloadOne(url, destDir, timeout) { + return new Promise((resolve) => { + try { + const parsed = new URL(url); + const client = parsed.protocol === 'https:' ? https : http; + + const req = client.get(url, { timeout }, async (resp) => { + if (resp.statusCode !== 200) { + resolve([false, `HTTP ${resp.statusCode}`]); + resp.resume(); + return; + } + + const contentType = resp.headers['content-type']; + const fname = sanitizeFilename(url, contentType); + const target = await uniqueName(path.join(destDir, fname)); + + const fileStream = await fs.open(target, 'w'); + const writeStream = fileStream.createWriteStream(); + + let failed = false; + let errorMsg = null; + + writeStream.on('error', (err) => { + failed = true; + errorMsg = err.message; + writeStream.close(); + fileStream.close(); + }); + + writeStream.on('finish', () => { + writeStream.close(); + fileStream.close(); + }); + + writeStream.on('close', () => { + if (failed) { + fs.unlink(target).catch(() => {}); + resolve([false, errorMsg]); + } else { + resolve([true, null]); + } + }); + + resp.pipe(writeStream); + }); + + req.on('error', (err) => { + resolve([false, err.message]); + }); + + req.on('timeout', () => { + req.destroy(); + resolve([false, 'Request timeout']); + }); + } catch (err) { + resolve([false, err.message]); + } + }); +} + +async function deleteZeroByteFiles(destDir) { + const removed = []; + const files = await fs.readdir(destDir); + for (const file of files) { + const filePath = path.join(destDir, file); + const stats = await fs.stat(filePath); + if (stats.isFile() && stats.size === 0) { + await fs.unlink(filePath); + removed.push(file); + } + } + return removed; +} + +function planDownloads(urls) { + const plan = {}; + for (const url of urls) { + plan[url] = sanitizeFilename(url, null); + } + return plan; +} + +async function verifyMissing(destDir, plan) { + const missing = []; + const files = await fs.readdir(destDir); + const existing = new Set(); + + for (const file of files) { + const filePath = path.join(destDir, file); + const stats = await fs.stat(filePath); + if (stats.isFile()) { + existing.add(file); + } + } + + for (const [url, fname] of Object.entries(plan)) { + if (!existing.has(fname)) { + const baseName = path.parse(fname).name; + const hasPrefix = Array.from(existing).some((name) => name.startsWith(baseName)); + if (!hasPrefix) { + missing.push(url); + } + } + } + + return missing; +} + +function dedupeAndFilterUrls(urls) { + const seen = new Set(); + const filtered = []; + const skipped = []; + + for (const url of urls) { + const cleanUrl = url.split('?')[0]; + if (seen.has(cleanUrl)) { + skipped.push(url); + continue; + } + seen.add(cleanUrl); + filtered.push(url); + } + + return { filtered, skipped }; +} + +async function saveSourceUrls(destRoot, sourcesPath) { + const sources = await fs.readFile(sourcesPath, 'utf-8'); + await fs.writeFile( + path.join(destRoot, 'urls.txt'), + sources.trim() + '\n', + 'utf-8', + ); +} + +function sleep(ms) { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +async function captureNetworkAndScreenshot(context, captureUrl, pageId, today) { + const page = await context.newPage(); + const requests = new Set(); + + page.on('request', (request) => { + const resourceType = request.resourceType(); + if (resourceType === 'image' || resourceType === 'media') { + requests.add(request.url()); + } + }); + + page.on('response', (response) => { + const url = response.url(); + const mimeType = response.headers()['content-type'] || ''; + if (isMediaUrl(url, mimeType)) { + requests.add(url); + } + }); + + try { + console.log('Navigating to:', captureUrl); + await page.goto(captureUrl, { waitUntil: 'networkidle', timeout: 30000 }); + await page.waitForTimeout(3000); + + await page.setViewportSize(VIEWPORT_SIZE); + + const pageTitle = sanitizeTitle(await page.title()); + const destRoot = path.join('downloads', `${today}-${pageTitle}-${pageId}`); + await fs.mkdir(destRoot, { recursive: true }); + + const screenshotPath = path.join(destRoot, 'index.png'); + console.log('Taking screenshot...'); + await page.screenshot({ path: screenshotPath, fullPage: false }); + console.log('Screenshot saved:', screenshotPath); + + return { mediaUrls: [...requests], pageTitle, destRoot }; + } finally { + await page.close(); + } +} + +async function downloadAssets(urls, destRoot, timeout, retries) { + const mediaDir = path.join(destRoot, 'media'); + await fs.mkdir(mediaDir, { recursive: true }); + + const { filtered, skipped } = dedupeAndFilterUrls(urls); + const plan = planDownloads(filtered); + + console.log(`Starting download of ${filtered.length} URLs...`); + if (skipped.length > 0) { + console.log(`Skipped ${skipped.length} duplicate URLs after cleaning.`); + } + + for (const url of filtered) { + console.log(`Downloading: ${url}`); + const [ok, err] = await downloadOne(url, mediaDir, timeout); + if (!ok) { + console.error(`WARN: download failed for ${url}: ${err}`); + } else { + console.log(`Downloaded: ${url}`); + } + await sleep(50); + } + + const removed = await deleteZeroByteFiles(mediaDir); + if (removed.length > 0) { + console.log(`Removed zero-byte files: ${removed.join(', ')}`); + } + + let missing = await verifyMissing(mediaDir, plan); + let attempts = 0; + + while (missing.length > 0 && attempts < retries) { + attempts += 1; + console.log(`Retry round ${attempts}: missing ${missing.length} files`); + + const stillMissing = []; + for (const url of missing) { + console.log(`Retrying: ${url}`); + const [ok, err] = await downloadOne(url, mediaDir, timeout); + if (!ok) { + console.error(`WARN: retry failed for ${url}: ${err}`); + stillMissing.push(url); + } else { + console.log(`Downloaded on retry: ${url}`); + } + } + + const removedRetry = await deleteZeroByteFiles(mediaDir); + if (removedRetry.length > 0) { + console.log(`Removed zero-byte files after retry: ${removedRetry.join(', ')}`); + } + + const retryPlan = {}; + for (const url of stillMissing) { + retryPlan[url] = plan[url]; + } + missing = stillMissing.length > 0 ? await verifyMissing(mediaDir, retryPlan) : []; + } + + if (missing.length > 0) { + console.error('FAILED to fetch these URLs after retries:'); + for (const url of missing) { + console.error(url); + } + } else { + console.log('All planned files downloaded.'); + } +} + async function main() { const { pageId, pageUrl, cdpUrl } = parseArgs(); console.log('Extracted page-id:', pageId); @@ -194,7 +459,6 @@ async function main() { const tempDir = 'temp'; await fs.mkdir(tempDir, { recursive: true }); - const urlsFile = path.join(tempDir, 'urls.txt'); const sourcesFile = path.join(tempDir, 'sources.txt'); await fs.writeFile(sourcesFile, pageUrl + '\n', 'utf-8'); @@ -209,23 +473,25 @@ async function main() { browser = await chromium.launch({ headless: true }); } - const context = await browser.newContext(VIEWPORT); + const context = await browser.newContext(CONTEXT_OPTIONS); const captureUrl = buildCaptureUrl(pageUrl); console.log('Capture URL:', captureUrl); try { - const { mediaUrls, pageTitle } = await captureNetworkAndScreenshot( + const { mediaUrls, pageTitle, destRoot } = await captureNetworkAndScreenshot( context, captureUrl, pageId, today, ); + await saveSourceUrls(destRoot, sourcesFile); + console.log(`Saved source URLs to ${path.join(destRoot, 'urls.txt')}`); + console.log('Found', mediaUrls.length, 'media URLs'); if (mediaUrls.length > 0) { - await fs.writeFile(urlsFile, mediaUrls.join('\n') + '\n', 'utf-8'); - await runDownloadScript(pageId, pageTitle, urlsFile, sourcesFile); + await downloadAssets(mediaUrls, destRoot, TIMEOUT, MAX_RETRIES); } else { console.log('No media URLs found, skipping download'); }