diff --git a/AGENTS.md b/AGENTS.md index b5b9b59..a601296 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -15,11 +15,13 @@ This repository has no existing conventions. These guidelines apply to all work - After collecting all requested resources and screenshots, close any additional tabs/pages opened for capture. This is mandatory; do not leave capture tabs open. ## Download script usage -- Use `python download.py --page-id --title "" --urls urls.txt --sources source_urls.txt` to batch download assets. The script generates `<date>` using format `YYYYMMDD`. +- Primary workflow: run `node run.mjs <page-url>` to capture network requests, screenshot, and download media in one step. This script uses Puppeteer to open a browser with mobile viewport (390×844 @ dpr 3), navigate to the page, capture audio/video/image URLs, take a viewport screenshot, then call `download.mjs` to batch download assets. +- For manual control: use `python download.py --page-id <id> --title "<title>" --urls urls.txt --sources source_urls.txt` (Python) or `node download.mjs --page-id <id> --title "<title>" --urls urls.txt --sources source_urls.txt` (Node.js) to batch download assets. The script generates `<date>` using format `YYYYMMDD`. - `urls.txt` should list the target asset URLs (one per line) already filtered to the requested scope (e.g., media only). - Downloads go to `downloads/<date>-<title>-<page-id>/media/`; filenames are cleaned (query/`@!` removed) and extensions retained/guessed; duplicates get numeric suffixes. - After the batch finishes, the script deletes 0-byte files, compares against the planned list, retries missing items up to 2 times, and reports any still-missing resources. - `urls.txt` is written to `downloads/<date>-<title>-<page-id>/urls.txt` to record user-provided page URLs. +- The script also deletes the `--urls` input file upon completion. ## Screenshots - Default viewport for screenshots: width 390, height 844, devicePixelRatio 3 (mobile portrait). Do not change unless the user explicitly requests another size. diff --git a/download.mjs b/download.mjs new file mode 100644 index 0000000..ab02b6e --- /dev/null +++ b/download.mjs @@ -0,0 +1,352 @@ +#!/usr/bin/env node +import fs from 'fs/promises'; +import path from 'path'; +import { fileURLToPath } from 'url'; +import https from 'https'; +import http from 'http'; +import { URL as URLParser } from 'url'; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); + +const MAX_RETRIES = 2; +const TIMEOUT = 30000; +const DATE_FMT = '%Y%m%d'; + +function getCurrentDate() { + const now = new Date(); + const year = now.getFullYear(); + const month = String(now.getMonth() + 1).padStart(2, '0'); + const day = String(now.getDate()).padStart(2, '0'); + return `${year}${month}${day}`; +} + +function parseArgs() { + const args = process.argv.slice(2); + const result = { + pageId: null, + title: null, + urls: null, + sources: null, + retries: MAX_RETRIES, + timeout: TIMEOUT + }; + + for (let i = 0; i < args.length; i++) { + const arg = args[i]; + if (arg === '--page-id') { + result.pageId = args[++i]; + } else if (arg === '--title') { + result.title = args[++i]; + } else if (arg === '--urls') { + result.urls = args[++i]; + } else if (arg === '--sources') { + result.sources = args[++i]; + } else if (arg === '--retries') { + result.retries = parseInt(args[++i], 10); + } else if (arg === '--timeout') { + result.timeout = parseInt(args[++i], 10); + } + } + + if (!result.pageId || !result.title || !result.urls || !result.sources) { + console.error('Usage: node download.mjs --page-id <id> --title "<title>" --urls urls.txt --sources source_urls.txt'); + process.exit(1); + } + + return result; +} + +async function loadLines(filepath) { + const content = await fs.readFile(filepath, 'utf-8'); + return content.split('\n').map(line => line.trim()).filter(line => line.length > 0); +} + +async function ensureDir(dirPath) { + await fs.mkdir(dirPath, { recursive: true }); +} + +function sanitizeFilename(url, contentType) { + try { + const parsed = new URLParser(url); + let basename = path.basename(parsed.pathname); + + if (basename.includes('@!')) { + basename = basename.split('@!')[0]; + } + + if (!basename) { + basename = 'file'; + } + + const { name, ext } = path.parse(basename); + if (!ext) { + const guessedExt = guessExtension(contentType); + return name + guessedExt; + } + + return basename; + } catch { + return 'file.bin'; + } +} + +function guessExtension(contentType) { + if (contentType) { + const mime = contentType.split(';')[0].trim(); + const map = { + 'image/jpeg': '.jpg', + 'image/png': '.png', + 'image/gif': '.gif', + 'image/webp': '.webp', + 'image/svg+xml': '.svg', + 'audio/mpeg': '.mp3', + 'audio/wav': '.wav', + 'audio/ogg': '.ogg', + 'audio/aac': '.aac', + 'video/mp4': '.mp4', + 'video/webm': '.webm', + 'video/ogg': '.ogv', + 'application/octet-stream': '.bin' + }; + return map[mime] || '.bin'; + } + return '.bin'; +} + +async function uniqueName(basePath) { + try { + await fs.access(basePath); + const { dir, name, ext } = path.parse(basePath); + let idx = 1; + while (true) { + const candidate = path.join(dir, `${name}_${idx}${ext}`); + try { + await fs.access(candidate); + idx++; + } catch { + return candidate; + } + } + } catch { + return basePath; + } +} + +function downloadOne(url, destDir, timeout) { + return new Promise((resolve) => { + try { + const parsed = new URLParser(url); + const client = parsed.protocol === 'https:' ? https : http; + + const options = { + rejectUnauthorized: false, + timeout + }; + + const req = client.get(url, options, async (resp) => { + if (resp.statusCode !== 200) { + resolve([false, `HTTP ${resp.statusCode}`]); + resp.resume(); + return; + } + + const contentType = resp.headers['content-type']; + const fname = sanitizeFilename(url, contentType); + const target = await uniqueName(path.join(destDir, fname)); + + const fileStream = await fs.open(target, 'w'); + const writeStream = fileStream.createWriteStream(); + + let failed = false; + let errorMsg = null; + + writeStream.on('error', (err) => { + failed = true; + errorMsg = err.message; + writeStream.close(); + fileStream.close(); + }); + + writeStream.on('finish', () => { + writeStream.close(); + fileStream.close(); + }); + + writeStream.on('close', () => { + if (failed) { + fs.unlink(target).catch(() => {}); + resolve([false, errorMsg]); + } else { + resolve([true, null]); + } + }); + + resp.pipe(writeStream); + }); + + req.on('error', (err) => { + resolve([false, err.message]); + }); + + req.on('timeout', () => { + req.destroy(); + resolve([false, 'Request timeout']); + }); + + } catch (err) { + resolve([false, err.message]); + } + }); +} + +async function deleteZeroByteFiles(destDir) { + const removed = []; + try { + const files = await fs.readdir(destDir); + for (const file of files) { + const filePath = path.join(destDir, file); + const stats = await fs.stat(filePath); + if (stats.isFile() && stats.size === 0) { + await fs.unlink(filePath); + removed.push(file); + } + } + } catch (err) { + console.error(`Error deleting zero-byte files: ${err.message}`); + } + return removed; +} + +function planDownloads(urls) { + const plan = {}; + for (const url of urls) { + plan[url] = sanitizeFilename(url, null); + } + return plan; +} + +async function verifyMissing(destDir, plan) { + const missing = []; + try { + const files = await fs.readdir(destDir); + const existing = new Set(); + + for (const file of files) { + const filePath = path.join(destDir, file); + const stats = await fs.stat(filePath); + if (stats.isFile()) { + existing.add(file); + } + } + + for (const [url, fname] of Object.entries(plan)) { + if (!existing.has(fname)) { + const baseName = path.parse(fname).name; + const hasPrefix = Array.from(existing).some( + name => name.startsWith(baseName) + ); + if (!hasPrefix) { + missing.push(url); + } + } + } + } catch (err) { + console.error(`Error verifying missing files: ${err.message}`); + } + return missing; +} + +async function saveSourceUrls(destRoot, sourcesPath) { + await ensureDir(destRoot); + const sources = await loadLines(sourcesPath); + await fs.writeFile( + path.join(destRoot, 'urls.txt'), + sources.join('\n') + '\n', + 'utf-8' + ); +} + +function sleep(ms) { + return new Promise(resolve => setTimeout(resolve, ms)); +} + +async function main() { + const args = parseArgs(); + const today = getCurrentDate(); + const destRoot = path.join('downloads', `${today}-${args.title}-${args.pageId}`); + const mediaDir = path.join(destRoot, 'media'); + + await ensureDir(mediaDir); + + const urlsPath = args.urls; + const urls = await loadLines(urlsPath); + const plan = planDownloads(urls); + + console.log(`Starting download of ${urls.length} URLs...`); + + for (const url of urls) { + const [ok, err] = await downloadOne(url, mediaDir, args.timeout); + if (!ok) { + console.error(`WARN: download failed for ${url}: ${err}`); + } + await sleep(50); + } + + const removed = await deleteZeroByteFiles(mediaDir); + if (removed.length > 0) { + console.log(`Removed zero-byte files: ${removed.join(', ')}`); + } + + let missing = await verifyMissing(mediaDir, plan); + let attempts = 0; + + while (missing.length > 0 && attempts < args.retries) { + attempts++; + console.log(`Retry round ${attempts}: missing ${missing.length} files`); + + const stillMissing = []; + for (const url of missing) { + const [ok, err] = await downloadOne(url, mediaDir, args.timeout); + if (!ok) { + console.error(`WARN: retry failed for ${url}: ${err}`); + stillMissing.push(url); + } + } + + const removedRetry = await deleteZeroByteFiles(mediaDir); + if (removedRetry.length > 0) { + console.log(`Removed zero-byte files after retry: ${removedRetry.join(', ')}`); + } + + const retryPlan = {}; + for (const url of stillMissing) { + retryPlan[url] = plan[url]; + } + missing = stillMissing.length > 0 ? await verifyMissing(mediaDir, retryPlan) : []; + } + + if (missing.length > 0) { + console.error('FAILED to fetch these URLs after retries:'); + for (const url of missing) { + console.error(url); + } + } else { + console.log('All planned files downloaded.'); + } + + await saveSourceUrls(destRoot, args.sources); + console.log(`Saved source URLs to ${path.join(destRoot, 'urls.txt')}`); + + try { + await fs.unlink(urlsPath); + console.log(`Removed input file: ${urlsPath}`); + } catch (err) { + console.error(`WARN: failed to remove input file ${urlsPath}: ${err.message}`); + } +} + +main().catch(err => { + console.error(err); + process.exit(1); +}); diff --git a/run.mjs b/run.mjs new file mode 100644 index 0000000..a5905e4 --- /dev/null +++ b/run.mjs @@ -0,0 +1,171 @@ +#!/usr/bin/env node +import fs from 'fs/promises'; +import path from 'path'; +import { fileURLToPath } from 'url'; +import { spawn } from 'child_process'; +import process from 'process'; +import puppeteer from 'puppeteer'; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); + +const VIEWPORT = { width: 390, height: 844, deviceScaleFactor: 3, isMobile: true, hasTouch: true }; +const MEDIA_MIME_PREFIXES = ['audio/', 'video/', 'image/']; +const MEDIA_EXTENSIONS = ['.mp3', '.wav', '.ogg', '.aac', '.m4a', '.flac', '.mp4', '.webm', '.ogv', '.avi', '.mov', '.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.bmp', '.ico']; + +function getCurrentDate() { + const now = new Date(); + const year = now.getFullYear(); + const month = String(now.getMonth() + 1).padStart(2, '0'); + const day = String(now.getDate()).padStart(2, '0'); + return `${year}${month}${day}`; +} + +function extractPageId(url) { + const patterns = [ + /storeview\/([a-z0-9]+)/i, + /\/v\/([a-z0-9]+)/i, + ]; + for (const pattern of patterns) { + const match = url.match(pattern); + if (match && match[1]) { + return match[1]; + } + } + return null; +} + +function parseArgs() { + const args = process.argv.slice(2); + const pageUrl = args[0]; + if (!pageUrl) { + console.error('Usage: node run.mjs <page-url>'); + process.exit(1); + } + const pageId = extractPageId(pageUrl); + if (!pageId) { + console.error('Error: Could not extract page-id from the URL'); + process.exit(1); + } + return { pageId, pageUrl }; +} + +function isMediaUrl(url, mimeType) { + if (mimeType && MEDIA_MIME_PREFIXES.some(prefix => mimeType.toLowerCase().startsWith(prefix))) { + return true; + } + const lowerUrl = url.toLowerCase(); + return MEDIA_EXTENSIONS.some(ext => lowerUrl.includes(ext)); +} + +function buildCaptureUrl(pageUrl) { + const pageId = extractPageId(pageUrl); + return `https://ca.rrx.cn/v/${pageId}?rrxsrc=2&iframe=1&tpl=1`; +} + +async function captureNetworkAndScreenshot(browser, captureUrl, pageId, today) { + const page = await browser.newPage(); + const requests = []; + + page.on('request', request => { + const url = request.url(); + const resourceType = request.resourceType(); + if (['image', 'media', 'font'].includes(resourceType)) { + requests.push(url); + } + }); + + page.on('response', response => { + const url = response.url(); + const headers = response.headers(); + const mimeType = headers['content-type'] || ''; + if (isMediaUrl(url, mimeType)) { + if (!requests.includes(url)) { + requests.push(url); + } + } + }); + + await page.emulate(VIEWPORT); + await page.setViewport(VIEWPORT); + + console.log('Navigating to:', captureUrl); + await page.goto(captureUrl, { waitUntil: 'networkidle2', timeout: 30000 }); + + await new Promise(resolve => setTimeout(resolve, 3000)); + + let pageTitle = 'page'; + try { + pageTitle = await page.title(); + pageTitle = pageTitle.replace(/[<>:"/\\|?*]/g, '').trim().slice(0, 100) || 'page'; + } catch {} + + const destRoot = path.join('downloads', `${today}-${pageTitle}-${pageId}`); + await fs.mkdir(destRoot, { recursive: true }); + const screenshotPath = path.join(destRoot, 'index.png'); + console.log('Taking screenshot...'); + await page.screenshot({ path: screenshotPath, fullPage: false }); + console.log('Screenshot saved:', screenshotPath); + + const mediaUrls = [...new Set(requests.filter(url => isMediaUrl(url, null)))]; + return { page, mediaUrls, pageTitle }; +} + +function runDownloadScript(pageId, title, urlsFile, sourcesFile) { + return new Promise((resolve) => { + const args = [ + 'download.mjs', + '--page-id', pageId, + '--title', title, + '--urls', urlsFile, + '--sources', sourcesFile, + ]; + const proc = spawn('node', args, { stdio: 'inherit' }); + proc.on('close', resolve); + }); +} + +async function main() { + const { pageId, pageUrl } = parseArgs(); + console.log('Extracted page-id:', pageId); + + const today = getCurrentDate(); + const tempDir = 'temp'; + await fs.mkdir(tempDir, { recursive: true }); + + const urlsFile = path.join(tempDir, 'urls.txt'); + const sourcesFile = path.join(tempDir, 'sources.txt'); + await fs.writeFile(sourcesFile, pageUrl + '\n', 'utf-8'); + + console.log('Launching browser...'); + const browser = await puppeteer.launch({ headless: 'new' }); + + const captureUrl = buildCaptureUrl(pageUrl); + console.log('Capture URL:', captureUrl); + + try { + const { page, mediaUrls, pageTitle } = await captureNetworkAndScreenshot(browser, captureUrl, pageId, today); + console.log('Found', mediaUrls.length, 'media URLs'); + + if (mediaUrls.length > 0) { + await fs.writeFile(urlsFile, mediaUrls.join('\n') + '\n', 'utf-8'); + await runDownloadScript(pageId, pageTitle, urlsFile, sourcesFile); + } else { + console.log('No media URLs found, skipping download'); + } + + console.log('Closing captured page...'); + await page.close(); + } catch (error) { + console.error('Error during capture:', error.message); + } + + console.log('Closing browser...'); + await browser.close(); + console.log('Workflow completed.'); +} + +main().catch(err => { + console.error(err); + process.exit(1); +});