- Introduce run.mjs to orchestrate full workflow: Puppeteer opens a browser with mobile viewport (390x844 @ dpr 3), navigates to capture URL, records network requests for media, takes a viewport screenshot, then calls download.mjs to batch download assets. - Implement download.mjs to handle asset downloading with same behavior as Python version (retry, zero-byte cleanup, unique naming, save urls.txt, delete input file). - Update AGENTS.md to document the new primary workflow and keep manual download instructions for both Python and Node.js. - Support title extraction from page title for folder naming.
353 lines
8.9 KiB
JavaScript
353 lines
8.9 KiB
JavaScript
#!/usr/bin/env node
|
|
import fs from 'fs/promises';
|
|
import path from 'path';
|
|
import { fileURLToPath } from 'url';
|
|
import https from 'https';
|
|
import http from 'http';
|
|
import { URL as URLParser } from 'url';
|
|
|
|
const __filename = fileURLToPath(import.meta.url);
|
|
const __dirname = path.dirname(__filename);
|
|
|
|
const MAX_RETRIES = 2;
|
|
const TIMEOUT = 30000;
|
|
const DATE_FMT = '%Y%m%d';
|
|
|
|
function getCurrentDate() {
|
|
const now = new Date();
|
|
const year = now.getFullYear();
|
|
const month = String(now.getMonth() + 1).padStart(2, '0');
|
|
const day = String(now.getDate()).padStart(2, '0');
|
|
return `${year}${month}${day}`;
|
|
}
|
|
|
|
function parseArgs() {
|
|
const args = process.argv.slice(2);
|
|
const result = {
|
|
pageId: null,
|
|
title: null,
|
|
urls: null,
|
|
sources: null,
|
|
retries: MAX_RETRIES,
|
|
timeout: TIMEOUT
|
|
};
|
|
|
|
for (let i = 0; i < args.length; i++) {
|
|
const arg = args[i];
|
|
if (arg === '--page-id') {
|
|
result.pageId = args[++i];
|
|
} else if (arg === '--title') {
|
|
result.title = args[++i];
|
|
} else if (arg === '--urls') {
|
|
result.urls = args[++i];
|
|
} else if (arg === '--sources') {
|
|
result.sources = args[++i];
|
|
} else if (arg === '--retries') {
|
|
result.retries = parseInt(args[++i], 10);
|
|
} else if (arg === '--timeout') {
|
|
result.timeout = parseInt(args[++i], 10);
|
|
}
|
|
}
|
|
|
|
if (!result.pageId || !result.title || !result.urls || !result.sources) {
|
|
console.error('Usage: node download.mjs --page-id <id> --title "<title>" --urls urls.txt --sources source_urls.txt');
|
|
process.exit(1);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
async function loadLines(filepath) {
|
|
const content = await fs.readFile(filepath, 'utf-8');
|
|
return content.split('\n').map(line => line.trim()).filter(line => line.length > 0);
|
|
}
|
|
|
|
async function ensureDir(dirPath) {
|
|
await fs.mkdir(dirPath, { recursive: true });
|
|
}
|
|
|
|
function sanitizeFilename(url, contentType) {
|
|
try {
|
|
const parsed = new URLParser(url);
|
|
let basename = path.basename(parsed.pathname);
|
|
|
|
if (basename.includes('@!')) {
|
|
basename = basename.split('@!')[0];
|
|
}
|
|
|
|
if (!basename) {
|
|
basename = 'file';
|
|
}
|
|
|
|
const { name, ext } = path.parse(basename);
|
|
if (!ext) {
|
|
const guessedExt = guessExtension(contentType);
|
|
return name + guessedExt;
|
|
}
|
|
|
|
return basename;
|
|
} catch {
|
|
return 'file.bin';
|
|
}
|
|
}
|
|
|
|
function guessExtension(contentType) {
|
|
if (contentType) {
|
|
const mime = contentType.split(';')[0].trim();
|
|
const map = {
|
|
'image/jpeg': '.jpg',
|
|
'image/png': '.png',
|
|
'image/gif': '.gif',
|
|
'image/webp': '.webp',
|
|
'image/svg+xml': '.svg',
|
|
'audio/mpeg': '.mp3',
|
|
'audio/wav': '.wav',
|
|
'audio/ogg': '.ogg',
|
|
'audio/aac': '.aac',
|
|
'video/mp4': '.mp4',
|
|
'video/webm': '.webm',
|
|
'video/ogg': '.ogv',
|
|
'application/octet-stream': '.bin'
|
|
};
|
|
return map[mime] || '.bin';
|
|
}
|
|
return '.bin';
|
|
}
|
|
|
|
async function uniqueName(basePath) {
|
|
try {
|
|
await fs.access(basePath);
|
|
const { dir, name, ext } = path.parse(basePath);
|
|
let idx = 1;
|
|
while (true) {
|
|
const candidate = path.join(dir, `${name}_${idx}${ext}`);
|
|
try {
|
|
await fs.access(candidate);
|
|
idx++;
|
|
} catch {
|
|
return candidate;
|
|
}
|
|
}
|
|
} catch {
|
|
return basePath;
|
|
}
|
|
}
|
|
|
|
function downloadOne(url, destDir, timeout) {
|
|
return new Promise((resolve) => {
|
|
try {
|
|
const parsed = new URLParser(url);
|
|
const client = parsed.protocol === 'https:' ? https : http;
|
|
|
|
const options = {
|
|
rejectUnauthorized: false,
|
|
timeout
|
|
};
|
|
|
|
const req = client.get(url, options, async (resp) => {
|
|
if (resp.statusCode !== 200) {
|
|
resolve([false, `HTTP ${resp.statusCode}`]);
|
|
resp.resume();
|
|
return;
|
|
}
|
|
|
|
const contentType = resp.headers['content-type'];
|
|
const fname = sanitizeFilename(url, contentType);
|
|
const target = await uniqueName(path.join(destDir, fname));
|
|
|
|
const fileStream = await fs.open(target, 'w');
|
|
const writeStream = fileStream.createWriteStream();
|
|
|
|
let failed = false;
|
|
let errorMsg = null;
|
|
|
|
writeStream.on('error', (err) => {
|
|
failed = true;
|
|
errorMsg = err.message;
|
|
writeStream.close();
|
|
fileStream.close();
|
|
});
|
|
|
|
writeStream.on('finish', () => {
|
|
writeStream.close();
|
|
fileStream.close();
|
|
});
|
|
|
|
writeStream.on('close', () => {
|
|
if (failed) {
|
|
fs.unlink(target).catch(() => {});
|
|
resolve([false, errorMsg]);
|
|
} else {
|
|
resolve([true, null]);
|
|
}
|
|
});
|
|
|
|
resp.pipe(writeStream);
|
|
});
|
|
|
|
req.on('error', (err) => {
|
|
resolve([false, err.message]);
|
|
});
|
|
|
|
req.on('timeout', () => {
|
|
req.destroy();
|
|
resolve([false, 'Request timeout']);
|
|
});
|
|
|
|
} catch (err) {
|
|
resolve([false, err.message]);
|
|
}
|
|
});
|
|
}
|
|
|
|
async function deleteZeroByteFiles(destDir) {
|
|
const removed = [];
|
|
try {
|
|
const files = await fs.readdir(destDir);
|
|
for (const file of files) {
|
|
const filePath = path.join(destDir, file);
|
|
const stats = await fs.stat(filePath);
|
|
if (stats.isFile() && stats.size === 0) {
|
|
await fs.unlink(filePath);
|
|
removed.push(file);
|
|
}
|
|
}
|
|
} catch (err) {
|
|
console.error(`Error deleting zero-byte files: ${err.message}`);
|
|
}
|
|
return removed;
|
|
}
|
|
|
|
function planDownloads(urls) {
|
|
const plan = {};
|
|
for (const url of urls) {
|
|
plan[url] = sanitizeFilename(url, null);
|
|
}
|
|
return plan;
|
|
}
|
|
|
|
async function verifyMissing(destDir, plan) {
|
|
const missing = [];
|
|
try {
|
|
const files = await fs.readdir(destDir);
|
|
const existing = new Set();
|
|
|
|
for (const file of files) {
|
|
const filePath = path.join(destDir, file);
|
|
const stats = await fs.stat(filePath);
|
|
if (stats.isFile()) {
|
|
existing.add(file);
|
|
}
|
|
}
|
|
|
|
for (const [url, fname] of Object.entries(plan)) {
|
|
if (!existing.has(fname)) {
|
|
const baseName = path.parse(fname).name;
|
|
const hasPrefix = Array.from(existing).some(
|
|
name => name.startsWith(baseName)
|
|
);
|
|
if (!hasPrefix) {
|
|
missing.push(url);
|
|
}
|
|
}
|
|
}
|
|
} catch (err) {
|
|
console.error(`Error verifying missing files: ${err.message}`);
|
|
}
|
|
return missing;
|
|
}
|
|
|
|
async function saveSourceUrls(destRoot, sourcesPath) {
|
|
await ensureDir(destRoot);
|
|
const sources = await loadLines(sourcesPath);
|
|
await fs.writeFile(
|
|
path.join(destRoot, 'urls.txt'),
|
|
sources.join('\n') + '\n',
|
|
'utf-8'
|
|
);
|
|
}
|
|
|
|
function sleep(ms) {
|
|
return new Promise(resolve => setTimeout(resolve, ms));
|
|
}
|
|
|
|
async function main() {
|
|
const args = parseArgs();
|
|
const today = getCurrentDate();
|
|
const destRoot = path.join('downloads', `${today}-${args.title}-${args.pageId}`);
|
|
const mediaDir = path.join(destRoot, 'media');
|
|
|
|
await ensureDir(mediaDir);
|
|
|
|
const urlsPath = args.urls;
|
|
const urls = await loadLines(urlsPath);
|
|
const plan = planDownloads(urls);
|
|
|
|
console.log(`Starting download of ${urls.length} URLs...`);
|
|
|
|
for (const url of urls) {
|
|
const [ok, err] = await downloadOne(url, mediaDir, args.timeout);
|
|
if (!ok) {
|
|
console.error(`WARN: download failed for ${url}: ${err}`);
|
|
}
|
|
await sleep(50);
|
|
}
|
|
|
|
const removed = await deleteZeroByteFiles(mediaDir);
|
|
if (removed.length > 0) {
|
|
console.log(`Removed zero-byte files: ${removed.join(', ')}`);
|
|
}
|
|
|
|
let missing = await verifyMissing(mediaDir, plan);
|
|
let attempts = 0;
|
|
|
|
while (missing.length > 0 && attempts < args.retries) {
|
|
attempts++;
|
|
console.log(`Retry round ${attempts}: missing ${missing.length} files`);
|
|
|
|
const stillMissing = [];
|
|
for (const url of missing) {
|
|
const [ok, err] = await downloadOne(url, mediaDir, args.timeout);
|
|
if (!ok) {
|
|
console.error(`WARN: retry failed for ${url}: ${err}`);
|
|
stillMissing.push(url);
|
|
}
|
|
}
|
|
|
|
const removedRetry = await deleteZeroByteFiles(mediaDir);
|
|
if (removedRetry.length > 0) {
|
|
console.log(`Removed zero-byte files after retry: ${removedRetry.join(', ')}`);
|
|
}
|
|
|
|
const retryPlan = {};
|
|
for (const url of stillMissing) {
|
|
retryPlan[url] = plan[url];
|
|
}
|
|
missing = stillMissing.length > 0 ? await verifyMissing(mediaDir, retryPlan) : [];
|
|
}
|
|
|
|
if (missing.length > 0) {
|
|
console.error('FAILED to fetch these URLs after retries:');
|
|
for (const url of missing) {
|
|
console.error(url);
|
|
}
|
|
} else {
|
|
console.log('All planned files downloaded.');
|
|
}
|
|
|
|
await saveSourceUrls(destRoot, args.sources);
|
|
console.log(`Saved source URLs to ${path.join(destRoot, 'urls.txt')}`);
|
|
|
|
try {
|
|
await fs.unlink(urlsPath);
|
|
console.log(`Removed input file: ${urlsPath}`);
|
|
} catch (err) {
|
|
console.error(`WARN: failed to remove input file ${urlsPath}: ${err.message}`);
|
|
}
|
|
}
|
|
|
|
main().catch(err => {
|
|
console.error(err);
|
|
process.exit(1);
|
|
});
|