#!/usr/bin/env node import fs from 'fs/promises'; import path from 'path'; import { fileURLToPath } from 'url'; import https from 'https'; import http from 'http'; import { URL as URLParser } from 'url'; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); const MAX_RETRIES = 2; const TIMEOUT = 30000; const DATE_FMT = '%Y%m%d'; function getCurrentDate() { const now = new Date(); const year = now.getFullYear(); const month = String(now.getMonth() + 1).padStart(2, '0'); const day = String(now.getDate()).padStart(2, '0'); return `${year}${month}${day}`; } function parseArgs() { const args = process.argv.slice(2); const result = { pageId: null, title: null, urls: null, sources: null, retries: MAX_RETRIES, timeout: TIMEOUT }; for (let i = 0; i < args.length; i++) { const arg = args[i]; if (arg === '--page-id') { result.pageId = args[++i]; } else if (arg === '--title') { result.title = args[++i]; } else if (arg === '--urls') { result.urls = args[++i]; } else if (arg === '--sources') { result.sources = args[++i]; } else if (arg === '--retries') { result.retries = parseInt(args[++i], 10); } else if (arg === '--timeout') { result.timeout = parseInt(args[++i], 10); } } if (!result.pageId || !result.title || !result.urls || !result.sources) { console.error('Usage: node download.mjs --page-id --title "" --urls urls.txt --sources source_urls.txt'); process.exit(1); } return result; } async function loadLines(filepath) { const content = await fs.readFile(filepath, 'utf-8'); return content.split('\n').map(line => line.trim()).filter(line => line.length > 0); } async function ensureDir(dirPath) { await fs.mkdir(dirPath, { recursive: true }); } function sanitizeFilename(url, contentType) { try { const parsed = new URLParser(url); let basename = path.basename(parsed.pathname); if (basename.includes('@!')) { basename = basename.split('@!')[0]; } if (!basename) { basename = 'file'; } const { name, ext } = path.parse(basename); if (!ext) { const guessedExt = guessExtension(contentType); return name + guessedExt; } return basename; } catch { return 'file.bin'; } } function guessExtension(contentType) { if (contentType) { const mime = contentType.split(';')[0].trim(); const map = { 'image/jpeg': '.jpg', 'image/png': '.png', 'image/gif': '.gif', 'image/webp': '.webp', 'image/svg+xml': '.svg', 'audio/mpeg': '.mp3', 'audio/wav': '.wav', 'audio/ogg': '.ogg', 'audio/aac': '.aac', 'video/mp4': '.mp4', 'video/webm': '.webm', 'video/ogg': '.ogv', 'application/octet-stream': '.bin' }; return map[mime] || '.bin'; } return '.bin'; } async function uniqueName(basePath) { try { await fs.access(basePath); const { dir, name, ext } = path.parse(basePath); let idx = 1; while (true) { const candidate = path.join(dir, `${name}_${idx}${ext}`); try { await fs.access(candidate); idx++; } catch { return candidate; } } } catch { return basePath; } } function downloadOne(url, destDir, timeout) { return new Promise((resolve) => { try { const parsed = new URLParser(url); const client = parsed.protocol === 'https:' ? https : http; const options = { rejectUnauthorized: false, timeout }; const req = client.get(url, options, async (resp) => { if (resp.statusCode !== 200) { resolve([false, `HTTP ${resp.statusCode}`]); resp.resume(); return; } const contentType = resp.headers['content-type']; const fname = sanitizeFilename(url, contentType); const target = await uniqueName(path.join(destDir, fname)); const fileStream = await fs.open(target, 'w'); const writeStream = fileStream.createWriteStream(); let failed = false; let errorMsg = null; writeStream.on('error', (err) => { failed = true; errorMsg = err.message; writeStream.close(); fileStream.close(); }); writeStream.on('finish', () => { writeStream.close(); fileStream.close(); }); writeStream.on('close', () => { if (failed) { fs.unlink(target).catch(() => {}); resolve([false, errorMsg]); } else { resolve([true, null]); } }); resp.pipe(writeStream); }); req.on('error', (err) => { resolve([false, err.message]); }); req.on('timeout', () => { req.destroy(); resolve([false, 'Request timeout']); }); } catch (err) { resolve([false, err.message]); } }); } async function deleteZeroByteFiles(destDir) { const removed = []; try { const files = await fs.readdir(destDir); for (const file of files) { const filePath = path.join(destDir, file); const stats = await fs.stat(filePath); if (stats.isFile() && stats.size === 0) { await fs.unlink(filePath); removed.push(file); } } } catch (err) { console.error(`Error deleting zero-byte files: ${err.message}`); } return removed; } function planDownloads(urls) { const plan = {}; for (const url of urls) { plan[url] = sanitizeFilename(url, null); } return plan; } async function verifyMissing(destDir, plan) { const missing = []; try { const files = await fs.readdir(destDir); const existing = new Set(); for (const file of files) { const filePath = path.join(destDir, file); const stats = await fs.stat(filePath); if (stats.isFile()) { existing.add(file); } } for (const [url, fname] of Object.entries(plan)) { if (!existing.has(fname)) { const baseName = path.parse(fname).name; const hasPrefix = Array.from(existing).some( name => name.startsWith(baseName) ); if (!hasPrefix) { missing.push(url); } } } } catch (err) { console.error(`Error verifying missing files: ${err.message}`); } return missing; } async function saveSourceUrls(destRoot, sourcesPath) { await ensureDir(destRoot); const sources = await loadLines(sourcesPath); await fs.writeFile( path.join(destRoot, 'urls.txt'), sources.join('\n') + '\n', 'utf-8' ); } function sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } async function main() { const args = parseArgs(); const today = getCurrentDate(); const destRoot = path.join('downloads', `${today}-${args.title}-${args.pageId}`); const mediaDir = path.join(destRoot, 'media'); await ensureDir(mediaDir); const urlsPath = args.urls; const urls = await loadLines(urlsPath); const plan = planDownloads(urls); console.log(`Starting download of ${urls.length} URLs...`); for (const url of urls) { const [ok, err] = await downloadOne(url, mediaDir, args.timeout); if (!ok) { console.error(`WARN: download failed for ${url}: ${err}`); } await sleep(50); } const removed = await deleteZeroByteFiles(mediaDir); if (removed.length > 0) { console.log(`Removed zero-byte files: ${removed.join(', ')}`); } let missing = await verifyMissing(mediaDir, plan); let attempts = 0; while (missing.length > 0 && attempts < args.retries) { attempts++; console.log(`Retry round ${attempts}: missing ${missing.length} files`); const stillMissing = []; for (const url of missing) { const [ok, err] = await downloadOne(url, mediaDir, args.timeout); if (!ok) { console.error(`WARN: retry failed for ${url}: ${err}`); stillMissing.push(url); } } const removedRetry = await deleteZeroByteFiles(mediaDir); if (removedRetry.length > 0) { console.log(`Removed zero-byte files after retry: ${removedRetry.join(', ')}`); } const retryPlan = {}; for (const url of stillMissing) { retryPlan[url] = plan[url]; } missing = stillMissing.length > 0 ? await verifyMissing(mediaDir, retryPlan) : []; } if (missing.length > 0) { console.error('FAILED to fetch these URLs after retries:'); for (const url of missing) { console.error(url); } } else { console.log('All planned files downloaded.'); } await saveSourceUrls(destRoot, args.sources); console.log(`Saved source URLs to ${path.join(destRoot, 'urls.txt')}`); try { await fs.unlink(urlsPath); console.log(`Removed input file: ${urlsPath}`); } catch (err) { console.error(`WARN: failed to remove input file ${urlsPath}: ${err.message}`); } } main().catch(err => { console.error(err); process.exit(1); });