rrx-downloader/download.mjs

#!/usr/bin/env node
import fs from 'fs/promises';
import path from 'path';
import { fileURLToPath } from 'url';
import https from 'https';
import http from 'http';
import { URL as URLParser } from 'url';

const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);

const MAX_RETRIES = 2;
const TIMEOUT = 30000;
const DATE_FMT = '%Y%m%d';

function getCurrentDate() {
  const now = new Date();
  const year = now.getFullYear();
  const month = String(now.getMonth() + 1).padStart(2, '0');
  const day = String(now.getDate()).padStart(2, '0');
  return `${year}${month}${day}`;
}

function parseArgs() {
  const args = process.argv.slice(2);
  const result = {
    pageId: null,
    title: null,
    urls: null,
    sources: null,
    retries: MAX_RETRIES,
    timeout: TIMEOUT
  };

  for (let i = 0; i < args.length; i++) {
    const arg = args[i];
    if (arg === '--page-id') {
      result.pageId = args[++i];
    } else if (arg === '--title') {
      result.title = args[++i];
    } else if (arg === '--urls') {
      result.urls = args[++i];
    } else if (arg === '--sources') {
      result.sources = args[++i];
    } else if (arg === '--retries') {
      result.retries = parseInt(args[++i], 10);
    } else if (arg === '--timeout') {
      result.timeout = parseInt(args[++i], 10);
    }
  }

  if (!result.pageId || !result.title || !result.urls || !result.sources) {
    console.error('Usage: node download.mjs --page-id <id> --title "<title>" --urls urls.txt --sources source_urls.txt');
    process.exit(1);
  }

  return result;
}

async function loadLines(filepath) {
  const content = await fs.readFile(filepath, 'utf-8');
  return content.split('\n').map(line => line.trim()).filter(line => line.length > 0);
}

async function ensureDir(dirPath) {
  await fs.mkdir(dirPath, { recursive: true });
}

function sanitizeFilename(url, contentType) {
  try {
    const parsed = new URLParser(url);
    let basename = path.basename(parsed.pathname);

    if (basename.includes('@!')) {
      basename = basename.split('@!')[0];
    }

    if (!basename) {
      basename = 'file';
    }

    const { name, ext } = path.parse(basename);
    if (!ext) {
      const guessedExt = guessExtension(contentType);
      return name + guessedExt;
    }

    return basename;
  } catch {
    return 'file.bin';
  }
}

function guessExtension(contentType) {
  if (contentType) {
    const mime = contentType.split(';')[0].trim();
    const map = {
      'image/jpeg': '.jpg',
      'image/png': '.png',
      'image/gif': '.gif',
      'image/webp': '.webp',
      'image/svg+xml': '.svg',
      'audio/mpeg': '.mp3',
      'audio/wav': '.wav',
      'audio/ogg': '.ogg',
      'audio/aac': '.aac',
      'video/mp4': '.mp4',
      'video/webm': '.webm',
      'video/ogg': '.ogv',
      'application/octet-stream': '.bin'
    };
    return map[mime] || '.bin';
  }
  return '.bin';
}

async function uniqueName(basePath) {
  try {
    await fs.access(basePath);
    const { dir, name, ext } = path.parse(basePath);
    let idx = 1;
    while (true) {
      const candidate = path.join(dir, `${name}_${idx}${ext}`);
      try {
        await fs.access(candidate);
        idx++;
      } catch {
        return candidate;
      }
    }
  } catch {
    return basePath;
  }
}

function downloadOne(url, destDir, timeout) {
  return new Promise((resolve) => {
    try {
      const parsed = new URLParser(url);
      const client = parsed.protocol === 'https:' ? https : http;

      const options = {
        rejectUnauthorized: false,
        timeout
      };

      const req = client.get(url, options, async (resp) => {
        if (resp.statusCode !== 200) {
          resolve([false, `HTTP ${resp.statusCode}`]);
          resp.resume();
          return;
        }

        const contentType = resp.headers['content-type'];
        const fname = sanitizeFilename(url, contentType);
        const target = await uniqueName(path.join(destDir, fname));

        const fileStream = await fs.open(target, 'w');
        const writeStream = fileStream.createWriteStream();

        let failed = false;
        let errorMsg = null;

        writeStream.on('error', (err) => {
          failed = true;
          errorMsg = err.message;
          writeStream.close();
          fileStream.close();
        });

        writeStream.on('finish', () => {
          writeStream.close();
          fileStream.close();
        });

        writeStream.on('close', () => {
          if (failed) {
            fs.unlink(target).catch(() => {});
            resolve([false, errorMsg]);
          } else {
            resolve([true, null]);
          }
        });

        resp.pipe(writeStream);
      });

      req.on('error', (err) => {
        resolve([false, err.message]);
      });

      req.on('timeout', () => {
        req.destroy();
        resolve([false, 'Request timeout']);
      });

    } catch (err) {
      resolve([false, err.message]);
    }
  });
}

async function deleteZeroByteFiles(destDir) {
  const removed = [];
  try {
    const files = await fs.readdir(destDir);
    for (const file of files) {
      const filePath = path.join(destDir, file);
      const stats = await fs.stat(filePath);
      if (stats.isFile() && stats.size === 0) {
        await fs.unlink(filePath);
        removed.push(file);
      }
    }
  } catch (err) {
    console.error(`Error deleting zero-byte files: ${err.message}`);
  }
  return removed;
}

function planDownloads(urls) {
  const plan = {};
  for (const url of urls) {
    plan[url] = sanitizeFilename(url, null);
  }
  return plan;
}

async function verifyMissing(destDir, plan) {
  const missing = [];
  try {
    const files = await fs.readdir(destDir);
    const existing = new Set();

    for (const file of files) {
      const filePath = path.join(destDir, file);
      const stats = await fs.stat(filePath);
      if (stats.isFile()) {
        existing.add(file);
      }
    }

    for (const [url, fname] of Object.entries(plan)) {
      if (!existing.has(fname)) {
        const baseName = path.parse(fname).name;
        const hasPrefix = Array.from(existing).some(
          name => name.startsWith(baseName)
        );
        if (!hasPrefix) {
          missing.push(url);
        }
      }
    }
  } catch (err) {
    console.error(`Error verifying missing files: ${err.message}`);
  }
  return missing;
}

async function saveSourceUrls(destRoot, sourcesPath) {
  await ensureDir(destRoot);
  const sources = await loadLines(sourcesPath);
  await fs.writeFile(
    path.join(destRoot, 'urls.txt'),
    sources.join('\n') + '\n',
    'utf-8'
  );
}

function sleep(ms) {
  return new Promise(resolve => setTimeout(resolve, ms));
}

async function main() {
  const args = parseArgs();
  const today = getCurrentDate();
  const destRoot = path.join('downloads', `${today}-${args.title}-${args.pageId}`);
  const mediaDir = path.join(destRoot, 'media');

  await ensureDir(mediaDir);

  const urlsPath = args.urls;
  const urls = await loadLines(urlsPath);
  const plan = planDownloads(urls);

  console.log(`Starting download of ${urls.length} URLs...`);

  for (const url of urls) {
    const [ok, err] = await downloadOne(url, mediaDir, args.timeout);
    if (!ok) {
      console.error(`WARN: download failed for ${url}: ${err}`);
    }
    await sleep(50);
  }

  const removed = await deleteZeroByteFiles(mediaDir);
  if (removed.length > 0) {
    console.log(`Removed zero-byte files: ${removed.join(', ')}`);
  }

  let missing = await verifyMissing(mediaDir, plan);
  let attempts = 0;

  while (missing.length > 0 && attempts < args.retries) {
    attempts++;
    console.log(`Retry round ${attempts}: missing ${missing.length} files`);

    const stillMissing = [];
    for (const url of missing) {
      const [ok, err] = await downloadOne(url, mediaDir, args.timeout);
      if (!ok) {
        console.error(`WARN: retry failed for ${url}: ${err}`);
        stillMissing.push(url);
      }
    }

    const removedRetry = await deleteZeroByteFiles(mediaDir);
    if (removedRetry.length > 0) {
      console.log(`Removed zero-byte files after retry: ${removedRetry.join(', ')}`);
    }

    const retryPlan = {};
    for (const url of stillMissing) {
      retryPlan[url] = plan[url];
    }
    missing = stillMissing.length > 0 ? await verifyMissing(mediaDir, retryPlan) : [];
  }

  if (missing.length > 0) {
    console.error('FAILED to fetch these URLs after retries:');
    for (const url of missing) {
      console.error(url);
    }
  } else {
    console.log('All planned files downloaded.');
  }

  await saveSourceUrls(destRoot, args.sources);
  console.log(`Saved source URLs to ${path.join(destRoot, 'urls.txt')}`);

  try {
    await fs.unlink(urlsPath);
    console.log(`Removed input file: ${urlsPath}`);
  } catch (err) {
    console.error(`WARN: failed to remove input file ${urlsPath}: ${err.message}`);
  }
}

main().catch(err => {
  console.error(err);
  process.exit(1);
});