247 lines
6.7 KiB
JavaScript
247 lines
6.7 KiB
JavaScript
#!/usr/bin/env node
|
|
import fs from 'fs/promises';
|
|
import path from 'path';
|
|
import { spawn } from 'child_process';
|
|
import process from 'process';
|
|
import { chromium } from 'playwright';
|
|
import http from 'http';
|
|
import https from 'https';
|
|
|
|
const VIEWPORT = {
|
|
width: 390,
|
|
height: 844,
|
|
deviceScaleFactor: 3,
|
|
isMobile: true,
|
|
hasTouch: true,
|
|
};
|
|
|
|
const MEDIA_MIME_PREFIXES = ['audio/', 'video/', 'image/'];
|
|
const MEDIA_EXTENSIONS = [
|
|
'.mp3', '.wav', '.ogg', '.aac', '.m4a', '.flac',
|
|
'.mp4', '.webm', '.ogv', '.avi', '.mov',
|
|
'.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.bmp', '.ico',
|
|
];
|
|
|
|
function getCurrentDate() {
|
|
const now = new Date();
|
|
const year = now.getFullYear();
|
|
const month = String(now.getMonth() + 1).padStart(2, '0');
|
|
const day = String(now.getDate()).padStart(2, '0');
|
|
return `${year}${month}${day}`;
|
|
}
|
|
|
|
function extractPageId(url) {
|
|
const patterns = [
|
|
/storeview\/([a-z0-9]+)/i,
|
|
/\/v\/([a-z0-9]+)/i,
|
|
];
|
|
for (const pattern of patterns) {
|
|
const match = url.match(pattern);
|
|
if (match && match[1]) {
|
|
return match[1];
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
function sanitizeTitle(title) {
|
|
if (!title) {
|
|
return 'page';
|
|
}
|
|
return title.replace(/[<>:"/\\|?*]/g, '').trim().slice(0, 100) || 'page';
|
|
}
|
|
|
|
function parseArgs() {
|
|
const args = process.argv.slice(2);
|
|
let pageUrl = null;
|
|
let cdpUrl = process.env.ENV_CDP || null;
|
|
|
|
for (let i = 0; i < args.length; i++) {
|
|
const arg = args[i];
|
|
if (arg === '--cdp') {
|
|
cdpUrl = args[i + 1];
|
|
i += 1;
|
|
} else if (!arg.startsWith('--') && !pageUrl) {
|
|
pageUrl = arg;
|
|
}
|
|
}
|
|
|
|
if (!pageUrl) {
|
|
console.error('Usage: node run.mjs <page-url> [--cdp ws://host:port/devtools/browser/<id>|http://host:port] (or ENV_CDP)');
|
|
process.exit(1);
|
|
}
|
|
const pageId = extractPageId(pageUrl);
|
|
if (!pageId) {
|
|
console.error('Error: Could not extract page-id from the URL');
|
|
process.exit(1);
|
|
}
|
|
return { pageId, pageUrl, cdpUrl };
|
|
}
|
|
|
|
function isMediaUrl(url, mimeType) {
|
|
if (mimeType && MEDIA_MIME_PREFIXES.some((prefix) => mimeType.toLowerCase().startsWith(prefix))) {
|
|
return true;
|
|
}
|
|
const lowerUrl = url.toLowerCase();
|
|
return MEDIA_EXTENSIONS.some((ext) => lowerUrl.includes(ext));
|
|
}
|
|
|
|
function buildCaptureUrl(pageUrl) {
|
|
const pageId = extractPageId(pageUrl);
|
|
return `https://ca.rrx.cn/v/${pageId}?rrxsrc=2&iframe=1&tpl=1`;
|
|
}
|
|
|
|
async function captureNetworkAndScreenshot(context, captureUrl, pageId, today) {
|
|
const page = await context.newPage();
|
|
const requests = new Set();
|
|
|
|
page.on('request', (request) => {
|
|
const resourceType = request.resourceType();
|
|
if (resourceType === 'image' || resourceType === 'media') {
|
|
requests.add(request.url());
|
|
}
|
|
});
|
|
|
|
page.on('response', (response) => {
|
|
const url = response.url();
|
|
const mimeType = response.headers()['content-type'] || '';
|
|
if (isMediaUrl(url, mimeType)) {
|
|
requests.add(url);
|
|
}
|
|
});
|
|
|
|
try {
|
|
console.log('Navigating to:', captureUrl);
|
|
await page.goto(captureUrl, { waitUntil: 'networkidle', timeout: 30000 });
|
|
await page.waitForTimeout(3000);
|
|
|
|
const pageTitle = sanitizeTitle(await page.title());
|
|
const destRoot = path.join('downloads', `${today}-${pageTitle}-${pageId}`);
|
|
await fs.mkdir(destRoot, { recursive: true });
|
|
|
|
const screenshotPath = path.join(destRoot, 'index.png');
|
|
console.log('Taking screenshot...');
|
|
await page.screenshot({ path: screenshotPath, fullPage: false });
|
|
console.log('Screenshot saved:', screenshotPath);
|
|
|
|
return { mediaUrls: [...requests], pageTitle };
|
|
} finally {
|
|
await page.close();
|
|
}
|
|
}
|
|
|
|
function runDownloadScript(pageId, title, urlsFile, sourcesFile) {
|
|
return new Promise((resolve) => {
|
|
const args = [
|
|
'download.mjs',
|
|
'--page-id', pageId,
|
|
'--title', title,
|
|
'--urls', urlsFile,
|
|
'--sources', sourcesFile,
|
|
];
|
|
const proc = spawn('node', args, { stdio: 'inherit' });
|
|
proc.on('close', resolve);
|
|
});
|
|
}
|
|
|
|
async function fetchJson(url) {
|
|
return new Promise((resolve, reject) => {
|
|
const client = url.startsWith('https:') ? https : http;
|
|
const req = client.get(url, (res) => {
|
|
if (res.statusCode !== 200) {
|
|
reject(new Error(`HTTP ${res.statusCode}`));
|
|
res.resume();
|
|
return;
|
|
}
|
|
let data = '';
|
|
res.setEncoding('utf8');
|
|
res.on('data', (chunk) => {
|
|
data += chunk;
|
|
});
|
|
res.on('end', () => {
|
|
try {
|
|
resolve(JSON.parse(data));
|
|
} catch (err) {
|
|
reject(err);
|
|
}
|
|
});
|
|
});
|
|
req.on('error', reject);
|
|
});
|
|
}
|
|
|
|
async function resolveCdpUrl(cdpUrl) {
|
|
if (!cdpUrl) {
|
|
return null;
|
|
}
|
|
if (cdpUrl.startsWith('ws://') || cdpUrl.startsWith('wss://')) {
|
|
return cdpUrl;
|
|
}
|
|
const base = cdpUrl.endsWith('/') ? cdpUrl.slice(0, -1) : cdpUrl;
|
|
const endpointUrl = `${base}/json/version`;
|
|
const data = await fetchJson(endpointUrl);
|
|
if (!data || !data.webSocketDebuggerUrl) {
|
|
throw new Error(`No webSocketDebuggerUrl found at ${endpointUrl}`);
|
|
}
|
|
return data.webSocketDebuggerUrl;
|
|
}
|
|
|
|
async function main() {
|
|
const { pageId, pageUrl, cdpUrl } = parseArgs();
|
|
console.log('Extracted page-id:', pageId);
|
|
|
|
const today = getCurrentDate();
|
|
const tempDir = 'temp';
|
|
await fs.mkdir(tempDir, { recursive: true });
|
|
|
|
const urlsFile = path.join(tempDir, 'urls.txt');
|
|
const sourcesFile = path.join(tempDir, 'sources.txt');
|
|
await fs.writeFile(sourcesFile, pageUrl + '\n', 'utf-8');
|
|
|
|
let browser;
|
|
if (cdpUrl) {
|
|
console.log('Resolving Chrome DevTools endpoint from:', cdpUrl);
|
|
const resolvedCdpUrl = await resolveCdpUrl(cdpUrl);
|
|
console.log('Connecting to remote Chrome DevTools:', resolvedCdpUrl);
|
|
browser = await chromium.connectOverCDP(resolvedCdpUrl);
|
|
} else {
|
|
console.log('Launching Playwright Chromium...');
|
|
browser = await chromium.launch({ headless: true });
|
|
}
|
|
|
|
const context = await browser.newContext(VIEWPORT);
|
|
|
|
const captureUrl = buildCaptureUrl(pageUrl);
|
|
console.log('Capture URL:', captureUrl);
|
|
|
|
try {
|
|
const { mediaUrls, pageTitle } = await captureNetworkAndScreenshot(
|
|
context,
|
|
captureUrl,
|
|
pageId,
|
|
today,
|
|
);
|
|
|
|
console.log('Found', mediaUrls.length, 'media URLs');
|
|
if (mediaUrls.length > 0) {
|
|
await fs.writeFile(urlsFile, mediaUrls.join('\n') + '\n', 'utf-8');
|
|
await runDownloadScript(pageId, pageTitle, urlsFile, sourcesFile);
|
|
} else {
|
|
console.log('No media URLs found, skipping download');
|
|
}
|
|
} catch (error) {
|
|
console.error('Error during capture:', error.message);
|
|
} finally {
|
|
console.log('Closing browser...');
|
|
await context.close();
|
|
await browser.close();
|
|
}
|
|
|
|
console.log('Workflow completed.');
|
|
}
|
|
|
|
main().catch((err) => {
|
|
console.error(err);
|
|
process.exit(1);
|
|
});
|