From 96afeb5e1c7c8e98836f00a457aeaa223d3fd490 Mon Sep 17 00:00:00 2001 From: Rogee Date: Mon, 2 Feb 2026 17:11:34 +0800 Subject: [PATCH] Initialize repo and update asset guidelines --- .gitignore | 1 + AGENTS.md | 34 ++++++++ download.py | 216 ++++++++++++++++++++++++++++++++++++++++++++++++ source_urls.txt | 2 + 4 files changed, 253 insertions(+) create mode 100644 .gitignore create mode 100644 AGENTS.md create mode 100644 download.py create mode 100644 source_urls.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c4ffae9 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +downloads/ diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..b5b9b59 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,34 @@ +# AGENTS + +## Scope +This repository has no existing conventions. These guidelines apply to all work in this repo. + +## Network capture and assets +- Resource scope is limited to: audio, video, images, and viewport screenshots; do not collect other asset types unless explicitly requested. +- When a task asks to harvest page resources, prioritize what the user requests (e.g., only media or only core assets). Ask for scope if unclear. +- If the user provides a URL like `https://h5.rrx.cn/storeview/.html`, extract ``. Open a blank tab first, apply viewport override (width 390, height 844, devicePixelRatio 3, mobile: true, hasTouch: true), then navigate that tab to `https://ca.rrx.cn/v/?rrxsrc=2&iframe=1&tpl=1`. Equivalent automation: call DevTools/Emulation to override device metrics with `{width:390,height:844,deviceScaleFactor:3, mobile:true, hasTouch:true}` before navigation to avoid double-loading assets. +- Use DevTools network captures to list requests; identify media by MIME or URL suffix. +- Save assets under `downloads/--<page-id>/media/` (title from current page; date format `YYYYMMDD`) with clean filenames (strip query strings and `@!` size suffixes; keep proper extensions). After download, rename any files still containing size tokens or missing extensions to the original base name + proper extension. +- Also save the source page URL(s) provided by the user into the folder root as `downloads/<date>-<title>-<page-id>/urls.txt`. +- Prefer direct downloads (e.g., curl) if DevTools bodies are unavailable or truncated. +- After batch downloading, delete any 0-byte files, verify against the planned download list, and retry missing items up to 2 times; if still failing, stop and report the missing resources. +- After collecting all requested resources and screenshots, close any additional tabs/pages opened for capture. This is mandatory; do not leave capture tabs open. + +## Download script usage +- Use `python download.py --page-id <id> --title "<title>" --urls urls.txt --sources source_urls.txt` to batch download assets. The script generates `<date>` using format `YYYYMMDD`. +- `urls.txt` should list the target asset URLs (one per line) already filtered to the requested scope (e.g., media only). +- Downloads go to `downloads/<date>-<title>-<page-id>/media/`; filenames are cleaned (query/`@!` removed) and extensions retained/guessed; duplicates get numeric suffixes. +- After the batch finishes, the script deletes 0-byte files, compares against the planned list, retries missing items up to 2 times, and reports any still-missing resources. +- `urls.txt` is written to `downloads/<date>-<title>-<page-id>/urls.txt` to record user-provided page URLs. + +## Screenshots +- Default viewport for screenshots: width 390, height 844, devicePixelRatio 3 (mobile portrait). Do not change unless the user explicitly requests another size. +- Match the screenshot to the user’s requested viewport. If they mention a size, emulate it and verify with `window.innerWidth/innerHeight` and `devicePixelRatio`. +- Capture screenshots with Chrome DevTools (device emulation per above) and save to `downloads/<date>-<title>-<page-id>/index.png` (title from current page; date format `YYYYMMDD`); use full-page only when explicitly asked. + +## Communication and confirmation +- Do not ask for pre-work confirmation; proceed with default scope (media + viewport screenshot) unless the user explicitly specifies otherwise. +- After completion, briefly confirm collected assets (paths + key filenames); do not prompt for extra formats unless the user asks. + +## Safety and precision +- Avoid downloading unrequested resources. If download failures occur, retry and report any missing items clearly. diff --git a/download.py b/download.py new file mode 100644 index 0000000..168a552 --- /dev/null +++ b/download.py @@ -0,0 +1,216 @@ +#!/usr/bin/env python3 +"""Download assets to structured folder per AGENTS rules. + +Usage: + python download.py --page-id j4lexk --title "My Page" --urls urls.txt --sources source_urls.txt + +- Creates `downloads/<date>-<title>-<page-id>/media/` and stores downloads there (date format `YYYYMMDD`, generated by the script). +- Saves source URLs provided by user into `downloads/<date>-<title>-<page-id>/urls.txt` (input file given by `--sources`). +- Cleans filenames (strip query strings and `@!` size tokens; keep/guess extensions). +- Batch downloads, then deletes 0-byte files, compares against planned URLs, retries missing up to 2 times, and reports any remaining failures. Deletes the `--urls` input file when finished. +""" + +import argparse +import os +import pathlib +import sys +import time +from typing import Dict, List, Optional, Tuple + +import mimetypes +import urllib.parse +import requests + +DATE_FMT = "%Y%m%d" + +# Default retry policy +MAX_RETRIES = 2 +TIMEOUT = 30 + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Batch download assets per AGENTS rules" + ) + parser.add_argument("--page-id", required=True, help="Page ID, e.g. j4lexk") + parser.add_argument("--title", required=True, help="Page title for folder naming") + parser.add_argument( + "--urls", required=True, help="Path to text file with asset URLs (one per line)" + ) + parser.add_argument( + "--sources", + required=True, + help="Path to text file containing source page URLs to record (will be saved as urls.txt)", + ) + parser.add_argument( + "--retries", type=int, default=MAX_RETRIES, help="Retry count for missing files" + ) + parser.add_argument( + "--timeout", type=int, default=TIMEOUT, help="Request timeout seconds" + ) + return parser.parse_args() + + +def load_lines(path: str) -> List[str]: + with open(path, "r", encoding="utf-8") as f: + return [ln.strip() for ln in f if ln.strip()] + + +def ensure_dir(path: pathlib.Path) -> None: + path.mkdir(parents=True, exist_ok=True) + + +def sanitize_filename(url: str, content_type: Optional[str]) -> str: + """Strip query/size tokens and ensure extension.""" + parsed = urllib.parse.urlparse(url) + basename = os.path.basename(parsed.path) + # remove query-derived size tokens like @!user_image_800x1 + if "@!" in basename: + basename = basename.split("@!", 1)[0] + # fallback name + if not basename: + basename = "file" + root, ext = os.path.splitext(basename) + if not ext: + ext = guess_extension(content_type) + return root + ext + + +def guess_extension(content_type: Optional[str]) -> str: + if content_type: + ext = mimetypes.guess_extension(content_type.split(";")[0].strip()) + if ext: + return ext + return ".bin" + + +def unique_name(base: pathlib.Path) -> pathlib.Path: + if not base.exists(): + return base + stem = base.stem + suffix = base.suffix + parent = base.parent + idx = 1 + while True: + candidate = parent / f"{stem}_{idx}{suffix}" + if not candidate.exists(): + return candidate + idx += 1 + + +def download_one( + url: str, dest_dir: pathlib.Path, timeout: int +) -> Tuple[bool, Optional[str]]: + try: + resp = requests.get(url, timeout=timeout, stream=True) + if resp.status_code != 200: + return False, f"HTTP {resp.status_code}" + fname = sanitize_filename(url, resp.headers.get("Content-Type")) + target = unique_name(dest_dir / fname) + with open(target, "wb") as f: + for chunk in resp.iter_content(chunk_size=8192): + if chunk: + f.write(chunk) + return True, None + except Exception as exc: # pylint: disable=broad-except + return False, str(exc) + + +def delete_zero_byte_files(dest_dir: pathlib.Path) -> List[str]: + removed = [] + for p in dest_dir.glob("*"): + if p.is_file() and p.stat().st_size == 0: + removed.append(p.name) + p.unlink() + return removed + + +def plan_downloads(urls: List[str]) -> Dict[str, str]: + """Return mapping of url -> planned base filename (sanitized) for verification.""" + plan = {} + for u in urls: + plan[u] = sanitize_filename(u, None) + return plan + + +def verify_missing(dest_dir: pathlib.Path, plan: Dict[str, str]) -> List[str]: + existing = {p.name for p in dest_dir.glob("*") if p.is_file()} + missing = [] + for url, fname in plan.items(): + # consider potential uniqueness suffix; check prefix match + if fname not in existing and not any( + name.startswith(fname.rsplit(".", 1)[0]) for name in existing + ): + missing.append(url) + return missing + + +def save_source_urls(dest_root: pathlib.Path, sources_path: pathlib.Path) -> None: + ensure_dir(dest_root) + sources = load_lines(str(sources_path)) + with open(dest_root / "urls.txt", "w", encoding="utf-8") as f: + for line in sources: + f.write(line + "\n") + + +def main() -> None: + args = parse_args() + today = time.strftime(DATE_FMT) + dest_root = pathlib.Path("downloads") / f"{today}-{args.title}-{args.page_id}" + media_dir = dest_root / "media" + ensure_dir(media_dir) + + urls_path = pathlib.Path(args.urls) + urls = load_lines(args.urls) + plan = plan_downloads(urls) + + # initial batch download + for url in urls: + ok, err = download_one(url, media_dir, args.timeout) + if not ok: + print(f"WARN: download failed for {url}: {err}", file=sys.stderr) + time.sleep(0.05) # mild pacing + + removed = delete_zero_byte_files(media_dir) + if removed: + print(f"Removed zero-byte files: {removed}") + + missing = verify_missing(media_dir, plan) + attempts = 0 + while missing and attempts < args.retries: + attempts += 1 + print(f"Retry round {attempts}: missing {len(missing)} files") + still_missing = [] + for url in missing: + ok, err = download_one(url, media_dir, args.timeout) + if not ok: + print(f"WARN: retry failed for {url}: {err}", file=sys.stderr) + still_missing.append(url) + removed_retry = delete_zero_byte_files(media_dir) + if removed_retry: + print(f"Removed zero-byte files after retry: {removed_retry}") + missing = ( + verify_missing(media_dir, {u: plan[u] for u in still_missing}) + if still_missing + else [] + ) + + if missing: + print("FAILED to fetch these URLs after retries:", file=sys.stderr) + for url in missing: + print(url, file=sys.stderr) + else: + print("All planned files downloaded.") + + save_source_urls(dest_root, pathlib.Path(args.sources)) + print(f"Saved source URLs to {dest_root / 'urls.txt'}") + + try: + urls_path.unlink() + print(f"Removed input file: {urls_path}") + except Exception as exc: # pylint: disable=broad-except + print(f"WARN: failed to remove input file {urls_path}: {exc}", file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/source_urls.txt b/source_urls.txt new file mode 100644 index 0000000..b71100e --- /dev/null +++ b/source_urls.txt @@ -0,0 +1,2 @@ +https://h5.rrx.cn/storeview/f4kz2m.html +https://ca.rrx.cn/v/f4kz2m?rrxsrc=2&iframe=1&tpl=1