hometown/scripts/fetch_720yun.py

#!/usr/bin/env python3
"""
从 720yun 页面抓取全景资源并本地化。
脚本位于 scripts/，输出到项目根目录的 panorama/、config.json。
若页面由 JS 动态加载，请使用「手动获取」方式（见 README）。

用法（在项目根目录执行）:
  python3 scripts/fetch_720yun.py "https://www.720yun.com/vr/c8525usOunr"
"""
import re
import sys
import json
import urllib.request
import urllib.error
from pathlib import Path

# 项目根目录
ROOT = Path(__file__).resolve().parent.parent


def fetch_html(url):
    req = urllib.request.Request(url, headers={
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
    })
    with urllib.request.urlopen(req, timeout=15) as r:
        return r.read().decode('utf-8', errors='replace')


def _extract_balanced_json(html, start_marker, open_char='{', close_char='}'):
    results = []
    i = 0
    if open_char == '{':
        other_open, other_close = '[', ']'
    else:
        other_open, other_close = '{', '}'
    while True:
        pos = html.find(start_marker, i)
        if pos < 0:
            break
        start = pos + len(start_marker)
        while start < len(html) and html[start] in ' \t\n=':
            start += 1
        if start >= len(html):
            i = pos + 1
            continue
        if html[start] == open_char:
            depth = 0
            in_string = None
            escape = False
            j = start
            while j < len(html):
                c = html[j]
                if escape:
                    escape = False
                    j += 1
                    continue
                if c == '\\' and in_string:
                    escape = True
                    j += 1
                    continue
                if in_string:
                    if c == in_string:
                        in_string = None
                    j += 1
                    continue
                if c in '"\'':
                    in_string = c
                    j += 1
                    continue
                if c == open_char:
                    depth += 1
                elif c == close_char:
                    depth -= 1
                    if depth == 0:
                        results.append(html[start:j + 1])
                        break
                elif c == other_open:
                    depth += 1
                elif c == other_close:
                    depth -= 1
                j += 1
        elif html[start] == other_open:
            depth = 0
            in_string = None
            escape = False
            j = start
            while j < len(html):
                c = html[j]
                if escape:
                    escape = False
                    j += 1
                    continue
                if c == '\\' and in_string:
                    escape = True
                    j += 1
                    continue
                if in_string:
                    if c == in_string:
                        in_string = None
                    j += 1
                    continue
                if c in '"\'':
                    in_string = c
                    j += 1
                    continue
                if c == other_open:
                    depth += 1
                elif c == other_close:
                    depth -= 1
                    if depth == 0:
                        results.append(html[start:j + 1])
                        break
                elif c == open_char:
                    depth += 1
                elif c == close_char:
                    depth -= 1
                j += 1
        i = pos + 1
    return results


def find_json_assignments(html):
    markers = [
        'window.__INITIAL_STATE__',
        '__INITIAL_STATE__',
        'window.__DATA__',
        'window.__NUXT_DATA__',
    ]
    results = []
    for marker in markers:
        results.extend(_extract_balanced_json(html, marker, '{', '}'))
    for m in re.finditer(r'"panorama"\s*:\s*"([^"]+)"', html):
        results.append(m.group(1))
    return results


def find_image_urls(html):
    url_pattern = re.compile(
        r'https?://[^\s"\'<>]+?\.(?:720yun\.com|qpic\.cn|gtimg\.com)[^\s"\'<>]*\.(?:jpg|jpeg|png|webp)',
        re.I
    )
    urls = list(set(url_pattern.findall(html)))
    alt_pattern = re.compile(
        r'https?://[^\s"\'<>]+?/(?:panorama|scene|photo|pano|vr)[^\s"\'<>]*\.(?:jpg|jpeg|png|webp)',
        re.I
    )
    for u in alt_pattern.findall(html):
        if u not in urls:
            urls.append(u)
    return urls


def _browser_headers(url=''):
    h = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': 'image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.9',
    }
    if '720static.com' in url or '720yun.com' in url:
        h['Referer'] = 'https://www.720yun.com/'
    return h


def download_file(url, dest_path):
    req = urllib.request.Request(url, headers=_browser_headers(url))
    with urllib.request.urlopen(req, timeout=30) as r:
        dest_path.write_bytes(r.read())


def main():
    if len(sys.argv) < 2:
        print('用法: python3 scripts/fetch_720yun.py <720yun页面URL>')
        print('例:   python3 scripts/fetch_720yun.py "https://www.720yun.com/vr/c8525usOunr"')
        sys.exit(1)
    url = sys.argv[1].strip()
    panorama_dir = ROOT / 'panorama'
    panorama_dir.mkdir(exist_ok=True)
    config_path = ROOT / 'config.json'

    print('正在请求页面...')
    try:
        html = fetch_html(url)
    except Exception as e:
        print('请求失败:', e)
        print('请使用 README 中的「手动获取」方式在浏览器中抓取资源。')
        sys.exit(1)

    image_urls = find_image_urls(html)
    json_candidates = find_json_assignments(html)

    for raw in json_candidates:
        try:
            if raw.startswith('http'):
                image_urls.append(raw)
                continue
            if raw.startswith('{'):
                data = json.loads(raw)
            elif raw.startswith('['):
                data = json.loads(raw)
                if data and isinstance(data[0], dict) and 'url' in data[0]:
                    image_urls.extend([s.get('url') for s in data if s.get('url')])
                continue
            else:
                continue

            def collect_urls(obj, out):
                if isinstance(obj, dict):
                    for k, v in obj.items():
                        if k in ('url', 'panorama', 'image', 'src', 'pic') and isinstance(v, str) and v.startswith('http'):
                            out.append(v)
                        else:
                            collect_urls(v, out)
                elif isinstance(obj, list):
                    for x in obj:
                        collect_urls(x, out)
            collect_urls(data, image_urls)
        except (json.JSONDecodeError, TypeError):
            pass

    image_urls = list(dict.fromkeys([u for u in image_urls if u and u.startswith('http')]))

    if not image_urls:
        print('未在页面 HTML 中发现全景图 URL（页面可能由 JavaScript 动态加载）。')
        print('请按 README 使用浏览器开发者工具手动获取。')
        sys.exit(0)

    print('发现可能的全景图 URL:', len(image_urls))
    local_path = panorama_dir / 'panorama.jpg'
    try:
        first = image_urls[0]
        print('正在下载:', first[:80], '...')
        download_file(first, local_path)
        print('已保存到:', local_path)
    except Exception as e:
        print('下载失败:', e)
        sys.exit(1)

    if config_path.exists():
        with open(config_path, 'r', encoding='utf-8') as f:
            config = json.load(f)
    else:
        config = {}
    config['panorama'] = 'panorama/panorama.jpg'
    config['type'] = config.get('type', 'equirectangular')
    config['title'] = config.get('title', '本地全景')
    with open(config_path, 'w', encoding='utf-8') as f:
        json.dump(config, f, ensure_ascii=False, indent=2)
    print('已更新 config.json。运行 npm start 后即可离线查看。')


if __name__ == '__main__':
    main()