fix:优化代码

2026-03-07 23:35:08 +08:00
parent 382aa955ef
commit c3d219efc1
9 changed files with 179 additions and 158 deletions
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -0,0 +1,26 @@
+# 720 云资源提取脚本
+
+本目录为 Python 脚本，用于从 720yun 页面或保存的 HTML 中提取并下载全景图资源。**请在项目根目录下执行**，脚本会自动读写根目录下的 `text.md`、`image/`、`panorama/`、`config.json` 等。
+
+## 脚本说明
+
+| 脚本 | 用途 |
+|------|------|
+| **fetch_720yun.py** | 根据 720yun 页面 URL 抓取 HTML，解析其中的全景图 URL 并下载到 `panorama/panorama.jpg`，同时更新根目录 `config.json`。适用于页面内直接包含图片链接的情况。 |
+| **parse_720yun_doc.py** | 从项目根目录的 `text.md`（720yun 页面另存为的文档）解析 `window.data` / `window.json`，得到六面图、缩略图等 URL；可选 `--fetch` 请求场景 JSON，`--download` 将六面图 + 缩略图下载到根目录 `image/`。 |
+
+## 使用示例
+
+```bash
+# 在项目根目录 720yun-offline/ 下执行
+
+# 方式一：按 URL 抓取（若页面由 JS 动态加载可能无结果）
+python3 scripts/fetch_720yun.py "https://www.720yun.com/vr/xxxxx"
+
+# 方式二：先浏览器打开 720 链接，整页另存为 text.md 放到项目根目录，再解析并下载六面图
+python3 scripts/parse_720yun_doc.py              # 仅解析，输出 parsed_720yun_resources.json
+python3 scripts/parse_720yun_doc.py --fetch      # 解析并请求场景 JSON
+python3 scripts/parse_720yun_doc.py --download   # 解析并将六面图、缩略图下载到 image/
+```
+
+下载到 `image/` 的文件可直接被前端使用（`config.json` 中已配置 `image/mobile_*.jpg`）。
--- a/scripts/fetch_720yun.py
+++ b/scripts/fetch_720yun.py
@@ -0,0 +1,251 @@
+#!/usr/bin/env python3
+"""
+从 720yun 页面抓取全景资源并本地化。
+脚本位于 scripts/，输出到项目根目录的 panorama/、config.json。
+若页面由 JS 动态加载，请使用「手动获取」方式（见 README）。
+
+用法（在项目根目录执行）:
+  python3 scripts/fetch_720yun.py "https://www.720yun.com/vr/c8525usOunr"
+"""
+import re
+import sys
+import json
+import urllib.request
+import urllib.error
+from pathlib import Path
+
+# 项目根目录
+ROOT = Path(__file__).resolve().parent.parent
+
+
+def fetch_html(url):
+    req = urllib.request.Request(url, headers={
+        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
+    })
+    with urllib.request.urlopen(req, timeout=15) as r:
+        return r.read().decode('utf-8', errors='replace')
+
+
+def _extract_balanced_json(html, start_marker, open_char='{', close_char='}'):
+    results = []
+    i = 0
+    if open_char == '{':
+        other_open, other_close = '[', ']'
+    else:
+        other_open, other_close = '{', '}'
+    while True:
+        pos = html.find(start_marker, i)
+        if pos < 0:
+            break
+        start = pos + len(start_marker)
+        while start < len(html) and html[start] in ' \t\n=':
+            start += 1
+        if start >= len(html):
+            i = pos + 1
+            continue
+        if html[start] == open_char:
+            depth = 0
+            in_string = None
+            escape = False
+            j = start
+            while j < len(html):
+                c = html[j]
+                if escape:
+                    escape = False
+                    j += 1
+                    continue
+                if c == '\\' and in_string:
+                    escape = True
+                    j += 1
+                    continue
+                if in_string:
+                    if c == in_string:
+                        in_string = None
+                    j += 1
+                    continue
+                if c in '"\'':
+                    in_string = c
+                    j += 1
+                    continue
+                if c == open_char:
+                    depth += 1
+                elif c == close_char:
+                    depth -= 1
+                    if depth == 0:
+                        results.append(html[start:j + 1])
+                        break
+                elif c == other_open:
+                    depth += 1
+                elif c == other_close:
+                    depth -= 1
+                j += 1
+        elif html[start] == other_open:
+            depth = 0
+            in_string = None
+            escape = False
+            j = start
+            while j < len(html):
+                c = html[j]
+                if escape:
+                    escape = False
+                    j += 1
+                    continue
+                if c == '\\' and in_string:
+                    escape = True
+                    j += 1
+                    continue
+                if in_string:
+                    if c == in_string:
+                        in_string = None
+                    j += 1
+                    continue
+                if c in '"\'':
+                    in_string = c
+                    j += 1
+                    continue
+                if c == other_open:
+                    depth += 1
+                elif c == other_close:
+                    depth -= 1
+                    if depth == 0:
+                        results.append(html[start:j + 1])
+                        break
+                elif c == open_char:
+                    depth += 1
+                elif c == close_char:
+                    depth -= 1
+                j += 1
+        i = pos + 1
+    return results
+
+
+def find_json_assignments(html):
+    markers = [
+        'window.__INITIAL_STATE__',
+        '__INITIAL_STATE__',
+        'window.__DATA__',
+        'window.__NUXT_DATA__',
+    ]
+    results = []
+    for marker in markers:
+        results.extend(_extract_balanced_json(html, marker, '{', '}'))
+    for m in re.finditer(r'"panorama"\s*:\s*"([^"]+)"', html):
+        results.append(m.group(1))
+    return results
+
+
+def find_image_urls(html):
+    url_pattern = re.compile(
+        r'https?://[^\s"\'<>]+?\.(?:720yun\.com|qpic\.cn|gtimg\.com)[^\s"\'<>]*\.(?:jpg|jpeg|png|webp)',
+        re.I
+    )
+    urls = list(set(url_pattern.findall(html)))
+    alt_pattern = re.compile(
+        r'https?://[^\s"\'<>]+?/(?:panorama|scene|photo|pano|vr)[^\s"\'<>]*\.(?:jpg|jpeg|png|webp)',
+        re.I
+    )
+    for u in alt_pattern.findall(html):
+        if u not in urls:
+            urls.append(u)
+    return urls
+
+
+def _browser_headers(url=''):
+    h = {
+        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+        'Accept': 'image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.9',
+    }
+    if '720static.com' in url or '720yun.com' in url:
+        h['Referer'] = 'https://www.720yun.com/'
+    return h
+
+
+def download_file(url, dest_path):
+    req = urllib.request.Request(url, headers=_browser_headers(url))
+    with urllib.request.urlopen(req, timeout=30) as r:
+        dest_path.write_bytes(r.read())
+
+
+def main():
+    if len(sys.argv) < 2:
+        print('用法: python3 scripts/fetch_720yun.py <720yun页面URL>')
+        print('例:   python3 scripts/fetch_720yun.py "https://www.720yun.com/vr/c8525usOunr"')
+        sys.exit(1)
+    url = sys.argv[1].strip()
+    panorama_dir = ROOT / 'panorama'
+    panorama_dir.mkdir(exist_ok=True)
+    config_path = ROOT / 'config.json'
+
+    print('正在请求页面...')
+    try:
+        html = fetch_html(url)
+    except Exception as e:
+        print('请求失败:', e)
+        print('请使用 README 中的「手动获取」方式在浏览器中抓取资源。')
+        sys.exit(1)
+
+    image_urls = find_image_urls(html)
+    json_candidates = find_json_assignments(html)
+
+    for raw in json_candidates:
+        try:
+            if raw.startswith('http'):
+                image_urls.append(raw)
+                continue
+            if raw.startswith('{'):
+                data = json.loads(raw)
+            elif raw.startswith('['):
+                data = json.loads(raw)
+                if data and isinstance(data[0], dict) and 'url' in data[0]:
+                    image_urls.extend([s.get('url') for s in data if s.get('url')])
+                continue
+            else:
+                continue
+
+            def collect_urls(obj, out):
+                if isinstance(obj, dict):
+                    for k, v in obj.items():
+                        if k in ('url', 'panorama', 'image', 'src', 'pic') and isinstance(v, str) and v.startswith('http'):
+                            out.append(v)
+                        else:
+                            collect_urls(v, out)
+                elif isinstance(obj, list):
+                    for x in obj:
+                        collect_urls(x, out)
+            collect_urls(data, image_urls)
+        except (json.JSONDecodeError, TypeError):
+            pass
+
+    image_urls = list(dict.fromkeys([u for u in image_urls if u and u.startswith('http')]))
+
+    if not image_urls:
+        print('未在页面 HTML 中发现全景图 URL（页面可能由 JavaScript 动态加载）。')
+        print('请按 README 使用浏览器开发者工具手动获取。')
+        sys.exit(0)
+
+    print('发现可能的全景图 URL:', len(image_urls))
+    local_path = panorama_dir / 'panorama.jpg'
+    try:
+        first = image_urls[0]
+        print('正在下载:', first[:80], '...')
+        download_file(first, local_path)
+        print('已保存到:', local_path)
+    except Exception as e:
+        print('下载失败:', e)
+        sys.exit(1)
+
+    if config_path.exists():
+        with open(config_path, 'r', encoding='utf-8') as f:
+            config = json.load(f)
+    else:
+        config = {}
+    config['panorama'] = 'panorama/panorama.jpg'
+    config['type'] = config.get('type', 'equirectangular')
+    config['title'] = config.get('title', '本地全景')
+    with open(config_path, 'w', encoding='utf-8') as f:
+        json.dump(config, f, ensure_ascii=False, indent=2)
+    print('已更新 config.json。运行 npm start 后即可离线查看。')
+
+
+if __name__ == '__main__':
+    main()
--- a/scripts/parse_720yun_doc.py
+++ b/scripts/parse_720yun_doc.py
@@ -0,0 +1,234 @@
+#!/usr/bin/env python3
+"""
+从 text.md（720yun 页面保存的文档）中解析 window.data / window.json，
+并解析出最终的全景图片资源 URL。
+脚本位于 scripts/，读写路径均相对于项目根目录。
+
+用法（在项目根目录执行）:
+  python3 scripts/parse_720yun_doc.py [text.md]
+  python3 scripts/parse_720yun_doc.py --fetch   # 并请求场景 JSON
+  python3 scripts/parse_720yun_doc.py --download  # 下载六面图到 image/
+"""
+import re
+import sys
+import json
+import urllib.request
+from pathlib import Path
+
+# 项目根目录（脚本所在目录的上一级）
+ROOT = Path(__file__).resolve().parent.parent
+
+
+def read_doc(path):
+    with open(path, 'r', encoding='utf-8', errors='replace') as f:
+        return f.read()
+
+
+def parse_window_data(html):
+    """解析 window.data={...}; 或 window.data = {...}（支持嵌套）"""
+    m = re.search(r'window\.data\s*=\s*\{', html)
+    if not m:
+        return None
+    start = m.end() - 1  # 从 '{' 开始
+    depth = 0
+    in_str = None
+    escape = False
+    i = start
+    while i < len(html):
+        c = html[i]
+        if escape:
+            escape = False
+            i += 1
+            continue
+        if in_str:
+            if c == '\\':
+                escape = True
+            elif c == in_str:
+                in_str = None
+            i += 1
+            continue
+        if c in '"\'':
+            in_str = c
+            i += 1
+            continue
+        if c == '{':
+            depth += 1
+        elif c == '}':
+            depth -= 1
+            if depth == 0:
+                raw = html[start:i + 1]
+                try:
+                    return json.loads(raw)
+                except json.JSONDecodeError:
+                    return None
+        i += 1
+    return None
+
+
+def parse_window_json(html):
+    """解析 window.json="..."; """
+    m = re.search(r'window\.json\s*=\s*["\']([^"\']+)["\']', html)
+    return m.group(1) if m else None
+
+
+# 720yun 全景图实际 CDN：浏览器里能访问的域名（与 resource-t 不同，需用此域名才能正确拉取）
+RESOURCE_CDN_HOST = 'ssl-panoimg130.720static.com'
+
+
+def build_resource_base(thumb_url):
+    """从 thumbUrl 得到资源目录的 base URL（用于拼立方体等）。使用实际 CDN 域名以便脚本拉取与浏览器一致。"""
+    if thumb_url.startswith('http'):
+        base = re.sub(r'^https?://[^/]+', 'https://' + RESOURCE_CDN_HOST, thumb_url)
+    else:
+        base = 'https://' + RESOURCE_CDN_HOST + thumb_url
+    base = base.rsplit('/', 1)[0] + '/'
+    return base
+
+
+def infer_cube_urls(resource_base):
+    """根据 720yun 常见命名推断立方体六面图 URL（与本地 image/mobile_*.jpg 对应）。"""
+    faces = ['f', 'r', 'b', 'l', 'u', 'd']  # 前 右 后 左 上 下
+    return [resource_base + 'mobile_' + face + '.jpg' for face in faces]
+
+
+def _browser_headers(referer='https://www.720yun.com/'):
+    return {
+        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+        'Accept': 'image/avif,image/webp,image/apng,image/*,*/*;q=0.9',
+        'Referer': referer,
+    }
+
+
+def fetch_tour_json(json_path, base_url='https://www.720yun.com/'):
+    """请求场景 JSON。json_path 为 window.json 的值，如 json/4ca3fae5e7x/.../3.json"""
+    url = base_url.rstrip('/') + '/' + json_path.lstrip('/')
+    req = urllib.request.Request(url, headers=_browser_headers())
+    try:
+        with urllib.request.urlopen(req, timeout=15) as r:
+            return json.loads(r.read().decode('utf-8', errors='replace'))
+    except Exception:
+        return None
+
+
+def download_to_file(url, dest_path):
+    """用浏览器头拉取并保存，保证与 Chrome 读取一致。"""
+    req = urllib.request.Request(url, headers=_browser_headers())
+    with urllib.request.urlopen(req, timeout=30) as r:
+        dest_path.write_bytes(r.read())
+
+
+def extract_image_urls_from_tour(tour_data, resource_base):
+    """从 krpano/720 场景 JSON 中递归提取所有图片 URL。"""
+    urls = []
+
+    def collect(obj):
+        if isinstance(obj, dict):
+            for k, v in obj.items():
+                if k in ('url', 'panorama', 'image', 'src', 'path', 'thumbUrl', 'basePath'):
+                    if isinstance(v, str) and (v.startswith('http') or v.startswith('/')):
+                        u = v if v.startswith('http') else (resource_base.rstrip('/') + v)
+                        urls.append(u)
+                elif k == 'cubeMap' and isinstance(v, list):
+                    for u in v:
+                        if isinstance(u, str):
+                            urls.append(u if u.startswith('http') else resource_base.rstrip('/') + '/' + u.lstrip('/'))
+                else:
+                    collect(v)
+        elif isinstance(obj, list):
+            for x in obj:
+                collect(x)
+
+    collect(tour_data)
+    return list(dict.fromkeys(urls))
+
+
+def main():
+    doc_path = ROOT / 'text.md'
+    if len(sys.argv) >= 2 and not sys.argv[1].startswith('-'):
+        doc_path = Path(sys.argv[1])
+    do_fetch = '--fetch' in sys.argv
+    do_download = '--download' in sys.argv
+
+    if not doc_path.exists():
+        print('未找到文档:', doc_path)
+        sys.exit(1)
+
+    html = read_doc(doc_path)
+    data = parse_window_data(html)
+    json_path = parse_window_json(html)
+
+    if not data and not json_path:
+        print('未能从文档中解析出 window.data 或 window.json')
+        sys.exit(1)
+
+    result = {
+        'window_data': data,
+        'window_json_path': json_path,
+        'resource_base': None,
+        'thumb_url': None,
+        'inferred_cube_urls': [],
+        'tour_json_url': None,
+        'tour_image_urls': [],
+    }
+
+    if data:
+        thumb = data.get('thumbUrl') or ''
+        if thumb and not thumb.startswith('http'):
+            result['thumb_url'] = 'https://thumb-t.720static.com' + thumb
+        else:
+            result['thumb_url'] = thumb or None
+        result['resource_base'] = build_resource_base(thumb) if thumb else None
+        result['tid'] = data.get('tid')
+        result['name'] = data.get('name')
+        result['sceneCount'] = data.get('sceneCount')
+
+    if result['resource_base']:
+        result['inferred_cube_urls'] = infer_cube_urls(result['resource_base'])
+
+    if json_path:
+        result['tour_json_url'] = 'https://www.720yun.com/' + json_path.lstrip('/')
+        if do_fetch and result['resource_base']:
+            tour = fetch_tour_json(json_path)
+            if tour:
+                result['tour_image_urls'] = extract_image_urls_from_tour(tour, result['resource_base'])
+            else:
+                print('请求场景 JSON 失败:', result['tour_json_url'], file=sys.stderr)
+
+    out_path = ROOT / 'parsed_720yun_resources.json'
+    with open(out_path, 'w', encoding='utf-8') as f:
+        json.dump(result, f, ensure_ascii=False, indent=2)
+    print('已写入:', out_path)
+
+    all_urls = []
+    if result.get('thumb_url'):
+        all_urls.append(('thumb', result['thumb_url']))
+    for u in result.get('inferred_cube_urls', []):
+        all_urls.append(('cube', u))
+    for u in result.get('tour_image_urls', []):
+        if not any(u == x for _, x in all_urls):
+            all_urls.append(('tour', u))
+
+    print('\n--- 解析出的图片资源 ---')
+    for kind, url in all_urls:
+        print(kind, url)
+    print('\n共', len(all_urls), '个 URL')
+
+    if do_download and result.get('resource_base'):
+        out_dir = ROOT / 'image'
+        out_dir.mkdir(exist_ok=True)
+        print('\n--- 使用浏览器头下载到 image/ ---')
+        for face, url in [('thumb', result.get('thumb_url'))] + list(zip(['mobile_f', 'mobile_r', 'mobile_b', 'mobile_l', 'mobile_u', 'mobile_d'], result.get('inferred_cube_urls', []))):
+            if not url:
+                continue
+            name = face + '.jpg' if face != 'thumb' else 'thumb.jpg'
+            dest = out_dir / name
+            try:
+                download_to_file(url, dest)
+                print('OK', name)
+            except Exception as e:
+                print('FAIL', name, e)
+    return result
+
+
+if __name__ == '__main__':
+    main()