#!/usr/bin/env python3 """ 从 text.md(720yun 页面保存的文档)中解析 window.data / window.json, 并解析出最终的全景图片资源 URL。 脚本位于 scripts/,读写路径均相对于项目根目录。 用法(在项目根目录执行): python3 scripts/parse_720yun_doc.py [text.md] python3 scripts/parse_720yun_doc.py --fetch # 并请求场景 JSON python3 scripts/parse_720yun_doc.py --download # 下载六面图到 image/ """ import re import sys import json import urllib.request from pathlib import Path # 项目根目录(脚本所在目录的上一级) ROOT = Path(__file__).resolve().parent.parent def read_doc(path): with open(path, 'r', encoding='utf-8', errors='replace') as f: return f.read() def parse_window_data(html): """解析 window.data={...}; 或 window.data = {...}(支持嵌套)""" m = re.search(r'window\.data\s*=\s*\{', html) if not m: return None start = m.end() - 1 # 从 '{' 开始 depth = 0 in_str = None escape = False i = start while i < len(html): c = html[i] if escape: escape = False i += 1 continue if in_str: if c == '\\': escape = True elif c == in_str: in_str = None i += 1 continue if c in '"\'': in_str = c i += 1 continue if c == '{': depth += 1 elif c == '}': depth -= 1 if depth == 0: raw = html[start:i + 1] try: return json.loads(raw) except json.JSONDecodeError: return None i += 1 return None def parse_window_json(html): """解析 window.json="..."; """ m = re.search(r'window\.json\s*=\s*["\']([^"\']+)["\']', html) return m.group(1) if m else None # 720yun 全景图实际 CDN:浏览器里能访问的域名(与 resource-t 不同,需用此域名才能正确拉取) RESOURCE_CDN_HOST = 'ssl-panoimg130.720static.com' def build_resource_base(thumb_url): """从 thumbUrl 得到资源目录的 base URL(用于拼立方体等)。使用实际 CDN 域名以便脚本拉取与浏览器一致。""" if thumb_url.startswith('http'): base = re.sub(r'^https?://[^/]+', 'https://' + RESOURCE_CDN_HOST, thumb_url) else: base = 'https://' + RESOURCE_CDN_HOST + thumb_url base = base.rsplit('/', 1)[0] + '/' return base def infer_cube_urls(resource_base): """根据 720yun 常见命名推断立方体六面图 URL(与本地 image/mobile_*.jpg 对应)。""" faces = ['f', 'r', 'b', 'l', 'u', 'd'] # 前 右 后 左 上 下 return [resource_base + 'mobile_' + face + '.jpg' for face in faces] def _browser_headers(referer='https://www.720yun.com/'): return { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'image/avif,image/webp,image/apng,image/*,*/*;q=0.9', 'Referer': referer, } def fetch_tour_json(json_path, base_url='https://www.720yun.com/'): """请求场景 JSON。json_path 为 window.json 的值,如 json/4ca3fae5e7x/.../3.json""" url = base_url.rstrip('/') + '/' + json_path.lstrip('/') req = urllib.request.Request(url, headers=_browser_headers()) try: with urllib.request.urlopen(req, timeout=15) as r: return json.loads(r.read().decode('utf-8', errors='replace')) except Exception: return None def download_to_file(url, dest_path): """用浏览器头拉取并保存,保证与 Chrome 读取一致。""" req = urllib.request.Request(url, headers=_browser_headers()) with urllib.request.urlopen(req, timeout=30) as r: dest_path.write_bytes(r.read()) def extract_image_urls_from_tour(tour_data, resource_base): """从 krpano/720 场景 JSON 中递归提取所有图片 URL。""" urls = [] def collect(obj): if isinstance(obj, dict): for k, v in obj.items(): if k in ('url', 'panorama', 'image', 'src', 'path', 'thumbUrl', 'basePath'): if isinstance(v, str) and (v.startswith('http') or v.startswith('/')): u = v if v.startswith('http') else (resource_base.rstrip('/') + v) urls.append(u) elif k == 'cubeMap' and isinstance(v, list): for u in v: if isinstance(u, str): urls.append(u if u.startswith('http') else resource_base.rstrip('/') + '/' + u.lstrip('/')) else: collect(v) elif isinstance(obj, list): for x in obj: collect(x) collect(tour_data) return list(dict.fromkeys(urls)) def main(): doc_path = ROOT / 'text.md' if len(sys.argv) >= 2 and not sys.argv[1].startswith('-'): doc_path = Path(sys.argv[1]) do_fetch = '--fetch' in sys.argv do_download = '--download' in sys.argv if not doc_path.exists(): print('未找到文档:', doc_path) sys.exit(1) html = read_doc(doc_path) data = parse_window_data(html) json_path = parse_window_json(html) if not data and not json_path: print('未能从文档中解析出 window.data 或 window.json') sys.exit(1) result = { 'window_data': data, 'window_json_path': json_path, 'resource_base': None, 'thumb_url': None, 'inferred_cube_urls': [], 'tour_json_url': None, 'tour_image_urls': [], } if data: thumb = data.get('thumbUrl') or '' if thumb and not thumb.startswith('http'): result['thumb_url'] = 'https://thumb-t.720static.com' + thumb else: result['thumb_url'] = thumb or None result['resource_base'] = build_resource_base(thumb) if thumb else None result['tid'] = data.get('tid') result['name'] = data.get('name') result['sceneCount'] = data.get('sceneCount') if result['resource_base']: result['inferred_cube_urls'] = infer_cube_urls(result['resource_base']) if json_path: result['tour_json_url'] = 'https://www.720yun.com/' + json_path.lstrip('/') if do_fetch and result['resource_base']: tour = fetch_tour_json(json_path) if tour: result['tour_image_urls'] = extract_image_urls_from_tour(tour, result['resource_base']) else: print('请求场景 JSON 失败:', result['tour_json_url'], file=sys.stderr) out_path = ROOT / 'parsed_720yun_resources.json' with open(out_path, 'w', encoding='utf-8') as f: json.dump(result, f, ensure_ascii=False, indent=2) print('已写入:', out_path) all_urls = [] if result.get('thumb_url'): all_urls.append(('thumb', result['thumb_url'])) for u in result.get('inferred_cube_urls', []): all_urls.append(('cube', u)) for u in result.get('tour_image_urls', []): if not any(u == x for _, x in all_urls): all_urls.append(('tour', u)) print('\n--- 解析出的图片资源 ---') for kind, url in all_urls: print(kind, url) print('\n共', len(all_urls), '个 URL') if do_download and result.get('resource_base'): out_dir = ROOT / 'image' out_dir.mkdir(exist_ok=True) print('\n--- 使用浏览器头下载到 image/ ---') for face, url in [('thumb', result.get('thumb_url'))] + list(zip(['mobile_f', 'mobile_r', 'mobile_b', 'mobile_l', 'mobile_u', 'mobile_d'], result.get('inferred_cube_urls', []))): if not url: continue name = face + '.jpg' if face != 'thumb' else 'thumb.jpg' dest = out_dir / name try: download_to_file(url, dest) print('OK', name) except Exception as e: print('FAIL', name, e) return result if __name__ == '__main__': main()