#!/usr/bin/env python3 """ 从 720yun 页面抓取全景资源并本地化。 用法: python3 fetch_720yun.py "https://www.720yun.com/vr/c8525usOunr" 若页面由 JS 动态加载,请使用「手动获取」方式(见 README)。 """ import re import sys import json import urllib.request import urllib.error from pathlib import Path def fetch_html(url): req = urllib.request.Request(url, headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' }) with urllib.request.urlopen(req, timeout=15) as r: return r.read().decode('utf-8', errors='replace') def _extract_balanced_json(html, start_marker, open_char='{', close_char='}'): """从 html 中查找 start_marker 后紧跟的完整 JSON(匹配括号)。""" results = [] i = 0 if open_char == '{': other_open, other_close = '[', ']' else: other_open, other_close = '{', '}' while True: pos = html.find(start_marker, i) if pos < 0: break start = pos + len(start_marker) # 跳过空白与等号 while start < len(html) and html[start] in ' \t\n=': start += 1 if start >= len(html): i = pos + 1 continue if html[start] == open_char: depth = 0 in_string = None escape = False j = start while j < len(html): c = html[j] if escape: escape = False j += 1 continue if c == '\\' and in_string: escape = True j += 1 continue if in_string: if c == in_string: in_string = None j += 1 continue if c in '"\'': in_string = c j += 1 continue if c == open_char: depth += 1 elif c == close_char: depth -= 1 if depth == 0: results.append(html[start:j + 1]) break elif c == other_open: depth += 1 elif c == other_close: depth -= 1 j += 1 elif html[start] == other_open: depth = 0 in_string = None escape = False j = start while j < len(html): c = html[j] if escape: escape = False j += 1 continue if c == '\\' and in_string: escape = True j += 1 continue if in_string: if c == in_string: in_string = None j += 1 continue if c in '"\'': in_string = c j += 1 continue if c == other_open: depth += 1 elif c == other_close: depth -= 1 if depth == 0: results.append(html[start:j + 1]) break elif c == open_char: depth += 1 elif c == close_char: depth -= 1 j += 1 i = pos + 1 return results def find_json_assignments(html): """查找页面中常见的 __INITIAL_STATE__、window.__DATA__ 等 JSON 赋值(支持嵌套)。""" markers = [ 'window.__INITIAL_STATE__', '__INITIAL_STATE__', 'window.__DATA__', 'window.__NUXT_DATA__', ] results = [] for marker in markers: results.extend(_extract_balanced_json(html, marker, '{', '}')) # 也尝试匹配 "panorama":"url" 或 "scenes":[...] 的简单模式 for m in re.finditer(r'"panorama"\s*:\s*"([^"]+)"', html): results.append(m.group(1)) return results def find_image_urls(html): """从 HTML 中提取可能是全景图的 URL(720yun CDN 等)。""" # 常见 720 云图片域名 url_pattern = re.compile( r'https?://[^\s"\'<>]+?\.(?:720yun\.com|qpic\.cn|gtimg\.com)[^\s"\'<>]*\.(?:jpg|jpeg|png|webp)', re.I ) urls = list(set(url_pattern.findall(html))) # 也匹配任意包含 panorama / scene / photo 的图片 URL alt_pattern = re.compile( r'https?://[^\s"\'<>]+?/(?:panorama|scene|photo|pano|vr)[^\s"\'<>]*\.(?:jpg|jpeg|png|webp)', re.I ) for u in alt_pattern.findall(html): if u not in urls: urls.append(u) return urls # 720yun CDN 会校验 Referer,脚本请求需与浏览器一致才能拿到正确数据 def _browser_headers(url=''): h = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.9', } if '720static.com' in url or '720yun.com' in url: h['Referer'] = 'https://www.720yun.com/' return h def download_file(url, dest_path): req = urllib.request.Request(url, headers=_browser_headers(url)) with urllib.request.urlopen(req, timeout=30) as r: dest_path.write_bytes(r.read()) def main(): if len(sys.argv) < 2: print('用法: python3 fetch_720yun.py <720yun页面URL>') print('例: python3 fetch_720yun.py "https://www.720yun.com/vr/c8525usOunr"') sys.exit(1) url = sys.argv[1].strip() base = Path(__file__).resolve().parent panorama_dir = base / 'panorama' panorama_dir.mkdir(exist_ok=True) config_path = base / 'config.json' print('正在请求页面...') try: html = fetch_html(url) except Exception as e: print('请求失败:', e) print('请使用 README 中的「手动获取」方式在浏览器中抓取资源。') sys.exit(1) image_urls = find_image_urls(html) json_candidates = find_json_assignments(html) # 尝试从 JSON 中解析 panorama 或 scenes for raw in json_candidates: try: if raw.startswith('http'): image_urls.append(raw) continue if raw.startswith('{'): data = json.loads(raw) elif raw.startswith('['): data = json.loads(raw) if data and isinstance(data[0], dict) and 'url' in data[0]: image_urls.extend([s.get('url') for s in data if s.get('url')]) continue else: continue # 递归查找 url / panorama / image 字段 def collect_urls(obj, out): if isinstance(obj, dict): for k, v in obj.items(): if k in ('url', 'panorama', 'image', 'src', 'pic') and isinstance(v, str) and v.startswith('http'): out.append(v) else: collect_urls(v, out) elif isinstance(obj, list): for x in obj: collect_urls(x, out) collect_urls(data, image_urls) except (json.JSONDecodeError, TypeError): pass image_urls = list(dict.fromkeys([u for u in image_urls if u and u.startswith('http')])) if not image_urls: print('未在页面 HTML 中发现全景图 URL(页面可能由 JavaScript 动态加载)。') print('请按 README 使用浏览器开发者工具手动获取:') print(' 1. 打开该 720yun 链接') print(' 2. F12 -> Network -> 刷新 -> 筛选 Img 或 XHR') print(' 3. 找到全景图或 scene 接口返回的图片 URL,下载到 panorama/ 并命名为 panorama.jpg') print(' 4. 确保 config.json 中 panorama 为 "panorama/panorama.jpg"') sys.exit(0) print('发现可能的全景图 URL:', len(image_urls)) local_path = panorama_dir / 'panorama.jpg' try: first = image_urls[0] print('正在下载:', first[:80], '...') download_file(first, local_path) print('已保存到:', local_path) except Exception as e: print('下载失败:', e) print('请手动将上面列出的任一 URL 在浏览器中打开并另存为 panorama/panorama.jpg') sys.exit(1) # 确保 config 指向本地 if config_path.exists(): with open(config_path, 'r', encoding='utf-8') as f: config = json.load(f) else: config = {} config['panorama'] = 'panorama/panorama.jpg' config['type'] = config.get('type', 'equirectangular') config['title'] = config.get('title', '本地全景') with open(config_path, 'w', encoding='utf-8') as f: json.dump(config, f, ensure_ascii=False, indent=2) print('已更新 config.json。运行本地服务器后打开 index.html 即可离线查看。') if __name__ == '__main__': main()