Files
hometown/scripts/parse_720yun_doc.py
2026-03-07 23:35:08 +08:00

235 lines
8.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
从 text.md720yun 页面保存的文档)中解析 window.data / window.json
并解析出最终的全景图片资源 URL。
脚本位于 scripts/,读写路径均相对于项目根目录。
用法(在项目根目录执行):
python3 scripts/parse_720yun_doc.py [text.md]
python3 scripts/parse_720yun_doc.py --fetch # 并请求场景 JSON
python3 scripts/parse_720yun_doc.py --download # 下载六面图到 image/
"""
import re
import sys
import json
import urllib.request
from pathlib import Path
# 项目根目录(脚本所在目录的上一级)
ROOT = Path(__file__).resolve().parent.parent
def read_doc(path):
with open(path, 'r', encoding='utf-8', errors='replace') as f:
return f.read()
def parse_window_data(html):
"""解析 window.data={...}; 或 window.data = {...}(支持嵌套)"""
m = re.search(r'window\.data\s*=\s*\{', html)
if not m:
return None
start = m.end() - 1 # 从 '{' 开始
depth = 0
in_str = None
escape = False
i = start
while i < len(html):
c = html[i]
if escape:
escape = False
i += 1
continue
if in_str:
if c == '\\':
escape = True
elif c == in_str:
in_str = None
i += 1
continue
if c in '"\'':
in_str = c
i += 1
continue
if c == '{':
depth += 1
elif c == '}':
depth -= 1
if depth == 0:
raw = html[start:i + 1]
try:
return json.loads(raw)
except json.JSONDecodeError:
return None
i += 1
return None
def parse_window_json(html):
"""解析 window.json="..."; """
m = re.search(r'window\.json\s*=\s*["\']([^"\']+)["\']', html)
return m.group(1) if m else None
# 720yun 全景图实际 CDN浏览器里能访问的域名与 resource-t 不同,需用此域名才能正确拉取)
RESOURCE_CDN_HOST = 'ssl-panoimg130.720static.com'
def build_resource_base(thumb_url):
"""从 thumbUrl 得到资源目录的 base URL用于拼立方体等。使用实际 CDN 域名以便脚本拉取与浏览器一致。"""
if thumb_url.startswith('http'):
base = re.sub(r'^https?://[^/]+', 'https://' + RESOURCE_CDN_HOST, thumb_url)
else:
base = 'https://' + RESOURCE_CDN_HOST + thumb_url
base = base.rsplit('/', 1)[0] + '/'
return base
def infer_cube_urls(resource_base):
"""根据 720yun 常见命名推断立方体六面图 URL与本地 image/mobile_*.jpg 对应)。"""
faces = ['f', 'r', 'b', 'l', 'u', 'd'] # 前 右 后 左 上 下
return [resource_base + 'mobile_' + face + '.jpg' for face in faces]
def _browser_headers(referer='https://www.720yun.com/'):
return {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'image/avif,image/webp,image/apng,image/*,*/*;q=0.9',
'Referer': referer,
}
def fetch_tour_json(json_path, base_url='https://www.720yun.com/'):
"""请求场景 JSON。json_path 为 window.json 的值,如 json/4ca3fae5e7x/.../3.json"""
url = base_url.rstrip('/') + '/' + json_path.lstrip('/')
req = urllib.request.Request(url, headers=_browser_headers())
try:
with urllib.request.urlopen(req, timeout=15) as r:
return json.loads(r.read().decode('utf-8', errors='replace'))
except Exception:
return None
def download_to_file(url, dest_path):
"""用浏览器头拉取并保存,保证与 Chrome 读取一致。"""
req = urllib.request.Request(url, headers=_browser_headers())
with urllib.request.urlopen(req, timeout=30) as r:
dest_path.write_bytes(r.read())
def extract_image_urls_from_tour(tour_data, resource_base):
"""从 krpano/720 场景 JSON 中递归提取所有图片 URL。"""
urls = []
def collect(obj):
if isinstance(obj, dict):
for k, v in obj.items():
if k in ('url', 'panorama', 'image', 'src', 'path', 'thumbUrl', 'basePath'):
if isinstance(v, str) and (v.startswith('http') or v.startswith('/')):
u = v if v.startswith('http') else (resource_base.rstrip('/') + v)
urls.append(u)
elif k == 'cubeMap' and isinstance(v, list):
for u in v:
if isinstance(u, str):
urls.append(u if u.startswith('http') else resource_base.rstrip('/') + '/' + u.lstrip('/'))
else:
collect(v)
elif isinstance(obj, list):
for x in obj:
collect(x)
collect(tour_data)
return list(dict.fromkeys(urls))
def main():
doc_path = ROOT / 'text.md'
if len(sys.argv) >= 2 and not sys.argv[1].startswith('-'):
doc_path = Path(sys.argv[1])
do_fetch = '--fetch' in sys.argv
do_download = '--download' in sys.argv
if not doc_path.exists():
print('未找到文档:', doc_path)
sys.exit(1)
html = read_doc(doc_path)
data = parse_window_data(html)
json_path = parse_window_json(html)
if not data and not json_path:
print('未能从文档中解析出 window.data 或 window.json')
sys.exit(1)
result = {
'window_data': data,
'window_json_path': json_path,
'resource_base': None,
'thumb_url': None,
'inferred_cube_urls': [],
'tour_json_url': None,
'tour_image_urls': [],
}
if data:
thumb = data.get('thumbUrl') or ''
if thumb and not thumb.startswith('http'):
result['thumb_url'] = 'https://thumb-t.720static.com' + thumb
else:
result['thumb_url'] = thumb or None
result['resource_base'] = build_resource_base(thumb) if thumb else None
result['tid'] = data.get('tid')
result['name'] = data.get('name')
result['sceneCount'] = data.get('sceneCount')
if result['resource_base']:
result['inferred_cube_urls'] = infer_cube_urls(result['resource_base'])
if json_path:
result['tour_json_url'] = 'https://www.720yun.com/' + json_path.lstrip('/')
if do_fetch and result['resource_base']:
tour = fetch_tour_json(json_path)
if tour:
result['tour_image_urls'] = extract_image_urls_from_tour(tour, result['resource_base'])
else:
print('请求场景 JSON 失败:', result['tour_json_url'], file=sys.stderr)
out_path = ROOT / 'parsed_720yun_resources.json'
with open(out_path, 'w', encoding='utf-8') as f:
json.dump(result, f, ensure_ascii=False, indent=2)
print('已写入:', out_path)
all_urls = []
if result.get('thumb_url'):
all_urls.append(('thumb', result['thumb_url']))
for u in result.get('inferred_cube_urls', []):
all_urls.append(('cube', u))
for u in result.get('tour_image_urls', []):
if not any(u == x for _, x in all_urls):
all_urls.append(('tour', u))
print('\n--- 解析出的图片资源 ---')
for kind, url in all_urls:
print(kind, url)
print('\n', len(all_urls), '个 URL')
if do_download and result.get('resource_base'):
out_dir = ROOT / 'image'
out_dir.mkdir(exist_ok=True)
print('\n--- 使用浏览器头下载到 image/ ---')
for face, url in [('thumb', result.get('thumb_url'))] + list(zip(['mobile_f', 'mobile_r', 'mobile_b', 'mobile_l', 'mobile_u', 'mobile_d'], result.get('inferred_cube_urls', []))):
if not url:
continue
name = face + '.jpg' if face != 'thumb' else 'thumb.jpg'
dest = out_dir / name
try:
download_to_file(url, dest)
print('OK', name)
except Exception as e:
print('FAIL', name, e)
return result
if __name__ == '__main__':
main()