236 lines
8.2 KiB
Python
236 lines
8.2 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
从 text.md(720yun 页面保存的文档)中解析 window.data / window.json,
|
||
并解析出最终的全景图片资源 URL。
|
||
|
||
用法:
|
||
python3 parse_720yun_doc.py [text.md]
|
||
python3 parse_720yun_doc.py --fetch # 并请求场景 JSON,解析出所有图片 URL
|
||
"""
|
||
import re
|
||
import sys
|
||
import json
|
||
import urllib.request
|
||
from pathlib import Path
|
||
|
||
|
||
def read_doc(path):
|
||
with open(path, 'r', encoding='utf-8', errors='replace') as f:
|
||
return f.read()
|
||
|
||
|
||
def parse_window_data(html):
|
||
"""解析 window.data={...}; 或 window.data = {...}(支持嵌套)"""
|
||
m = re.search(r'window\.data\s*=\s*\{', html)
|
||
if not m:
|
||
return None
|
||
start = m.end() - 1 # 从 '{' 开始
|
||
depth = 0
|
||
in_str = None
|
||
escape = False
|
||
i = start
|
||
while i < len(html):
|
||
c = html[i]
|
||
if escape:
|
||
escape = False
|
||
i += 1
|
||
continue
|
||
if in_str:
|
||
if c == '\\':
|
||
escape = True
|
||
elif c == in_str:
|
||
in_str = None
|
||
i += 1
|
||
continue
|
||
if c in '"\'':
|
||
in_str = c
|
||
i += 1
|
||
continue
|
||
if c == '{':
|
||
depth += 1
|
||
elif c == '}':
|
||
depth -= 1
|
||
if depth == 0:
|
||
raw = html[start:i + 1]
|
||
try:
|
||
return json.loads(raw)
|
||
except json.JSONDecodeError:
|
||
return None
|
||
i += 1
|
||
return None
|
||
|
||
|
||
def parse_window_json(html):
|
||
"""解析 window.json="..."; """
|
||
m = re.search(r'window\.json\s*=\s*["\']([^"\']+)["\']', html)
|
||
return m.group(1) if m else None
|
||
|
||
|
||
# 720yun 全景图实际 CDN:浏览器里能访问的域名(与 resource-t 不同,需用此域名才能正确拉取)
|
||
RESOURCE_CDN_HOST = 'ssl-panoimg130.720static.com'
|
||
|
||
|
||
def build_resource_base(thumb_url):
|
||
"""从 thumbUrl 得到资源目录的 base URL(用于拼立方体等)。使用实际 CDN 域名以便脚本拉取与浏览器一致。"""
|
||
# thumbUrl 可能是 "/resource/prod/4ca3fae5e7x/d22jkguytw6/59446768/imgs/thumb.jpg"
|
||
# 全景图实际在 ssl-panoimg130.720static.com,用 resource-t 会拿不到或异常
|
||
if thumb_url.startswith('http'):
|
||
base = re.sub(r'^https?://[^/]+', 'https://' + RESOURCE_CDN_HOST, thumb_url)
|
||
else:
|
||
base = 'https://' + RESOURCE_CDN_HOST + thumb_url
|
||
base = base.rsplit('/', 1)[0] + '/'
|
||
return base
|
||
|
||
|
||
def infer_cube_urls(resource_base):
|
||
"""根据 720yun 常见命名推断立方体六面图 URL(与本地 image/mobile_*.jpg 对应)。"""
|
||
faces = ['f', 'r', 'b', 'l', 'u', 'd'] # 前 右 后 左 上 下
|
||
return [resource_base + 'mobile_' + face + '.jpg' for face in faces]
|
||
|
||
|
||
# 与浏览器一致的请求头,720yun CDN 校验 Referer,否则拿不到正确数据
|
||
def _browser_headers(referer='https://www.720yun.com/'):
|
||
return {
|
||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||
'Accept': 'image/avif,image/webp,image/apng,image/*,*/*;q=0.9',
|
||
'Referer': referer,
|
||
}
|
||
|
||
|
||
def fetch_tour_json(json_path, base_url='https://www.720yun.com/'):
|
||
"""请求场景 JSON。json_path 为 window.json 的值,如 json/4ca3fae5e7x/.../3.json"""
|
||
url = base_url.rstrip('/') + '/' + json_path.lstrip('/')
|
||
req = urllib.request.Request(url, headers=_browser_headers())
|
||
try:
|
||
with urllib.request.urlopen(req, timeout=15) as r:
|
||
return json.loads(r.read().decode('utf-8', errors='replace'))
|
||
except Exception as e:
|
||
return None
|
||
|
||
|
||
def download_to_file(url, dest_path):
|
||
"""用浏览器头拉取并保存,保证与 Chrome 读取一致。"""
|
||
req = urllib.request.Request(url, headers=_browser_headers())
|
||
with urllib.request.urlopen(req, timeout=30) as r:
|
||
dest_path.write_bytes(r.read())
|
||
|
||
|
||
def extract_image_urls_from_tour(tour_data, resource_base):
|
||
"""从 krpano/720 场景 JSON 中递归提取所有图片 URL。"""
|
||
urls = []
|
||
|
||
def collect(obj):
|
||
if isinstance(obj, dict):
|
||
for k, v in obj.items():
|
||
if k in ('url', 'panorama', 'image', 'src', 'path', 'thumbUrl', 'basePath'):
|
||
if isinstance(v, str) and (v.startswith('http') or v.startswith('/')):
|
||
u = v if v.startswith('http') else (resource_base.rstrip('/') + v)
|
||
urls.append(u)
|
||
elif k == 'cubeMap' and isinstance(v, list):
|
||
for u in v:
|
||
if isinstance(u, str):
|
||
urls.append(u if u.startswith('http') else resource_base.rstrip('/') + '/' + u.lstrip('/'))
|
||
else:
|
||
collect(v)
|
||
elif isinstance(obj, list):
|
||
for x in obj:
|
||
collect(x)
|
||
|
||
collect(tour_data)
|
||
return list(dict.fromkeys(urls))
|
||
|
||
|
||
def main():
|
||
doc_path = Path(__file__).resolve().parent / 'text.md'
|
||
if len(sys.argv) >= 2 and not sys.argv[1].startswith('-'):
|
||
doc_path = Path(sys.argv[1])
|
||
do_fetch = '--fetch' in sys.argv
|
||
do_download = '--download' in sys.argv
|
||
|
||
if not doc_path.exists():
|
||
print('未找到文档:', doc_path)
|
||
sys.exit(1)
|
||
|
||
html = read_doc(doc_path)
|
||
data = parse_window_data(html)
|
||
json_path = parse_window_json(html)
|
||
|
||
if not data and not json_path:
|
||
print('未能从文档中解析出 window.data 或 window.json')
|
||
sys.exit(1)
|
||
|
||
# 解析结果
|
||
result = {
|
||
'window_data': data,
|
||
'window_json_path': json_path,
|
||
'resource_base': None,
|
||
'thumb_url': None,
|
||
'inferred_cube_urls': [],
|
||
'tour_json_url': None,
|
||
'tour_image_urls': [],
|
||
}
|
||
|
||
if data:
|
||
thumb = data.get('thumbUrl') or ''
|
||
if thumb and not thumb.startswith('http'):
|
||
result['thumb_url'] = 'https://thumb-t.720static.com' + thumb
|
||
else:
|
||
result['thumb_url'] = thumb or None
|
||
result['resource_base'] = build_resource_base(thumb) if thumb else None
|
||
result['tid'] = data.get('tid')
|
||
result['name'] = data.get('name')
|
||
result['sceneCount'] = data.get('sceneCount')
|
||
|
||
if result['resource_base']:
|
||
result['inferred_cube_urls'] = infer_cube_urls(result['resource_base'])
|
||
|
||
if json_path:
|
||
result['tour_json_url'] = 'https://www.720yun.com/' + json_path.lstrip('/')
|
||
if do_fetch and result['resource_base']:
|
||
tour = fetch_tour_json(json_path)
|
||
if tour:
|
||
result['tour_image_urls'] = extract_image_urls_from_tour(tour, result['resource_base'])
|
||
else:
|
||
print('请求场景 JSON 失败:', result['tour_json_url'], file=sys.stderr)
|
||
|
||
# 输出:先写 JSON 汇总,再列最终图片列表
|
||
out_path = Path(__file__).resolve().parent / 'parsed_720yun_resources.json'
|
||
with open(out_path, 'w', encoding='utf-8') as f:
|
||
json.dump(result, f, ensure_ascii=False, indent=2)
|
||
print('已写入:', out_path)
|
||
|
||
# 最终图片资源列表(去重、合并)
|
||
all_urls = []
|
||
if result.get('thumb_url'):
|
||
all_urls.append(('thumb', result['thumb_url']))
|
||
for u in result.get('inferred_cube_urls', []):
|
||
all_urls.append(('cube', u))
|
||
for u in result.get('tour_image_urls', []):
|
||
if not any(u == x for _, x in all_urls):
|
||
all_urls.append(('tour', u))
|
||
|
||
print('\n--- 解析出的图片资源 ---')
|
||
for kind, url in all_urls:
|
||
print(kind, url)
|
||
print('\n共', len(all_urls), '个 URL')
|
||
|
||
if do_download and result.get('resource_base'):
|
||
out_dir = Path(__file__).resolve().parent / 'image'
|
||
out_dir.mkdir(exist_ok=True)
|
||
print('\n--- 使用浏览器头下载到 image/ ---')
|
||
for face, url in [('thumb', result.get('thumb_url'))] + list(zip(['mobile_f', 'mobile_r', 'mobile_b', 'mobile_l', 'mobile_u', 'mobile_d'], result.get('inferred_cube_urls', []))):
|
||
if not url:
|
||
continue
|
||
name = face + '.jpg' if face != 'thumb' else 'thumb.jpg'
|
||
dest = out_dir / name
|
||
try:
|
||
download_to_file(url, dest)
|
||
print('OK', name)
|
||
except Exception as e:
|
||
print('FAIL', name, e)
|
||
return result
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|