Files
hometown/parse_720yun_doc.py
2026-03-07 19:13:49 +08:00

236 lines
8.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
从 text.md720yun 页面保存的文档)中解析 window.data / window.json
并解析出最终的全景图片资源 URL。
用法:
python3 parse_720yun_doc.py [text.md]
python3 parse_720yun_doc.py --fetch # 并请求场景 JSON解析出所有图片 URL
"""
import re
import sys
import json
import urllib.request
from pathlib import Path
def read_doc(path):
with open(path, 'r', encoding='utf-8', errors='replace') as f:
return f.read()
def parse_window_data(html):
"""解析 window.data={...}; 或 window.data = {...}(支持嵌套)"""
m = re.search(r'window\.data\s*=\s*\{', html)
if not m:
return None
start = m.end() - 1 # 从 '{' 开始
depth = 0
in_str = None
escape = False
i = start
while i < len(html):
c = html[i]
if escape:
escape = False
i += 1
continue
if in_str:
if c == '\\':
escape = True
elif c == in_str:
in_str = None
i += 1
continue
if c in '"\'':
in_str = c
i += 1
continue
if c == '{':
depth += 1
elif c == '}':
depth -= 1
if depth == 0:
raw = html[start:i + 1]
try:
return json.loads(raw)
except json.JSONDecodeError:
return None
i += 1
return None
def parse_window_json(html):
"""解析 window.json="..."; """
m = re.search(r'window\.json\s*=\s*["\']([^"\']+)["\']', html)
return m.group(1) if m else None
# 720yun 全景图实际 CDN浏览器里能访问的域名与 resource-t 不同,需用此域名才能正确拉取)
RESOURCE_CDN_HOST = 'ssl-panoimg130.720static.com'
def build_resource_base(thumb_url):
"""从 thumbUrl 得到资源目录的 base URL用于拼立方体等。使用实际 CDN 域名以便脚本拉取与浏览器一致。"""
# thumbUrl 可能是 "/resource/prod/4ca3fae5e7x/d22jkguytw6/59446768/imgs/thumb.jpg"
# 全景图实际在 ssl-panoimg130.720static.com用 resource-t 会拿不到或异常
if thumb_url.startswith('http'):
base = re.sub(r'^https?://[^/]+', 'https://' + RESOURCE_CDN_HOST, thumb_url)
else:
base = 'https://' + RESOURCE_CDN_HOST + thumb_url
base = base.rsplit('/', 1)[0] + '/'
return base
def infer_cube_urls(resource_base):
"""根据 720yun 常见命名推断立方体六面图 URL与本地 image/mobile_*.jpg 对应)。"""
faces = ['f', 'r', 'b', 'l', 'u', 'd'] # 前 右 后 左 上 下
return [resource_base + 'mobile_' + face + '.jpg' for face in faces]
# 与浏览器一致的请求头720yun CDN 校验 Referer否则拿不到正确数据
def _browser_headers(referer='https://www.720yun.com/'):
return {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'image/avif,image/webp,image/apng,image/*,*/*;q=0.9',
'Referer': referer,
}
def fetch_tour_json(json_path, base_url='https://www.720yun.com/'):
"""请求场景 JSON。json_path 为 window.json 的值,如 json/4ca3fae5e7x/.../3.json"""
url = base_url.rstrip('/') + '/' + json_path.lstrip('/')
req = urllib.request.Request(url, headers=_browser_headers())
try:
with urllib.request.urlopen(req, timeout=15) as r:
return json.loads(r.read().decode('utf-8', errors='replace'))
except Exception as e:
return None
def download_to_file(url, dest_path):
"""用浏览器头拉取并保存,保证与 Chrome 读取一致。"""
req = urllib.request.Request(url, headers=_browser_headers())
with urllib.request.urlopen(req, timeout=30) as r:
dest_path.write_bytes(r.read())
def extract_image_urls_from_tour(tour_data, resource_base):
"""从 krpano/720 场景 JSON 中递归提取所有图片 URL。"""
urls = []
def collect(obj):
if isinstance(obj, dict):
for k, v in obj.items():
if k in ('url', 'panorama', 'image', 'src', 'path', 'thumbUrl', 'basePath'):
if isinstance(v, str) and (v.startswith('http') or v.startswith('/')):
u = v if v.startswith('http') else (resource_base.rstrip('/') + v)
urls.append(u)
elif k == 'cubeMap' and isinstance(v, list):
for u in v:
if isinstance(u, str):
urls.append(u if u.startswith('http') else resource_base.rstrip('/') + '/' + u.lstrip('/'))
else:
collect(v)
elif isinstance(obj, list):
for x in obj:
collect(x)
collect(tour_data)
return list(dict.fromkeys(urls))
def main():
doc_path = Path(__file__).resolve().parent / 'text.md'
if len(sys.argv) >= 2 and not sys.argv[1].startswith('-'):
doc_path = Path(sys.argv[1])
do_fetch = '--fetch' in sys.argv
do_download = '--download' in sys.argv
if not doc_path.exists():
print('未找到文档:', doc_path)
sys.exit(1)
html = read_doc(doc_path)
data = parse_window_data(html)
json_path = parse_window_json(html)
if not data and not json_path:
print('未能从文档中解析出 window.data 或 window.json')
sys.exit(1)
# 解析结果
result = {
'window_data': data,
'window_json_path': json_path,
'resource_base': None,
'thumb_url': None,
'inferred_cube_urls': [],
'tour_json_url': None,
'tour_image_urls': [],
}
if data:
thumb = data.get('thumbUrl') or ''
if thumb and not thumb.startswith('http'):
result['thumb_url'] = 'https://thumb-t.720static.com' + thumb
else:
result['thumb_url'] = thumb or None
result['resource_base'] = build_resource_base(thumb) if thumb else None
result['tid'] = data.get('tid')
result['name'] = data.get('name')
result['sceneCount'] = data.get('sceneCount')
if result['resource_base']:
result['inferred_cube_urls'] = infer_cube_urls(result['resource_base'])
if json_path:
result['tour_json_url'] = 'https://www.720yun.com/' + json_path.lstrip('/')
if do_fetch and result['resource_base']:
tour = fetch_tour_json(json_path)
if tour:
result['tour_image_urls'] = extract_image_urls_from_tour(tour, result['resource_base'])
else:
print('请求场景 JSON 失败:', result['tour_json_url'], file=sys.stderr)
# 输出:先写 JSON 汇总,再列最终图片列表
out_path = Path(__file__).resolve().parent / 'parsed_720yun_resources.json'
with open(out_path, 'w', encoding='utf-8') as f:
json.dump(result, f, ensure_ascii=False, indent=2)
print('已写入:', out_path)
# 最终图片资源列表(去重、合并)
all_urls = []
if result.get('thumb_url'):
all_urls.append(('thumb', result['thumb_url']))
for u in result.get('inferred_cube_urls', []):
all_urls.append(('cube', u))
for u in result.get('tour_image_urls', []):
if not any(u == x for _, x in all_urls):
all_urls.append(('tour', u))
print('\n--- 解析出的图片资源 ---')
for kind, url in all_urls:
print(kind, url)
print('\n', len(all_urls), '个 URL')
if do_download and result.get('resource_base'):
out_dir = Path(__file__).resolve().parent / 'image'
out_dir.mkdir(exist_ok=True)
print('\n--- 使用浏览器头下载到 image/ ---')
for face, url in [('thumb', result.get('thumb_url'))] + list(zip(['mobile_f', 'mobile_r', 'mobile_b', 'mobile_l', 'mobile_u', 'mobile_d'], result.get('inferred_cube_urls', []))):
if not url:
continue
name = face + '.jpg' if face != 'thumb' else 'thumb.jpg'
dest = out_dir / name
try:
download_to_file(url, dest)
print('OK', name)
except Exception as e:
print('FAIL', name, e)
return result
if __name__ == '__main__':
main()