Files
hometown/fetch_720yun.py
2026-03-07 19:13:49 +08:00

256 lines
9.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
从 720yun 页面抓取全景资源并本地化。
用法: python3 fetch_720yun.py "https://www.720yun.com/vr/c8525usOunr"
若页面由 JS 动态加载,请使用「手动获取」方式(见 README
"""
import re
import sys
import json
import urllib.request
import urllib.error
from pathlib import Path
def fetch_html(url):
req = urllib.request.Request(url, headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
})
with urllib.request.urlopen(req, timeout=15) as r:
return r.read().decode('utf-8', errors='replace')
def _extract_balanced_json(html, start_marker, open_char='{', close_char='}'):
"""从 html 中查找 start_marker 后紧跟的完整 JSON匹配括号"""
results = []
i = 0
if open_char == '{':
other_open, other_close = '[', ']'
else:
other_open, other_close = '{', '}'
while True:
pos = html.find(start_marker, i)
if pos < 0:
break
start = pos + len(start_marker)
# 跳过空白与等号
while start < len(html) and html[start] in ' \t\n=':
start += 1
if start >= len(html):
i = pos + 1
continue
if html[start] == open_char:
depth = 0
in_string = None
escape = False
j = start
while j < len(html):
c = html[j]
if escape:
escape = False
j += 1
continue
if c == '\\' and in_string:
escape = True
j += 1
continue
if in_string:
if c == in_string:
in_string = None
j += 1
continue
if c in '"\'':
in_string = c
j += 1
continue
if c == open_char:
depth += 1
elif c == close_char:
depth -= 1
if depth == 0:
results.append(html[start:j + 1])
break
elif c == other_open:
depth += 1
elif c == other_close:
depth -= 1
j += 1
elif html[start] == other_open:
depth = 0
in_string = None
escape = False
j = start
while j < len(html):
c = html[j]
if escape:
escape = False
j += 1
continue
if c == '\\' and in_string:
escape = True
j += 1
continue
if in_string:
if c == in_string:
in_string = None
j += 1
continue
if c in '"\'':
in_string = c
j += 1
continue
if c == other_open:
depth += 1
elif c == other_close:
depth -= 1
if depth == 0:
results.append(html[start:j + 1])
break
elif c == open_char:
depth += 1
elif c == close_char:
depth -= 1
j += 1
i = pos + 1
return results
def find_json_assignments(html):
"""查找页面中常见的 __INITIAL_STATE__、window.__DATA__ 等 JSON 赋值(支持嵌套)。"""
markers = [
'window.__INITIAL_STATE__',
'__INITIAL_STATE__',
'window.__DATA__',
'window.__NUXT_DATA__',
]
results = []
for marker in markers:
results.extend(_extract_balanced_json(html, marker, '{', '}'))
# 也尝试匹配 "panorama":"url" 或 "scenes":[...] 的简单模式
for m in re.finditer(r'"panorama"\s*:\s*"([^"]+)"', html):
results.append(m.group(1))
return results
def find_image_urls(html):
"""从 HTML 中提取可能是全景图的 URL720yun CDN 等)。"""
# 常见 720 云图片域名
url_pattern = re.compile(
r'https?://[^\s"\'<>]+?\.(?:720yun\.com|qpic\.cn|gtimg\.com)[^\s"\'<>]*\.(?:jpg|jpeg|png|webp)',
re.I
)
urls = list(set(url_pattern.findall(html)))
# 也匹配任意包含 panorama / scene / photo 的图片 URL
alt_pattern = re.compile(
r'https?://[^\s"\'<>]+?/(?:panorama|scene|photo|pano|vr)[^\s"\'<>]*\.(?:jpg|jpeg|png|webp)',
re.I
)
for u in alt_pattern.findall(html):
if u not in urls:
urls.append(u)
return urls
# 720yun CDN 会校验 Referer脚本请求需与浏览器一致才能拿到正确数据
def _browser_headers(url=''):
h = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.9',
}
if '720static.com' in url or '720yun.com' in url:
h['Referer'] = 'https://www.720yun.com/'
return h
def download_file(url, dest_path):
req = urllib.request.Request(url, headers=_browser_headers(url))
with urllib.request.urlopen(req, timeout=30) as r:
dest_path.write_bytes(r.read())
def main():
if len(sys.argv) < 2:
print('用法: python3 fetch_720yun.py <720yun页面URL>')
print('例: python3 fetch_720yun.py "https://www.720yun.com/vr/c8525usOunr"')
sys.exit(1)
url = sys.argv[1].strip()
base = Path(__file__).resolve().parent
panorama_dir = base / 'panorama'
panorama_dir.mkdir(exist_ok=True)
config_path = base / 'config.json'
print('正在请求页面...')
try:
html = fetch_html(url)
except Exception as e:
print('请求失败:', e)
print('请使用 README 中的「手动获取」方式在浏览器中抓取资源。')
sys.exit(1)
image_urls = find_image_urls(html)
json_candidates = find_json_assignments(html)
# 尝试从 JSON 中解析 panorama 或 scenes
for raw in json_candidates:
try:
if raw.startswith('http'):
image_urls.append(raw)
continue
if raw.startswith('{'):
data = json.loads(raw)
elif raw.startswith('['):
data = json.loads(raw)
if data and isinstance(data[0], dict) and 'url' in data[0]:
image_urls.extend([s.get('url') for s in data if s.get('url')])
continue
else:
continue
# 递归查找 url / panorama / image 字段
def collect_urls(obj, out):
if isinstance(obj, dict):
for k, v in obj.items():
if k in ('url', 'panorama', 'image', 'src', 'pic') and isinstance(v, str) and v.startswith('http'):
out.append(v)
else:
collect_urls(v, out)
elif isinstance(obj, list):
for x in obj:
collect_urls(x, out)
collect_urls(data, image_urls)
except (json.JSONDecodeError, TypeError):
pass
image_urls = list(dict.fromkeys([u for u in image_urls if u and u.startswith('http')]))
if not image_urls:
print('未在页面 HTML 中发现全景图 URL页面可能由 JavaScript 动态加载)。')
print('请按 README 使用浏览器开发者工具手动获取:')
print(' 1. 打开该 720yun 链接')
print(' 2. F12 -> Network -> 刷新 -> 筛选 Img 或 XHR')
print(' 3. 找到全景图或 scene 接口返回的图片 URL下载到 panorama/ 并命名为 panorama.jpg')
print(' 4. 确保 config.json 中 panorama 为 "panorama/panorama.jpg"')
sys.exit(0)
print('发现可能的全景图 URL:', len(image_urls))
local_path = panorama_dir / 'panorama.jpg'
try:
first = image_urls[0]
print('正在下载:', first[:80], '...')
download_file(first, local_path)
print('已保存到:', local_path)
except Exception as e:
print('下载失败:', e)
print('请手动将上面列出的任一 URL 在浏览器中打开并另存为 panorama/panorama.jpg')
sys.exit(1)
# 确保 config 指向本地
if config_path.exists():
with open(config_path, 'r', encoding='utf-8') as f:
config = json.load(f)
else:
config = {}
config['panorama'] = 'panorama/panorama.jpg'
config['type'] = config.get('type', 'equirectangular')
config['title'] = config.get('title', '本地全景')
with open(config_path, 'w', encoding='utf-8') as f:
json.dump(config, f, ensure_ascii=False, indent=2)
print('已更新 config.json。运行本地服务器后打开 index.html 即可离线查看。')
if __name__ == '__main__':
main()