Files
hometown/scripts/fetch_720yun.py
2026-03-07 23:35:08 +08:00

252 lines
8.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
从 720yun 页面抓取全景资源并本地化。
脚本位于 scripts/,输出到项目根目录的 panorama/、config.json。
若页面由 JS 动态加载,请使用「手动获取」方式(见 README
用法(在项目根目录执行):
python3 scripts/fetch_720yun.py "https://www.720yun.com/vr/c8525usOunr"
"""
import re
import sys
import json
import urllib.request
import urllib.error
from pathlib import Path
# 项目根目录
ROOT = Path(__file__).resolve().parent.parent
def fetch_html(url):
req = urllib.request.Request(url, headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
})
with urllib.request.urlopen(req, timeout=15) as r:
return r.read().decode('utf-8', errors='replace')
def _extract_balanced_json(html, start_marker, open_char='{', close_char='}'):
results = []
i = 0
if open_char == '{':
other_open, other_close = '[', ']'
else:
other_open, other_close = '{', '}'
while True:
pos = html.find(start_marker, i)
if pos < 0:
break
start = pos + len(start_marker)
while start < len(html) and html[start] in ' \t\n=':
start += 1
if start >= len(html):
i = pos + 1
continue
if html[start] == open_char:
depth = 0
in_string = None
escape = False
j = start
while j < len(html):
c = html[j]
if escape:
escape = False
j += 1
continue
if c == '\\' and in_string:
escape = True
j += 1
continue
if in_string:
if c == in_string:
in_string = None
j += 1
continue
if c in '"\'':
in_string = c
j += 1
continue
if c == open_char:
depth += 1
elif c == close_char:
depth -= 1
if depth == 0:
results.append(html[start:j + 1])
break
elif c == other_open:
depth += 1
elif c == other_close:
depth -= 1
j += 1
elif html[start] == other_open:
depth = 0
in_string = None
escape = False
j = start
while j < len(html):
c = html[j]
if escape:
escape = False
j += 1
continue
if c == '\\' and in_string:
escape = True
j += 1
continue
if in_string:
if c == in_string:
in_string = None
j += 1
continue
if c in '"\'':
in_string = c
j += 1
continue
if c == other_open:
depth += 1
elif c == other_close:
depth -= 1
if depth == 0:
results.append(html[start:j + 1])
break
elif c == open_char:
depth += 1
elif c == close_char:
depth -= 1
j += 1
i = pos + 1
return results
def find_json_assignments(html):
markers = [
'window.__INITIAL_STATE__',
'__INITIAL_STATE__',
'window.__DATA__',
'window.__NUXT_DATA__',
]
results = []
for marker in markers:
results.extend(_extract_balanced_json(html, marker, '{', '}'))
for m in re.finditer(r'"panorama"\s*:\s*"([^"]+)"', html):
results.append(m.group(1))
return results
def find_image_urls(html):
url_pattern = re.compile(
r'https?://[^\s"\'<>]+?\.(?:720yun\.com|qpic\.cn|gtimg\.com)[^\s"\'<>]*\.(?:jpg|jpeg|png|webp)',
re.I
)
urls = list(set(url_pattern.findall(html)))
alt_pattern = re.compile(
r'https?://[^\s"\'<>]+?/(?:panorama|scene|photo|pano|vr)[^\s"\'<>]*\.(?:jpg|jpeg|png|webp)',
re.I
)
for u in alt_pattern.findall(html):
if u not in urls:
urls.append(u)
return urls
def _browser_headers(url=''):
h = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.9',
}
if '720static.com' in url or '720yun.com' in url:
h['Referer'] = 'https://www.720yun.com/'
return h
def download_file(url, dest_path):
req = urllib.request.Request(url, headers=_browser_headers(url))
with urllib.request.urlopen(req, timeout=30) as r:
dest_path.write_bytes(r.read())
def main():
if len(sys.argv) < 2:
print('用法: python3 scripts/fetch_720yun.py <720yun页面URL>')
print('例: python3 scripts/fetch_720yun.py "https://www.720yun.com/vr/c8525usOunr"')
sys.exit(1)
url = sys.argv[1].strip()
panorama_dir = ROOT / 'panorama'
panorama_dir.mkdir(exist_ok=True)
config_path = ROOT / 'config.json'
print('正在请求页面...')
try:
html = fetch_html(url)
except Exception as e:
print('请求失败:', e)
print('请使用 README 中的「手动获取」方式在浏览器中抓取资源。')
sys.exit(1)
image_urls = find_image_urls(html)
json_candidates = find_json_assignments(html)
for raw in json_candidates:
try:
if raw.startswith('http'):
image_urls.append(raw)
continue
if raw.startswith('{'):
data = json.loads(raw)
elif raw.startswith('['):
data = json.loads(raw)
if data and isinstance(data[0], dict) and 'url' in data[0]:
image_urls.extend([s.get('url') for s in data if s.get('url')])
continue
else:
continue
def collect_urls(obj, out):
if isinstance(obj, dict):
for k, v in obj.items():
if k in ('url', 'panorama', 'image', 'src', 'pic') and isinstance(v, str) and v.startswith('http'):
out.append(v)
else:
collect_urls(v, out)
elif isinstance(obj, list):
for x in obj:
collect_urls(x, out)
collect_urls(data, image_urls)
except (json.JSONDecodeError, TypeError):
pass
image_urls = list(dict.fromkeys([u for u in image_urls if u and u.startswith('http')]))
if not image_urls:
print('未在页面 HTML 中发现全景图 URL页面可能由 JavaScript 动态加载)。')
print('请按 README 使用浏览器开发者工具手动获取。')
sys.exit(0)
print('发现可能的全景图 URL:', len(image_urls))
local_path = panorama_dir / 'panorama.jpg'
try:
first = image_urls[0]
print('正在下载:', first[:80], '...')
download_file(first, local_path)
print('已保存到:', local_path)
except Exception as e:
print('下载失败:', e)
sys.exit(1)
if config_path.exists():
with open(config_path, 'r', encoding='utf-8') as f:
config = json.load(f)
else:
config = {}
config['panorama'] = 'panorama/panorama.jpg'
config['type'] = config.get('type', 'equirectangular')
config['title'] = config.get('title', '本地全景')
with open(config_path, 'w', encoding='utf-8') as f:
json.dump(config, f, ensure_ascii=False, indent=2)
print('已更新 config.json。运行 npm start 后即可离线查看。')
if __name__ == '__main__':
main()