252 lines
8.1 KiB
Python
252 lines
8.1 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
从 720yun 页面抓取全景资源并本地化。
|
||
脚本位于 scripts/,输出到项目根目录的 panorama/、config.json。
|
||
若页面由 JS 动态加载,请使用「手动获取」方式(见 README)。
|
||
|
||
用法(在项目根目录执行):
|
||
python3 scripts/fetch_720yun.py "https://www.720yun.com/vr/c8525usOunr"
|
||
"""
|
||
import re
|
||
import sys
|
||
import json
|
||
import urllib.request
|
||
import urllib.error
|
||
from pathlib import Path
|
||
|
||
# 项目根目录
|
||
ROOT = Path(__file__).resolve().parent.parent
|
||
|
||
|
||
def fetch_html(url):
|
||
req = urllib.request.Request(url, headers={
|
||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
|
||
})
|
||
with urllib.request.urlopen(req, timeout=15) as r:
|
||
return r.read().decode('utf-8', errors='replace')
|
||
|
||
|
||
def _extract_balanced_json(html, start_marker, open_char='{', close_char='}'):
|
||
results = []
|
||
i = 0
|
||
if open_char == '{':
|
||
other_open, other_close = '[', ']'
|
||
else:
|
||
other_open, other_close = '{', '}'
|
||
while True:
|
||
pos = html.find(start_marker, i)
|
||
if pos < 0:
|
||
break
|
||
start = pos + len(start_marker)
|
||
while start < len(html) and html[start] in ' \t\n=':
|
||
start += 1
|
||
if start >= len(html):
|
||
i = pos + 1
|
||
continue
|
||
if html[start] == open_char:
|
||
depth = 0
|
||
in_string = None
|
||
escape = False
|
||
j = start
|
||
while j < len(html):
|
||
c = html[j]
|
||
if escape:
|
||
escape = False
|
||
j += 1
|
||
continue
|
||
if c == '\\' and in_string:
|
||
escape = True
|
||
j += 1
|
||
continue
|
||
if in_string:
|
||
if c == in_string:
|
||
in_string = None
|
||
j += 1
|
||
continue
|
||
if c in '"\'':
|
||
in_string = c
|
||
j += 1
|
||
continue
|
||
if c == open_char:
|
||
depth += 1
|
||
elif c == close_char:
|
||
depth -= 1
|
||
if depth == 0:
|
||
results.append(html[start:j + 1])
|
||
break
|
||
elif c == other_open:
|
||
depth += 1
|
||
elif c == other_close:
|
||
depth -= 1
|
||
j += 1
|
||
elif html[start] == other_open:
|
||
depth = 0
|
||
in_string = None
|
||
escape = False
|
||
j = start
|
||
while j < len(html):
|
||
c = html[j]
|
||
if escape:
|
||
escape = False
|
||
j += 1
|
||
continue
|
||
if c == '\\' and in_string:
|
||
escape = True
|
||
j += 1
|
||
continue
|
||
if in_string:
|
||
if c == in_string:
|
||
in_string = None
|
||
j += 1
|
||
continue
|
||
if c in '"\'':
|
||
in_string = c
|
||
j += 1
|
||
continue
|
||
if c == other_open:
|
||
depth += 1
|
||
elif c == other_close:
|
||
depth -= 1
|
||
if depth == 0:
|
||
results.append(html[start:j + 1])
|
||
break
|
||
elif c == open_char:
|
||
depth += 1
|
||
elif c == close_char:
|
||
depth -= 1
|
||
j += 1
|
||
i = pos + 1
|
||
return results
|
||
|
||
|
||
def find_json_assignments(html):
|
||
markers = [
|
||
'window.__INITIAL_STATE__',
|
||
'__INITIAL_STATE__',
|
||
'window.__DATA__',
|
||
'window.__NUXT_DATA__',
|
||
]
|
||
results = []
|
||
for marker in markers:
|
||
results.extend(_extract_balanced_json(html, marker, '{', '}'))
|
||
for m in re.finditer(r'"panorama"\s*:\s*"([^"]+)"', html):
|
||
results.append(m.group(1))
|
||
return results
|
||
|
||
|
||
def find_image_urls(html):
|
||
url_pattern = re.compile(
|
||
r'https?://[^\s"\'<>]+?\.(?:720yun\.com|qpic\.cn|gtimg\.com)[^\s"\'<>]*\.(?:jpg|jpeg|png|webp)',
|
||
re.I
|
||
)
|
||
urls = list(set(url_pattern.findall(html)))
|
||
alt_pattern = re.compile(
|
||
r'https?://[^\s"\'<>]+?/(?:panorama|scene|photo|pano|vr)[^\s"\'<>]*\.(?:jpg|jpeg|png|webp)',
|
||
re.I
|
||
)
|
||
for u in alt_pattern.findall(html):
|
||
if u not in urls:
|
||
urls.append(u)
|
||
return urls
|
||
|
||
|
||
def _browser_headers(url=''):
|
||
h = {
|
||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||
'Accept': 'image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.9',
|
||
}
|
||
if '720static.com' in url or '720yun.com' in url:
|
||
h['Referer'] = 'https://www.720yun.com/'
|
||
return h
|
||
|
||
|
||
def download_file(url, dest_path):
|
||
req = urllib.request.Request(url, headers=_browser_headers(url))
|
||
with urllib.request.urlopen(req, timeout=30) as r:
|
||
dest_path.write_bytes(r.read())
|
||
|
||
|
||
def main():
|
||
if len(sys.argv) < 2:
|
||
print('用法: python3 scripts/fetch_720yun.py <720yun页面URL>')
|
||
print('例: python3 scripts/fetch_720yun.py "https://www.720yun.com/vr/c8525usOunr"')
|
||
sys.exit(1)
|
||
url = sys.argv[1].strip()
|
||
panorama_dir = ROOT / 'panorama'
|
||
panorama_dir.mkdir(exist_ok=True)
|
||
config_path = ROOT / 'config.json'
|
||
|
||
print('正在请求页面...')
|
||
try:
|
||
html = fetch_html(url)
|
||
except Exception as e:
|
||
print('请求失败:', e)
|
||
print('请使用 README 中的「手动获取」方式在浏览器中抓取资源。')
|
||
sys.exit(1)
|
||
|
||
image_urls = find_image_urls(html)
|
||
json_candidates = find_json_assignments(html)
|
||
|
||
for raw in json_candidates:
|
||
try:
|
||
if raw.startswith('http'):
|
||
image_urls.append(raw)
|
||
continue
|
||
if raw.startswith('{'):
|
||
data = json.loads(raw)
|
||
elif raw.startswith('['):
|
||
data = json.loads(raw)
|
||
if data and isinstance(data[0], dict) and 'url' in data[0]:
|
||
image_urls.extend([s.get('url') for s in data if s.get('url')])
|
||
continue
|
||
else:
|
||
continue
|
||
|
||
def collect_urls(obj, out):
|
||
if isinstance(obj, dict):
|
||
for k, v in obj.items():
|
||
if k in ('url', 'panorama', 'image', 'src', 'pic') and isinstance(v, str) and v.startswith('http'):
|
||
out.append(v)
|
||
else:
|
||
collect_urls(v, out)
|
||
elif isinstance(obj, list):
|
||
for x in obj:
|
||
collect_urls(x, out)
|
||
collect_urls(data, image_urls)
|
||
except (json.JSONDecodeError, TypeError):
|
||
pass
|
||
|
||
image_urls = list(dict.fromkeys([u for u in image_urls if u and u.startswith('http')]))
|
||
|
||
if not image_urls:
|
||
print('未在页面 HTML 中发现全景图 URL(页面可能由 JavaScript 动态加载)。')
|
||
print('请按 README 使用浏览器开发者工具手动获取。')
|
||
sys.exit(0)
|
||
|
||
print('发现可能的全景图 URL:', len(image_urls))
|
||
local_path = panorama_dir / 'panorama.jpg'
|
||
try:
|
||
first = image_urls[0]
|
||
print('正在下载:', first[:80], '...')
|
||
download_file(first, local_path)
|
||
print('已保存到:', local_path)
|
||
except Exception as e:
|
||
print('下载失败:', e)
|
||
sys.exit(1)
|
||
|
||
if config_path.exists():
|
||
with open(config_path, 'r', encoding='utf-8') as f:
|
||
config = json.load(f)
|
||
else:
|
||
config = {}
|
||
config['panorama'] = 'panorama/panorama.jpg'
|
||
config['type'] = config.get('type', 'equirectangular')
|
||
config['title'] = config.get('title', '本地全景')
|
||
with open(config_path, 'w', encoding='utf-8') as f:
|
||
json.dump(config, f, ensure_ascii=False, indent=2)
|
||
print('已更新 config.json。运行 npm start 后即可离线查看。')
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|