fix:优化代码
This commit is contained in:
26
scripts/README.md
Normal file
26
scripts/README.md
Normal file
@@ -0,0 +1,26 @@
|
||||
# 720 云资源提取脚本
|
||||
|
||||
本目录为 Python 脚本,用于从 720yun 页面或保存的 HTML 中提取并下载全景图资源。**请在项目根目录下执行**,脚本会自动读写根目录下的 `text.md`、`image/`、`panorama/`、`config.json` 等。
|
||||
|
||||
## 脚本说明
|
||||
|
||||
| 脚本 | 用途 |
|
||||
|------|------|
|
||||
| **fetch_720yun.py** | 根据 720yun 页面 URL 抓取 HTML,解析其中的全景图 URL 并下载到 `panorama/panorama.jpg`,同时更新根目录 `config.json`。适用于页面内直接包含图片链接的情况。 |
|
||||
| **parse_720yun_doc.py** | 从项目根目录的 `text.md`(720yun 页面另存为的文档)解析 `window.data` / `window.json`,得到六面图、缩略图等 URL;可选 `--fetch` 请求场景 JSON,`--download` 将六面图 + 缩略图下载到根目录 `image/`。 |
|
||||
|
||||
## 使用示例
|
||||
|
||||
```bash
|
||||
# 在项目根目录 720yun-offline/ 下执行
|
||||
|
||||
# 方式一:按 URL 抓取(若页面由 JS 动态加载可能无结果)
|
||||
python3 scripts/fetch_720yun.py "https://www.720yun.com/vr/xxxxx"
|
||||
|
||||
# 方式二:先浏览器打开 720 链接,整页另存为 text.md 放到项目根目录,再解析并下载六面图
|
||||
python3 scripts/parse_720yun_doc.py # 仅解析,输出 parsed_720yun_resources.json
|
||||
python3 scripts/parse_720yun_doc.py --fetch # 解析并请求场景 JSON
|
||||
python3 scripts/parse_720yun_doc.py --download # 解析并将六面图、缩略图下载到 image/
|
||||
```
|
||||
|
||||
下载到 `image/` 的文件可直接被前端使用(`config.json` 中已配置 `image/mobile_*.jpg`)。
|
||||
251
scripts/fetch_720yun.py
Normal file
251
scripts/fetch_720yun.py
Normal file
@@ -0,0 +1,251 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
从 720yun 页面抓取全景资源并本地化。
|
||||
脚本位于 scripts/,输出到项目根目录的 panorama/、config.json。
|
||||
若页面由 JS 动态加载,请使用「手动获取」方式(见 README)。
|
||||
|
||||
用法(在项目根目录执行):
|
||||
python3 scripts/fetch_720yun.py "https://www.720yun.com/vr/c8525usOunr"
|
||||
"""
|
||||
import re
|
||||
import sys
|
||||
import json
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
from pathlib import Path
|
||||
|
||||
# 项目根目录
|
||||
ROOT = Path(__file__).resolve().parent.parent
|
||||
|
||||
|
||||
def fetch_html(url):
|
||||
req = urllib.request.Request(url, headers={
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
|
||||
})
|
||||
with urllib.request.urlopen(req, timeout=15) as r:
|
||||
return r.read().decode('utf-8', errors='replace')
|
||||
|
||||
|
||||
def _extract_balanced_json(html, start_marker, open_char='{', close_char='}'):
|
||||
results = []
|
||||
i = 0
|
||||
if open_char == '{':
|
||||
other_open, other_close = '[', ']'
|
||||
else:
|
||||
other_open, other_close = '{', '}'
|
||||
while True:
|
||||
pos = html.find(start_marker, i)
|
||||
if pos < 0:
|
||||
break
|
||||
start = pos + len(start_marker)
|
||||
while start < len(html) and html[start] in ' \t\n=':
|
||||
start += 1
|
||||
if start >= len(html):
|
||||
i = pos + 1
|
||||
continue
|
||||
if html[start] == open_char:
|
||||
depth = 0
|
||||
in_string = None
|
||||
escape = False
|
||||
j = start
|
||||
while j < len(html):
|
||||
c = html[j]
|
||||
if escape:
|
||||
escape = False
|
||||
j += 1
|
||||
continue
|
||||
if c == '\\' and in_string:
|
||||
escape = True
|
||||
j += 1
|
||||
continue
|
||||
if in_string:
|
||||
if c == in_string:
|
||||
in_string = None
|
||||
j += 1
|
||||
continue
|
||||
if c in '"\'':
|
||||
in_string = c
|
||||
j += 1
|
||||
continue
|
||||
if c == open_char:
|
||||
depth += 1
|
||||
elif c == close_char:
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
results.append(html[start:j + 1])
|
||||
break
|
||||
elif c == other_open:
|
||||
depth += 1
|
||||
elif c == other_close:
|
||||
depth -= 1
|
||||
j += 1
|
||||
elif html[start] == other_open:
|
||||
depth = 0
|
||||
in_string = None
|
||||
escape = False
|
||||
j = start
|
||||
while j < len(html):
|
||||
c = html[j]
|
||||
if escape:
|
||||
escape = False
|
||||
j += 1
|
||||
continue
|
||||
if c == '\\' and in_string:
|
||||
escape = True
|
||||
j += 1
|
||||
continue
|
||||
if in_string:
|
||||
if c == in_string:
|
||||
in_string = None
|
||||
j += 1
|
||||
continue
|
||||
if c in '"\'':
|
||||
in_string = c
|
||||
j += 1
|
||||
continue
|
||||
if c == other_open:
|
||||
depth += 1
|
||||
elif c == other_close:
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
results.append(html[start:j + 1])
|
||||
break
|
||||
elif c == open_char:
|
||||
depth += 1
|
||||
elif c == close_char:
|
||||
depth -= 1
|
||||
j += 1
|
||||
i = pos + 1
|
||||
return results
|
||||
|
||||
|
||||
def find_json_assignments(html):
|
||||
markers = [
|
||||
'window.__INITIAL_STATE__',
|
||||
'__INITIAL_STATE__',
|
||||
'window.__DATA__',
|
||||
'window.__NUXT_DATA__',
|
||||
]
|
||||
results = []
|
||||
for marker in markers:
|
||||
results.extend(_extract_balanced_json(html, marker, '{', '}'))
|
||||
for m in re.finditer(r'"panorama"\s*:\s*"([^"]+)"', html):
|
||||
results.append(m.group(1))
|
||||
return results
|
||||
|
||||
|
||||
def find_image_urls(html):
|
||||
url_pattern = re.compile(
|
||||
r'https?://[^\s"\'<>]+?\.(?:720yun\.com|qpic\.cn|gtimg\.com)[^\s"\'<>]*\.(?:jpg|jpeg|png|webp)',
|
||||
re.I
|
||||
)
|
||||
urls = list(set(url_pattern.findall(html)))
|
||||
alt_pattern = re.compile(
|
||||
r'https?://[^\s"\'<>]+?/(?:panorama|scene|photo|pano|vr)[^\s"\'<>]*\.(?:jpg|jpeg|png|webp)',
|
||||
re.I
|
||||
)
|
||||
for u in alt_pattern.findall(html):
|
||||
if u not in urls:
|
||||
urls.append(u)
|
||||
return urls
|
||||
|
||||
|
||||
def _browser_headers(url=''):
|
||||
h = {
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.9',
|
||||
}
|
||||
if '720static.com' in url or '720yun.com' in url:
|
||||
h['Referer'] = 'https://www.720yun.com/'
|
||||
return h
|
||||
|
||||
|
||||
def download_file(url, dest_path):
|
||||
req = urllib.request.Request(url, headers=_browser_headers(url))
|
||||
with urllib.request.urlopen(req, timeout=30) as r:
|
||||
dest_path.write_bytes(r.read())
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print('用法: python3 scripts/fetch_720yun.py <720yun页面URL>')
|
||||
print('例: python3 scripts/fetch_720yun.py "https://www.720yun.com/vr/c8525usOunr"')
|
||||
sys.exit(1)
|
||||
url = sys.argv[1].strip()
|
||||
panorama_dir = ROOT / 'panorama'
|
||||
panorama_dir.mkdir(exist_ok=True)
|
||||
config_path = ROOT / 'config.json'
|
||||
|
||||
print('正在请求页面...')
|
||||
try:
|
||||
html = fetch_html(url)
|
||||
except Exception as e:
|
||||
print('请求失败:', e)
|
||||
print('请使用 README 中的「手动获取」方式在浏览器中抓取资源。')
|
||||
sys.exit(1)
|
||||
|
||||
image_urls = find_image_urls(html)
|
||||
json_candidates = find_json_assignments(html)
|
||||
|
||||
for raw in json_candidates:
|
||||
try:
|
||||
if raw.startswith('http'):
|
||||
image_urls.append(raw)
|
||||
continue
|
||||
if raw.startswith('{'):
|
||||
data = json.loads(raw)
|
||||
elif raw.startswith('['):
|
||||
data = json.loads(raw)
|
||||
if data and isinstance(data[0], dict) and 'url' in data[0]:
|
||||
image_urls.extend([s.get('url') for s in data if s.get('url')])
|
||||
continue
|
||||
else:
|
||||
continue
|
||||
|
||||
def collect_urls(obj, out):
|
||||
if isinstance(obj, dict):
|
||||
for k, v in obj.items():
|
||||
if k in ('url', 'panorama', 'image', 'src', 'pic') and isinstance(v, str) and v.startswith('http'):
|
||||
out.append(v)
|
||||
else:
|
||||
collect_urls(v, out)
|
||||
elif isinstance(obj, list):
|
||||
for x in obj:
|
||||
collect_urls(x, out)
|
||||
collect_urls(data, image_urls)
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
pass
|
||||
|
||||
image_urls = list(dict.fromkeys([u for u in image_urls if u and u.startswith('http')]))
|
||||
|
||||
if not image_urls:
|
||||
print('未在页面 HTML 中发现全景图 URL(页面可能由 JavaScript 动态加载)。')
|
||||
print('请按 README 使用浏览器开发者工具手动获取。')
|
||||
sys.exit(0)
|
||||
|
||||
print('发现可能的全景图 URL:', len(image_urls))
|
||||
local_path = panorama_dir / 'panorama.jpg'
|
||||
try:
|
||||
first = image_urls[0]
|
||||
print('正在下载:', first[:80], '...')
|
||||
download_file(first, local_path)
|
||||
print('已保存到:', local_path)
|
||||
except Exception as e:
|
||||
print('下载失败:', e)
|
||||
sys.exit(1)
|
||||
|
||||
if config_path.exists():
|
||||
with open(config_path, 'r', encoding='utf-8') as f:
|
||||
config = json.load(f)
|
||||
else:
|
||||
config = {}
|
||||
config['panorama'] = 'panorama/panorama.jpg'
|
||||
config['type'] = config.get('type', 'equirectangular')
|
||||
config['title'] = config.get('title', '本地全景')
|
||||
with open(config_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(config, f, ensure_ascii=False, indent=2)
|
||||
print('已更新 config.json。运行 npm start 后即可离线查看。')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
234
scripts/parse_720yun_doc.py
Normal file
234
scripts/parse_720yun_doc.py
Normal file
@@ -0,0 +1,234 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
从 text.md(720yun 页面保存的文档)中解析 window.data / window.json,
|
||||
并解析出最终的全景图片资源 URL。
|
||||
脚本位于 scripts/,读写路径均相对于项目根目录。
|
||||
|
||||
用法(在项目根目录执行):
|
||||
python3 scripts/parse_720yun_doc.py [text.md]
|
||||
python3 scripts/parse_720yun_doc.py --fetch # 并请求场景 JSON
|
||||
python3 scripts/parse_720yun_doc.py --download # 下载六面图到 image/
|
||||
"""
|
||||
import re
|
||||
import sys
|
||||
import json
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
|
||||
# 项目根目录(脚本所在目录的上一级)
|
||||
ROOT = Path(__file__).resolve().parent.parent
|
||||
|
||||
|
||||
def read_doc(path):
|
||||
with open(path, 'r', encoding='utf-8', errors='replace') as f:
|
||||
return f.read()
|
||||
|
||||
|
||||
def parse_window_data(html):
|
||||
"""解析 window.data={...}; 或 window.data = {...}(支持嵌套)"""
|
||||
m = re.search(r'window\.data\s*=\s*\{', html)
|
||||
if not m:
|
||||
return None
|
||||
start = m.end() - 1 # 从 '{' 开始
|
||||
depth = 0
|
||||
in_str = None
|
||||
escape = False
|
||||
i = start
|
||||
while i < len(html):
|
||||
c = html[i]
|
||||
if escape:
|
||||
escape = False
|
||||
i += 1
|
||||
continue
|
||||
if in_str:
|
||||
if c == '\\':
|
||||
escape = True
|
||||
elif c == in_str:
|
||||
in_str = None
|
||||
i += 1
|
||||
continue
|
||||
if c in '"\'':
|
||||
in_str = c
|
||||
i += 1
|
||||
continue
|
||||
if c == '{':
|
||||
depth += 1
|
||||
elif c == '}':
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
raw = html[start:i + 1]
|
||||
try:
|
||||
return json.loads(raw)
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
i += 1
|
||||
return None
|
||||
|
||||
|
||||
def parse_window_json(html):
|
||||
"""解析 window.json="..."; """
|
||||
m = re.search(r'window\.json\s*=\s*["\']([^"\']+)["\']', html)
|
||||
return m.group(1) if m else None
|
||||
|
||||
|
||||
# 720yun 全景图实际 CDN:浏览器里能访问的域名(与 resource-t 不同,需用此域名才能正确拉取)
|
||||
RESOURCE_CDN_HOST = 'ssl-panoimg130.720static.com'
|
||||
|
||||
|
||||
def build_resource_base(thumb_url):
|
||||
"""从 thumbUrl 得到资源目录的 base URL(用于拼立方体等)。使用实际 CDN 域名以便脚本拉取与浏览器一致。"""
|
||||
if thumb_url.startswith('http'):
|
||||
base = re.sub(r'^https?://[^/]+', 'https://' + RESOURCE_CDN_HOST, thumb_url)
|
||||
else:
|
||||
base = 'https://' + RESOURCE_CDN_HOST + thumb_url
|
||||
base = base.rsplit('/', 1)[0] + '/'
|
||||
return base
|
||||
|
||||
|
||||
def infer_cube_urls(resource_base):
|
||||
"""根据 720yun 常见命名推断立方体六面图 URL(与本地 image/mobile_*.jpg 对应)。"""
|
||||
faces = ['f', 'r', 'b', 'l', 'u', 'd'] # 前 右 后 左 上 下
|
||||
return [resource_base + 'mobile_' + face + '.jpg' for face in faces]
|
||||
|
||||
|
||||
def _browser_headers(referer='https://www.720yun.com/'):
|
||||
return {
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'image/avif,image/webp,image/apng,image/*,*/*;q=0.9',
|
||||
'Referer': referer,
|
||||
}
|
||||
|
||||
|
||||
def fetch_tour_json(json_path, base_url='https://www.720yun.com/'):
|
||||
"""请求场景 JSON。json_path 为 window.json 的值,如 json/4ca3fae5e7x/.../3.json"""
|
||||
url = base_url.rstrip('/') + '/' + json_path.lstrip('/')
|
||||
req = urllib.request.Request(url, headers=_browser_headers())
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=15) as r:
|
||||
return json.loads(r.read().decode('utf-8', errors='replace'))
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def download_to_file(url, dest_path):
|
||||
"""用浏览器头拉取并保存,保证与 Chrome 读取一致。"""
|
||||
req = urllib.request.Request(url, headers=_browser_headers())
|
||||
with urllib.request.urlopen(req, timeout=30) as r:
|
||||
dest_path.write_bytes(r.read())
|
||||
|
||||
|
||||
def extract_image_urls_from_tour(tour_data, resource_base):
|
||||
"""从 krpano/720 场景 JSON 中递归提取所有图片 URL。"""
|
||||
urls = []
|
||||
|
||||
def collect(obj):
|
||||
if isinstance(obj, dict):
|
||||
for k, v in obj.items():
|
||||
if k in ('url', 'panorama', 'image', 'src', 'path', 'thumbUrl', 'basePath'):
|
||||
if isinstance(v, str) and (v.startswith('http') or v.startswith('/')):
|
||||
u = v if v.startswith('http') else (resource_base.rstrip('/') + v)
|
||||
urls.append(u)
|
||||
elif k == 'cubeMap' and isinstance(v, list):
|
||||
for u in v:
|
||||
if isinstance(u, str):
|
||||
urls.append(u if u.startswith('http') else resource_base.rstrip('/') + '/' + u.lstrip('/'))
|
||||
else:
|
||||
collect(v)
|
||||
elif isinstance(obj, list):
|
||||
for x in obj:
|
||||
collect(x)
|
||||
|
||||
collect(tour_data)
|
||||
return list(dict.fromkeys(urls))
|
||||
|
||||
|
||||
def main():
|
||||
doc_path = ROOT / 'text.md'
|
||||
if len(sys.argv) >= 2 and not sys.argv[1].startswith('-'):
|
||||
doc_path = Path(sys.argv[1])
|
||||
do_fetch = '--fetch' in sys.argv
|
||||
do_download = '--download' in sys.argv
|
||||
|
||||
if not doc_path.exists():
|
||||
print('未找到文档:', doc_path)
|
||||
sys.exit(1)
|
||||
|
||||
html = read_doc(doc_path)
|
||||
data = parse_window_data(html)
|
||||
json_path = parse_window_json(html)
|
||||
|
||||
if not data and not json_path:
|
||||
print('未能从文档中解析出 window.data 或 window.json')
|
||||
sys.exit(1)
|
||||
|
||||
result = {
|
||||
'window_data': data,
|
||||
'window_json_path': json_path,
|
||||
'resource_base': None,
|
||||
'thumb_url': None,
|
||||
'inferred_cube_urls': [],
|
||||
'tour_json_url': None,
|
||||
'tour_image_urls': [],
|
||||
}
|
||||
|
||||
if data:
|
||||
thumb = data.get('thumbUrl') or ''
|
||||
if thumb and not thumb.startswith('http'):
|
||||
result['thumb_url'] = 'https://thumb-t.720static.com' + thumb
|
||||
else:
|
||||
result['thumb_url'] = thumb or None
|
||||
result['resource_base'] = build_resource_base(thumb) if thumb else None
|
||||
result['tid'] = data.get('tid')
|
||||
result['name'] = data.get('name')
|
||||
result['sceneCount'] = data.get('sceneCount')
|
||||
|
||||
if result['resource_base']:
|
||||
result['inferred_cube_urls'] = infer_cube_urls(result['resource_base'])
|
||||
|
||||
if json_path:
|
||||
result['tour_json_url'] = 'https://www.720yun.com/' + json_path.lstrip('/')
|
||||
if do_fetch and result['resource_base']:
|
||||
tour = fetch_tour_json(json_path)
|
||||
if tour:
|
||||
result['tour_image_urls'] = extract_image_urls_from_tour(tour, result['resource_base'])
|
||||
else:
|
||||
print('请求场景 JSON 失败:', result['tour_json_url'], file=sys.stderr)
|
||||
|
||||
out_path = ROOT / 'parsed_720yun_resources.json'
|
||||
with open(out_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(result, f, ensure_ascii=False, indent=2)
|
||||
print('已写入:', out_path)
|
||||
|
||||
all_urls = []
|
||||
if result.get('thumb_url'):
|
||||
all_urls.append(('thumb', result['thumb_url']))
|
||||
for u in result.get('inferred_cube_urls', []):
|
||||
all_urls.append(('cube', u))
|
||||
for u in result.get('tour_image_urls', []):
|
||||
if not any(u == x for _, x in all_urls):
|
||||
all_urls.append(('tour', u))
|
||||
|
||||
print('\n--- 解析出的图片资源 ---')
|
||||
for kind, url in all_urls:
|
||||
print(kind, url)
|
||||
print('\n共', len(all_urls), '个 URL')
|
||||
|
||||
if do_download and result.get('resource_base'):
|
||||
out_dir = ROOT / 'image'
|
||||
out_dir.mkdir(exist_ok=True)
|
||||
print('\n--- 使用浏览器头下载到 image/ ---')
|
||||
for face, url in [('thumb', result.get('thumb_url'))] + list(zip(['mobile_f', 'mobile_r', 'mobile_b', 'mobile_l', 'mobile_u', 'mobile_d'], result.get('inferred_cube_urls', []))):
|
||||
if not url:
|
||||
continue
|
||||
name = face + '.jpg' if face != 'thumb' else 'thumb.jpg'
|
||||
dest = out_dir / name
|
||||
try:
|
||||
download_to_file(url, dest)
|
||||
print('OK', name)
|
||||
except Exception as e:
|
||||
print('FAIL', name, e)
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user